diff --git a/.github/workflows/sanity_check.yml b/.github/workflows/sanity_check.yml
index 3ee92fc85c2..27685115584 100644
--- a/.github/workflows/sanity_check.yml
+++ b/.github/workflows/sanity_check.yml
@@ -32,7 +32,7 @@ jobs:
     - name: Download clang-format-diff.py
       uses: wei/wget@v1
       with:
-        args: https://raw.githubusercontent.com/llvm-mirror/clang/master/tools/clang-format/clang-format-diff.py
+        args: https://raw.githubusercontent.com/llvm/llvm-project/main/clang/tools/clang-format/clang-format-diff.py
 
     - name: Check format
       run: VERBOSE_CHECK=1 make check-format
diff --git a/.gitignore b/.gitignore
index e55b306a5cb..e40ce2c2521 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@ rocksdb.pc
 *.gcda
 *.gcno
 *.o
+*.o.tmp
 *.so
 *.so.*
 *_test
@@ -93,3 +94,8 @@ compile_commands.json
 .clangd
 clang-format-diff.py
 .py3/
+
+fuzz/proto/gen/
+fuzz/crash-*
+
+cmake-build-*
diff --git a/.travis.yml b/.travis.yml
index 9a0c04afac7..8b4ed913c71 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,21 +3,21 @@ dist: trusty
 language: cpp
 os:
   - linux
-  - osx
 arch:
-  - amd64
   - arm64
   - ppc64le
 compiler:
   - gcc
-osx_image: xcode9.4
 cache:
   directories:
   - "$BUILD_DIR/aws"
 addons:
-   apt:
-      packages:
-      - zlib1g-dev
+  apt:
+    update: true
+    sources:
+      - ubuntu-toolchain-r-test
+    packages:
+      - libgflags-dev
       - libbz2-dev
       - libsnappy-dev
       - curl
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c5bf804ffe..ec149e9be0c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,10 +32,11 @@
 # 3. cmake ..
 # 4. make -j
 
-cmake_minimum_required(VERSION 3.5.1)
+cmake_minimum_required(VERSION 3.10)
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/")
 include(ReadVersion)
+include(GoogleTest)
 get_rocksdb_version(rocksdb_VERSION)
 project(rocksdb
   VERSION ${rocksdb_VERSION}
@@ -62,6 +63,7 @@ if(CCACHE_FOUND)
 endif(CCACHE_FOUND)
 
 option(WITH_JEMALLOC "build with JeMalloc" OFF)
+option(WITH_LIBURING "build with liburing" ON)
 option(WITH_SNAPPY "build with SNAPPY" OFF)
 option(WITH_LZ4 "build with lz4" OFF)
 option(WITH_ZLIB "build with zlib" OFF)
@@ -71,6 +73,12 @@ option(WITH_WINDOWS_UTF8_FILENAMES "use UTF8 as characterset for opening files,
 if (WITH_WINDOWS_UTF8_FILENAMES)
   add_definitions(-DROCKSDB_WINDOWS_UTF8_FILENAMES)
 endif()
+
+if ($ENV{CIRCLECI})
+  message(STATUS "Build for CircieCI env, a few tests may be disabled")
+  add_definitions(-DCIRCLECI)
+endif()
+
 # third-party/folly is only validated to work on Linux and Windows for now.
 # So only turn it on there by default.
 if(CMAKE_SYSTEM_NAME MATCHES "Linux|Windows")
@@ -89,10 +97,9 @@ if( NOT DEFINED CMAKE_CXX_STANDARD )
 endif()
 
 include(CMakeDependentOption)
-CMAKE_DEPENDENT_OPTION(WITH_GFLAGS "build with GFlags" ON
-  "NOT MSVC;NOT MINGW" OFF)
 
 if(MSVC)
+  option(WITH_GFLAGS "build with GFlags" OFF)
   option(WITH_XPRESS "build with windows built in compression" OFF)
   include(${CMAKE_CURRENT_SOURCE_DIR}/thirdparty.inc)
 else()
@@ -108,6 +115,11 @@ else()
     endif()
   endif()
 
+  if(MINGW)
+    option(WITH_GFLAGS "build with GFlags" OFF)
+  else()
+    option(WITH_GFLAGS "build with GFlags" ON)
+  endif()
   set(GFLAGS_LIB)
   if(WITH_GFLAGS)
     # Config with namespace available since gflags 2.2.2
@@ -119,11 +131,11 @@ else()
         set(GFLAGS_LIB ${GFLAGS_TARGET})
       else()
         # Config with GFLAGS_LIBRARIES available since gflags 2.1.0
-        set(GFLAGS_LIB ${GFLAGS_LIBRARIES})
+        set(GFLAGS_LIB ${gflags_LIBRARIES})
       endif()
     else()
       find_package(gflags REQUIRED)
-        set(GFLAGS_LIB gflags::gflags)
+      set(GFLAGS_LIB gflags::gflags)
     endif()
     include_directories(${GFLAGS_INCLUDE_DIR})
     list(APPEND THIRDPARTY_LIBS ${GFLAGS_LIB})
@@ -178,23 +190,25 @@ if(WITH_AWS)
   list(APPEND THIRDPARTY_LIBS ${AWSSDK_LINK_LIBRARIES})
 endif()
 
-string(TIMESTAMP TS "%Y/%m/%d %H:%M:%S" UTC)
-set(GIT_DATE_TIME "${TS}" CACHE STRING "the time we first built rocksdb")
+string(TIMESTAMP TS "%Y-%m-%d %H:%M:%S" UTC)
+set(BUILD_DATE "${TS}" CACHE STRING "the time we first built rocksdb")
 
 find_package(Git)
 
 if(GIT_FOUND AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
-  if(WIN32)
-    execute_process(COMMAND $ENV{COMSPEC} /C ${GIT_EXECUTABLE} -C ${CMAKE_CURRENT_SOURCE_DIR} rev-parse HEAD OUTPUT_VARIABLE GIT_SHA)
-  else()
-    execute_process(COMMAND ${GIT_EXECUTABLE} -C ${CMAKE_CURRENT_SOURCE_DIR} rev-parse HEAD OUTPUT_VARIABLE GIT_SHA)
+  execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_SHA COMMAND "${GIT_EXECUTABLE}" rev-parse HEAD )
+  execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" RESULT_VARIABLE GIT_MOD COMMAND "${GIT_EXECUTABLE}" diff-index HEAD --quiet)
+  execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_DATE COMMAND "${GIT_EXECUTABLE}" log -1 --date=format:"%Y-%m-%d %T" --format="%ad")
+  execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_TAG RESULT_VARIABLE rv COMMAND "${GIT_EXECUTABLE}" symbolic-ref -q --short HEAD OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if (rv AND NOT rv EQUAL 0)
+    execute_process(WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" OUTPUT_VARIABLE GIT_TAG COMMAND "${GIT_EXECUTABLE}" describe --tags --exact-match OUTPUT_STRIP_TRAILING_WHITESPACE)
   endif()
 else()
   set(GIT_SHA 0)
+  set(GIT_MOD 1)
 endif()
-
-string(REGEX REPLACE "[^0-9a-f]+" "" GIT_SHA "${GIT_SHA}")
-
+string(REGEX REPLACE "[^0-9a-fA-F]+" "" GIT_SHA "${GIT_SHA}")
+string(REGEX REPLACE "[^0-9: /-]+" "" GIT_DATE "${GIT_DATE}")
 
 option(WITH_MD_LIBRARY "build with MD" ON)
 if(WIN32 AND MSVC)
@@ -207,15 +221,16 @@ endif()
 
 set(BUILD_VERSION_CC ${CMAKE_BINARY_DIR}/build_version.cc)
 configure_file(util/build_version.cc.in ${BUILD_VERSION_CC} @ONLY)
-add_library(build_version OBJECT ${BUILD_VERSION_CC})
-target_include_directories(build_version PRIVATE
-  ${CMAKE_CURRENT_SOURCE_DIR}/util)
+
 if(MSVC)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /nologo /EHsc /GS /Gd /GR /GF /fp:precise /Zc:wchar_t /Zc:forScope /errorReport:queue")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W4 /wd4127 /wd4800 /wd4996 /wd4351 /wd4100 /wd4204 /wd4324")
 else()
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W -Wextra -Wall")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W -Wextra -Wall -pthread")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wsign-compare -Wshadow -Wno-unused-parameter -Wno-unused-variable -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers -Wno-strict-aliasing")
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wstrict-prototypes")
+  endif()
   if(MINGW)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format -fno-asynchronous-unwind-tables")
     add_definitions(-D_POSIX_C_SOURCE=1)
@@ -251,14 +266,14 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
   endif(HAS_ALTIVEC)
 endif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
 
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64|AARCH64")
         CHECK_C_COMPILER_FLAG("-march=armv8-a+crc+crypto" HAS_ARMV8_CRC)
   if(HAS_ARMV8_CRC)
     message(STATUS " HAS_ARMV8_CRC yes")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc+crypto -Wno-unused-function")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+crc+crypto -Wno-unused-function")
   endif(HAS_ARMV8_CRC)
-endif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
+endif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64|AARCH64")
 
 option(PORTABLE "build a portable binary" OFF)
 option(FORCE_SSE42 "force building with SSE4.2, even when PORTABLE=ON" OFF)
@@ -297,6 +312,7 @@ else()
 endif()
 
 include(CheckCXXSourceCompiles)
+set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
 if(NOT MSVC)
   set(CMAKE_REQUIRED_FLAGS "-msse4.2 -mpclmul")
 endif()
@@ -313,7 +329,6 @@ int main() {
   auto d = _mm_cvtsi128_si64(c);
 }
 " HAVE_SSE42)
-unset(CMAKE_REQUIRED_FLAGS)
 if(HAVE_SSE42)
   add_definitions(-DHAVE_SSE42)
   add_definitions(-DHAVE_PCLMUL)
@@ -321,18 +336,66 @@ elseif(FORCE_SSE42)
   message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled")
 endif()
 
+# Check if -latomic is required or not
+if (NOT MSVC)
+  set(CMAKE_REQUIRED_FLAGS "--std=c++11")
+  CHECK_CXX_SOURCE_COMPILES("
+#include <atomic>
+std::atomic<uint64_t> x(0);
+int main() {
+  uint64_t i = x.load(std::memory_order_relaxed);
+  bool b = x.is_lock_free();
+  return 0;
+}
+" BUILTIN_ATOMIC)
+if (NOT BUILTIN_ATOMIC)
+  #TODO: Check if -latomic exists
+  list(APPEND THIRDPARTY_LIBS atomic)
+endif()
+endif()
+
+if (WITH_LIBURING)
+  set(CMAKE_REQUIRED_FLAGS "-luring")
+  CHECK_CXX_SOURCE_COMPILES("
+#include <liburing.h>
+int main() {
+  struct io_uring ring;
+  io_uring_queue_init(1, &ring, 0);
+  return 0;
+}
+" HAS_LIBURING)
+  if (HAS_LIBURING)
+    add_definitions(-DROCKSDB_IOURING_PRESENT)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -luring")
+  endif()
+endif()
+
+# Reset the required flags
+set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
+
 CHECK_CXX_SOURCE_COMPILES("
 #if defined(_MSC_VER) && !defined(__thread)
 #define __thread __declspec(thread)
 #endif
 int main() {
   static __thread int tls;
+  (void)tls;
 }
 " HAVE_THREAD_LOCAL)
 if(HAVE_THREAD_LOCAL)
   add_definitions(-DROCKSDB_SUPPORT_THREAD_LOCAL)
 endif()
 
+option(WITH_IOSTATS_CONTEXT "Enable IO stats context" ON)
+if (NOT WITH_IOSTATS_CONTEXT)
+  add_definitions(-DNIOSTATS_CONTEXT)
+endif()
+
+option(WITH_PERF_CONTEXT "Enable perf context" ON)
+if (NOT WITH_PERF_CONTEXT)
+  add_definitions(-DNPERF_CONTEXT)
+endif()
+
 option(FAIL_ON_WARNINGS "Treat compile warnings as errors" ON)
 if(FAIL_ON_WARNINGS)
   if(MSVC)
@@ -474,11 +537,6 @@ if(CMAKE_SYSTEM_NAME MATCHES "Cygwin")
   add_definitions(-fno-builtin-memcmp -DCYGWIN)
 elseif(CMAKE_SYSTEM_NAME MATCHES "Darwin")
   add_definitions(-DOS_MACOSX)
-  if(CMAKE_SYSTEM_PROCESSOR MATCHES arm)
-    add_definitions(-DIOS_CROSS_COMPILE -DROCKSDB_LITE)
-    # no debug info for IOS, that will make our library big
-    add_definitions(-DNDEBUG)
-  endif()
 elseif(CMAKE_SYSTEM_NAME MATCHES "Linux")
   add_definitions(-DOS_LINUX)
 elseif(CMAKE_SYSTEM_NAME MATCHES "SunOS")
@@ -573,12 +631,15 @@ find_package(Threads REQUIRED)
 
 set(SOURCES
         cache/cache.cc
+        cache/cache_entry_roles.cc
         cache/clock_cache.cc
         cache/lru_cache.cc
         cache/sharded_cache.cc
         db/arena_wrapped_db_iter.cc
+        db/blob/blob_fetcher.cc
         db/blob/blob_file_addition.cc
         db/blob/blob_file_builder.cc
+        db/blob/blob_file_cache.cc
         db/blob/blob_file_garbage.cc
         db/blob/blob_file_meta.cc
         db/blob/blob_file_reader.cc
@@ -588,7 +649,6 @@ set(SOURCES
         db/builder.cc
         db/c.cc
         db/column_family.cc
-        db/compacted_db_impl.cc
         db/compaction/compaction.cc
         db/compaction/compaction_iterator.cc
         db/compaction/compaction_picker.cc
@@ -599,6 +659,7 @@ set(SOURCES
         db/compaction/sst_partitioner.cc
         db/convenience.cc
         db/db_filesnapshot.cc
+        db/db_impl/compacted_db_impl.cc
         db/db_impl/db_impl.cc
         db/db_impl/db_impl_write.cc
         db/db_impl/db_impl_compaction_flush.cc
@@ -649,17 +710,20 @@ set(SOURCES
         db/write_batch_base.cc
         db/write_controller.cc
         db/write_thread.cc
+        env/composite_env.cc
         env/env.cc
         env/env_chroot.cc
         env/env_encryption.cc
         env/env_hdfs.cc
         env/file_system.cc
         env/file_system_tracer.cc
+        env/fs_remap.cc
         env/mock_env.cc
         file/delete_scheduler.cc
         file/file_prefetch_buffer.cc
         file/file_util.cc
         file/filename.cc
+        file/line_file_reader.cc
         file/random_access_file_reader.cc
         file/read_write_util.cc
         file/readahead_raf.cc
@@ -694,6 +758,7 @@ set(SOURCES
         monitoring/thread_status_util_debug.cc
         options/cf_options.cc
         options/configurable.cc
+        options/customizable.cc
         options/db_options.cc
         options/options.cc
         options/options_helper.cc
@@ -772,6 +837,7 @@ set(SOURCES
         util/murmurhash.cc
         util/random.cc
         util/rate_limiter.cc
+        util/ribbon_config.cc
         util/slice.cc
         util/file_checksum_helper.cc
         util/status.cc
@@ -817,8 +883,11 @@ set(SOURCES
         utilities/simulator_cache/sim_cache.cc
         utilities/table_properties_collectors/compact_on_deletion_collector.cc
         utilities/trace/file_trace_reader_writer.cc
-        utilities/transactions/lock/lock_tracker.cc
-        utilities/transactions/lock/point_lock_tracker.cc
+        utilities/transactions/lock/lock_manager.cc
+        utilities/transactions/lock/point/point_lock_tracker.cc
+        utilities/transactions/lock/point/point_lock_manager.cc
+        utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc
+        utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc
         utilities/transactions/optimistic_transaction_db_impl.cc
         utilities/transactions/optimistic_transaction.cc
         utilities/transactions/pessimistic_transaction.cc
@@ -826,7 +895,6 @@ set(SOURCES
         utilities/transactions/snapshot_checker.cc
         utilities/transactions/transaction_base.cc
         utilities/transactions/transaction_db_mutex_impl.cc
-        utilities/transactions/transaction_lock_mgr.cc
         utilities/transactions/transaction_util.cc
         utilities/transactions/write_prepared_txn.cc
         utilities/transactions/write_prepared_txn_db.cc
@@ -854,6 +922,20 @@ set(SOURCES
         db/db_impl/db_impl_remote_compaction.cc
         $<TARGET_OBJECTS:build_version>)
 
+list(APPEND SOURCES
+  utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc
+  utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc
+  utilities/transactions/lock/range/range_tree/lib/standalone_port.cc
+  utilities/transactions/lock/range/range_tree/lib/util/dbt.cc
+  utilities/transactions/lock/range/range_tree/lib/util/memarena.cc)
+
 if(HAVE_SSE42 AND NOT MSVC)
   set_source_files_properties(
     util/crc32c.cc
@@ -929,12 +1011,12 @@ else()
   set(SYSTEM_LIBS ${CMAKE_THREAD_LIBS_INIT})
 endif()
 
-add_library(${ROCKSDB_STATIC_LIB} STATIC ${SOURCES})
+add_library(${ROCKSDB_STATIC_LIB} STATIC ${SOURCES} ${BUILD_VERSION_CC})
 target_link_libraries(${ROCKSDB_STATIC_LIB} PRIVATE
   ${THIRDPARTY_LIBS} ${SYSTEM_LIBS})
 
 if(ROCKSDB_BUILD_SHARED)
-  add_library(${ROCKSDB_SHARED_LIB} SHARED ${SOURCES})
+  add_library(${ROCKSDB_SHARED_LIB} SHARED ${SOURCES} ${BUILD_VERSION_CC})
   target_link_libraries(${ROCKSDB_SHARED_LIB} PRIVATE
     ${THIRDPARTY_LIBS} ${SYSTEM_LIBS})
 
@@ -952,7 +1034,7 @@ if(ROCKSDB_BUILD_SHARED)
                           LINKER_LANGUAGE CXX
                           VERSION ${rocksdb_VERSION}
                           SOVERSION ${rocksdb_VERSION_MAJOR}
-                          OUTPUT_NAME "rocksdb")
+                          OUTPUT_NAME "rocksdb${ARTIFACT_SUFFIX}")
   endif()
 endif()
 
@@ -1074,15 +1156,21 @@ if(WITH_TESTS)
         cloud/remote_compaction_test.cc
         db/blob/blob_file_addition_test.cc
         db/blob/blob_file_builder_test.cc
+        db/blob/blob_file_cache_test.cc
         db/blob/blob_file_garbage_test.cc
         db/blob/blob_file_reader_test.cc
+        db/blob/db_blob_basic_test.cc
+        db/blob/db_blob_compaction_test.cc
+        db/blob/db_blob_corruption_test.cc
         db/blob/db_blob_index_test.cc
         db/column_family_test.cc
         db/compact_files_test.cc
+        db/compaction/clipping_iterator_test.cc
         db/compaction/compaction_job_stats_test.cc
         db/compaction/compaction_job_test.cc
         db/compaction/compaction_iterator_test.cc
         db/compaction/compaction_picker_test.cc
+        db/compaction/compaction_service_test.cc
         db/comparator_db_test.cc
         db/corruption_test.cc
         db/cuckoo_table_db_test.cc
@@ -1098,6 +1186,7 @@ if(WITH_TESTS)
         db/db_iter_test.cc
         db/db_iter_stress_test.cc
         db/db_iterator_test.cc
+        db/db_kv_checksum_test.cc
         db/db_log_iter_test.cc
         db/db_memtable_test.cc
         db/db_merge_operator_test.cc
@@ -1105,7 +1194,7 @@ if(WITH_TESTS)
         db/db_options_test.cc
         db/db_properties_test.cc
         db/db_range_del_test.cc
-        db/db_impl/db_secondary_test.cc
+        db/db_secondary_test.cc
         db/db_sst_test.cc
         db/db_statistics_test.cc
         db/db_table_properties_test.cc
@@ -1169,6 +1258,7 @@ if(WITH_TESTS)
         monitoring/statistics_test.cc
         monitoring/stats_history_test.cc
         options/configurable_test.cc
+        options/customizable_test.cc
         options/options_settable_test.cc
         options/options_test.cc
         table/block_based/block_based_filter_block_test.cc
@@ -1185,6 +1275,7 @@ if(WITH_TESTS)
         table/table_test.cc
         table/block_fetcher_test.cc
         test_util/testutil_test.cc
+        trace_replay/block_cache_tracer_test.cc
         trace_replay/io_tracer_test.cc
         tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
         tools/io_tracer_parser_test.cc
@@ -1205,6 +1296,7 @@ if(WITH_TESTS)
         util/random_test.cc
         util/rate_limiter_test.cc
         util/repeatable_thread_test.cc
+        util/ribbon_test.cc
         util/slice_test.cc
         util/slice_transform_test.cc
         util/timer_queue_test.cc
@@ -1231,9 +1323,10 @@ if(WITH_TESTS)
         utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
         utilities/transactions/optimistic_transaction_test.cc
         utilities/transactions/transaction_test.cc
-        utilities/transactions/transaction_lock_mgr_test.cc
+        utilities/transactions/lock/point/point_lock_manager_test.cc
         utilities/transactions/write_prepared_transaction_test.cc
         utilities/transactions/write_unprepared_transaction_test.cc
+        utilities/transactions/lock/range/range_locking_test.cc
         utilities/ttl/ttl_test.cc
         utilities/write_batch_with_index/write_batch_with_index_test.cc
     )
@@ -1268,21 +1361,21 @@ if(WITH_TESTS)
 
   foreach(sourcefile ${TESTS})
       get_filename_component(exename ${sourcefile} NAME_WE)
-      add_executable(${CMAKE_PROJECT_NAME}_${exename}${ARTIFACT_SUFFIX} ${sourcefile})
-      set_target_properties(${CMAKE_PROJECT_NAME}_${exename}${ARTIFACT_SUFFIX}
+      add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile})
+      set_target_properties(${exename}${ARTIFACT_SUFFIX}
         PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1
         EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1
         EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1
         OUTPUT_NAME ${exename}${ARTIFACT_SUFFIX}
       )
-      target_link_libraries(${CMAKE_PROJECT_NAME}_${exename}${ARTIFACT_SUFFIX} testutillib${ARTIFACT_SUFFIX} testharness gtest ${THIRDPARTY_LIBS} ${ROCKSDB_LIB})
+      target_link_libraries(${exename}${ARTIFACT_SUFFIX} testutillib${ARTIFACT_SUFFIX} testharness gtest ${THIRDPARTY_LIBS} ${ROCKSDB_LIB})
       if(NOT "${exename}" MATCHES "db_sanity_test")
-        add_test(NAME ${exename} COMMAND ${exename}${ARTIFACT_SUFFIX})
-        add_dependencies(check ${CMAKE_PROJECT_NAME}_${exename}${ARTIFACT_SUFFIX})
+        gtest_discover_tests(${exename} DISCOVERY_TIMEOUT 120)
+        add_dependencies(check ${exename}${ARTIFACT_SUFFIX})
       endif()
       if("${exename}" MATCHES "env_librados_test")
         # env_librados_test.cc uses librados directly
-        target_link_libraries(${CMAKE_PROJECT_NAME}_${exename}${ARTIFACT_SUFFIX} rados)
+        target_link_libraries(${exename}${ARTIFACT_SUFFIX} rados)
       endif()
   endforeach(sourcefile ${TESTS})
 
@@ -1307,40 +1400,42 @@ if(WITH_TESTS)
 endif()
 
 if(WITH_BENCHMARK_TOOLS)
-  add_executable(db_bench
+  add_executable(db_bench${ARTIFACT_SUFFIX}
+    tools/simulated_hybrid_file_system.cc
     tools/db_bench.cc
     tools/db_bench_tool.cc)
-  target_link_libraries(db_bench
+  target_link_libraries(db_bench${ARTIFACT_SUFFIX}
     ${ROCKSDB_LIB} ${THIRDPARTY_LIBS})
 
-  add_executable(cache_bench
-    cache/cache_bench.cc)
-  target_link_libraries(cache_bench
+  add_executable(cache_bench${ARTIFACT_SUFFIX}
+    cache/cache_bench.cc
+    cache/cache_bench_tool.cc)
+  target_link_libraries(cache_bench${ARTIFACT_SUFFIX}
     ${ROCKSDB_LIB} ${GFLAGS_LIB})
 
-  add_executable(memtablerep_bench
+  add_executable(memtablerep_bench${ARTIFACT_SUFFIX}
     memtable/memtablerep_bench.cc)
-  target_link_libraries(memtablerep_bench
+  target_link_libraries(memtablerep_bench${ARTIFACT_SUFFIX}
     ${ROCKSDB_LIB} ${GFLAGS_LIB})
 
-  add_executable(range_del_aggregator_bench
+  add_executable(range_del_aggregator_bench${ARTIFACT_SUFFIX}
     db/range_del_aggregator_bench.cc)
-  target_link_libraries(range_del_aggregator_bench
+  target_link_libraries(range_del_aggregator_bench${ARTIFACT_SUFFIX}
     ${ROCKSDB_LIB} ${GFLAGS_LIB})
 
-  add_executable(table_reader_bench
+  add_executable(table_reader_bench${ARTIFACT_SUFFIX}
     table/table_reader_bench.cc)
-  target_link_libraries(table_reader_bench
+  target_link_libraries(table_reader_bench${ARTIFACT_SUFFIX}
     ${ROCKSDB_LIB} testharness ${GFLAGS_LIB})
 
-  add_executable(filter_bench
+  add_executable(filter_bench${ARTIFACT_SUFFIX}
     util/filter_bench.cc)
-  target_link_libraries(filter_bench
+  target_link_libraries(filter_bench${ARTIFACT_SUFFIX}
     ${ROCKSDB_LIB} ${GFLAGS_LIB})
 
-  add_executable(hash_table_bench
+  add_executable(hash_table_bench${ARTIFACT_SUFFIX}
     utilities/persistent_cache/hash_table_bench.cc)
-  target_link_libraries(hash_table_bench
+  target_link_libraries(hash_table_bench${ARTIFACT_SUFFIX}
     ${ROCKSDB_LIB} ${GFLAGS_LIB})
 endif()
 
diff --git a/HISTORY.md b/HISTORY.md
index a4a66317dc7..837bb1b73e4 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,34 +1,223 @@
 # Rocksdb Change Log
-## 6.14.6 (12/01/2020)
+## 6.22.1 (2021-06-25)
 ### Bug Fixes
-* Truncated WALs ending in incomplete records can no longer produce gaps in the recovered data when `WALRecoveryMode::kPointInTimeRecovery` is used. Gaps are still possible when WALs are truncated exactly on record boundaries.
+* `GetLiveFilesMetaData()` now populates the `temperature`, `oldest_ancester_time`, and `file_creation_time` fields of its `LiveFileMetaData` results when the information is available. Previously these fields always contained zero indicating unknown.
+
+## 6.22.0 (2021-06-18)
+### Behavior Changes
+* Added two additional tickers, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH and MEMTABLE_GARBAGE_BYTES_AT_FLUSH. These stats can be used to estimate the ratio of "garbage" (outdated) bytes in the memtable that are discarded at flush time.
+* Added API comments clarifying safe usage of Disable/EnableManualCompaction and EventListener callbacks for compaction.
 
-## 6.14.5 (11/15/2020)
 ### Bug Fixes
-* Fix a bug of encoding and parsing BlockBasedTableOptions::read_amp_bytes_per_bit as a 64-bit integer.
-* Fixed the logic of populating native data structure for `read_amp_bytes_per_bit` during OPTIONS file parsing on big-endian architecture. Without this fix, original code introduced in PR7659, when running on big-endian machine, can mistakenly store read_amp_bytes_per_bit (an uint32) in little endian format. Future access to `read_amp_bytes_per_bit` will give wrong values. Little endian architecture is not affected.
+* fs_posix.cc GetFreeSpace() always report disk space available to root even when running as non-root.  Linux defaults often have disk mounts with 5 to 10 percent of total space reserved only for root.  Out of space could result for non-root users.
+* Subcompactions are now disabled when user-defined timestamps are used, since the subcompaction boundary picking logic is currently not timestamp-aware, which could lead to incorrect results when different subcompactions process keys that only differ by timestamp.
+* Fix an issue that `DeleteFilesInRange()` may cause ongoing compaction reports corruption exception, or ASSERT for debug build. There's no actual data loss or corruption that we find.
+* Fixed confusingly duplicated output in LOG for periodic stats ("DUMPING STATS"), including "Compaction Stats" and "File Read Latency Histogram By Level".
+* Fixed performance bugs in background gathering of block cache entry statistics, that could consume a lot of CPU when there are many column families with a shared block cache.
+
+### New Features
+* Marked the Ribbon filter and optimize_filters_for_memory features as production-ready, each enabling memory savings for Bloom-like filters. Use `NewRibbonFilterPolicy` in place of `NewBloomFilterPolicy` to use Ribbon filters instead of Bloom, or `ribbonfilter` in place of `bloomfilter` in configuration string.
+* Allow `DBWithTTL` to use `DeleteRange` api just like other DBs. `DeleteRangeCF()` which executes `WriteBatchInternal::DeleteRange()` has been added to the handler in `DBWithTTLImpl::Write()` to implement it.
+* Add BlockBasedTableOptions.prepopulate_block_cache.  If enabled, it prepopulate warm/hot data blocks which are already in memory into block cache at the time of flush. On a flush, the data block that is in memory (in memtables) get flushed to the device. If using Direct IO, additional IO is incurred to read this data back into memory again, which is avoided by enabling this option and it also helps with Distributed FileSystem. More details in include/rocksdb/table.h.
+* Added a `cancel` field to `CompactRangeOptions`, allowing individual in-process manual range compactions to be cancelled.
 
-## 6.14.4 (11/05/2020)
+## 6.21.0 (2021-05-21)
 ### Bug Fixes
-Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before `TableBuilder::Finish()` in compaction job. For example, the `NeedCompact()` method of `CompactOnDeletionCollector` returned by built-in `CompactOnDeletionCollectorFactory` requires `BlockBasedTable::Finish()` to return the correct result. The bug can cause a compaction-generated file not to be marked for future compaction based on deletion ratio.
+* Fixed a bug in handling file rename error in distributed/network file systems when the server succeeds but client returns error. The bug can cause CURRENT file to point to non-existing MANIFEST file, thus DB cannot be opened.
+* Fixed a bug where ingested files were written with incorrect boundary key metadata. In rare cases this could have led to a level's files being wrongly ordered and queries for the boundary keys returning wrong results.
+* Fixed a data race between insertion into memtables and the retrieval of the DB properties `rocksdb.cur-size-active-mem-table`, `rocksdb.cur-size-all-mem-tables`, and `rocksdb.size-all-mem-tables`.
+* Fixed the false-positive alert when recovering from the WAL file. Avoid reporting "SST file is ahead of WAL" on a newly created empty column family, if the previous WAL file is corrupted.
+* Fixed a bug where `GetLiveFiles()` output included a non-existent file called "OPTIONS-000000". Backups and checkpoints, which use `GetLiveFiles()`, failed on DBs impacted by this bug. Read-write DBs were impacted when the latest OPTIONS file failed to write and `fail_if_options_file_error == false`. Read-only DBs were impacted when no OPTIONS files existed.
+* Handle return code by io_uring_submit_and_wait() and io_uring_wait_cqe().
+* In the IngestExternalFile() API, only try to sync the ingested file if the file is linked and the FileSystem/Env supports reopening a writable file.
+* Fixed a bug that `AdvancedColumnFamilyOptions.max_compaction_bytes` is under-calculated for manual compaction (`CompactRange()`). Manual compaction is split to multiple compactions if the compaction size exceed the `max_compaction_bytes`. The bug creates much larger compaction which size exceed the user setting. On the other hand, larger manual compaction size can increase the subcompaction parallelism, you can tune that by setting `max_compaction_bytes`.
+
+### Behavior Changes
+* Due to the fix of false-postive alert of "SST file is ahead of WAL", all the CFs with no SST file (CF empty) will bypass the consistency check. We fixed a false-positive, but introduced a very rare true-negative which will be triggered in the following conditions: A CF with some delete operations in the last a few queries which will result in an empty CF (those are flushed to SST file and a compaction triggered which combines this file and all other SST files and generates an empty CF, or there is another reason to write a manifest entry for this CF after a flush that generates no SST file from an empty CF). The deletion entries are logged in a WAL and this WAL was corrupted, while the CF's log number points to the next WAL (due to the flush). Therefore, the DB can only recover to the point without these trailing deletions and cause the inconsistent DB status.
+
+### New Features
+* Add new option allow_stall passed during instance creation of WriteBufferManager. When allow_stall is set, WriteBufferManager will stall all writers shared across multiple DBs and columns if memory usage goes beyond specified WriteBufferManager::buffer_size (soft limit). Stall will be cleared when memory is freed after flush and memory usage goes down below buffer_size.
+* Allow `CompactionFilter`s to apply in more table file creation scenarios such as flush and recovery. For compatibility, `CompactionFilter`s by default apply during compaction. Users can customize this behavior by overriding `CompactionFilterFactory::ShouldFilterTableFileCreation()`.
+* Added more fields to FilterBuildingContext with LSM details, for custom filter policies that vary behavior based on where they are in the LSM-tree.
+* Added DB::Properties::kBlockCacheEntryStats for querying statistics on what percentage of block cache is used by various kinds of blocks, etc. using DB::GetProperty and DB::GetMapProperty. The same information is now dumped to info LOG periodically according to `stats_dump_period_sec`.
+* Add an experimental Remote Compaction feature, which allows the user to run Compaction on a different host or process. The feature is still under development, currently only works on some basic use cases. The interface will be changed without backward/forward compatibility support.
+* RocksDB would validate total entries read in flush, and compare with counter inserted into it. If flush_verify_memtable_count = true (default), flush will fail. Otherwise, only log to info logs.
+* Add `TableProperties::num_filter_entries`, which can be used with `TableProperties::filter_size` to calculate the effective bits per filter entry (unique user key or prefix) for a table file.
+
+### Performance Improvements
+* BlockPrefetcher is used by iterators to prefetch data if they anticipate more data to be used in future. It is enabled implicitly by rocksdb. Added change to take in account read pattern if reads are sequential. This would disable prefetching for random reads in MultiGet and iterators as readahead_size is increased exponential doing large prefetches.
+
+### Public API change
+* Removed a parameter from TableFactory::NewTableBuilder, which should not be called by user code because TableBuilder is not a public API.
+* Removed unused structure `CompactionFilterContext`.
+* The `skip_filters` parameter to SstFileWriter is now considered deprecated. Use `BlockBasedTableOptions::filter_policy` to control generation of filters.
+* ClockCache is known to have bugs that could lead to crash or corruption, so should not be used until fixed. Use NewLRUCache instead.
+* Added a new pure virtual function `ApplyToAllEntries` to `Cache`, to replace `ApplyToAllCacheEntries`. Custom `Cache` implementations must add an implementation. Because this function is for gathering statistics, an empty implementation could be acceptable for some applications.
+* Added the ObjectRegistry to the ConfigOptions class.  This registry instance will be used to find any customizable loadable objects during initialization.
+* Expanded the ObjectRegistry functionality to allow nested ObjectRegistry instances.  Added methods to register a set of functions with the registry/library as a group.
+* Deprecated backupable_db.h and BackupableDBOptions in favor of new versions with appropriate names: backup_engine.h and BackupEngineOptions. Old API compatibility is preserved.
+
+### Default Option Change
+* When options.arena_block_size <= 0 (default value 0), still use writer_buffer_size / 8 but cap to 1MB. Too large alloation size might not be friendly to allocator and might cause performance issues in extreme cases.
+
+### Build
+* By default, try to build with liburing. For make, if ROCKSDB_USE_IO_URING is not set, treat as enable, which means RocksDB will try to build with liburing. Users can disable it with ROCKSDB_USE_IO_URING=0. For cmake, add WITH_LIBURING to control it, with default on.
+
+## 6.20.0 (2021-04-16)
+### Behavior Changes
+* `ColumnFamilyOptions::sample_for_compression` now takes effect for creation of all block-based tables. Previously it only took effect for block-based tables created by flush.
+* `CompactFiles()` can no longer compact files from lower level to up level, which has the risk to corrupt DB (details: #8063). The validation is also added to all compactions.
+* Fixed some cases in which DB::OpenForReadOnly() could write to the filesystem. If you want a Logger with a read-only DB, you must now set DBOptions::info_log yourself, such as using CreateLoggerFromOptions().
+* get_iostats_context() will never return nullptr. If thread-local support is not available, and user does not opt-out iostats context, then compilation will fail. The same applies to perf context as well.
+* Added support for WriteBatchWithIndex::NewIteratorWithBase when overwrite_key=false.  Previously, this combination was not supported and would assert or return nullptr.
+* Improve the behavior of WriteBatchWithIndex for Merge operations.  Now more operations may be stored in order to return the correct merged result.
 
-## 6.14.3 (10/30/2020)
 ### Bug Fixes
-* Reverted a behavior change silently introduced in 6.14.2, in which the effects of the `ignore_unknown_options` flag (used in option parsing/loading functions) changed.
-* Reverted a behavior change silently introduced in 6.14, in which options parsing/loading functions began returning `NotFound` instead of `InvalidArgument` for option names not available in the present version.
+* Use thread-safe `strerror_r()` to get error messages.
+* Fixed a potential hang in shutdown for a DB whose `Env` has high-pri thread pool disabled (`Env::GetBackgroundThreads(Env::Priority::HIGH) == 0`)
+* Made BackupEngine thread-safe and added documentation comments to clarify what is safe for multiple BackupEngine objects accessing the same backup directory.
+* Fixed crash (divide by zero) when compression dictionary is applied to a file containing only range tombstones.
+* Fixed a backward iteration bug with partitioned filter enabled: not including the prefix of the last key of the previous filter partition in current filter partition can cause wrong iteration result.
+* Fixed a bug that allowed `DBOptions::max_open_files` to be set with a non-negative integer with `ColumnFamilyOptions::compaction_style = kCompactionStyleFIFO`.
+
+### Performance Improvements
+* On ARM platform, use `yield` instead of `wfe` to relax cpu to gain better performance.
+
+### Public API change
+* Added `TableProperties::slow_compression_estimated_data_size` and `TableProperties::fast_compression_estimated_data_size`. When `ColumnFamilyOptions::sample_for_compression > 0`, they estimate what `TableProperties::data_size` would have been if the "fast" or "slow" (see `ColumnFamilyOptions::sample_for_compression` API doc for definitions) compression had been used instead.
+* Update DB::StartIOTrace and remove Env object from the arguments as its redundant and DB already has Env object that is passed down to IOTracer::StartIOTrace
+* Added `FlushReason::kWalFull`, which is reported when a memtable is flushed due to the WAL reaching its size limit; those flushes were previously reported as `FlushReason::kWriteBufferManager`. Also, changed the reason for flushes triggered by the write buffer manager to `FlushReason::kWriteBufferManager`; they were previously reported as `FlushReason::kWriteBufferFull`.
+* Extend file_checksum_dump ldb command and DB::GetLiveFilesChecksumInfo API for IntegratedBlobDB and get checksum of blob files along with SST files.
 
-## 6.14.2 (10/21/2020)
+### New Features
+* Added the ability to open BackupEngine backups as read-only DBs, using BackupInfo::name_for_open and env_for_open provided by BackupEngine::GetBackupInfo() with include_file_details=true.
+* Added BackupEngine support for integrated BlobDB, with blob files shared between backups when table files are shared. Because of current limitations, blob files always use the kLegacyCrc32cAndFileSize naming scheme, and incremental backups must read and checksum all blob files in a DB, even for files that are already backed up.
+* Added an optional output parameter to BackupEngine::CreateNewBackup(WithMetadata) to return the BackupID of the new backup.
+* Added BackupEngine::GetBackupInfo / GetLatestBackupInfo for querying individual backups.
+* Made the Ribbon filter a long-term supported feature in terms of the SST schema(compatible with version >= 6.15.0) though the API for enabling it is expected to change.
+
+## 6.19.0 (2021-03-21)
 ### Bug Fixes
-* Fixed a bug which causes hang in closing DB when refit level is set in opt build. It was because ContinueBackgroundWork() was called in assert statement which is a no op. It was introduced in 6.14.
+* Fixed the truncation error found in APIs/tools when dumping block-based SST files in a human-readable format. After fix, the block-based table can be fully dumped as a readable file.
+* When hitting a write slowdown condition, no write delay (previously 1 millisecond) is imposed until `delayed_write_rate` is actually exceeded, with an initial burst allowance of 1 millisecond worth of bytes. Also, beyond the initial burst allowance, `delayed_write_rate` is now more strictly enforced, especially with multiple column families.
+
+### Public API change
+* Changed default `BackupableDBOptions::share_files_with_checksum` to `true` and deprecated `false` because of potential for data loss. Note that accepting this change in behavior can temporarily increase backup data usage because files are not shared between backups using the two different settings. Also removed obsolete option kFlagMatchInterimNaming.
+* Add a new option BlockBasedTableOptions::max_auto_readahead_size. RocksDB does auto-readahead for iterators on noticing more than two reads for a table file if user doesn't provide readahead_size. The readahead starts at 8KB and doubles on every additional read upto max_auto_readahead_size and now max_auto_readahead_size can be configured dynamically as well. Found that 256 KB readahead size provides the best performance, based on experiments, for auto readahead. Experiment data is in PR #3282. If value is set 0 then no automatic prefetching will be done by rocksdb. Also changing the value will only affect files opened after the change.
+* Add suppport to extend DB::VerifyFileChecksums API to also verify blob files checksum.
+* When using the new BlobDB, the amount of data written by flushes/compactions is now broken down into table files and blob files in the compaction statistics; namely, Write(GB) denotes the amount of data written to table files, while Wblob(GB) means the amount of data written to blob files.
+* New default BlockBasedTableOptions::format_version=5 to enable new Bloom filter implementation by default, compatible with RocksDB versions >= 6.6.0.
+* Add new SetBufferSize API to WriteBufferManager to allow dynamic management of memory allotted to all write buffers.  This allows user code to adjust memory monitoring provided by WriteBufferManager as process memory needs change datasets grow and shrink.
+* Clarified the required semantics of Read() functions in FileSystem and Env APIs. Please ensure any custom implementations are compliant.
+* For the new integrated BlobDB implementation, compaction statistics now include the amount of data read from blob files during compaction (due to garbage collection or compaction filters). Write amplification metrics have also been extended to account for data read from blob files.
+* Add EqualWithoutTimestamp() to Comparator.
+* Extend support to track blob files in SSTFileManager whenever a blob file is created/deleted. Blob files will be scheduled to delete via SSTFileManager and SStFileManager will now take blob files in account while calculating size and space limits along with SST files.
+* Add new Append and PositionedAppend API with checksum handoff to legacy Env.
+
+### New Features
+* Support compaction filters for the new implementation of BlobDB. Add `FilterBlobByKey()` to `CompactionFilter`. Subclasses can override this method so that compaction filters can determine whether the actual blob value has to be read during compaction. Use a new `kUndetermined` in `CompactionFilter::Decision` to indicated that further action is necessary for compaction filter to make a decision.
+* Add support to extend retrieval of checksums for blob files from the MANIFEST when checkpointing. During backup, rocksdb can detect corruption in blob files  during file copies.
+* Add new options for db_bench --benchmarks: flush, waitforcompaction, compact0, compact1.
+* Add an option to BackupEngine::GetBackupInfo to include the name and size of each backed-up file. Especially in the presence of file sharing among backups, this offers detailed insight into backup space usage.
+* Enable backward iteration on keys with user-defined timestamps.
+* Add statistics and info log for error handler: counters for bg error, bg io error, bg retryable io error, auto resume count, auto resume total retry number, and auto resume sucess; Histogram for auto resume retry count in each recovery call. Note that, each auto resume attempt will have one or multiple retries.
+
+### Behavior Changes
+* During flush, only WAL sync retryable IO error is mapped to hard error, which will stall the writes. When WAL is used but only SST file write has retryable IO error, it will be mapped to soft error and write will not be affected.
+
+## 6.18.0 (2021-02-19)
+### Behavior Changes
+* When retryable IO error occurs during compaction, it is mapped to soft error and set the BG error. However, auto resume is not called to clean the soft error since compaction will reschedule by itself. In this change, When retryable IO error occurs during compaction, BG error is not set. User will be informed the error via EventHelper.
+* Introduce a new trace file format for query tracing and replay and trace file version is bump up to 0.2. A payload map is added as the first portion of the payload. We will not have backward compatible issues when adding new entries to trace records. Added the iterator_upper_bound and iterator_lower_bound in Seek and SeekForPrev tracing function. Added them as the new payload member for iterator tracing.
+
+### New Features
+* Add support for key-value integrity protection in live updates from the user buffers provided to `WriteBatch` through the write to RocksDB's in-memory update buffer (memtable). This is intended to detect some cases of in-memory data corruption, due to either software or hardware errors. Users can enable protection by constructing their `WriteBatch` with `protection_bytes_per_key == 8`.
+* Add support for updating `full_history_ts_low` option in manual compaction, which is for old timestamp data GC.
+* Add a mechanism for using Makefile to build external plugin code into the RocksDB libraries/binaries. This intends to simplify compatibility and distribution for plugins (e.g., special-purpose `FileSystem`s) whose source code resides outside the RocksDB repo. See "plugin/README.md" for developer details, and "PLUGINS.md" for a listing of available plugins.
+* Added memory pre-fetching for experimental Ribbon filter, which especially optimizes performance with batched MultiGet.
+* A new, experimental version of BlobDB (key-value separation) is now available. The new implementation is integrated into the RocksDB core, i.e. it is accessible via the usual `rocksdb::DB` API, as opposed to the separate `rocksdb::blob_db::BlobDB` interface used by the earlier version, and can be configured on a per-column family basis using the configuration options `enable_blob_files`, `min_blob_size`, `blob_file_size`, `blob_compression_type`, `enable_blob_garbage_collection`, and `blob_garbage_collection_age_cutoff`. It extends RocksDB's consistency guarantees to blobs, and offers more features and better performance. Note that some features, most notably `Merge`, compaction filters, and backup/restore are not yet supported, and there is no support for migrating a database created by the old implementation.
 
-## 6.14.1 (10/13/2020)
 ### Bug Fixes
-* Since 6.12, memtable lookup should report unrecognized value_type as corruption (#7121).
-* Since 6.14, fix false positive flush/compaction `Status::Corruption` failure when `paranoid_file_checks == true` and range tombstones were written to the compaction output files.
+* Since 6.15.0, `TransactionDB` returns error `Status`es from calls to `DeleteRange()` and calls to `Write()` where the `WriteBatch` contains a range deletion. Previously such operations may have succeeded while not providing the expected transactional guarantees. There are certain cases where range deletion can still be used on such DBs; see the API doc on `TransactionDB::DeleteRange()` for details.
+* `OptimisticTransactionDB` now returns error `Status`es from calls to `DeleteRange()` and calls to `Write()` where the `WriteBatch` contains a range deletion. Previously such operations may have succeeded while not providing the expected transactional guarantees.
+* Fix `WRITE_PREPARED`, `WRITE_UNPREPARED` TransactionDB `MultiGet()` may return uncommitted data with snapshot.
+* In DB::OpenForReadOnly, if any error happens while checking Manifest file path, it was overridden by Status::NotFound. It has been fixed and now actual error is returned.
+
+### Public API Change
+* Added a "only_mutable_options" flag to the ConfigOptions.  When this flag is "true", the Configurable functions and convenience methods (such as GetDBOptionsFromString) will only deal with options that are marked as mutable.  When this flag is true, only options marked as mutable can be configured (a Status::InvalidArgument will be returned) and options not marked as mutable will not be returned or compared.  The default is "false", meaning to compare all options.
+* Add new Append and PositionedAppend APIs to FileSystem to bring the data verification information (data checksum information) from upper layer (e.g., WritableFileWriter) to the storage layer. In this way, the customized FileSystem is able to verify the correctness of data being written to the storage on time. Add checksum_handoff_file_types to DBOptions. User can use this option to control which file types (Currently supported file tyes: kWALFile, kTableFile, kDescriptorFile.) should use the new Append and PositionedAppend APIs to handoff the verification information. Currently, RocksDB only use crc32c to calculate the checksum for write handoff.
+* Add an option, `CompressionOptions::max_dict_buffer_bytes`, to limit the in-memory buffering for selecting samples for generating/training a dictionary. The limit is currently loosely adhered to.
+
+
+## 6.17.0 (2021-01-15)
+### Behavior Changes
+* When verifying full file checksum with `DB::VerifyFileChecksums()`, we now fail with `Status::InvalidArgument` if the name of the checksum generator used for verification does not match the name of the checksum generator used for protecting the file when it was created.
+* Since RocksDB does not continue write the same file if a file write fails for any reason, the file scope write IO error is treated the same as retryable IO error. More information about error handling of file scope IO error is included in `ErrorHandler::SetBGError`.
+
+### Bug Fixes
+* Version older than 6.15 cannot decode VersionEdits `WalAddition` and `WalDeletion`, fixed this by changing the encoded format of them to be ignorable by older versions.
+* Fix a race condition between DB startups and shutdowns in managing the periodic background worker threads. One effect of this race condition could be the process being terminated.
+
+### Public API Change
+* Add a public API WriteBufferManager::dummy_entries_in_cache_usage() which reports the size of dummy entries stored in cache (passed to WriteBufferManager). Dummy entries are used to account for DataBlocks.
+* Add a SystemClock class that contains the time-related methods from Env.  The original methods in Env may be deprecated in a future release.  This class will allow easier testing, development, and expansion of time-related features.
+* Add a public API GetRocksBuildProperties and GetRocksBuildInfoAsString to get properties about the current build.  These properties may include settings related to the GIT settings (branch, timestamp).  This change also sets the "build date" based on the GIT properties, rather than the actual build time, thereby enabling more reproducible builds.
+
+## 6.16.0 (2020-12-18)
+### Behavior Changes
+* Attempting to write a merge operand without explicitly configuring `merge_operator` now fails immediately, causing the DB to enter read-only mode. Previously, failure was deferred until the `merge_operator` was needed by a user read or a background operation.
+
+### Bug Fixes
+* Truncated WALs ending in incomplete records can no longer produce gaps in the recovered data when `WALRecoveryMode::kPointInTimeRecovery` is used. Gaps are still possible when WALs are truncated exactly on record boundaries; for complete protection, users should enable `track_and_verify_wals_in_manifest`.
+* Fix a bug where compressed blocks read by MultiGet are not inserted into the compressed block cache when use_direct_reads = true.
+* Fixed the issue of full scanning on obsolete files when there are too many outstanding compactions with ConcurrentTaskLimiter enabled.
+* Fixed the logic of populating native data structure for `read_amp_bytes_per_bit` during OPTIONS file parsing on big-endian architecture. Without this fix, original code introduced in PR7659, when running on big-endian machine, can mistakenly store read_amp_bytes_per_bit (an uint32) in little endian format. Future access to `read_amp_bytes_per_bit` will give wrong values. Little endian architecture is not affected.
+* Fixed prefix extractor with timestamp issues.
+* Fixed a bug in atomic flush: in two-phase commit mode, the minimum WAL log number to keep is incorrect.
+* Fixed a bug related to checkpoint in PR7789: if there are multiple column families, and the checkpoint is not opened as read only, then in rare cases, data loss may happen in the checkpoint. Since backup engine relies on checkpoint, it may also be affected.
+* When ldb --try_load_options is used with the --column_family option, the ColumnFamilyOptions for the specified column family was not loaded from the OPTIONS file. Fix it so its loaded from OPTIONS and then overridden with command line overrides.
+
+### New Features
+* User defined timestamp feature supports `CompactRange` and `GetApproximateSizes`.
+* Support getting aggregated table properties (kAggregatedTableProperties and kAggregatedTablePropertiesAtLevel) with DB::GetMapProperty, for easier access to the data in a structured format.
+* Experimental option BlockBasedTableOptions::optimize_filters_for_memory now works with experimental Ribbon filter (as well as Bloom filter).
+
+### Public API Change
+* Deprecated public but rarely-used FilterBitsBuilder::CalculateNumEntry, which is replaced with ApproximateNumEntries taking a size_t parameter and returning size_t.
+* To improve portability the functions `Env::GetChildren` and `Env::GetChildrenFileAttributes` will no longer return entries for the special directories `.` or `..`.
+* Added a new option `track_and_verify_wals_in_manifest`. If `true`, the log numbers and sizes of the synced WALs are tracked in MANIFEST, then during DB recovery, if a synced WAL is missing from disk, or the WAL's size does not match the recorded size in MANIFEST, an error will be reported and the recovery will be aborted. Note that this option does not work with secondary instance.
+* `rocksdb_approximate_sizes` and `rocksdb_approximate_sizes_cf` in the C API now requires an error pointer (`char** errptr`) for receiving any error.
+* All overloads of DB::GetApproximateSizes now return Status, so that any failure to obtain the sizes is indicated to the caller.
+
+## 6.15.0 (2020-11-13)
+### Bug Fixes
 * Fixed a bug in the following combination of features: indexes with user keys (`format_version >= 3`), indexes are partitioned (`index_type == kTwoLevelIndexSearch`), and some index partitions are pinned in memory (`BlockBasedTableOptions::pin_l0_filter_and_index_blocks_in_cache`). The bug could cause keys to be truncated when read from the index leading to wrong read results or other unexpected behavior.
 * Fixed a bug when indexes are partitioned (`index_type == kTwoLevelIndexSearch`), some index partitions are pinned in memory (`BlockBasedTableOptions::pin_l0_filter_and_index_blocks_in_cache`), and partitions reads could be mixed between block cache and directly from the file (e.g., with `enable_index_compression == 1` and `mmap_read == 1`, partitions that were stored uncompressed due to poor compression ratio would be read directly from the file via mmap, while partitions that were stored compressed would be read from block cache). The bug could cause index partitions to be mistakenly considered empty during reads leading to wrong read results.
+* Since 6.12, memtable lookup should report unrecognized value_type as corruption (#7121).
+* Since 6.14, fix false positive flush/compaction `Status::Corruption` failure when `paranoid_file_checks == true` and range tombstones were written to the compaction output files.
+* Since 6.14, fix a bug that could cause a stalled write to crash with mixed of slowdown and no_slowdown writes (`WriteOptions.no_slowdown=true`).
+* Fixed a bug which causes hang in closing DB when refit level is set in opt build. It was because ContinueBackgroundWork() was called in assert statement which is a no op. It was introduced in 6.14.
+* Fixed a bug which causes Get() to return incorrect result when a key's merge operand is applied twice. This can occur if the thread performing Get() runs concurrently with a background flush thread and another thread writing to the MANIFEST file (PR6069).
+* Reverted a behavior change silently introduced in 6.14.2, in which the effects of the `ignore_unknown_options` flag (used in option parsing/loading functions) changed.
+* Reverted a behavior change silently introduced in 6.14, in which options parsing/loading functions began returning `NotFound` instead of `InvalidArgument` for option names not available in the present version.
+* Fixed MultiGet bugs it doesn't return valid data with user defined timestamp.
+* Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before `TableBuilder::Finish()` in compaction job. For example, the `NeedCompact()` method of `CompactOnDeletionCollector` returned by built-in `CompactOnDeletionCollectorFactory` requires `BlockBasedTable::Finish()` to return the correct result. The bug can cause a compaction-generated file not to be marked for future compaction based on deletion ratio.
+* Fixed a seek issue with prefix extractor and timestamp.
+* Fixed a bug of encoding and parsing BlockBasedTableOptions::read_amp_bytes_per_bit as a 64-bit integer.
+* Fixed a bug of a recovery corner case, details in PR7621.
+
+### Public API Change
+* Deprecate `BlockBasedTableOptions::pin_l0_filter_and_index_blocks_in_cache` and `BlockBasedTableOptions::pin_top_level_index_and_filter`. These options still take effect until users migrate to the replacement APIs in `BlockBasedTableOptions::metadata_cache_options`. Migration guidance can be found in the API comments on the deprecated options.
+* Add new API `DB::VerifyFileChecksums` to verify SST file checksum with corresponding entries in the MANIFEST if present. Current implementation requires scanning and recomputing file checksums.
+
+### Behavior Changes
+* The dictionary compression settings specified in `ColumnFamilyOptions::compression_opts` now additionally affect files generated by flush and compaction to non-bottommost level. Previously those settings at most affected files generated by compaction to bottommost level, depending on whether `ColumnFamilyOptions::bottommost_compression_opts` overrode them. Users who relied on dictionary compression settings in `ColumnFamilyOptions::compression_opts` affecting only the bottommost level can keep the behavior by moving their dictionary settings to `ColumnFamilyOptions::bottommost_compression_opts` and setting its `enabled` flag.
+* When the `enabled` flag is set in `ColumnFamilyOptions::bottommost_compression_opts`, those compression options now take effect regardless of the value in `ColumnFamilyOptions::bottommost_compression`. Previously, those compression options only took effect when `ColumnFamilyOptions::bottommost_compression != kDisableCompressionOption`. Now, they additionally take effect when `ColumnFamilyOptions::bottommost_compression == kDisableCompressionOption` (such a setting causes bottommost compression type to fall back to `ColumnFamilyOptions::compression_per_level` if configured, and otherwise fall back to `ColumnFamilyOptions::compression`).
+
+### New Features
+* An EXPERIMENTAL new Bloom alternative that saves about 30% space compared to Bloom filters, with about 3-4x construction time and similar query times is available using NewExperimentalRibbonFilterPolicy.
 
-## 6.14 (10/09/2020)
+## 6.14 (2020-10-09)
 ### Bug fixes
 * Fixed a bug after a `CompactRange()` with `CompactRangeOptions::change_level` set fails due to a conflict in the level change step, which caused all subsequent calls to `CompactRange()` with `CompactRangeOptions::change_level` set to incorrectly fail with a `Status::NotSupported("another thread is refitting")` error.
 * Fixed a bug that the bottom most level compaction could still be a trivial move even if `BottommostLevelCompaction.kForce` or `kForceOptimized` is set.
@@ -44,12 +233,15 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 * The settings of the DBOptions and ColumnFamilyOptions are now managed by Configurable objects (see New Features).  The same convenience methods to configure these options still exist but the backend implementation has been unified under a common implementation.
 
 ### New Features
+
 * Methods to configure serialize, and compare -- such as TableFactory -- are exposed directly through the Configurable base class (from which these objects inherit).  This change will allow for better and more thorough configuration management and retrieval in the future.  The options for a Configurable object can be set via the ConfigureFromMap, ConfigureFromString, or ConfigureOption method.  The serialized version of the options of an object can be retrieved via the GetOptionString, ToString, or GetOption methods.  The list of options supported by an object can be obtained via the GetOptionNames method.  The "raw" object (such as the BlockBasedTableOption) for an option may be retrieved via the GetOptions method.  Configurable options can be compared via the AreEquivalent method.  The settings within a Configurable object may be validated via the ValidateOptions method.  The object may be intialized (at which point only mutable options may be updated) via the PrepareOptions method.
 * Introduce options.check_flush_compaction_key_order with default value to be true. With this option, during flush and compaction, key order will be checked when writing to each SST file. If the order is violated, the flush or compaction will fail.
 * Added is_full_compaction to CompactionJobStats, so that the information is available through the EventListener interface.
 * Add more stats for MultiGet in Histogram to get number of data blocks, index blocks, filter blocks and sst files read from file system per level.
+* SST files have a new table property called db_host_id, which is set to the hostname by default. A new option in DBOptions, db_host_id, allows the property value to be overridden with a user specified string, or disable it completely by making the option string empty.
+* Methods to create customizable extensions -- such as TableFactory -- are exposed directly through the Customizable base class (from which these objects inherit).  This change will allow these Customizable classes to be loaded and configured in a standard way (via CreateFromString).  More information on how to write and use Customizable classes is in the customizable.h header file.
 
-## 6.13 (09/12/2020)
+## 6.13 (2020-09-12)
 ### Bug fixes
 * Fix a performance regression introduced in 6.4 that makes a upper bound check for every Next() even if keys are within a data block that is within the upper bound.
 * Fix a possible corruption to the LSM state (overlapping files within a level) when a `CompactRange()` for refitting levels (`CompactRangeOptions::change_level == true`) and another manual compaction are executed in parallel.
@@ -129,7 +321,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 * Reduce key comparisons during random access in all block-based tables.
 * BackupEngine avoids unnecessary repeated checksum computation for backing up a table file to the `shared_checksum` directory when using `share_files_with_checksum_naming = kUseDbSessionId` (new default), except on SST files generated before this version of RocksDB, which fall back on using `kLegacyCrc32cAndFileSize`.
 
-## 6.11 (6/12/2020)
+## 6.11 (2020-06-12)
 ### Bug Fixes
 * Fix consistency checking error swallowing in some cases when options.force_consistency_checks = true.
 * Fix possible false NotFound status from batched MultiGet using index type kHashSearch.
@@ -164,7 +356,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 ### Performance Improvements
 * Eliminate redundant key comparisons during random access in block-based tables.
 
-## 6.10 (5/2/2020)
+## 6.10 (2020-05-02)
 ### Bug Fixes
 * Fix wrong result being read from ingested file. May happen when a key in the file happen to be prefix of another key also in the file. The issue can further cause more data corruption. The issue exists with rocksdb >= 5.0.0 since DB::IngestExternalFile() was introduced.
 * Finish implementation of BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey. It's now ready for use. Significantly reduces read amplification in some setups, especially for iterator seeks.
@@ -196,7 +388,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 * Improve performance of batch MultiGet with partitioned filters, by sharing block cache lookups to applicable filter blocks.
 * Reduced memory copies when fetching and uncompressing compressed blocks from sst files.
 
-## 6.9.0 (03/29/2020)
+## 6.9.0 (2020-03-29)
 ### Behavior changes
 * Since RocksDB 6.8, ttl-based FIFO compaction can drop a file whose oldest key becomes older than options.ttl while others have not. This fix reverts this and makes ttl-based FIFO compaction use the file's flush time as the criterion. This fix also requires that max_open_files = -1 and compaction_options_fifo.allow_compaction = false to function properly.
 
@@ -223,7 +415,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 * Added a new option, best_efforts_recovery (default: false), to allow database to open in a db dir with missing table files. During best efforts recovery, missing table files are ignored, and database recovers to the most recent state without missing table file. Cross-column-family consistency is not guaranteed even if WAL is enabled.
 * options.bottommost_compression, options.compression_opts and options.bottommost_compression_opts are now dynamically changeable.
 
-## 6.8.0 (02/24/2020)
+## 6.8.0 (2020-02-24)
 ### Java API Changes
 * Major breaking changes to Java comparators, toward standardizing on ByteBuffer for performant, locale-neutral operations on keys (#6252).
 * Added overloads of common API methods using direct ByteBuffers for keys and values (#2283).
@@ -250,21 +442,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 * `db_bench` now supports `value_size_distribution_type`, `value_size_min`, `value_size_max` options for generating random variable sized value. Added `blob_db_compression_type` option for BlobDB to enable blob compression.
 * Replace RocksDB namespace "rocksdb" with flag "ROCKSDB_NAMESPACE" which if is not defined, defined as "rocksdb" in header file rocksdb_namespace.h.
 
-## 6.7.3 (03/18/2020)
-### Bug Fixes
-* Fix a data race that might cause crash when calling DB::GetCreationTimeOfOldestFile() by a small chance. The bug was introduced in 6.6 Release.
-
-## 6.7.2 (02/24/2020)
-### Bug Fixes
-* Fixed a bug of IO Uring partial result handling introduced in 6.7.0.
-
-
-## 6.7.1 (02/13/2020)
-### Bug Fixes
-* Fixed issue #6316 that can cause a corruption of the MANIFEST file in the middle when writing to it fails due to no disk space.
-* Batched MultiGet() ignores IO errors while reading data blocks, causing it to potentially continue looking for a key and returning stale results.
-
-## 6.7.0 (01/21/2020)
+## 6.7.0 (2020-01-21)
 ### Public API Change
 * Added a rocksdb::FileSystem class in include/rocksdb/file_system.h to encapsulate file creation/read/write operations, and an option DBOptions::file_system to allow a user to pass in an instance of rocksdb::FileSystem. If its a non-null value, this will take precendence over DBOptions::env for file operations. A new API rocksdb::FileSystem::Default() returns a platform default object. The DBOptions::env option and Env::Default() API will continue to be used for threading and other OS related functions, and where DBOptions::file_system is not specified, for file operations. For storage developers who are accustomed to rocksdb::Env, the interface in rocksdb::FileSystem is new and will probably undergo some changes as more storage systems are ported to it from rocksdb::Env. As of now, no env other than Posix has been ported to the new interface.
 * A new rocksdb::NewSstFileManager() API that allows the caller to pass in separate Env and FileSystem objects.
@@ -289,11 +467,11 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 * Introduce ReadOptions.auto_prefix_mode. When set to true, iterator will return the same result as total order seek, but may choose to use prefix seek internally based on seek key and iterator upper bound.
 * MultiGet() can use IO Uring to parallelize read from the same SST file. This featuer is by default disabled. It can be enabled with environment variable ROCKSDB_USE_IO_URING.
 
-## 6.6.2 (01/13/2020)
+## 6.6.2 (2020-01-13)
 ### Bug Fixes
 * Fixed a bug where non-L0 compaction input files were not considered to compute the `creation_time` of new compaction outputs.
 
-## 6.6.1 (01/02/2020)
+## 6.6.1 (2020-01-02)
 ### Bug Fixes
 * Fix a bug in WriteBatchWithIndex::MultiGetFromBatchAndDB, which is called by Transaction::MultiGet, that causes due to stale pointer access when the number of keys is > 32
 * Fixed two performance issues related to memtable history trimming. First, a new SuperVersion is now created only if some memtables were actually trimmed. Second, trimming is only scheduled if there is at least one flushed memtable that is kept in memory for the purposes of transaction conflict checking.
@@ -303,7 +481,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 * Delete superversions in BackgroundCallPurge.
 * Fix use-after-free and double-deleting files in BackgroundCallPurge().
 
-## 6.6.0 (11/25/2019)
+## 6.6.0 (2019-11-25)
 ### Bug Fixes
 * Fix data corruption caused by output of intra-L0 compaction on ingested file not being placed in correct order in L0.
 * Fix a data race between Version::GetColumnFamilyMetaData() and Compaction::MarkFilesBeingCompacted() for access to being_compacted (#6056). The current fix acquires the db mutex during Version::GetColumnFamilyMetaData(), which may cause regression.
@@ -356,19 +534,19 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 * For 64-bit hashing, RocksDB is standardizing on a slightly modified preview version of XXH3. This function is now used for many non-persisted hashes, along with fastrange64() in place of the modulus operator, and some benchmarks show a slight improvement.
 * Level iterator to invlidate the iterator more often in prefix seek and the level is filtered out by prefix bloom.
 
-## 6.5.2 (11/15/2019)
+## 6.5.2 (2019-11-15)
 ### Bug Fixes
 * Fix a assertion failure in MultiGet() when BlockBasedTableOptions::no_block_cache is true and there is no compressed block cache
 * Fix a buffer overrun problem in BlockBasedTable::MultiGet() when compression is enabled and no compressed block cache is configured.
 * If a call to BackupEngine::PurgeOldBackups or BackupEngine::DeleteBackup suffered a crash, power failure, or I/O error, files could be left over from old backups that could only be purged with a call to GarbageCollect. Any call to PurgeOldBackups, DeleteBackup, or GarbageCollect should now suffice to purge such files.
 
-## 6.5.1 (10/16/2019)
+## 6.5.1 (2019-10-16)
 ### Bug Fixes
 * Revert the feature "Merging iterator to avoid child iterator reseek for some cases (#5286)" since it might cause strange results when reseek happens with a different iterator upper bound.
 * Fix a bug in BlockBasedTableIterator that might return incorrect results when reseek happens with a different iterator upper bound.
 * Fix a bug when partitioned filters and prefix search are used in conjunction, ::SeekForPrev could return invalid for an existing prefix. ::SeekForPrev might be called by the user, or internally on ::Prev, or within ::Seek if the return value involves Delete or a Merge operand.
 
-## 6.5.0 (9/13/2019)
+## 6.5.0 (2019-09-13)
 ### Bug Fixes
 * Fixed a number of data races in BlobDB.
 * Fix a bug where the compaction snapshot refresh feature is not disabled as advertised when `snap_refresh_nanos` is set to 0..
@@ -389,7 +567,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 ### Performance Improvements
 * Improve the speed of the MemTable Bloom filter, reducing the write overhead of enabling it by 1/3 to 1/2, with similar benefit to read performance.
 
-## 6.4.0 (7/30/2019)
+## 6.4.0 (2019-07-30)
 ### Default Option Change
 * LRUCacheOptions.high_pri_pool_ratio is set to 0.5 (previously 0.0) by default, which means that by default midpoint insertion is enabled. The same change is made for the default value of high_pri_pool_ratio argument in NewLRUCache(). When block cache is not explicitly created, the small block cache created by BlockBasedTable will still has this option to be 0.0.
 * Change BlockBasedTableOptions.cache_index_and_filter_blocks_with_high_priority's default value from false to true.
@@ -425,7 +603,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 * Fixed a regression where the fill_cache read option also affected index blocks.
 * Fixed an issue where using cache_index_and_filter_blocks==false affected partitions of partitioned indexes/filters as well.
 
-## 6.3.2 (8/15/2019)
+## 6.3.2 (2019-08-15)
 ### Public API Change
 * The semantics of the per-block-type block read counts in the performance context now match those of the generic block_read_count.
 
@@ -433,11 +611,11 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 * Fixed a regression where the fill_cache read option also affected index blocks.
 * Fixed an issue where using cache_index_and_filter_blocks==false affected partitions of partitioned indexes as well.
 
-## 6.3.1 (7/24/2019)
+## 6.3.1 (2019-07-24)
 ### Bug Fixes
 * Fix auto rolling bug introduced in 6.3.0, which causes segfault if log file creation fails.
 
-## 6.3.0 (6/18/2019)
+## 6.3.0 (2019-06-18)
 ### Public API Change
 * Now DB::Close() will return Aborted() error when there is unreleased snapshot. Users can retry after all snapshots are released.
 * Index blocks are now handled similarly to data blocks with regards to the block cache: instead of storing objects in the cache, only the blocks themselves are cached. In addition, index blocks no longer get evicted from the cache when a table is closed, can now use the compressed block cache (if any), and can be shared among multiple table readers.
@@ -474,7 +652,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 * Fix a bug caused by secondary not skipping the beginning of new MANIFEST.
 * On DB open, delete WAL trash files left behind in wal_dir
 
-## 6.2.0 (4/30/2019)
+## 6.2.0 (2019-04-30)
 ### New Features
 * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`.
 * Improve range scan performance by avoiding per-key upper bound check in BlockBasedTableIterator.
@@ -496,7 +674,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 * Close a WAL file before another thread deletes it.
 * Fix an assertion failure `IsFlushPending() == true` caused by one bg thread releasing the db mutex in ~ColumnFamilyData and another thread clearing `flush_requested_` flag.
 
-## 6.1.1 (4/9/2019)
+## 6.1.1 (2019-04-09)
 ### New Features
 * When reading from option file/string/map, customized comparators and/or merge operators can be filled according to object registry.
 
@@ -506,7 +684,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 * Fix a bug in 2PC where a sequence of txn prepare, memtable flush, and crash could result in losing the prepared transaction.
 * Fix a bug in Encryption Env which could cause encrypted files to be read beyond file boundaries.
 
-## 6.1.0 (3/27/2019)
+## 6.1.0 (2019-03-27)
 ### New Features
 * Introduce two more stats levels, kExceptHistogramOrTimers and kExceptTimers.
 * Added a feature to perform data-block sampling for compressibility, and report stats to user.
@@ -524,7 +702,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 * Fix JEMALLOC_CXX_THROW macro missing from older Jemalloc versions, causing build failures on some platforms.
 * Fix SstFileReader not able to open file ingested with write_glbal_seqno=true.
 
-## 6.0.0 (2/19/2019)
+## 6.0.0 (2019-02-19)
 ### New Features
 * Enabled checkpoint on readonly db (DBImplReadOnly).
 * Make DB ignore dropped column families while committing results of atomic flush.
@@ -567,7 +745,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 ### Change Default Options
 * Change options.compaction_pri's default to kMinOverlappingRatio
 
-## 5.18.0 (11/30/2018)
+## 5.18.0 (2018-11-30)
 ### New Features
 * Introduced `JemallocNodumpAllocator` memory allocator. When being use, block cache will be excluded from core dump.
 * Introduced `PerfContextByLevel` as part of `PerfContext` which allows storing perf context at each level. Also replaced `__thread` with `thread_local` keyword for perf_context. Added per-level perf context for bloom filter and `Get` query.
@@ -598,7 +776,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 * Fix a deadlock caused by compaction and file ingestion waiting for each other in the event of write stalls.
 * Make DB ignore dropped column families while committing results of atomic flush.
 
-## 5.17.0 (10/05/2018)
+## 5.17.0 (2018-10-05)
 ### Public API Change
 * `OnTableFileCreated` will now be called for empty files generated during compaction. In that case, `TableFileCreationInfo::file_path` will be "(nil)" and `TableFileCreationInfo::file_size` will be zero.
 * Add `FlushOptions::allow_write_stall`, which controls whether Flush calls start working immediately, even if it causes user writes to stall, or will wait until flush can be performed without causing write stall (similar to `CompactRangeOptions::allow_write_stall`). Note that the default value is false, meaning we add delay to Flush calls until stalling can be avoided when possible. This is behavior change compared to previous RocksDB versions, where Flush calls didn't check if they might cause stall or not.
@@ -612,21 +790,21 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 * Avoid creating empty SSTs and subsequently deleting them in certain cases during compaction.
 * Sync CURRENT file contents during checkpoint.
 
-## 5.16.3 (10/1/2018)
+## 5.16.3 (2018-10-01)
 ### Bug Fixes
 * Fix crash caused when `CompactFiles` run with `CompactionOptions::compression == CompressionType::kDisableCompressionOption`. Now that setting causes the compression type to be chosen according to the column family-wide compression options.
 
-## 5.16.2 (9/21/2018)
+## 5.16.2 (2018-09-21)
 ### Bug Fixes
 * Fix bug in partition filters with format_version=4.
 
-## 5.16.1 (9/17/2018)
+## 5.16.1 (2018-09-17)
 ### Bug Fixes
 * Remove trace_analyzer_tool from rocksdb_lib target in TARGETS file.
 * Fix RocksDB Java build and tests.
 * Remove sync point in Block destructor.
 
-## 5.16.0 (8/21/2018)
+## 5.16.0 (2018-08-21)
 ### Public API Change
 * The merge operands are passed to `MergeOperator::ShouldMerge` in the reversed order relative to how they were merged (passed to FullMerge or FullMergeV2) for performance reasons
 * GetAllKeyVersions() to take an extra argument of `max_num_ikeys`.
@@ -640,7 +818,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 ### Bug Fixes
 * Fix a bug in misreporting the estimated partition index size in properties block.
 
-## 5.15.0 (7/17/2018)
+## 5.15.0 (2018-07-17)
 ### Public API Change
 * Remove managed iterator. ReadOptions.managed is not effective anymore.
 * For bottommost_compression, a compatible CompressionOptions is added via `bottommost_compression_opts`. To keep backward compatible, a new boolean `enabled` is added to CompressionOptions. For compression_opts, it will be always used no matter what value of `enabled` is. For bottommost_compression_opts, it will only be used when user set `enabled=true`, otherwise, compression_opts will be used for bottommost_compression as default.
@@ -666,7 +844,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 * Fix a bug caused by not copying the block trailer with compressed SST file, direct IO, prefetcher and no compressed block cache.
 * Fix write can stuck indefinitely if enable_pipelined_write=true. The issue exists since pipelined write was introduced in 5.5.0.
 
-## 5.14.0 (5/16/2018)
+## 5.14.0 (2018-05-16)
 ### Public API Change
 * Add a BlockBasedTableOption to align uncompressed data blocks on the smaller of block size or page size boundary, to reduce flash reads by avoiding reads spanning 4K pages.
 * The background thread naming convention changed (on supporting platforms) to "rocksdb:<thread pool priority><thread number>", e.g., "rocksdb:low0".
@@ -699,7 +877,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 * Add `BlockBasedTableConfig.setBlockCache` to allow sharing a block cache across DB instances.
 * Added SstFileManager to the Java API to allow managing SST files across DB instances.
 
-## 5.13.0 (3/20/2018)
+## 5.13.0 (2018-03-20)
 ### Public API Change
 * RocksDBOptionsParser::Parse()'s `ignore_unknown_options` argument will only be effective if the option file shows it is generated using a higher version of RocksDB than the current version.
 * Remove CompactionEventListener.
@@ -715,7 +893,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 * Fix a leak in prepared_section_completed_ where the zeroed entries would not removed from the map.
 * Fix WAL corruption caused by race condition between user write thread and backup/checkpoint thread.
 
-## 5.12.0 (2/14/2018)
+## 5.12.0 (2018-02-14)
 ### Public API Change
 * Iterator::SeekForPrev is now a pure virtual method. This is to prevent user who implement the Iterator interface fail to implement SeekForPrev by mistake.
 * Add `include_end` option to make the range end exclusive when `include_end == false` in `DeleteFilesInRange()`.
@@ -737,7 +915,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 * Fix advance reservation of arena block addresses.
 * Fix handling of empty string as checkpoint directory.
 
-## 5.11.0 (01/08/2018)
+## 5.11.0 (2018-01-08)
 ### Public API Change
 * Add `autoTune` and `getBytesPerSecond()` to RocksJava RateLimiter
 
@@ -754,7 +932,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 * Fix a mislabel bug for bottom-pri compaction threads.
 * Fix DB::Flush() keep waiting after flush finish under certain condition.
 
-## 5.10.0 (12/11/2017)
+## 5.10.0 (2017-12-11)
 ### Public API Change
 * When running `make` with environment variable `USE_SSE` set and `PORTABLE` unset, will use all machine features available locally. Previously this combination only compiled SSE-related features.
 
@@ -771,7 +949,7 @@ Fixed a potential bug caused by evaluating `TableBuilder::NeedCompact()` before
 * Fix performance issue in `IngestExternalFile()` affecting databases with large number of SST files.
 * Fix possible corruption to LSM structure when `DeleteFilesInRange()` deletes a subset of files spanned by a `DeleteRange()` marker.
 
-## 5.9.0 (11/1/2017)
+## 5.9.0 (2017-11-01)
 ### Public API Change
 * `BackupableDBOptions::max_valid_backups_to_open == 0` now means no backups will be opened during BackupEngine initialization. Previously this condition disabled limiting backups opened.
 * `DBOptions::preserve_deletes` is a new option that allows one to specify that DB should not drop tombstones for regular deletes if they have sequence number larger than what was set by the new API call `DB::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum)`. Disabled by default.
@@ -798,7 +976,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 * Fix a potential data inconsistency issue during point-in-time recovery. `DB:Open()` will abort if column family inconsistency is found during PIT recovery.
 * Fix possible metadata corruption in databases using `DeleteRange()`.
 
-## 5.8.0 (08/30/2017)
+## 5.8.0 (2017-08-30)
 ### Public API Change
 * Users of `Statistics::getHistogramString()` will see fewer histogram buckets and different bucket endpoints.
 * `Slice::compare` and BytewiseComparator `Compare` no longer accept `Slice`s containing nullptr.
@@ -818,7 +996,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 * Fix transient reappearance of keys covered by range deletions when memtable prefix bloom filter is enabled.
 * Fix potentially wrong file smallest key when range deletions separated by snapshot are written together.
 
-## 5.7.0 (07/13/2017)
+## 5.7.0 (2017-07-13)
 ### Public API Change
 * DB property "rocksdb.sstables" now prints keys in hex form.
 
@@ -833,7 +1011,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 ### Bug Fixes
 * Fix discarding empty compaction output files when `DeleteRange()` is used together with subcompactions.
 
-## 5.6.0 (06/06/2017)
+## 5.6.0 (2017-06-06)
 ### Public API Change
 * Scheduling flushes and compactions in the same thread pool is no longer supported by setting `max_background_flushes=0`. Instead, users can achieve this by configuring their high-pri thread pool to have zero threads.
 * Replace `Options::max_background_flushes`, `Options::max_background_compactions`, and `Options::base_background_compactions` all with `Options::max_background_jobs`, which automatically decides how many threads to allocate towards flush/compaction.
@@ -850,7 +1028,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 ### Bug Fixes
 * Shouldn't ignore return value of fsync() in flush.
 
-## 5.5.0 (05/17/2017)
+## 5.5.0 (2017-05-17)
 ### New Features
 * FIFO compaction to support Intra L0 compaction too with CompactionOptionsFIFO.allow_compaction=true.
 * DB::ResetStats() to reset internal stats.
@@ -867,7 +1045,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 ### Bug Fixes
 * Fix the bug that Direct I/O uses direct reads for non-SST file
 
-## 5.4.0 (04/11/2017)
+## 5.4.0 (2017-04-11)
 ### Public API Change
 * random_access_max_buffer_size no longer has any effect
 * Removed Env::EnableReadAhead(), Env::ShouldForwardRawRequest()
@@ -884,7 +1062,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 * Introduce level-based L0->L0 compactions to reduce file count, so write delays are incurred less often.
 * (Experimental) Partitioning filters which creates an index on the partitions. The feature can be enabled by setting partition_filters when using kFullFilter. Currently the feature also requires two-level indexing to be enabled. Number of partitions is the same as the number of partitions for indexes, which is controlled by metadata_block_size.
 
-## 5.3.0 (03/08/2017)
+## 5.3.0 (2017-03-08)
 ### Public API Change
 * Remove disableDataSync option.
 * Remove timeout_hint_us option from WriteOptions. The option has been deprecated and has no effect since 3.13.0.
@@ -894,7 +1072,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 ### Bug Fixes
 * Fix the bug that iterator may skip keys
 
-## 5.2.0 (02/08/2017)
+## 5.2.0 (2017-02-08)
 ### Public API Change
 * NewLRUCache() will determine number of shard bits automatically based on capacity, if the user doesn't pass one. This also impacts the default block cache when the user doesn't explicit provide one.
 * Change the default of delayed slowdown value to 16MB/s and further increase the L0 stop condition to 36 files.
@@ -912,7 +1090,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 * Some fixes related to 2PC.
 * Fix bugs of data corruption in direct I/O
 
-## 5.1.0 (01/13/2017)
+## 5.1.0 (2017-01-13)
 * Support dynamically change `delete_obsolete_files_period_micros` option via SetDBOptions().
 * Added EventListener::OnExternalFileIngested which will be called when IngestExternalFile() add a file successfully.
 * BackupEngine::Open and BackupEngineReadOnly::Open now always return error statuses matching those of the backup Env.
@@ -921,7 +1099,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 * Fix the bug that if 2PC is enabled, checkpoints may loss some recent transactions.
 * When file copying is needed when creating checkpoints or bulk loading files, fsync the file after the file copying.
 
-## 5.0.0 (11/17/2016)
+## 5.0.0 (2016-11-17)
 ### Public API Change
 * Options::max_bytes_for_level_multiplier is now a double along with all getters and setters.
 * Support dynamically change `delayed_write_rate` and `max_total_wal_size` options via SetDBOptions().
@@ -940,7 +1118,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 * Add LuaCompactionFilter in utilities.  This allows developers to write compaction filters in Lua.  To use this feature, LUA_PATH needs to be set to the root directory of Lua.
 * No longer populate "LATEST_BACKUP" file in backup directory, which formerly contained the number of the latest backup. The latest backup can be determined by finding the highest numbered file in the "meta/" subdirectory.
 
-## 4.13.0 (10/18/2016)
+## 4.13.0 (2016-10-18)
 ### Public API Change
 * DB::GetOptions() reflect dynamic changed options (i.e. through DB::SetOptions()) and return copy of options instead of reference.
 * Added Statistics::getAndResetTickerCount().
@@ -949,7 +1127,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 * Add DB::SetDBOptions() to dynamic change base_background_compactions and max_background_compactions.
 * Added Iterator::SeekForPrev(). This new API will seek to the last key that less than or equal to the target key.
 
-## 4.12.0 (9/12/2016)
+## 4.12.0 (2016-09-12)
 ### Public API Change
 * CancelAllBackgroundWork() flushes all memtables for databases containing writes that have bypassed the WAL (writes issued with WriteOptions::disableWAL=true) before shutting down background threads.
 * Merge options source_compaction_factor, max_grandparent_overlap_bytes and expanded_compaction_factor into max_compaction_bytes.
@@ -961,7 +1139,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 * Change ticker/histogram statistics implementations to accumulate data in thread-local storage, which improves CPU performance by reducing cache coherency costs. Callers of CreateDBStatistics do not need to change anything to use this feature.
 * Block cache mid-point insertion, where index and filter block are inserted into LRU block cache with higher priority. The feature can be enabled by setting BlockBasedTableOptions::cache_index_and_filter_blocks_with_high_priority to true and high_pri_pool_ratio > 0 when creating NewLRUCache.
 
-## 4.11.0 (8/1/2016)
+## 4.11.0 (2016-08-01)
 ### Public API Change
 * options.memtable_prefix_bloom_huge_page_tlb_size => memtable_huge_page_size. When it is set, RocksDB will try to allocate memory from huge page for memtable too, rather than just memtable bloom filter.
 
@@ -969,7 +1147,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 * A tool to migrate DB after options change. See include/rocksdb/utilities/option_change_migration.h.
 * Add ReadOptions.background_purge_on_iterator_cleanup. If true, we avoid file deletion when destroying iterators.
 
-## 4.10.0 (7/5/2016)
+## 4.10.0 (2016-07-05)
 ### Public API Change
 * options.memtable_prefix_bloom_bits changes to options.memtable_prefix_bloom_bits_ratio and deprecate options.memtable_prefix_bloom_probes
 * enum type CompressionType and PerfLevel changes from char to unsigned char. Value of all PerfLevel shift by one.
@@ -981,7 +1159,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 * RepairDB support for column families. RepairDB now associates data with non-default column families using information embedded in the SST/WAL files (4.7 or later). For data written by 4.6 or earlier, RepairDB associates it with the default column family.
 * Add options.write_buffer_manager which allows users to control total memtable sizes across multiple DB instances.
 
-## 4.9.0 (6/9/2016)
+## 4.9.0 (2016-06-09)
 ### Public API changes
 * Add bottommost_compression option, This option can be used to set a specific compression algorithm for the bottommost level (Last level containing files in the DB).
 * Introduce CompactionJobInfo::compression, This field state the compression algorithm used to generate the output files of the compaction.
@@ -991,7 +1169,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 ### New Features
 * Introduce NewSimCache() in rocksdb/utilities/sim_cache.h. This function creates a block cache that is able to give simulation results (mainly hit rate) of simulating block behavior with a configurable cache size.
 
-## 4.8.0 (5/2/2016)
+## 4.8.0 (2016-05-02)
 ### Public API Change
 * Allow preset compression dictionary for improved compression of block-based tables. This is supported for zlib, zstd, and lz4. The compression dictionary's size is configurable via CompressionOptions::max_dict_bytes.
 * Delete deprecated classes for creating backups (BackupableDB) and restoring from backups (RestoreBackupableDB). Now, BackupEngine should be used for creating backups, and BackupEngineReadOnly should be used for restorations. For more details, see https://github.com/facebook/rocksdb/wiki/How-to-backup-RocksDB%3F
@@ -1001,12 +1179,12 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 ### New Features
 * Add ReadOptions::readahead_size. If non-zero, NewIterator will create a new table reader which performs reads of the given size.
 
-## 4.7.0 (4/8/2016)
+## 4.7.0 (2016-04-08)
 ### Public API Change
 * rename options compaction_measure_io_stats to report_bg_io_stats and include flush too.
 * Change some default options. Now default options will optimize for server-workloads. Also enable slowdown and full stop triggers for pending compaction bytes. These changes may cause sub-optimal performance or significant increase of resource usage. To avoid these risks, users can open existing RocksDB with options extracted from RocksDB option files. See https://github.com/facebook/rocksdb/wiki/RocksDB-Options-File for how to use RocksDB option files. Or you can call Options.OldDefaults() to recover old defaults. DEFAULT_OPTIONS_HISTORY.md will track change history of default options.
 
-## 4.6.0 (3/10/2016)
+## 4.6.0 (2016-03-10)
 ### Public API Changes
 * Change default of BlockBasedTableOptions.format_version to 2. It means default DB created by 4.6 or up cannot be opened by RocksDB version 3.9 or earlier.
 * Added strict_capacity_limit option to NewLRUCache. If the flag is set to true, insert to cache will fail if no enough capacity can be free. Signature of Cache::Insert() is updated accordingly.
@@ -1017,7 +1195,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 * Add CompactionPri::kMinOverlappingRatio, a compaction picking mode friendly to write amplification.
 * Deprecate Iterator::IsKeyPinned() and replace it with Iterator::GetProperty() with prop_name="rocksdb.iterator.is.key.pinned"
 
-## 4.5.0 (2/5/2016)
+## 4.5.0 (2016-02-05)
 ### Public API Changes
 * Add a new perf context level between kEnableCount and kEnableTime. Level 2 now does not include timers for mutexes.
 * Statistics of mutex operation durations will not be measured by default. If you want to have them enabled, you need to set Statistics::stats_level_ to kAll.
@@ -1028,7 +1206,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 * Add kPersistedTier to ReadTier.  This option allows Get and MultiGet to read only the persited data and skip mem-tables if writes were done with disableWAL = true.
 * Add DBOptions::sst_file_manager. Use NewSstFileManager() in include/rocksdb/sst_file_manager.h to create a SstFileManager that can be used to track the total size of SST files and control the SST files deletion rate.
 
-## 4.4.0 (1/14/2016)
+## 4.4.0 (2016-01-14)
 ### Public API Changes
 * Change names in CompactionPri and add a new one.
 * Deprecate options.soft_rate_limit and add options.soft_pending_compaction_bytes_limit.
@@ -1038,7 +1216,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 * Increase default options.delayed_write_rate to 2MB/s.
 * Added a new parameter --path to ldb tool. --path accepts the name of either MANIFEST, SST or a WAL file. Either --db or --path can be used when calling ldb.
 
-## 4.3.0 (12/8/2015)
+## 4.3.0 (2015-12-08)
 ### New Features
 * CompactionFilter has new member function called IgnoreSnapshots which allows CompactionFilter to be called even if there are snapshots later than the key.
 * RocksDB will now persist options under the same directory as the RocksDB database on successful DB::Open, CreateColumnFamily, DropColumnFamily, and SetOptions.
@@ -1048,7 +1226,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 ### Public API Changes
 * When options.db_write_buffer_size triggers, only the column family with the largest column family size will be flushed, not all the column families.
 
-## 4.2.0 (11/9/2015)
+## 4.2.0 (2015-11-09)
 ### New Features
 * Introduce CreateLoggerFromOptions(), this function create a Logger for provided DBOptions.
 * Add GetAggregatedIntProperty(), which returns the sum of the GetIntProperty of all the column families.
@@ -1061,7 +1239,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 * Remove DefaultCompactionFilterFactory.
 
 
-## 4.1.0 (10/8/2015)
+## 4.1.0 (2015-10-08)
 ### New Features
 * Added single delete operation as a more efficient way to delete keys that have not been overwritten.
 * Added experimental AddFile() to DB interface that allow users to add files created by SstFileWriter into an empty Database, see include/rocksdb/sst_file_writer.h and DB::AddFile() for more info.
@@ -1075,7 +1253,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 * CompactionFilter has a new method FilterMergeOperand() that RocksDB applies to every merge operand during compaction to decide whether to filter the operand.
 * We removed CompactionFilterV2 interfaces from include/rocksdb/compaction_filter.h. The functionality was deprecated already in version 3.13.
 
-## 4.0.0 (9/9/2015)
+## 4.0.0 (2015-09-09)
 ### New Features
 * Added support for transactions.  See include/rocksdb/utilities/transaction.h for more info.
 * DB::GetProperty() now accepts "rocksdb.aggregated-table-properties" and "rocksdb.aggregated-table-properties-at-levelN", in which case it returns aggregated table properties of the target column family, or the aggregated table properties of the specified level N if the "at-level" version is used.
@@ -1088,7 +1266,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 * Added Equal() method to the Comparator interface that can optionally be overwritten in cases where equality comparisons can be done more efficiently than three-way comparisons.
 * Previous 'experimental' OptimisticTransaction class has been replaced by Transaction class.
 
-## 3.13.0 (8/6/2015)
+## 3.13.0 (2015-08-06)
 ### New Features
 * RollbackToSavePoint() in WriteBatch/WriteBatchWithIndex
 * Add NewCompactOnDeletionCollectorFactory() in utilities/table_properties_collectors, which allows rocksdb to mark a SST file as need-compaction when it observes at least D deletion entries in any N consecutive entries in that SST file.  Note that this feature depends on an experimental NeedCompact() API --- the result of this API will not persist after DB restart.
@@ -1103,7 +1281,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 * Add statistics::getHistogramString() to print detailed distribution of a histogram metric.
 * Add DBOptions::skip_stats_update_on_db_open.  When it is on, DB::Open() will run faster as it skips the random reads required for loading necessary stats from SST files to optimize compaction.
 
-## 3.12.0 (7/2/2015)
+## 3.12.0 (2015-07-02)
 ### New Features
 * Added experimental support for optimistic transactions.  See include/rocksdb/utilities/optimistic_transaction.h for more info.
 * Added a new way to report QPS from db_bench (check out --report_file and --report_interval_seconds)
@@ -1133,7 +1311,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 * Add BackupEngineImpl.options_.max_background_operations to specify the maximum number of operations that may be performed in parallel. Add support for parallelized backup and restore.
 * Add DB::SyncWAL() that does a WAL sync without blocking writers.
 
-## 3.11.0 (5/19/2015)
+## 3.11.0 (2015-05-19)
 ### New Features
 * Added a new API Cache::SetCapacity(size_t capacity) to dynamically change the maximum configured capacity of the cache. If the new capacity is less than the existing cache usage, the implementation will try to lower the usage by evicting the necessary number of elements following a strict LRU policy.
 * Added an experimental API for handling flashcache devices (blacklists background threads from caching their reads) -- NewFlashcacheAwareEnv
@@ -1144,7 +1322,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 * TablePropertiesCollector::AddUserKey() is added to replace TablePropertiesCollector::Add(). AddUserKey() exposes key type, sequence number and file size up to now to users.
 * DBOptions::bytes_per_sync used to apply to both WAL and table files. As of 3.11 it applies only to table files. If you want to use this option to sync WAL in the background, please use wal_bytes_per_sync
 
-## 3.10.0 (3/24/2015)
+## 3.10.0 (2015-03-24)
 ### New Features
 * GetThreadStatus() is now able to report detailed thread status, including:
  - Thread Operation including flush and compaction.
@@ -1179,7 +1357,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 * lz4 compression is now included in rocksjava static library when running `make rocksdbjavastatic`.
 * Overflowing a size_t when setting rocksdb options now throws an IllegalArgumentException, which removes the necessity for a developer to catch these Exceptions explicitly.
 
-## 3.9.0 (12/8/2014)
+## 3.9.0 (2014-12-08)
 
 ### New Features
 * Add rocksdb::GetThreadList(), which in the future will return the current status of all
@@ -1198,7 +1376,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 ### Improvements
 * RocksDBLite library now becomes smaller and will be compiled with -fno-exceptions flag.
 
-## 3.8.0 (11/14/2014)
+## 3.8.0 (2014-11-14)
 
 ### Public API changes
 * BackupEngine::NewBackupEngine() was deprecated; please use BackupEngine::Open() from now on.
@@ -1212,14 +1390,14 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 * CompactFiles and EventListener, although they are still in experimental state
 * Full ColumnFamily support in RocksJava.
 
-## 3.7.0 (11/6/2014)
+## 3.7.0 (2014-11-06)
 ### Public API changes
 * Introduce SetOptions() API to allow adjusting a subset of options dynamically online
 * Introduce 4 new convenient functions for converting Options from string: GetColumnFamilyOptionsFromMap(), GetColumnFamilyOptionsFromString(), GetDBOptionsFromMap(), GetDBOptionsFromString()
 * Remove WriteBatchWithIndex.Delete() overloads using SliceParts
 * When opening a DB, if options.max_background_compactions is larger than the existing low pri pool of options.env, it will enlarge it. Similarly, options.max_background_flushes is larger than the existing high pri pool of options.env, it will enlarge it.
 
-## 3.6.0 (10/7/2014)
+## 3.6.0 (2014-10-07)
 ### Disk format changes
 * If you're using RocksDB on ARM platforms and you're using default bloom filter, there is a disk format change you need to be aware of. There are three steps you need to do when you convert to new release: 1. turn off filter policy, 2. compact the whole database, 3. turn on filter policy
 
@@ -1232,7 +1410,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 * Change target_file_size_base type to uint64_t from int.
 * Remove allow_thread_local. This feature was proved to be stable, so we are turning it always-on.
 
-## 3.5.0 (9/3/2014)
+## 3.5.0 (2014-09-03)
 ### New Features
 * Add include/utilities/write_batch_with_index.h, providing a utility class to query data out of WriteBatch when building it.
 * Move BlockBasedTable related options to BlockBasedTableOptions from Options. Change corresponding JNI interface. Options affected include:
@@ -1243,7 +1421,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 ### Public API changes
 * The Prefix Extractor used with V2 compaction filters is now passed user key to SliceTransform::Transform instead of unparsed RocksDB key.
 
-## 3.4.0 (8/18/2014)
+## 3.4.0 (2014-08-18)
 ### New Features
 * Support Multiple DB paths in universal style compactions
 * Add feature of storing plain table index and bloom filter in SST file.
@@ -1259,7 +1437,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 * Add DB::GetIntProperty(), which returns DB properties that are integer as uint64_t.
 * The Prefix Extractor used with V2 compaction filters is now passed user key to SliceTransform::Transform instead of unparsed RocksDB key.
 
-## 3.3.0 (7/10/2014)
+## 3.3.0 (2014-07-10)
 ### New Features
 * Added JSON API prototype.
 * HashLinklist reduces performance outlier caused by skewed bucket by switching data in the bucket from linked list to skip list. Add parameter threshold_use_skiplist in NewHashLinkListRepFactory().
@@ -1270,7 +1448,7 @@ if set to something > 0 user will see 2 changes in iterators behavior 1) only ke
 ### Public API changes
 * Removed NewTotalOrderPlainTableFactory because it is not used and implemented semantically incorrect.
 
-## 3.2.0 (06/20/2014)
+## 3.2.0 (2014-06-20)
 
 ### Public API changes
 * We removed seek compaction as a concept from RocksDB because:
@@ -1288,7 +1466,7 @@ Because of that, Options::disable_seek_compaction is now obsolete. It is still a
 ### Performance Improvements
 * Tailing Iterator re-implemeted with ForwardIterator + Cascading Search Hint , see ~20% throughput improvement.
 
-## 3.1.0 (05/21/2014)
+## 3.1.0 (2014-05-21)
 
 ### Public API changes
 * Replaced ColumnFamilyOptions::table_properties_collectors with ColumnFamilyOptions::table_properties_collector_factories
@@ -1297,7 +1475,7 @@ Because of that, Options::disable_seek_compaction is now obsolete. It is still a
 * Hash index for block-based table will be materialized and reconstructed more efficiently. Previously hash index is constructed by scanning the whole table during every table open.
 * FIFO compaction style
 
-## 3.0.0 (05/05/2014)
+## 3.0.0 (2014-05-05)
 
 ### Public API changes
 * Added _LEVEL to all InfoLogLevel enums
@@ -1309,7 +1487,7 @@ Because of that, Options::disable_seek_compaction is now obsolete. It is still a
 * Added an option to use different checksum functions in BlockBasedTableOptions
 * Added ApplyToAllCacheEntries() function to Cache
 
-## 2.8.0 (04/04/2014)
+## 2.8.0 (2014-04-04)
 
 * Removed arena.h from public header files.
 * By default, checksums are verified on every read from database
@@ -1338,7 +1516,7 @@ Because of that, Options::disable_seek_compaction is now obsolete. It is still a
 * Now compaction filter has a V2 interface. It buffers the kv-pairs sharing the same key prefix, process them in batches, and return the batched results back to DB. The new interface uses a new structure CompactionFilterContext for the same purpose as CompactionFilter::Context in V1.
 * Geo-spatial support for locations and radial-search.
 
-## 2.7.0 (01/28/2014)
+## 2.7.0 (2014-01-28)
 
 ### Public API changes
 
diff --git a/INSTALL.md b/INSTALL.md
index 9163df166d3..0165e9c128d 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -43,6 +43,8 @@ to build a portable binary, add `PORTABLE=1` before your make commands, like thi
       command line flags processing. You can compile rocksdb library even
       if you don't have gflags installed.
 
+* `make check` will also check code formatting, which requires [clang-format](https://clang.llvm.org/docs/ClangFormat.html)
+
 * If you wish to build the RocksJava static target, then cmake is required for building Snappy.
 
 ## Supported platforms
@@ -94,12 +96,21 @@ to build a portable binary, add `PORTABLE=1` before your make commands, like thi
               sudo yum install libasan
 
     * Install zstandard:
+        * With [EPEL](https://fedoraproject.org/wiki/EPEL):
+
+              sudo yum install libzstd-devel
+
+        * With CentOS 8:
+
+              sudo dnf install libzstd-devel
+
+        * From source:
 
-             wget https://github.com/facebook/zstd/archive/v1.1.3.tar.gz
-             mv v1.1.3.tar.gz zstd-1.1.3.tar.gz
-             tar zxvf zstd-1.1.3.tar.gz
-             cd zstd-1.1.3
-             make && sudo make install
+              wget https://github.com/facebook/zstd/archive/v1.1.3.tar.gz
+              mv v1.1.3.tar.gz zstd-1.1.3.tar.gz
+              tar zxvf zstd-1.1.3.tar.gz
+              cd zstd-1.1.3
+              make && sudo make install
 
 * **OS X**:
     * Install latest C++ compiler that supports C++ 11:
diff --git a/Makefile b/Makefile
index 9f0a4923f23..7433056ad9a 100644
--- a/Makefile
+++ b/Makefile
@@ -55,60 +55,33 @@ DEBUG_LEVEL?=1
 # Set the default LIB_MODE to static
 LIB_MODE?=static
 
-ifeq ($(MAKECMDGOALS),dbg)
-	DEBUG_LEVEL=2
-endif
+# OBJ_DIR is where the object files reside.  Default to the current directory
+OBJ_DIR?=.
 
-ifeq ($(MAKECMDGOALS),clean)
-	DEBUG_LEVEL=0
-endif
+# Check the MAKECMDGOALS to set the DEBUG_LEVEL and LIB_MODE appropriately
 
-ifeq ($(MAKECMDGOALS),release)
+ifneq ($(filter clean release install, $(MAKECMDGOALS)),)
 	DEBUG_LEVEL=0
 endif
-
-ifeq ($(MAKECMDGOALS),shared_lib)
-	LIB_MODE=shared
+ifneq ($(filter dbg, $(MAKECMDGOALS)),)
+	DEBUG_LEVEL=2
+else ifneq ($(filter shared_lib install-shared, $(MAKECMDGOALS)),)
 	DEBUG_LEVEL=0
-endif
-
-ifeq ($(MAKECMDGOALS),install-shared)
 	LIB_MODE=shared
-	DEBUG_LEVEL=0
-endif
-
-ifeq ($(MAKECMDGOALS),static_lib)
+else ifneq ($(filter static_lib install-static, $(MAKECMDGOALS)),)
 	DEBUG_LEVEL=0
 	LIB_MODE=static
-endif
-
-ifeq ($(MAKECMDGOALS),install-static)
-	DEBUG_LEVEL=0
-	LIB_MODE=static
-endif
-
-ifeq ($(MAKECMDGOALS),install)
-	DEBUG_LEVEL=0
-endif
-
-
-ifneq ($(findstring jtest, $(MAKECMDGOALS)),)
+else ifneq ($(filter jtest rocksdbjava%, $(MAKECMDGOALS)),)
 	OBJ_DIR=jl
 	LIB_MODE=shared
-endif
-
-ifneq ($(findstring rocksdbjava, $(MAKECMDGOALS)),)
-	LIB_MODE=shared
-        ifneq ($(findstring rocksdbjavastatic, $(MAKECMDGOALS)),)
+	ifneq ($(findstring rocksdbjavastatic, $(MAKECMDGOALS)),)
 		OBJ_DIR=jls
-	        ifneq ($(DEBUG_LEVEL),2)
-	            DEBUG_LEVEL=0
-                endif
-                ifeq ($(MAKECMDGOALS),rocksdbjavastaticpublish)
-	            DEBUG_LEVEL=0
-                endif
-	else
-		OBJ_DIR=jl
+		ifneq ($(DEBUG_LEVEL),2)
+			DEBUG_LEVEL=0
+		endif
+		ifeq ($(MAKECMDGOALS),rocksdbjavastaticpublish)
+			DEBUG_LEVEL=0
+		endif
 	endif
 endif
 
@@ -190,6 +163,11 @@ else
 endif
 
 ifdef ASSERT_STATUS_CHECKED
+# For ASC, turn off constructor elision, preventing the case where a constructor returned
+# by a method may pass the ASC check if the status is checked in the inner method.  Forcing
+# the copy constructor to be invoked disables the optimization and will cause the calling method
+# to check the status in order to prevent an error from being raised.
+PLATFORM_CXXFLAGS += -fno-elide-constructors
 ifeq ($(filter -DROCKSDB_ASSERT_STATUS_CHECKED,$(OPT)),)
 	OPT += -DROCKSDB_ASSERT_STATUS_CHECKED
 endif
@@ -254,6 +232,8 @@ AM_SHARE = $(AM_V_CCLD) $(CXX) $(PLATFORM_SHARED_LDFLAGS)$@ -L. $(patsubst lib%.
 # Export some common variables that might have been passed as Make variables
 # instead of environment variables.
 dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; \
+                  export CXXFLAGS="$(EXTRA_CXXFLAGS)"; \
+                  export LDFLAGS="$(EXTRA_LDFLAGS)"; \
                   export COMPILE_WITH_ASAN="$(COMPILE_WITH_ASAN)"; \
                   export COMPILE_WITH_TSAN="$(COMPILE_WITH_TSAN)"; \
                   export COMPILE_WITH_UBSAN="$(COMPILE_WITH_UBSAN)"; \
@@ -264,6 +244,12 @@ dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; \
 # this file is generated by the previous line to set build flags and sources
 include make_config.mk
 
+ROCKSDB_PLUGIN_MKS = $(foreach plugin, $(ROCKSDB_PLUGINS), plugin/$(plugin)/*.mk)
+include $(ROCKSDB_PLUGIN_MKS)
+ROCKSDB_PLUGIN_SOURCES = $(foreach plugin, $(ROCKSDB_PLUGINS), $(foreach source, $($(plugin)_SOURCES), plugin/$(plugin)/$(source)))
+ROCKSDB_PLUGIN_HEADERS = $(foreach plugin, $(ROCKSDB_PLUGINS), $(foreach header, $($(plugin)_HEADERS), plugin/$(plugin)/$(header)))
+PLATFORM_LDFLAGS += $(foreach plugin, $(ROCKSDB_PLUGINS), $($(plugin)_LDFLAGS))
+
 export JAVAC_ARGS
 CLEAN_FILES += make_config.mk rocksdb.pc
 
@@ -414,6 +400,10 @@ ifdef TEST_UINT128_COMPAT
   PLATFORM_CCFLAGS += -DTEST_UINT128_COMPAT=1
   PLATFORM_CXXFLAGS += -DTEST_UINT128_COMPAT=1
 endif
+ifdef ROCKSDB_MODIFY_NPHASH
+  PLATFORM_CCFLAGS += -DROCKSDB_MODIFY_NPHASH=1
+  PLATFORM_CXXFLAGS += -DROCKSDB_MODIFY_NPHASH=1
+endif
 
 # This (the first rule) must depend on "all".
 default: all
@@ -421,6 +411,10 @@ default: all
 WARNING_FLAGS = -W -Wextra -Wall -Wsign-compare -Wshadow \
   -Wunused-parameter
 
+ifeq (,$(filter amd64, $(MACHINE)))
+	C_WARNING_FLAGS = -Wstrict-prototypes
+endif
+
 ifdef USE_CLANG
 	# Used by some teams in Facebook
 	WARNING_FLAGS += -Wshift-sign-overflow
@@ -465,38 +459,13 @@ ifeq ($(NO_THREEWAY_CRC32C), 1)
 	CXXFLAGS += -DNO_THREEWAY_CRC32C
 endif
 
-CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
+CFLAGS += $(C_WARNING_FLAGS) $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
 CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers
 
 LDFLAGS += $(PLATFORM_LDFLAGS)
 
-# If NO_UPDATE_BUILD_VERSION is set we don't update util/build_version.cc, but
-# the file needs to already exist or else the build will fail
-ifndef NO_UPDATE_BUILD_VERSION
-date := $(shell date +%F)
-ifdef FORCE_GIT_SHA
-	git_sha := $(FORCE_GIT_SHA)
-else
-	git_sha := $(shell git rev-parse HEAD 2>/dev/null)
-endif
-gen_build_version = sed -e s/@@GIT_SHA@@/$(git_sha)/ -e s/@@GIT_DATE_TIME@@/$(date)/ util/build_version.cc.in
-
-# Record the version of the source that we are compiling.
-# We keep a record of the git revision in this file.  It is then built
-# as a regular source file as part of the compilation process.
-# One can run "strings executable_filename | grep _build_" to find
-# the version of the source that we used to build the executable file.
-FORCE:
-util/build_version.cc: FORCE
-	$(AM_V_GEN)rm -f $@-t
-	$(AM_V_at)$(gen_build_version) > $@-t
-	$(AM_V_at)if test -f $@; then					\
-	  cmp -s $@-t $@ && rm -f $@-t || mv -f $@-t $@;		\
-	else mv -f $@-t $@; fi
-endif
-
-OBJ_DIR?=.
 LIB_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(LIB_SOURCES))
+LIB_OBJECTS += $(patsubst %.cc, $(OBJ_DIR)/%.o, $(ROCKSDB_PLUGIN_SOURCES))
 ifeq ($(HAVE_POWER8),1)
 LIB_OBJECTS += $(patsubst %.c, $(OBJ_DIR)/%.o, $(LIB_SOURCES_C))
 LIB_OBJECTS += $(patsubst %.S, $(OBJ_DIR)/%.o, $(LIB_SOURCES_ASM))
@@ -506,6 +475,12 @@ ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
   LIB_OBJECTS += $(patsubst %.cpp, $(OBJ_DIR)/%.o, $(FOLLY_SOURCES))
 endif
 
+# range_tree is not compatible with non GNU libc on ppc64
+# see https://jira.percona.com/browse/PS-7559
+ifneq ($(PPC_LIBC_IS_GNU),0)
+  LIB_OBJECTS += $(patsubst %.cc, $(OBJ_DIR)/%.o, $(RANGE_TREE_SOURCES))
+endif
+
 GTEST = $(OBJ_DIR)/$(GTEST_DIR)/gtest/gtest-all.o
 TESTUTIL = $(OBJ_DIR)/test_util/testutil.o
 TESTHARNESS = $(OBJ_DIR)/test_util/testharness.o $(TESTUTIL) $(GTEST)
@@ -516,12 +491,14 @@ VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
 
 TEST_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES)) $(GTEST)
 BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(BENCH_LIB_SOURCES))
+CACHE_BENCH_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(CACHE_BENCH_LIB_SOURCES))
 TOOL_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(TOOL_LIB_SOURCES))
 ANALYZE_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(ANALYZER_LIB_SOURCES))
 STRESS_OBJECTS =  $(patsubst %.cc, $(OBJ_DIR)/%.o, $(STRESS_LIB_SOURCES))
 
-ALL_SOURCES  = $(LIB_SOURCES) $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES) $(GTEST_DIR)/gtest/gtest-all.cc
-ALL_SOURCES += $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) $(STRESS_LIB_SOURCES)
+# Exclude build_version.cc -- a generated source file -- from all sources.  Not needed for dependencies
+ALL_SOURCES  = $(filter-out util/build_version.cc, $(LIB_SOURCES)) $(TEST_LIB_SOURCES) $(MOCK_LIB_SOURCES) $(GTEST_DIR)/gtest/gtest-all.cc
+ALL_SOURCES += $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(CACHE_BENCH_LIB_SOURCES) $(ANALYZER_LIB_SOURCES) $(STRESS_LIB_SOURCES)
 ALL_SOURCES += $(TEST_MAIN_SOURCES) $(TOOL_MAIN_SOURCES) $(BENCH_MAIN_SOURCES)
 
 TESTS = $(patsubst %.cc, %, $(notdir $(TEST_MAIN_SOURCES)))
@@ -532,160 +509,40 @@ ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
 	ALL_SOURCES += third-party/folly/folly/synchronization/test/DistributedMutexTest.cc
 endif
 
-PARALLEL_TEST = \
-	backupable_db_test \
-	db_bloom_filter_test \
-	db_compaction_filter_test \
-	db_compaction_test \
-	db_merge_operator_test \
-	db_sst_test \
-	db_test \
-	db_test2 \
-	db_universal_compaction_test \
-	db_wal_test \
-	column_family_test \
-	external_sst_file_test \
-	import_column_family_test \
-	fault_injection_test \
-	file_reader_writer_test \
-	inlineskiplist_test \
-	manual_compaction_test \
-	persistent_cache_test \
-	table_test \
-	transaction_test \
-	transaction_lock_mgr_test \
-	write_prepared_transaction_test \
-	write_unprepared_transaction_test \
-
-ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
-	TESTS += folly_synchronization_distributed_mutex_test
-	PARALLEL_TEST += folly_synchronization_distributed_mutex_test
-	TESTS_PASSING_ASC = folly_synchronization_distributed_mutex_test
-endif
-
 # options_settable_test doesn't pass with UBSAN as we use hack in the test
 ifdef COMPILE_WITH_UBSAN
         TESTS := $(shell echo $(TESTS) | sed 's/\boptions_settable_test\b//g')
 endif
 ifdef ASSERT_STATUS_CHECKED
-# This is a new check for which we will add support incrementally. This
-# list can be removed once support is fully added.
-	TESTS_PASSING_ASC = \
-		arena_test \
-		autovector_test \
-		cache_test \
-		lru_cache_test \
-		blob_file_addition_test \
-		blob_file_builder_test \
-		blob_file_garbage_test \
-		blob_file_reader_test \
-		bloom_test \
-		cassandra_format_test \
-		cassandra_row_merge_test \
-		cassandra_serialize_test \
-		cleanable_test \
-		coding_test \
-		crc32c_test \
-		dbformat_test \
-		db_basic_test \
-		db_with_timestamp_basic_test \
-		db_with_timestamp_compaction_test \
-		db_options_test \
-		db_properties_test \
-		db_secondary_test \
-		options_file_test \
-		defer_test \
-		filename_test \
-		dynamic_bloom_test \
-		env_basic_test \
+	# TODO: finish fixing all tests to pass this check
+	TESTS_FAILING_ASC = \
+		c_test \
+		db_test \
+		db_test2 \
 		env_test \
-		env_logger_test \
-		event_logger_test \
-		error_handler_fs_test \
-		auto_roll_logger_test \
-		file_indexer_test \
-		flush_job_test \
-		hash_table_test \
-		hash_test \
-		heap_test \
-		histogram_test \
-		inlineskiplist_test \
-		io_posix_test \
-		iostats_context_test \
-		ldb_cmd_test \
-		memkind_kmem_allocator_test \
-		merger_test \
-		mock_env_test \
-		object_registry_test \
-		prefix_test \
-		repair_test \
-		configurable_test \
-		options_settable_test \
-		options_test \
-		random_test \
-		range_del_aggregator_test \
-		sst_file_reader_test \
-		range_tombstone_fragmenter_test \
-		repeatable_thread_test \
-		skiplist_test \
-		slice_test \
-		sst_dump_test \
-		statistics_test \
-		stats_history_test \
-		thread_local_test \
-		trace_analyzer_test \
-		env_timed_test \
-		filelock_test \
-		timer_queue_test \
-		timer_test \
-		options_util_test \
-		persistent_cache_test \
-		util_merge_operators_test \
-		block_cache_trace_analyzer_test \
-		block_cache_tracer_test \
-		cache_simulator_test \
-		sim_cache_test \
-		version_builder_test \
-		version_edit_test \
-		work_queue_test \
-		write_controller_test \
-		compaction_iterator_test \
-		compaction_job_test \
-		compaction_job_stats_test \
-	        io_tracer_test \
-		merge_helper_test \
-		memtable_list_test \
-		flush_job_test \
-		block_based_filter_block_test \
-		block_fetcher_test \
-		full_filter_block_test \
-		partitioned_filter_block_test \
-		column_family_test \
-		file_reader_writer_test \
-		corruption_test \
-		db_universal_compaction_test \
-		import_column_family_test \
-		memory_test \
-		table_test \
+		range_locking_test \
+		testutil_test \
 
-ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
-TESTS_PASSING_ASC += folly_synchronization_distributed_mutex_test
+	# Since we have very few ASC exclusions left, excluding them from
+	# the build is the most convenient way to exclude them from testing
+	TESTS := $(filter-out $(TESTS_FAILING_ASC),$(TESTS))
 endif
 
-	# Enable building all unit tests, but use check_some to run only tests
-	# known to pass ASC (ASSERT_STATUS_CHECKED)
-	ROCKSDBTESTS_SUBSET ?= $(TESTS_PASSING_ASC)
-	# Alternate: only build unit tests known to pass ASC, and run them
-	# with make check
-	#TESTS := $(filter $(TESTS_PASSING_ASC),$(TESTS))
-	#PARALLEL_TEST := $(filter $(TESTS_PASSING_ASC),$(PARALLEL_TEST))
-else
-	ROCKSDBTESTS_SUBSET ?= $(TESTS)
-endif
+ROCKSDBTESTS_SUBSET ?= $(TESTS)
+
+# env_test - suspicious use of test::TmpDir
+# deletefile_test - serial because it generates giant temporary files in
+#   its various tests. Parallel can fill up your /dev/shm
+NON_PARALLEL_TEST = \
+	env_test \
+	deletefile_test \
+
+PARALLEL_TEST = $(filter-out $(NON_PARALLEL_TEST), $(TESTS))
+
 # Not necessarily well thought out or up-to-date, but matches old list
 TESTS_PLATFORM_DEPENDENT := \
 	db_basic_test \
-	db_with_timestamp_basic_test \
+	db_blob_basic_test \
 	db_encryption_test \
 	db_test2 \
 	external_sst_file_basic_test \
@@ -703,6 +560,7 @@ TESTS_PLATFORM_DEPENDENT := \
 	io_posix_test \
 	hash_test \
 	random_test \
+	ribbon_test \
 	thread_local_test \
 	work_queue_test \
 	rate_limiter_test \
@@ -768,13 +626,44 @@ else
 LIBRARY=$(STATIC_LIBRARY)
 TEST_LIBRARY=$(STATIC_TEST_LIBRARY)
 TOOLS_LIBRARY=$(STATIC_TOOLS_LIBRARY)
-STRESS_LIBRARY=$(STATIC_STRESS_LIBRARY)
 endif
+STRESS_LIBRARY=$(STATIC_STRESS_LIBRARY)
 
 ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
 ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
 ROCKSDB_PATCH = $(shell egrep "ROCKSDB_PATCH.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
 
+# If NO_UPDATE_BUILD_VERSION is set we don't update util/build_version.cc, but
+# the file needs to already exist or else the build will fail
+ifndef NO_UPDATE_BUILD_VERSION
+
+# By default, use the current date-time as the date.  If there are no changes,
+# we will use the last commit date instead.
+build_date := $(shell date "+%Y-%m-%d %T")
+
+ifdef FORCE_GIT_SHA
+	git_sha := $(FORCE_GIT_SHA)
+	git_mod := 1
+	git_date := $(build_date)
+else
+	git_sha := $(shell git rev-parse HEAD 2>/dev/null)
+	git_tag  := $(shell git symbolic-ref -q --short HEAD 2> /dev/null || git describe --tags --exact-match 2>/dev/null)
+	git_mod  := $(shell git diff-index HEAD --quiet 2>/dev/null; echo $$?)
+	git_date := $(shell git log -1 --date=format:"%Y-%m-%d %T" --format="%ad" 2>/dev/null)
+endif
+gen_build_version = sed -e s/@GIT_SHA@/$(git_sha)/ -e s:@GIT_TAG@:"$(git_tag)": -e s/@GIT_MOD@/"$(git_mod)"/ -e s/@BUILD_DATE@/"$(build_date)"/ -e s/@GIT_DATE@/"$(git_date)"/ util/build_version.cc.in
+
+# Record the version of the source that we are compiling.
+# We keep a record of the git revision in this file.  It is then built
+# as a regular source file as part of the compilation process.
+# One can run "strings executable_filename | grep _build_" to find
+# the version of the source that we used to build the executable file.
+util/build_version.cc: $(filter-out $(OBJ_DIR)/util/build_version.o, $(LIB_OBJECTS)) util/build_version.cc.in
+	$(AM_V_GEN)rm -f $@-t
+	$(AM_V_at)$(gen_build_version) > $@
+endif
+CLEAN_FILES += util/build_version.cc
+
 default: all
 
 #-----------------------------------------------
@@ -822,7 +711,8 @@ endif  # PLATFORM_SHARED_EXT
 	analyze tools tools_lib \
 	blackbox_crash_test_with_atomic_flush whitebox_crash_test_with_atomic_flush  \
 	blackbox_crash_test_with_txn whitebox_crash_test_with_txn \
-	blackbox_crash_test_with_best_efforts_recovery
+	blackbox_crash_test_with_best_efforts_recovery \
+	blackbox_crash_test_with_ts whitebox_crash_test_with_ts
 
 
 all: $(LIBRARY) $(BENCHMARKS) tools tools_lib test_libs $(TESTS)
@@ -938,7 +828,7 @@ gen_parallel_tests:
 # 107.816 PASS t/DBTest.EncodeDecompressedBlockSizeTest
 #
 slow_test_regexp = \
-	^.*SnapshotConcurrentAccessTest.*$$|^t/run-table_test-HarnessTest.Randomized$$|^t/run-db_test-.*(?:FileCreationRandomFailure|EncodeDecompressedBlockSizeTest)$$|^.*RecoverFromCorruptedWALWithoutFlush$$
+	^.*SnapshotConcurrentAccessTest.*$$|^.*SeqAdvanceConcurrentTest.*$$|^t/run-table_test-HarnessTest.Randomized$$|^t/run-db_test-.*(?:FileCreationRandomFailure|EncodeDecompressedBlockSizeTest)$$|^.*RecoverFromCorruptedWALWithoutFlush$$
 prioritize_long_running_tests =						\
   perl -pe 's,($(slow_test_regexp)),100 $$1,'				\
     | sort -k1,1gr							\
@@ -1060,6 +950,8 @@ crash_test_with_txn: whitebox_crash_test_with_txn blackbox_crash_test_with_txn
 
 crash_test_with_best_efforts_recovery: blackbox_crash_test_with_best_efforts_recovery
 
+crash_test_with_ts: whitebox_crash_test_with_ts blackbox_crash_test_with_ts
+
 blackbox_crash_test: db_stress
 	$(PYTHON) -u tools/db_crashtest.py --simple blackbox $(CRASH_TEST_EXT_ARGS)
 	$(PYTHON) -u tools/db_crashtest.py blackbox $(CRASH_TEST_EXT_ARGS)
@@ -1073,6 +965,9 @@ blackbox_crash_test_with_txn: db_stress
 blackbox_crash_test_with_best_efforts_recovery: db_stress
 	$(PYTHON) -u tools/db_crashtest.py --test_best_efforts_recovery blackbox $(CRASH_TEST_EXT_ARGS)
 
+blackbox_crash_test_with_ts: db_stress
+	$(PYTHON) -u tools/db_crashtest.py --enable_ts blackbox $(CRASH_TEST_EXT_ARGS)
+
 ifeq ($(CRASH_TEST_KILL_ODD),)
   CRASH_TEST_KILL_ODD=888887
 endif
@@ -1091,6 +986,10 @@ whitebox_crash_test_with_txn: db_stress
 	$(PYTHON) -u tools/db_crashtest.py --txn whitebox --random_kill_odd \
       $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
 
+whitebox_crash_test_with_ts: db_stress
+	$(PYTHON) -u tools/db_crashtest.py --enable_ts whitebox --random_kill_odd \
+      $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
+
 asan_check: clean
 	COMPILE_WITH_ASAN=1 $(MAKE) check -j32
 	$(MAKE) clean
@@ -1236,8 +1135,9 @@ analyze_incremental:
 		$(MAKE) dbg
 
 CLEAN_FILES += unity.cc
-unity.cc: Makefile
+unity.cc: Makefile util/build_version.cc.in
 	rm -f $@ $@-t
+	$(AM_V_at)$(gen_build_version) > util/build_version.cc
 	for source_file in $(LIB_SOURCES); do \
 		echo "#include \"$$source_file\"" >> $@-t; \
 	done
@@ -1317,11 +1217,11 @@ $(STATIC_TEST_LIBRARY): $(TEST_OBJECTS)
 	$(AM_V_AR)rm -f $@ $(SHARED_TEST_LIBRARY)
 	$(AM_V_at)$(AR) $(ARFLAGS) $@ $^
 
-$(STATIC_TOOLS_LIBRARY): $(BENCH_OBJECTS) $(TOOL_OBJECTS)
+$(STATIC_TOOLS_LIBRARY): $(TOOL_OBJECTS)
 	$(AM_V_AR)rm -f $@ $(SHARED_TOOLS_LIBRARY)
 	$(AM_V_at)$(AR) $(ARFLAGS) $@ $^
 
-$(STATIC_STRESS_LIBRARY): $(ANALYZE_OBJECTS) $(STRESS_OBJECTS)
+$(STATIC_STRESS_LIBRARY): $(ANALYZE_OBJECTS) $(STRESS_OBJECTS) $(TESTUTIL)
 	$(AM_V_AR)rm -f $@ $(SHARED_STRESS_LIBRARY)
 	$(AM_V_at)$(AR) $(ARFLAGS) $@ $^
 
@@ -1333,7 +1233,7 @@ $(SHARED_TOOLS_LIBRARY): $(TOOL_OBJECTS) $(SHARED1)
 	$(AM_V_AR)rm -f $@ $(STATIC_TOOLS_LIBRARY)
 	$(AM_SHARE)
 
-$(SHARED_STRESS_LIBRARY): $(ANALYZE_OBJECTS) $(STRESS_OBJECTS) $(SHARED_TOOLS_LIBRARY) $(SHARED1)
+$(SHARED_STRESS_LIBRARY): $(ANALYZE_OBJECTS) $(STRESS_OBJECTS) $(TESTUTIL) $(SHARED_TOOLS_LIBRARY) $(SHARED1)
 	$(AM_V_AR)rm -f $@ $(STATIC_STRESS_LIBRARY)
 	$(AM_SHARE)
 
@@ -1355,7 +1255,7 @@ folly_synchronization_distributed_mutex_test: $(OBJ_DIR)/third-party/folly/folly
 	$(AM_LINK)
 endif
 
-cache_bench: $(OBJ_DIR)/cache/cache_bench.o $(LIBRARY)
+cache_bench: $(OBJ_DIR)/cache/cache_bench.o $(CACHE_BENCH_OBJECTS) $(LIBRARY)
 	$(AM_LINK)
 
 persistent_cache_bench: $(OBJ_DIR)/utilities/persistent_cache/persistent_cache_bench.o $(LIBRARY)
@@ -1415,6 +1315,9 @@ hash_test: $(OBJ_DIR)/util/hash_test.o $(TEST_LIBRARY) $(LIBRARY)
 random_test: $(OBJ_DIR)/util/random_test.o  $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+ribbon_test: $(OBJ_DIR)/util/ribbon_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 option_change_migration_test: $(OBJ_DIR)/utilities/option_change_migration/option_change_migration_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
@@ -1460,6 +1363,12 @@ slice_transform_test: $(OBJ_DIR)/util/slice_transform_test.o $(TEST_LIBRARY) $(L
 db_basic_test: $(OBJ_DIR)/db/db_basic_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+db_blob_basic_test: $(OBJ_DIR)/db/blob/db_blob_basic_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_blob_compaction_test: $(OBJ_DIR)/db/blob/db_blob_compaction_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 db_with_timestamp_basic_test: $(OBJ_DIR)/db/db_with_timestamp_basic_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
@@ -1508,6 +1417,9 @@ db_inplace_update_test: $(OBJ_DIR)/db/db_inplace_update_test.o $(TEST_LIBRARY) $
 db_iterator_test: $(OBJ_DIR)/db/db_iterator_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+db_kv_checksum_test: $(OBJ_DIR)/db/db_kv_checksum_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 db_memtable_test: $(OBJ_DIR)/db/db_memtable_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
@@ -1630,6 +1542,9 @@ compaction_job_test: $(OBJ_DIR)/db/compaction/compaction_job_test.o $(TEST_LIBRA
 compaction_job_stats_test: $(OBJ_DIR)/db/compaction/compaction_job_stats_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+compaction_service_test: $(OBJ_DIR)/db/compaction/compaction_service_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 compact_on_deletion_collector_test: $(OBJ_DIR)/utilities/table_properties_collectors/compact_on_deletion_collector_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
@@ -1783,6 +1698,9 @@ compact_files_test: $(OBJ_DIR)/db/compact_files_test.o $(TEST_LIBRARY) $(LIBRARY
 configurable_test: options/configurable_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+customizable_test: options/customizable_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 options_test: $(OBJ_DIR)/options/options_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
@@ -1834,7 +1752,7 @@ write_callback_test: $(OBJ_DIR)/db/write_callback_test.o $(TEST_LIBRARY) $(LIBRA
 heap_test: $(OBJ_DIR)/util/heap_test.o $(GTEST)
 	$(AM_LINK)
 
-transaction_lock_mgr_test: utilities/transactions/transaction_lock_mgr_test.o $(TEST_LIBRARY) $(LIBRARY)
+point_lock_manager_test: utilities/transactions/lock/point/point_lock_manager_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
 transaction_test: $(OBJ_DIR)/utilities/transactions/transaction_test.o $(TEST_LIBRARY) $(LIBRARY)
@@ -1903,13 +1821,16 @@ blob_db_test: $(OBJ_DIR)/utilities/blob_db/blob_db_test.o $(TEST_LIBRARY) $(LIBR
 repeatable_thread_test: $(OBJ_DIR)/util/repeatable_thread_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+range_locking_test: utilities/transactions/lock/range/range_locking_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 range_tombstone_fragmenter_test: $(OBJ_DIR)/db/range_tombstone_fragmenter_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
 sst_file_reader_test: $(OBJ_DIR)/table/sst_file_reader_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
-db_secondary_test: $(OBJ_DIR)/db/db_impl/db_secondary_test.o $(TEST_LIBRARY) $(LIBRARY)
+db_secondary_test: $(OBJ_DIR)/db/db_secondary_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
 block_cache_tracer_test: $(OBJ_DIR)/trace_replay/block_cache_tracer_test.o $(TEST_LIBRARY) $(LIBRARY)
@@ -1927,6 +1848,9 @@ blob_file_addition_test: $(OBJ_DIR)/db/blob/blob_file_addition_test.o $(TEST_LIB
 blob_file_builder_test: $(OBJ_DIR)/db/blob/blob_file_builder_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+blob_file_cache_test: $(OBJ_DIR)/db/blob/blob_file_cache_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 blob_file_garbage_test: $(OBJ_DIR)/db/blob/blob_file_garbage_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
@@ -1954,6 +1878,15 @@ io_tracer_parser_test: $(OBJ_DIR)/tools/io_tracer_parser_test.o $(OBJ_DIR)/tools
 io_tracer_parser: $(OBJ_DIR)/tools/io_tracer_parser.o $(TOOLS_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+db_blob_corruption_test: $(OBJ_DIR)/db/blob/db_blob_corruption_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+db_write_buffer_manager_test: $(OBJ_DIR)/db/db_write_buffer_manager_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
+clipping_iterator_test: $(OBJ_DIR)/db/compaction/clipping_iterator_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 #-------------------------------------------------
 # make install related stuff
 PREFIX ?= /usr/local
@@ -1978,6 +1911,10 @@ install-headers: gen-pc
 	for header in `$(FIND) "include/rocksdb" -type f -name *.h`; do \
 		install -C -m 644 $$header $(DESTDIR)/$(PREFIX)/$$header; \
 	done
+	for header in $(ROCKSDB_PLUGIN_HEADERS); do \
+		install -d $(DESTDIR)/$(PREFIX)/include/rocksdb/`dirname $$header`; \
+		install -C -m 644 $$header $(DESTDIR)/$(PREFIX)/include/rocksdb/$$header; \
+	done
 	install -C -m 644 rocksdb.pc $(INSTALL_LIBDIR)/pkgconfig/rocksdb.pc
 
 install-static: install-headers $(LIBRARY)
@@ -2061,11 +1998,11 @@ BZIP2_DOWNLOAD_BASE ?= https://sourceware.org/pub/bzip2
 SNAPPY_VER ?= 1.1.8
 SNAPPY_SHA256 ?= 16b677f07832a612b0836178db7f374e414f94657c138e6993cbfc5dcc58651f
 SNAPPY_DOWNLOAD_BASE ?= https://github.com/google/snappy/archive
-LZ4_VER ?= 1.9.2
-LZ4_SHA256 ?= 658ba6191fa44c92280d4aa2c271b0f4fbc0e34d249578dd05e50e76d0e5efcc
+LZ4_VER ?= 1.9.3
+LZ4_SHA256 ?= 030644df4611007ff7dc962d981f390361e6c97a34e5cbc393ddfbe019ffe2c1
 LZ4_DOWNLOAD_BASE ?= https://github.com/lz4/lz4/archive
-ZSTD_VER ?= 1.4.4
-ZSTD_SHA256 ?= a364f5162c7d1a455cc915e8e3cf5f4bd8b75d09bc0f53965b0c9ca1383c52c8
+ZSTD_VER ?= 1.4.9
+ZSTD_SHA256 ?= acf714d98e3db7b876e5b540cbf6dee298f60eb3c0723104f6d3f065cd60d6a8
 ZSTD_DOWNLOAD_BASE ?= https://github.com/facebook/zstd/archive
 CURL_SSL_OPTS ?= --tlsv1
 
@@ -2097,80 +2034,80 @@ ifeq ($(PLATFORM), OS_AIX)
 	SNAPPY_MAKE_TARGET = libsnappy.la
 endif
 ifeq ($(PLATFORM), OS_OPENBSD)
-        JAVA_INCLUDE = -I$(JAVA_HOME)/include -I$(JAVA_HOME)/include/openbsd
+	JAVA_INCLUDE = -I$(JAVA_HOME)/include -I$(JAVA_HOME)/include/openbsd
 	ROCKSDBJNILIB = librocksdbjni-openbsd$(ARCH).so
-        ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-openbsd$(ARCH).jar
+	ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-openbsd$(ARCH).jar
 endif
 
-libz.a:
-	-rm -rf zlib-$(ZLIB_VER)
-ifeq (,$(wildcard ./zlib-$(ZLIB_VER).tar.gz))
+zlib-$(ZLIB_VER).tar.gz:
 	curl --fail --output zlib-$(ZLIB_VER).tar.gz --location ${ZLIB_DOWNLOAD_BASE}/zlib-$(ZLIB_VER).tar.gz
-endif
 	ZLIB_SHA256_ACTUAL=`$(SHA256_CMD) zlib-$(ZLIB_VER).tar.gz | cut -d ' ' -f 1`; \
 	if [ "$(ZLIB_SHA256)" != "$$ZLIB_SHA256_ACTUAL" ]; then \
 		echo zlib-$(ZLIB_VER).tar.gz checksum mismatch, expected=\"$(ZLIB_SHA256)\" actual=\"$$ZLIB_SHA256_ACTUAL\"; \
 		exit 1; \
 	fi
+
+libz.a: zlib-$(ZLIB_VER).tar.gz
+	-rm -rf zlib-$(ZLIB_VER)
 	tar xvzf zlib-$(ZLIB_VER).tar.gz
-	cd zlib-$(ZLIB_VER) && CFLAGS='-fPIC ${EXTRA_CFLAGS}' LDFLAGS='${EXTRA_LDFLAGS}' ./configure --static && $(MAKE)
+	cd zlib-$(ZLIB_VER) && CFLAGS='-fPIC ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' ./configure --static && $(MAKE)
 	cp zlib-$(ZLIB_VER)/libz.a .
 
-libbz2.a:
-	-rm -rf bzip2-$(BZIP2_VER)
-ifeq (,$(wildcard ./bzip2-$(BZIP2_VER).tar.gz))
+bzip2-$(BZIP2_VER).tar.gz:
 	curl --fail --output bzip2-$(BZIP2_VER).tar.gz --location ${CURL_SSL_OPTS} ${BZIP2_DOWNLOAD_BASE}/bzip2-$(BZIP2_VER).tar.gz
-endif
 	BZIP2_SHA256_ACTUAL=`$(SHA256_CMD) bzip2-$(BZIP2_VER).tar.gz | cut -d ' ' -f 1`; \
 	if [ "$(BZIP2_SHA256)" != "$$BZIP2_SHA256_ACTUAL" ]; then \
 		echo bzip2-$(BZIP2_VER).tar.gz checksum mismatch, expected=\"$(BZIP2_SHA256)\" actual=\"$$BZIP2_SHA256_ACTUAL\"; \
 		exit 1; \
 	fi
+
+libbz2.a: bzip2-$(BZIP2_VER).tar.gz
+	-rm -rf bzip2-$(BZIP2_VER)
 	tar xvzf bzip2-$(BZIP2_VER).tar.gz
-	cd bzip2-$(BZIP2_VER) && $(MAKE) CFLAGS='-fPIC -O2 -g -D_FILE_OFFSET_BITS=64 ${EXTRA_CFLAGS}' AR='ar ${EXTRA_ARFLAGS}'
+	cd bzip2-$(BZIP2_VER) && $(MAKE) CFLAGS='-fPIC -O2 -g -D_FILE_OFFSET_BITS=64 ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' AR='ar ${EXTRA_ARFLAGS}'
 	cp bzip2-$(BZIP2_VER)/libbz2.a .
 
-libsnappy.a:
-	-rm -rf snappy-$(SNAPPY_VER)
-ifeq (,$(wildcard ./snappy-$(SNAPPY_VER).tar.gz))
+snappy-$(SNAPPY_VER).tar.gz:
 	curl --fail --output snappy-$(SNAPPY_VER).tar.gz --location ${CURL_SSL_OPTS} ${SNAPPY_DOWNLOAD_BASE}/$(SNAPPY_VER).tar.gz
-endif
 	SNAPPY_SHA256_ACTUAL=`$(SHA256_CMD) snappy-$(SNAPPY_VER).tar.gz | cut -d ' ' -f 1`; \
 	if [ "$(SNAPPY_SHA256)" != "$$SNAPPY_SHA256_ACTUAL" ]; then \
 		echo snappy-$(SNAPPY_VER).tar.gz checksum mismatch, expected=\"$(SNAPPY_SHA256)\" actual=\"$$SNAPPY_SHA256_ACTUAL\"; \
 		exit 1; \
 	fi
+
+libsnappy.a: snappy-$(SNAPPY_VER).tar.gz
+	-rm -rf snappy-$(SNAPPY_VER)
 	tar xvzf snappy-$(SNAPPY_VER).tar.gz
 	mkdir snappy-$(SNAPPY_VER)/build
-	cd snappy-$(SNAPPY_VER)/build && CFLAGS='${EXTRA_CFLAGS}' CXXFLAGS='${EXTRA_CXXFLAGS}' LDFLAGS='${EXTRA_LDFLAGS}' cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON .. && $(MAKE) ${SNAPPY_MAKE_TARGET}
+	cd snappy-$(SNAPPY_VER)/build && CFLAGS='${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' CXXFLAGS='${JAVA_STATIC_DEPS_CXXFLAGS} ${EXTRA_CXXFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON ${PLATFORM_CMAKE_FLAGS} .. && $(MAKE) ${SNAPPY_MAKE_TARGET}
 	cp snappy-$(SNAPPY_VER)/build/libsnappy.a .
 
-liblz4.a:
-	-rm -rf lz4-$(LZ4_VER)
-ifeq (,$(wildcard ./lz4-$(LZ4_VER).tar.gz))
+lz4-$(LZ4_VER).tar.gz:
 	curl --fail --output lz4-$(LZ4_VER).tar.gz --location ${CURL_SSL_OPTS} ${LZ4_DOWNLOAD_BASE}/v$(LZ4_VER).tar.gz
-endif
 	LZ4_SHA256_ACTUAL=`$(SHA256_CMD) lz4-$(LZ4_VER).tar.gz | cut -d ' ' -f 1`; \
 	if [ "$(LZ4_SHA256)" != "$$LZ4_SHA256_ACTUAL" ]; then \
 		echo lz4-$(LZ4_VER).tar.gz checksum mismatch, expected=\"$(LZ4_SHA256)\" actual=\"$$LZ4_SHA256_ACTUAL\"; \
 		exit 1; \
 	fi
+
+liblz4.a: lz4-$(LZ4_VER).tar.gz
+	-rm -rf lz4-$(LZ4_VER)
 	tar xvzf lz4-$(LZ4_VER).tar.gz
-	cd lz4-$(LZ4_VER)/lib && $(MAKE) CFLAGS='-fPIC -O2 ${EXTRA_CFLAGS}' all
+	cd lz4-$(LZ4_VER)/lib && $(MAKE) CFLAGS='-fPIC -O2 ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' all
 	cp lz4-$(LZ4_VER)/lib/liblz4.a .
 
-libzstd.a:
-	-rm -rf zstd-$(ZSTD_VER)
-ifeq (,$(wildcard ./zstd-$(ZSTD_VER).tar.gz))
+zstd-$(ZSTD_VER).tar.gz:
 	curl --fail --output zstd-$(ZSTD_VER).tar.gz --location ${CURL_SSL_OPTS} ${ZSTD_DOWNLOAD_BASE}/v$(ZSTD_VER).tar.gz
-endif
 	ZSTD_SHA256_ACTUAL=`$(SHA256_CMD) zstd-$(ZSTD_VER).tar.gz | cut -d ' ' -f 1`; \
 	if [ "$(ZSTD_SHA256)" != "$$ZSTD_SHA256_ACTUAL" ]; then \
 		echo zstd-$(ZSTD_VER).tar.gz checksum mismatch, expected=\"$(ZSTD_SHA256)\" actual=\"$$ZSTD_SHA256_ACTUAL\"; \
 		exit 1; \
 	fi
+
+libzstd.a: zstd-$(ZSTD_VER).tar.gz
+	-rm -rf zstd-$(ZSTD_VER)
 	tar xvzf zstd-$(ZSTD_VER).tar.gz
-	cd zstd-$(ZSTD_VER)/lib && DESTDIR=. PREFIX= $(MAKE) CFLAGS='-fPIC -O2 ${EXTRA_CFLAGS}' install
+	cd zstd-$(ZSTD_VER)/lib && DESTDIR=. PREFIX= $(MAKE) CFLAGS='-fPIC -O2 ${JAVA_STATIC_DEPS_CCFLAGS} ${EXTRA_CFLAGS}' LDFLAGS='${JAVA_STATIC_DEPS_LDFLAGS} ${EXTRA_LDFLAGS}' libzstd.a
 	cp zstd-$(ZSTD_VER)/lib/libzstd.a .
 
 # A version of each $(LIB_OBJECTS) compiled with -fPIC and a fixed set of static compression libraries
@@ -2179,14 +2116,23 @@ JAVA_COMPRESSIONS = libz.a libbz2.a libsnappy.a liblz4.a libzstd.a
 endif
 
 JAVA_STATIC_FLAGS = -DZLIB -DBZIP2 -DSNAPPY -DLZ4 -DZSTD
-JAVA_STATIC_INCLUDES = -I./zlib-$(ZLIB_VER) -I./bzip2-$(BZIP2_VER) -I./snappy-$(SNAPPY_VER) -I./lz4-$(LZ4_VER)/lib -I./zstd-$(ZSTD_VER)/lib/include
-ifneq ($(findstring rocksdbjavastatic, $(MAKECMDGOALS)),)
+JAVA_STATIC_INCLUDES = -I./zlib-$(ZLIB_VER) -I./bzip2-$(BZIP2_VER) -I./snappy-$(SNAPPY_VER) -I./snappy-$(SNAPPY_VER)/build -I./lz4-$(LZ4_VER)/lib -I./zstd-$(ZSTD_VER)/lib -I./zstd-$(ZSTD_VER)/lib/dictBuilder
+
+ifneq ($(findstring rocksdbjavastatic, $(filter-out rocksdbjavastatic_deps, $(MAKECMDGOALS))),)
 CXXFLAGS += $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES)
-CFLAGS +=  $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES)
+CFLAGS += $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES)
+endif
+rocksdbjavastatic:
+ifeq ($(JAVA_HOME),)
+	$(error JAVA_HOME is not set)
 endif
-rocksdbjavastatic: $(LIB_OBJECTS) $(JAVA_COMPRESSIONS)
-	cd java;$(MAKE) javalib;
-	rm -f ./java/target/$(ROCKSDBJNILIB)
+	$(MAKE) rocksdbjavastatic_deps
+	$(MAKE) rocksdbjavastatic_libobjects
+	$(MAKE) rocksdbjavastatic_javalib
+
+rocksdbjavastatic_javalib:
+	cd java; SHA256_CMD='$(SHA256_CMD)' $(MAKE) javalib
+	rm -f java/target/$(ROCKSDBJNILIB)
 	$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC \
 	  -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) \
 	  $(LIB_OBJECTS) $(COVERAGEFLAGS) \
@@ -2203,6 +2149,10 @@ rocksdbjavastatic: $(LIB_OBJECTS) $(JAVA_COMPRESSIONS)
 	openssl sha1 java/target/$(ROCKSDB_JAVADOCS_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAVADOCS_JAR).sha1
 	openssl sha1 java/target/$(ROCKSDB_SOURCES_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_SOURCES_JAR).sha1
 
+rocksdbjavastatic_deps: $(JAVA_COMPRESSIONS)
+
+rocksdbjavastatic_libobjects: $(LIB_OBJECTS)
+
 rocksdbjavastaticrelease: rocksdbjavastatic
 	cd java/crossbuild && (vagrant destroy -f || true) && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64 && vagrant up linux64-musl && vagrant halt linux64-musl
 	cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
@@ -2218,7 +2168,7 @@ rocksdbjavastaticreleasedocker: rocksdbjavastatic rocksdbjavastaticdockerx86 roc
 
 rocksdbjavastaticdockerx86:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_x86-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+	docker run --rm --name rocksdb_linux_x86-be --platform linux/386 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
 
 rocksdbjavastaticdockerx86_64:
 	mkdir -p java/target
@@ -2234,7 +2184,7 @@ rocksdbjavastaticdockerarm64v8:
 
 rocksdbjavastaticdockerx86musl:
 	mkdir -p java/target
-	docker run --rm --name rocksdb_linux_x86-musl-be --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
+	docker run --rm --name rocksdb_linux_x86-musl-be --platform linux/386 --attach stdin --attach stdout --attach stderr --volume $(HOME)/.m2:/root/.m2:ro --volume `pwd`:/rocksdb-host:ro --volume /rocksdb-local-build --volume `pwd`/java/target:/rocksdb-java-target --env DEBUG_LEVEL=$(DEBUG_LEVEL) evolvedbinary/rocksjava:alpine3_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh
 
 rocksdbjavastaticdockerx86_64musl:
 	mkdir -p java/target
@@ -2272,7 +2222,10 @@ jl/%.o: %.cc
 	$(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) -fPIC -c $< -o $@ $(COVERAGEFLAGS)
 
 rocksdbjava: $(LIB_OBJECTS)
-	$(AM_V_GEN)cd java;$(MAKE) javalib;
+ifeq ($(JAVA_HOME),)
+	$(error JAVA_HOME is not set)
+endif
+	$(AM_V_GEN)cd java; SHA256_CMD='$(SHA256_CMD)' $(MAKE) javalib;
 	$(AM_V_at)rm -f ./java/target/$(ROCKSDBJNILIB)
 	$(AM_V_at)$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIB_OBJECTS) $(JAVA_LDFLAGS) $(COVERAGEFLAGS)
 	$(AM_V_at)cd java;jar -cf target/$(ROCKSDB_JAR) HISTORY*.md
@@ -2284,13 +2237,13 @@ jclean:
 	cd java;$(MAKE) clean;
 
 jtest_compile: rocksdbjava
-	cd java;$(MAKE) java_test
+	cd java; SHA256_CMD='$(SHA256_CMD)' $(MAKE) java_test
 
 jtest_run:
 	cd java;$(MAKE) run_test
 
 jtest: rocksdbjava
-	cd java;$(MAKE) sample;$(MAKE) test;
+	cd java;$(MAKE) sample; SHA256_CMD='$(SHA256_CMD)' $(MAKE) test;
 	$(PYTHON) tools/check_all_python.py # TODO peterd: find a better place for this check in CI targets
 
 jdb_bench:
@@ -2349,12 +2302,14 @@ endif
 # ---------------------------------------------------------------------------
 #  	Source files dependencies detection
 # ---------------------------------------------------------------------------
-
+# If skip dependencies is ON, skip including the dep files
+ifneq ($(SKIP_DEPENDS), 1)
 DEPFILES = $(patsubst %.cc, $(OBJ_DIR)/%.cc.d, $(ALL_SOURCES))
 DEPFILES+ = $(patsubst %.c, $(OBJ_DIR)/%.c.d, $(LIB_SOURCES_C) $(TEST_MAIN_SOURCES_C))
 ifeq ($(USE_FOLLY_DISTRIBUTED_MUTEX),1)
   DEPFILES +=$(patsubst %.cpp, $(OBJ_DIR)/%.cpp.d, $(FOLLY_SOURCES))
 endif
+endif
 
 # Add proper dependency support so changing a .h file forces a .cc file to
 # rebuild.
@@ -2394,20 +2349,9 @@ endif
 build_subset_tests: $(ROCKSDBTESTS_SUBSET)
 	$(AM_V_GEN)if [ -n "$${ROCKSDBTESTS_SUBSET_TESTS_TO_FILE}" ]; then echo "$(ROCKSDBTESTS_SUBSET)" > "$${ROCKSDBTESTS_SUBSET_TESTS_TO_FILE}"; else echo "$(ROCKSDBTESTS_SUBSET)"; fi
 
-# if the make goal is either "clean" or "format", we shouldn't
-# try to import the *.d files.
-# TODO(kailiu) The unfamiliarity of Make's conditions leads to the ugly
-# working solution.
-ifneq ($(MAKECMDGOALS),clean)
-ifneq ($(MAKECMDGOALS),format)
-ifneq ($(MAKECMDGOALS),jclean)
-ifneq ($(MAKECMDGOALS),jtest)
-ifneq ($(MAKECMDGOALS),package)
-ifneq ($(MAKECMDGOALS),analyze)
+# Remove the rules for which dependencies should not be generated and see if any are left.
+#If so, include the dependencies; if not, do not include the dependency files
+ROCKS_DEP_RULES=$(filter-out clean format check-format check-buck-targets jclean jtest package analyze tags rocksdbjavastatic% unity.% unity_test, $(MAKECMDGOALS))
+ifneq ("$(ROCKS_DEP_RULES)", "")
 -include $(DEPFILES)
 endif
-endif
-endif
-endif
-endif
-endif
diff --git a/PLUGINS.md b/PLUGINS.md
new file mode 100644
index 00000000000..ec30b12f15b
--- /dev/null
+++ b/PLUGINS.md
@@ -0,0 +1,4 @@
+This is the list of all known third-party plugins for RocksDB. If something is missing, please open a pull request to add it.
+
+* [Dedupfs](https://github.com/ajkr/dedupfs): an example for plugin developers to reference
+* [ZenFS](https://github.com/westerndigitalcorporation/zenfs): a file system for zoned block devices
diff --git a/TARGETS b/TARGETS
index 96da3a19731..563252a39d3 100644
--- a/TARGETS
+++ b/TARGETS
@@ -1,4 +1,5 @@
-# This file @generated by `python3 buckifier/buckify_rocksdb.py`
+# This file @generated by:
+#$ python3 buckifier/buckify_rocksdb.py
 # --> DO NOT EDIT MANUALLY <--
 # This file is a Facebook-specific integration for buck builds, so can
 # only be validated by Facebook employees.
@@ -9,7 +10,7 @@ load(":defs.bzl", "test_binary")
 
 REPO_PATH = package_name() + "/"
 
-ROCKSDB_COMPILER_FLAGS = [
+ROCKSDB_COMPILER_FLAGS_0 = [
     "-fno-builtin-memcmp",
     # Needed to compile in fbcode
     "-Wno-expansion-to-defined",
@@ -24,10 +25,10 @@ ROCKSDB_EXTERNAL_DEPS = [
     ("zlib", None, "z"),
     ("gflags", None, "gflags"),
     ("lz4", None, "lz4"),
-    ("zstd", None),
+    ("zstd", None, "zstd"),
 ]
 
-ROCKSDB_OS_DEPS = [
+ROCKSDB_OS_DEPS_0 = [
     (
         "linux",
         ["third-party//numa:numa", "third-party//liburing:uring", "third-party//tbb:tbb"],
@@ -38,7 +39,7 @@ ROCKSDB_OS_DEPS = [
     ),
 ]
 
-ROCKSDB_OS_PREPROCESSOR_FLAGS = [
+ROCKSDB_OS_PREPROCESSOR_FLAGS_0 = [
     (
         "linux",
         [
@@ -86,10 +87,12 @@ ROCKSDB_PREPROCESSOR_FLAGS = [
 
     # Added missing flags from output of build_detect_platform
     "-DROCKSDB_BACKTRACE",
+]
 
-    # Directories with files for #include
-    "-I" + REPO_PATH + "include/",
-    "-I" + REPO_PATH,
+# Directories with files for #include
+ROCKSDB_INCLUDE_PATHS = [
+    "",
+    "include",
 ]
 
 ROCKSDB_ARCH_PREPROCESSOR_FLAGS = {
@@ -104,18 +107,18 @@ is_opt_mode = build_mode.startswith("opt")
 
 # -DNDEBUG is added by default in opt mode in fbcode. But adding it twice
 # doesn't harm and avoid forgetting to add it.
-ROCKSDB_COMPILER_FLAGS += (["-DNDEBUG"] if is_opt_mode else [])
+ROCKSDB_COMPILER_FLAGS = ROCKSDB_COMPILER_FLAGS_0 + (["-DNDEBUG"] if is_opt_mode else [])
 
 sanitizer = read_config("fbcode", "sanitizer")
 
 # Do not enable jemalloc if sanitizer presents. RocksDB will further detect
 # whether the binary is linked with jemalloc at runtime.
-ROCKSDB_OS_PREPROCESSOR_FLAGS += ([(
+ROCKSDB_OS_PREPROCESSOR_FLAGS = ROCKSDB_OS_PREPROCESSOR_FLAGS_0 + ([(
     "linux",
     ["-DROCKSDB_JEMALLOC"],
 )] if sanitizer == "" else [])
 
-ROCKSDB_OS_DEPS += ([(
+ROCKSDB_OS_DEPS = ROCKSDB_OS_DEPS_0 + ([(
     "linux",
     ["third-party//jemalloc:headers"],
 )] if sanitizer == "" else [])
@@ -129,6 +132,7 @@ cpp_library(
     name = "rocksdb_lib",
     srcs = [
         "cache/cache.cc",
+        "cache/cache_entry_roles.cc",
         "cache/clock_cache.cc",
         "cache/lru_cache.cc",
         "cache/sharded_cache.cc",
@@ -148,8 +152,10 @@ cpp_library(
         "cloud/manifest_reader.cc",
         "cloud/purge.cc",
         "db/arena_wrapped_db_iter.cc",
+        "db/blob/blob_fetcher.cc",
         "db/blob/blob_file_addition.cc",
         "db/blob/blob_file_builder.cc",
+        "db/blob/blob_file_cache.cc",
         "db/blob/blob_file_garbage.cc",
         "db/blob/blob_file_meta.cc",
         "db/blob/blob_file_reader.cc",
@@ -159,7 +165,6 @@ cpp_library(
         "db/builder.cc",
         "db/c.cc",
         "db/column_family.cc",
-        "db/compacted_db_impl.cc",
         "db/compaction/compaction.cc",
         "db/compaction/compaction_iterator.cc",
         "db/compaction/compaction_job.cc",
@@ -170,6 +175,7 @@ cpp_library(
         "db/compaction/sst_partitioner.cc",
         "db/convenience.cc",
         "db/db_filesnapshot.cc",
+        "db/db_impl/compacted_db_impl.cc",
         "db/db_impl/db_impl.cc",
         "db/db_impl/db_impl_compaction_flush.cc",
         "db/db_impl/db_impl_debug.cc",
@@ -221,6 +227,7 @@ cpp_library(
         "db/write_batch_base.cc",
         "db/write_controller.cc",
         "db/write_thread.cc",
+        "env/composite_env.cc",
         "env/env.cc",
         "env/env_chroot.cc",
         "env/env_encryption.cc",
@@ -229,12 +236,14 @@ cpp_library(
         "env/file_system.cc",
         "env/file_system_tracer.cc",
         "env/fs_posix.cc",
+        "env/fs_remap.cc",
         "env/io_posix.cc",
         "env/mock_env.cc",
         "file/delete_scheduler.cc",
         "file/file_prefetch_buffer.cc",
         "file/file_util.cc",
         "file/filename.cc",
+        "file/line_file_reader.cc",
         "file/random_access_file_reader.cc",
         "file/read_write_util.cc",
         "file/readahead_raf.cc",
@@ -270,6 +279,7 @@ cpp_library(
         "monitoring/thread_status_util_debug.cc",
         "options/cf_options.cc",
         "options/configurable.cc",
+        "options/customizable.cc",
         "options/db_options.cc",
         "options/options.cc",
         "options/options_helper.cc",
@@ -348,12 +358,14 @@ cpp_library(
         "util/compression_context_cache.cc",
         "util/concurrent_task_limiter_impl.cc",
         "util/crc32c.cc",
+        "util/crc32c_arm64.cc",
         "util/dynamic_bloom.cc",
         "util/file_checksum_helper.cc",
         "util/hash.cc",
         "util/murmurhash.cc",
         "util/random.cc",
         "util/rate_limiter.cc",
+        "util/ribbon_config.cc",
         "util/slice.cc",
         "util/status.cc",
         "util/string_util.cc",
@@ -399,8 +411,23 @@ cpp_library(
         "utilities/simulator_cache/sim_cache.cc",
         "utilities/table_properties_collectors/compact_on_deletion_collector.cc",
         "utilities/trace/file_trace_reader_writer.cc",
-        "utilities/transactions/lock/lock_tracker.cc",
-        "utilities/transactions/lock/point_lock_tracker.cc",
+        "utilities/transactions/lock/lock_manager.cc",
+        "utilities/transactions/lock/point/point_lock_manager.cc",
+        "utilities/transactions/lock/point/point_lock_tracker.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc",
+        "utilities/transactions/lock/range/range_tree/lib/standalone_port.cc",
+        "utilities/transactions/lock/range/range_tree/lib/util/dbt.cc",
+        "utilities/transactions/lock/range/range_tree/lib/util/memarena.cc",
+        "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc",
+        "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc",
         "utilities/transactions/optimistic_transaction.cc",
         "utilities/transactions/optimistic_transaction_db_impl.cc",
         "utilities/transactions/pessimistic_transaction.cc",
@@ -408,7 +435,6 @@ cpp_library(
         "utilities/transactions/snapshot_checker.cc",
         "utilities/transactions/transaction_base.cc",
         "utilities/transactions/transaction_db_mutex_impl.cc",
-        "utilities/transactions/transaction_lock_mgr.cc",
         "utilities/transactions/transaction_util.cc",
         "utilities/transactions/write_prepared_txn.cc",
         "utilities/transactions/write_prepared_txn_db.cc",
@@ -424,6 +450,7 @@ cpp_library(
     os_deps = ROCKSDB_OS_DEPS,
     os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+    include_paths = ROCKSDB_INCLUDE_PATHS,
     deps = [],
     external_deps = ROCKSDB_EXTERNAL_DEPS,
     link_whole = False,
@@ -433,6 +460,7 @@ cpp_library(
     name = "rocksdb_whole_archive_lib",
     srcs = [
         "cache/cache.cc",
+        "cache/cache_entry_roles.cc",
         "cache/clock_cache.cc",
         "cache/lru_cache.cc",
         "cache/sharded_cache.cc",
@@ -452,8 +480,10 @@ cpp_library(
         "cloud/manifest_reader.cc",
         "cloud/purge.cc",
         "db/arena_wrapped_db_iter.cc",
+        "db/blob/blob_fetcher.cc",
         "db/blob/blob_file_addition.cc",
         "db/blob/blob_file_builder.cc",
+        "db/blob/blob_file_cache.cc",
         "db/blob/blob_file_garbage.cc",
         "db/blob/blob_file_meta.cc",
         "db/blob/blob_file_reader.cc",
@@ -463,7 +493,6 @@ cpp_library(
         "db/builder.cc",
         "db/c.cc",
         "db/column_family.cc",
-        "db/compacted_db_impl.cc",
         "db/compaction/compaction.cc",
         "db/compaction/compaction_iterator.cc",
         "db/compaction/compaction_job.cc",
@@ -474,6 +503,7 @@ cpp_library(
         "db/compaction/sst_partitioner.cc",
         "db/convenience.cc",
         "db/db_filesnapshot.cc",
+        "db/db_impl/compacted_db_impl.cc",
         "db/db_impl/db_impl.cc",
         "db/db_impl/db_impl_compaction_flush.cc",
         "db/db_impl/db_impl_debug.cc",
@@ -525,6 +555,7 @@ cpp_library(
         "db/write_batch_base.cc",
         "db/write_controller.cc",
         "db/write_thread.cc",
+        "env/composite_env.cc",
         "env/env.cc",
         "env/env_chroot.cc",
         "env/env_encryption.cc",
@@ -533,12 +564,14 @@ cpp_library(
         "env/file_system.cc",
         "env/file_system_tracer.cc",
         "env/fs_posix.cc",
+        "env/fs_remap.cc",
         "env/io_posix.cc",
         "env/mock_env.cc",
         "file/delete_scheduler.cc",
         "file/file_prefetch_buffer.cc",
         "file/file_util.cc",
         "file/filename.cc",
+        "file/line_file_reader.cc",
         "file/random_access_file_reader.cc",
         "file/read_write_util.cc",
         "file/readahead_raf.cc",
@@ -574,6 +607,7 @@ cpp_library(
         "monitoring/thread_status_util_debug.cc",
         "options/cf_options.cc",
         "options/configurable.cc",
+        "options/customizable.cc",
         "options/db_options.cc",
         "options/options.cc",
         "options/options_helper.cc",
@@ -652,12 +686,14 @@ cpp_library(
         "util/compression_context_cache.cc",
         "util/concurrent_task_limiter_impl.cc",
         "util/crc32c.cc",
+        "util/crc32c_arm64.cc",
         "util/dynamic_bloom.cc",
         "util/file_checksum_helper.cc",
         "util/hash.cc",
         "util/murmurhash.cc",
         "util/random.cc",
         "util/rate_limiter.cc",
+        "util/ribbon_config.cc",
         "util/slice.cc",
         "util/status.cc",
         "util/string_util.cc",
@@ -703,8 +739,23 @@ cpp_library(
         "utilities/simulator_cache/sim_cache.cc",
         "utilities/table_properties_collectors/compact_on_deletion_collector.cc",
         "utilities/trace/file_trace_reader_writer.cc",
-        "utilities/transactions/lock/lock_tracker.cc",
-        "utilities/transactions/lock/point_lock_tracker.cc",
+        "utilities/transactions/lock/lock_manager.cc",
+        "utilities/transactions/lock/point/point_lock_manager.cc",
+        "utilities/transactions/lock/point/point_lock_tracker.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc",
+        "utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc",
+        "utilities/transactions/lock/range/range_tree/lib/standalone_port.cc",
+        "utilities/transactions/lock/range/range_tree/lib/util/dbt.cc",
+        "utilities/transactions/lock/range/range_tree/lib/util/memarena.cc",
+        "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc",
+        "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc",
         "utilities/transactions/optimistic_transaction.cc",
         "utilities/transactions/optimistic_transaction_db_impl.cc",
         "utilities/transactions/pessimistic_transaction.cc",
@@ -712,7 +763,6 @@ cpp_library(
         "utilities/transactions/snapshot_checker.cc",
         "utilities/transactions/transaction_base.cc",
         "utilities/transactions/transaction_db_mutex_impl.cc",
-        "utilities/transactions/transaction_lock_mgr.cc",
         "utilities/transactions/transaction_util.cc",
         "utilities/transactions/write_prepared_txn.cc",
         "utilities/transactions/write_prepared_txn_db.cc",
@@ -728,6 +778,7 @@ cpp_library(
     os_deps = ROCKSDB_OS_DEPS,
     os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+    include_paths = ROCKSDB_INCLUDE_PATHS,
     deps = [],
     external_deps = ROCKSDB_EXTERNAL_DEPS,
     link_whole = True,
@@ -751,6 +802,7 @@ cpp_library(
     os_deps = ROCKSDB_OS_DEPS,
     os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+    include_paths = ROCKSDB_INCLUDE_PATHS,
     deps = [":rocksdb_lib"],
     external_deps = ROCKSDB_EXTERNAL_DEPS + [
         ("googletest", None, "gtest"),
@@ -764,6 +816,7 @@ cpp_library(
         "test_util/testutil.cc",
         "tools/block_cache_analyzer/block_cache_trace_analyzer.cc",
         "tools/db_bench_tool.cc",
+        "tools/simulated_hybrid_file_system.cc",
         "tools/trace_analyzer_tool.cc",
     ],
     auto_headers = AutoHeaders.RECURSIVE_GLOB,
@@ -772,6 +825,22 @@ cpp_library(
     os_deps = ROCKSDB_OS_DEPS,
     os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+    include_paths = ROCKSDB_INCLUDE_PATHS,
+    deps = [":rocksdb_lib"],
+    external_deps = ROCKSDB_EXTERNAL_DEPS,
+    link_whole = False,
+)
+
+cpp_library(
+    name = "rocksdb_cache_bench_tools_lib",
+    srcs = ["cache/cache_bench_tool.cc"],
+    auto_headers = AutoHeaders.RECURSIVE_GLOB,
+    arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
+    compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    os_deps = ROCKSDB_OS_DEPS,
+    os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
+    preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+    include_paths = ROCKSDB_INCLUDE_PATHS,
     deps = [":rocksdb_lib"],
     external_deps = ROCKSDB_EXTERNAL_DEPS,
     link_whole = False,
@@ -799,30 +868,30 @@ cpp_library(
     os_deps = ROCKSDB_OS_DEPS,
     os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+    include_paths = ROCKSDB_INCLUDE_PATHS,
     deps = ROCKSDB_LIB_DEPS,
     external_deps = ROCKSDB_EXTERNAL_DEPS,
 )
 
-if not is_opt_mode:
-    cpp_binary(
-        name = "c_test_bin",
-        srcs = ["db/c_test.c"],
-        arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
-        os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
-        compiler_flags = ROCKSDB_COMPILER_FLAGS,
-        preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
-        deps = [":rocksdb_test_lib"],
-    )
+cpp_binary(
+    name = "c_test_bin",
+    srcs = ["db/c_test.c"],
+    arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
+    os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
+    compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+    include_paths = ROCKSDB_INCLUDE_PATHS,
+    deps = [":rocksdb_test_lib"],
+) if not is_opt_mode else None
 
-if not is_opt_mode:
-    custom_unittest(
-        "c_test",
-        command = [
-            native.package_name() + "/buckifier/rocks_test_runner.sh",
-            "$(location :{})".format("c_test_bin"),
-        ],
-        type = "simple",
-    )
+custom_unittest(
+    name = "c_test",
+    command = [
+        native.package_name() + "/buckifier/rocks_test_runner.sh",
+        "$(location :{})".format("c_test_bin"),
+    ],
+    type = "simple",
+) if not is_opt_mode else None
 
 cpp_library(
     name = "env_basic_test_lib",
@@ -833,6 +902,7 @@ cpp_library(
     os_deps = ROCKSDB_OS_DEPS,
     os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+    include_paths = ROCKSDB_INCLUDE_PATHS,
     deps = [":rocksdb_test_lib"],
     external_deps = ROCKSDB_EXTERNAL_DEPS,
     link_whole = False,
@@ -843,21 +913,21 @@ ROCKS_TESTS = [
     [
         "arena_test",
         "memory/arena_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "auto_roll_logger_test",
         "logging/auto_roll_logger_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "autovector_test",
         "util/autovector_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -871,140 +941,154 @@ ROCKS_TESTS = [
     [
         "blob_db_test",
         "utilities/blob_db/blob_db_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "blob_file_addition_test",
         "db/blob/blob_file_addition_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "blob_file_builder_test",
         "db/blob/blob_file_builder_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "blob_file_cache_test",
+        "db/blob/blob_file_cache_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "blob_file_garbage_test",
         "db/blob/blob_file_garbage_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "blob_file_reader_test",
         "db/blob/blob_file_reader_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "block_based_filter_block_test",
         "table/block_based/block_based_filter_block_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "block_based_table_reader_test",
         "table/block_based/block_based_table_reader_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "block_cache_trace_analyzer_test",
         "tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "block_cache_tracer_test",
         "trace_replay/block_cache_tracer_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "block_fetcher_test",
         "table/block_fetcher_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "block_test",
         "table/block_based/block_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "bloom_test",
         "util/bloom_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "cache_simulator_test",
         "utilities/simulator_cache/cache_simulator_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "cache_test",
         "cache/cache_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "cassandra_format_test",
         "utilities/cassandra/cassandra_format_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "cassandra_functional_test",
         "utilities/cassandra/cassandra_functional_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "cassandra_row_merge_test",
         "utilities/cassandra/cassandra_row_merge_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "cassandra_serialize_test",
         "utilities/cassandra/cassandra_serialize_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "checkpoint_test",
         "utilities/checkpoint/checkpoint_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "cleanable_test",
         "table/cleanable_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "clipping_iterator_test",
+        "db/compaction/clipping_iterator_test.cc",
+        "parallel",
         [],
         [],
     ],
@@ -1025,7 +1109,7 @@ ROCKS_TESTS = [
     [
         "coding_test",
         "util/coding_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -1039,119 +1123,154 @@ ROCKS_TESTS = [
     [
         "compact_files_test",
         "db/compact_files_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "compact_on_deletion_collector_test",
         "utilities/table_properties_collectors/compact_on_deletion_collector_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "compaction_iterator_test",
         "db/compaction/compaction_iterator_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "compaction_job_stats_test",
         "db/compaction/compaction_job_stats_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "compaction_job_test",
         "db/compaction/compaction_job_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "compaction_picker_test",
         "db/compaction/compaction_picker_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "compaction_service_test",
+        "db/compaction/compaction_service_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "comparator_db_test",
         "db/comparator_db_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "configurable_test",
         "options/configurable_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "corruption_test",
         "db/corruption_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "crc32c_test",
         "util/crc32c_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "cuckoo_table_builder_test",
         "table/cuckoo/cuckoo_table_builder_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "cuckoo_table_db_test",
         "db/cuckoo_table_db_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "cuckoo_table_reader_test",
         "table/cuckoo/cuckoo_table_reader_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "customizable_test",
+        "options/customizable_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "data_block_hash_index_test",
         "table/block_based/data_block_hash_index_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_basic_test",
         "db/db_basic_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "db_blob_basic_test",
+        "db/blob/db_blob_basic_test.cc",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "db_blob_compaction_test",
+        "db/blob/db_blob_compaction_test.cc",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "db_blob_corruption_test",
+        "db/blob/db_blob_corruption_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "db_blob_index_test",
         "db/blob/db_blob_index_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_block_cache_test",
         "db/db_block_cache_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -1186,84 +1305,91 @@ ROCKS_TESTS = [
     [
         "db_dynamic_level_test",
         "db/db_dynamic_level_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_encryption_test",
         "db/db_encryption_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_flush_test",
         "db/db_flush_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_inplace_update_test",
         "db/db_inplace_update_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_io_failure_test",
         "db/db_io_failure_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_iter_stress_test",
         "db/db_iter_stress_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_iter_test",
         "db/db_iter_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_iterator_test",
         "db/db_iterator_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "db_kv_checksum_test",
+        "db/db_kv_checksum_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "db_log_iter_test",
         "db/db_log_iter_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_logical_block_size_cache_test",
         "db/db_logical_block_size_cache_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_memtable_test",
         "db/db_memtable_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_merge_operand_test",
         "db/db_merge_operand_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -1277,28 +1403,28 @@ ROCKS_TESTS = [
     [
         "db_options_test",
         "db/db_options_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_properties_test",
         "db/db_properties_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_range_del_test",
         "db/db_range_del_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_secondary_test",
-        "db/db_impl/db_secondary_test.cc",
-        "serial",
+        "db/db_secondary_test.cc",
+        "parallel",
         [],
         [],
     ],
@@ -1312,21 +1438,21 @@ ROCKS_TESTS = [
     [
         "db_statistics_test",
         "db/db_statistics_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_table_properties_test",
         "db/db_table_properties_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_tailing_iter_test",
         "db/db_tailing_iter_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -1361,42 +1487,49 @@ ROCKS_TESTS = [
     [
         "db_with_timestamp_basic_test",
         "db/db_with_timestamp_basic_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "db_with_timestamp_compaction_test",
         "db/db_with_timestamp_compaction_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "db_write_buffer_manager_test",
+        "db/db_write_buffer_manager_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "db_write_test",
         "db/db_write_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "dbformat_test",
         "db/dbformat_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "defer_test",
         "util/defer_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "delete_scheduler_test",
         "file/delete_scheduler_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -1410,21 +1543,21 @@ ROCKS_TESTS = [
     [
         "dynamic_bloom_test",
         "util/dynamic_bloom_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "env_basic_test",
         "env/env_basic_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "env_logger_test",
         "logging/env_logger_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -1438,28 +1571,28 @@ ROCKS_TESTS = [
     [
         "env_timed_test",
         "utilities/env_timed_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "error_handler_fs_test",
         "db/error_handler_fs_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "event_logger_test",
         "logging/event_logger_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "external_sst_file_basic_test",
         "db/external_sst_file_basic_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -1480,7 +1613,7 @@ ROCKS_TESTS = [
     [
         "file_indexer_test",
         "db/file_indexer_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -1494,56 +1627,56 @@ ROCKS_TESTS = [
     [
         "filelock_test",
         "util/filelock_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "filename_test",
         "db/filename_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "flush_job_test",
         "db/flush_job_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "full_filter_block_test",
         "table/block_based/full_filter_block_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "hash_table_test",
         "utilities/persistent_cache/hash_table_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "hash_test",
         "util/hash_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "heap_test",
         "util/heap_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "histogram_test",
         "monitoring/histogram_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -1564,56 +1697,56 @@ ROCKS_TESTS = [
     [
         "io_posix_test",
         "env/io_posix_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "io_tracer_parser_test",
         "tools/io_tracer_parser_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "io_tracer_test",
         "trace_replay/io_tracer_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "iostats_context_test",
         "monitoring/iostats_context_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "ldb_cmd_test",
         "tools/ldb_cmd_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "listener_test",
         "db/listener_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "log_test",
         "db/log_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "lru_cache_test",
         "cache/lru_cache_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -1627,126 +1760,126 @@ ROCKS_TESTS = [
     [
         "memkind_kmem_allocator_test",
         "memory/memkind_kmem_allocator_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "memory_test",
         "utilities/memory/memory_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "memtable_list_test",
         "db/memtable_list_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "merge_helper_test",
         "db/merge_helper_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "merge_test",
         "db/merge_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "merger_test",
         "table/merger_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "mock_env_test",
         "env/mock_env_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "object_registry_test",
         "utilities/object_registry_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "obsolete_files_test",
         "db/obsolete_files_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "optimistic_transaction_test",
         "utilities/transactions/optimistic_transaction_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "option_change_migration_test",
         "utilities/option_change_migration/option_change_migration_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "options_file_test",
         "db/options_file_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "options_settable_test",
         "options/options_settable_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "options_test",
         "options/options_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "options_util_test",
         "utilities/options/options_util_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "partitioned_filter_block_test",
         "table/block_based/partitioned_filter_block_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "perf_context_test",
         "db/perf_context_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "periodic_work_scheduler_test",
         "db/periodic_work_scheduler_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -1760,63 +1893,77 @@ ROCKS_TESTS = [
     [
         "plain_table_db_test",
         "db/plain_table_db_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "point_lock_manager_test",
+        "utilities/transactions/lock/point/point_lock_manager_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "prefetch_test",
         "file/prefetch_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "prefix_test",
         "db/prefix_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "random_access_file_reader_test",
         "file/random_access_file_reader_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "random_test",
         "util/random_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "range_del_aggregator_test",
         "db/range_del_aggregator_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "range_locking_test",
+        "utilities/transactions/lock/range/range_locking_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "range_tombstone_fragmenter_test",
         "db/range_tombstone_fragmenter_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "rate_limiter_test",
         "util/rate_limiter_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "reduce_levels_test",
         "tools/reduce_levels_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -1830,84 +1977,91 @@ ROCKS_TESTS = [
     [
         "repair_test",
         "db/repair_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "repeatable_thread_test",
         "util/repeatable_thread_test.cc",
-        "serial",
+        "parallel",
+        [],
+        [],
+    ],
+    [
+        "ribbon_test",
+        "util/ribbon_test.cc",
+        "parallel",
         [],
         [],
     ],
     [
         "sim_cache_test",
         "utilities/simulator_cache/sim_cache_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "skiplist_test",
         "memtable/skiplist_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "slice_test",
         "util/slice_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "slice_transform_test",
         "util/slice_transform_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "sst_dump_test",
         "tools/sst_dump_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "sst_file_reader_test",
         "table/sst_file_reader_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "statistics_test",
         "monitoring/statistics_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "stats_history_test",
         "monitoring/stats_history_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "stringappend_test",
         "utilities/merge_operators/string_append/stringappend_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "table_properties_collector_test",
         "db/table_properties_collector_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -1921,48 +2075,41 @@ ROCKS_TESTS = [
     [
         "testutil_test",
         "test_util/testutil_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "thread_list_test",
         "util/thread_list_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "thread_local_test",
         "util/thread_local_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "timer_queue_test",
         "util/timer_queue_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "timer_test",
         "util/timer_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "trace_analyzer_test",
         "tools/trace_analyzer_test.cc",
-        "serial",
-        [],
-        [],
-    ],
-    [
-        "transaction_lock_mgr_test",
-        "utilities/transactions/transaction_lock_mgr_test.cc",
         "parallel",
         [],
         [],
@@ -1977,84 +2124,84 @@ ROCKS_TESTS = [
     [
         "ttl_test",
         "utilities/ttl/ttl_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "util_merge_operators_test",
         "utilities/util_merge_operators_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "version_builder_test",
         "db/version_builder_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "version_edit_test",
         "db/version_edit_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "version_set_test",
         "db/version_set_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "wal_manager_test",
         "db/wal_manager_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "work_queue_test",
         "util/work_queue_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "write_batch_test",
         "db/write_batch_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "write_batch_with_index_test",
         "utilities/write_batch_with_index/write_batch_with_index_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "write_buffer_manager_test",
         "memtable/write_buffer_manager_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "write_callback_test",
         "db/write_callback_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
     [
         "write_controller_test",
         "db/write_controller_test.cc",
-        "serial",
+        "parallel",
         [],
         [],
     ],
@@ -2085,6 +2232,7 @@ ROCKS_TESTS = [
         os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
         compiler_flags = ROCKSDB_COMPILER_FLAGS + extra_compiler_flags,
         preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+        include_paths = ROCKSDB_INCLUDE_PATHS,
         deps = [":rocksdb_test_lib"] + extra_deps,
         external_deps = ROCKSDB_EXTERNAL_DEPS + [
             ("googletest", None, "gtest"),
diff --git a/USERS.md b/USERS.md
index 11d52a0519c..fb612efabd0 100644
--- a/USERS.md
+++ b/USERS.md
@@ -26,6 +26,9 @@ Learn more about those use cases in a Tech Talk by Ankit Gupta and Naveen Somasu
 ## Yahoo
 Yahoo is using RocksDB as a storage engine for their biggest distributed data store Sherpa. Learn more about it here: http://yahooeng.tumblr.com/post/120730204806/sherpa-scales-new-heights
 
+## Baidu
+[Apache Doris](http://doris.apache.org/master/en/) is a MPP analytical database engine released by Baidu. It [uses RocksDB](http://doris.apache.org/master/en/administrator-guide/operation/tablet-meta-tool.html) to manage its tablet's metadata.
+
 ## CockroachDB
 CockroachDB is an open-source geo-replicated transactional database. They are using RocksDB as their storage engine. Check out their github: https://github.com/cockroachdb/cockroach
 
@@ -44,7 +47,7 @@ Tango is using RocksDB as a graph storage to store all users' connection data an
 Turn is using RocksDB as a storage layer for their key/value store, serving at peak 2.4MM QPS out of different datacenters.
 Check out our RocksDB Protobuf merge operator at: https://github.com/vladb38/rocksdb_protobuf
 
-## Santanader UK/Cloudera Profession Services
+## Santander UK/Cloudera Profession Services
 Check out their blog post: http://blog.cloudera.com/blog/2015/08/inside-santanders-near-real-time-data-ingest-architecture/
 
 ## Airbnb
@@ -67,7 +70,7 @@ Pinterest's Object Retrieval System uses RocksDB for storage: https://www.youtub
 [VWO's](https://vwo.com/) Smart Code checker and URL helper uses RocksDB to store all the URLs where VWO's Smart Code is installed.
 
 ## quasardb
-[quasardb](https://www.quasardb.net) is a high-performance, distributed, transactional key-value database that integrates well with in-memory analytics engines such as Apache Spark. 
+[quasardb](https://www.quasardb.net) is a high-performance, distributed, transactional key-value database that integrates well with in-memory analytics engines such as Apache Spark.
 quasardb uses a heavily tuned RocksDB as its persistence layer.
 
 ## Netflix
@@ -86,7 +89,7 @@ quasardb uses a heavily tuned RocksDB as its persistence layer.
 [Uber](http://eng.uber.com/cherami/) uses RocksDB as a durable and scalable task queue.
 
 ## 360 Pika
-[360](http://www.360.cn/) [Pika](https://github.com/Qihoo360/pika) is a nosql compatible with redis. With the huge amount of data stored, redis may suffer for a capacity bottleneck, and pika was born for solving it. It has widely been widely used in many company
+[360](http://www.360.cn/) [Pika](https://github.com/Qihoo360/pika) is a nosql compatible with redis. With the huge amount of data stored, redis may suffer for a capacity bottleneck, and pika was born for solving it. It has widely been used in many companies.
 
 ## LzLabs
 LzLabs is using RocksDB as a storage engine in their multi-database distributed framework to store application configuration and user data.
@@ -96,16 +99,19 @@ LzLabs is using RocksDB as a storage engine in their multi-database distributed
 
 ## IOTA Foundation
  [IOTA Foundation](https://www.iota.org/) is using RocksDB in the [IOTA Reference Implementation (IRI)](https://github.com/iotaledger/iri) to store the local state of the Tangle. The Tangle is the first open-source distributed ledger powering the future of the Internet of Things.
- 
+
 ## Avrio Project
  [Avrio Project](http://avrio-project.github.io/avrio.network/) is using RocksDB in [Avrio ](https://github.com/avrio-project/avrio) to store blocks, account balances and data and other blockchain-releated data. Avrio is a multiblockchain decentralized cryptocurrency empowering monetary transactions.
- 
+
 ## Crux
 [Crux](https://github.com/juxt/crux) is a document database that uses RocksDB for local [EAV](https://en.wikipedia.org/wiki/Entity%E2%80%93attribute%E2%80%93value_model) index storage to enable point-in-time bitemporal Datalog queries. The "unbundled" architecture uses Kafka to provide horizontal scalability.
 
 ## Nebula Graph
-
 [Nebula Graph](https://github.com/vesoft-inc/nebula) is a distributed, scalable, lightning-fast, open source graph database capable of hosting super large scale graphs with dozens of billions of vertices (nodes) and trillions of edges, with milliseconds of latency.
 
 ## YugabyteDB
 [YugabyteDB](https://www.yugabyte.com/) is an open source, high performance, distributed SQL database that uses RocksDB as its storage layer. For more information, please see https://github.com/yugabyte/yugabyte-db/.
+
+## ArangoDB
+[ArangoDB](https://www.arangodb.com/) is a native multi-model database with flexible data models for documents, graphs, and key-values, for building high performance applications using a convenient SQL-like query language or JavaScript extensions. It uses RocksDB as its sotrage engine.
+
diff --git a/buckifier/buckify_rocksdb.py b/buckifier/buckify_rocksdb.py
index f0909bc6171..993dd4d1b18 100644
--- a/buckifier/buckify_rocksdb.py
+++ b/buckifier/buckify_rocksdb.py
@@ -24,10 +24,10 @@
 # (This generates a TARGET file without user-specified dependency for unit
 # tests.)
 # $python3 buckifier/buckify_rocksdb.py \
-#        '{"fake": { \
-#                      "extra_deps": [":test_dep", "//fakes/module:mock1"],  \
-#                      "extra_compiler_flags": ["-DROCKSDB_LITE", "-Os"], \
-#                  } \
+#        '{"fake": {
+#                      "extra_deps": [":test_dep", "//fakes/module:mock1"],
+#                      "extra_compiler_flags": ["-DROCKSDB_LITE", "-Os"]
+#                  }
 #         }'
 # (Generated TARGETS file has test_dep and mock1 as dependencies for RocksDB
 # unit tests, and will use the extra_compiler_flags to compile the unit test
@@ -69,25 +69,25 @@ def get_cc_files(repo_path):
     return cc_files
 
 
-# Get parallel tests from Makefile
-def get_parallel_tests(repo_path):
+# Get non_parallel tests from Makefile
+def get_non_parallel_tests(repo_path):
     Makefile = repo_path + "/Makefile"
 
     s = set({})
 
-    found_parallel_tests = False
+    found_non_parallel_tests = False
     for line in open(Makefile):
         line = line.strip()
-        if line.startswith("PARALLEL_TEST ="):
-            found_parallel_tests = True
-        elif found_parallel_tests:
+        if line.startswith("NON_PARALLEL_TEST ="):
+            found_non_parallel_tests = True
+        elif found_non_parallel_tests:
             if line.endswith("\\"):
                 # remove the trailing \
                 line = line[:-1]
                 line = line.strip()
                 s.add(line)
             else:
-                # we consumed all the parallel tests
+                # we consumed all the non_parallel tests
                 break
 
     return s
@@ -123,23 +123,33 @@ def generate_targets(repo_path, deps_map):
     src_mk = parse_src_mk(repo_path)
     # get all .cc files
     cc_files = get_cc_files(repo_path)
-    # get parallel tests from Makefile
-    parallel_tests = get_parallel_tests(repo_path)
+    # get non_parallel tests from Makefile
+    non_parallel_tests = get_non_parallel_tests(repo_path)
 
-    if src_mk is None or cc_files is None or parallel_tests is None:
+    if src_mk is None or cc_files is None or non_parallel_tests is None:
         return False
 
-    TARGETS = TARGETSBuilder("%s/TARGETS" % repo_path)
+    extra_argv = ""
+    if len(sys.argv) >= 2:
+        # Heuristically quote and canonicalize whitespace for inclusion
+        # in how the file was generated.
+        extra_argv = " '{0}'".format(" ".join(sys.argv[1].split()))
+
+    TARGETS = TARGETSBuilder("%s/TARGETS" % repo_path, extra_argv)
 
     # rocksdb_lib
     TARGETS.add_library(
         "rocksdb_lib",
         src_mk["LIB_SOURCES"] +
+        # always add range_tree, it's only excluded on ppc64, which we don't use internally
+        src_mk["RANGE_TREE_SOURCES"] +
         src_mk["TOOL_LIB_SOURCES"])
     # rocksdb_whole_archive_lib
     TARGETS.add_library(
         "rocksdb_whole_archive_lib",
         src_mk["LIB_SOURCES"] +
+        # always add range_tree, it's only excluded on ppc64, which we don't use internally
+        src_mk["RANGE_TREE_SOURCES"] +
         src_mk["TOOL_LIB_SOURCES"],
         deps=None,
         headers=None,
@@ -163,6 +173,11 @@ def generate_targets(repo_path, deps_map):
         src_mk.get("ANALYZER_LIB_SOURCES", []) +
         ["test_util/testutil.cc"],
         [":rocksdb_lib"])
+    # rocksdb_cache_bench_tools_lib
+    TARGETS.add_library(
+        "rocksdb_cache_bench_tools_lib",
+        src_mk.get("CACHE_BENCH_LIB_SOURCES", []),
+        [":rocksdb_lib"])
     # rocksdb_stress_lib
     TARGETS.add_rocksdb_library(
         "rocksdb_stress_lib",
@@ -201,7 +216,7 @@ def generate_targets(repo_path, deps_map):
             TARGETS.register_test(
                 test_target_name,
                 test_src,
-                test in parallel_tests,
+                test not in non_parallel_tests,
                 json.dumps(deps['extra_deps']),
                 json.dumps(deps['extra_compiler_flags']))
 
diff --git a/buckifier/targets_builder.py b/buckifier/targets_builder.py
index e9f6f5be40a..d2649c1311d 100644
--- a/buckifier/targets_builder.py
+++ b/buckifier/targets_builder.py
@@ -25,10 +25,11 @@ def pretty_list(lst, indent=8):
 
 
 class TARGETSBuilder(object):
-    def __init__(self, path):
+    def __init__(self, path, extra_argv):
         self.path = path
         self.targets_file = open(path, 'wb')
-        header = targets_cfg.rocksdb_target_header_template
+        header = targets_cfg.rocksdb_target_header_template.format(
+            extra_argv=extra_argv)
         self.targets_file.write(header.encode("utf-8"))
         self.total_lib = 0
         self.total_bin = 0
@@ -79,26 +80,25 @@ def add_binary(self, name, srcs, deps=None):
 
     def add_c_test(self):
         self.targets_file.write(b"""
-if not is_opt_mode:
-    cpp_binary(
-        name = "c_test_bin",
-        srcs = ["db/c_test.c"],
-        arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
-        os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
-        compiler_flags = ROCKSDB_COMPILER_FLAGS,
-        preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
-        deps = [":rocksdb_test_lib"],
-    )
+cpp_binary(
+    name = "c_test_bin",
+    srcs = ["db/c_test.c"],
+    arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
+    os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
+    compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+    include_paths = ROCKSDB_INCLUDE_PATHS,
+    deps = [":rocksdb_test_lib"],
+) if not is_opt_mode else None
 
-if not is_opt_mode:
-    custom_unittest(
-        "c_test",
-        command = [
-            native.package_name() + "/buckifier/rocks_test_runner.sh",
-            "$(location :{})".format("c_test_bin"),
-        ],
-        type = "simple",
-    )
+custom_unittest(
+    name = "c_test",
+    command = [
+        native.package_name() + "/buckifier/rocks_test_runner.sh",
+        "$(location :{})".format("c_test_bin"),
+    ],
+    type = "simple",
+) if not is_opt_mode else None
 """)
 
     def register_test(self,
diff --git a/buckifier/targets_cfg.py b/buckifier/targets_cfg.py
index 0c20ef095c8..38037b250d6 100644
--- a/buckifier/targets_cfg.py
+++ b/buckifier/targets_cfg.py
@@ -5,7 +5,8 @@
 from __future__ import unicode_literals
 
 rocksdb_target_header_template = \
-    """# This file \100generated by `python3 buckifier/buckify_rocksdb.py`
+    """# This file \100generated by:
+#$ python3 buckifier/buckify_rocksdb.py{extra_argv}
 # --> DO NOT EDIT MANUALLY <--
 # This file is a Facebook-specific integration for buck builds, so can
 # only be validated by Facebook employees.
@@ -16,7 +17,7 @@
 
 REPO_PATH = package_name() + "/"
 
-ROCKSDB_COMPILER_FLAGS = [
+ROCKSDB_COMPILER_FLAGS_0 = [
     "-fno-builtin-memcmp",
     # Needed to compile in fbcode
     "-Wno-expansion-to-defined",
@@ -31,10 +32,10 @@
     ("zlib", None, "z"),
     ("gflags", None, "gflags"),
     ("lz4", None, "lz4"),
-    ("zstd", None),
+    ("zstd", None, "zstd"),
 ]
 
-ROCKSDB_OS_DEPS = [
+ROCKSDB_OS_DEPS_0 = [
     (
         "linux",
         ["third-party//numa:numa", "third-party//liburing:uring", "third-party//tbb:tbb"],
@@ -45,7 +46,7 @@
     ),
 ]
 
-ROCKSDB_OS_PREPROCESSOR_FLAGS = [
+ROCKSDB_OS_PREPROCESSOR_FLAGS_0 = [
     (
         "linux",
         [
@@ -93,17 +94,19 @@
 
     # Added missing flags from output of build_detect_platform
     "-DROCKSDB_BACKTRACE",
+]
 
-    # Directories with files for #include
-    "-I" + REPO_PATH + "include/",
-    "-I" + REPO_PATH,
+# Directories with files for #include
+ROCKSDB_INCLUDE_PATHS = [
+    "",
+    "include",
 ]
 
-ROCKSDB_ARCH_PREPROCESSOR_FLAGS = {
+ROCKSDB_ARCH_PREPROCESSOR_FLAGS = {{
     "x86_64": [
         "-DHAVE_PCLMUL",
     ],
-}
+}}
 
 build_mode = read_config("fbcode", "build_mode")
 
@@ -111,18 +114,18 @@
 
 # -DNDEBUG is added by default in opt mode in fbcode. But adding it twice
 # doesn't harm and avoid forgetting to add it.
-ROCKSDB_COMPILER_FLAGS += (["-DNDEBUG"] if is_opt_mode else [])
+ROCKSDB_COMPILER_FLAGS = ROCKSDB_COMPILER_FLAGS_0 + (["-DNDEBUG"] if is_opt_mode else [])
 
 sanitizer = read_config("fbcode", "sanitizer")
 
 # Do not enable jemalloc if sanitizer presents. RocksDB will further detect
 # whether the binary is linked with jemalloc at runtime.
-ROCKSDB_OS_PREPROCESSOR_FLAGS += ([(
+ROCKSDB_OS_PREPROCESSOR_FLAGS = ROCKSDB_OS_PREPROCESSOR_FLAGS_0 + ([(
     "linux",
     ["-DROCKSDB_JEMALLOC"],
 )] if sanitizer == "" else [])
 
-ROCKSDB_OS_DEPS += ([(
+ROCKSDB_OS_DEPS = ROCKSDB_OS_DEPS_0 + ([(
     "linux",
     ["third-party//jemalloc:headers"],
 )] if sanitizer == "" else [])
@@ -144,6 +147,7 @@
     os_deps = ROCKSDB_OS_DEPS,
     os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+    include_paths = ROCKSDB_INCLUDE_PATHS,
     deps = [{deps}],
     external_deps = ROCKSDB_EXTERNAL_DEPS{extra_external_deps},
     link_whole = {link_whole},
@@ -160,6 +164,7 @@
     os_deps = ROCKSDB_OS_DEPS,
     os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+    include_paths = ROCKSDB_INCLUDE_PATHS,
     deps = ROCKSDB_LIB_DEPS,
     external_deps = ROCKSDB_EXTERNAL_DEPS,
 )
@@ -172,6 +177,7 @@
     arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
     compiler_flags = ROCKSDB_COMPILER_FLAGS,
     preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+    include_paths = ROCKSDB_INCLUDE_PATHS,
     deps = [{deps}],
     external_deps = ROCKSDB_EXTERNAL_DEPS,
 )
@@ -202,6 +208,7 @@
         os_preprocessor_flags = ROCKSDB_OS_PREPROCESSOR_FLAGS,
         compiler_flags = ROCKSDB_COMPILER_FLAGS + extra_compiler_flags,
         preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+        include_paths = ROCKSDB_INCLUDE_PATHS,
         deps = [":rocksdb_test_lib"] + extra_deps,
         external_deps = ROCKSDB_EXTERNAL_DEPS + [
             ("googletest", None, "gtest"),
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index bc8b0dbc769..b8e469e2a9c 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -45,8 +45,13 @@ if test -z "$OUTPUT"; then
   exit 1
 fi
 
-# we depend on C++11
-PLATFORM_CXXFLAGS="-std=c++11"
+# we depend on C++11, but should be compatible with newer standards
+if [ "$ROCKSDB_CXX_STANDARD" ]; then
+  PLATFORM_CXXFLAGS="-std=$ROCKSDB_CXX_STANDARD"
+else
+  PLATFORM_CXXFLAGS="-std=c++11"
+fi
+
 # we currently depend on POSIX platform
 COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX"
 
@@ -64,10 +69,6 @@ if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
       source "$PWD/build_tools/fbcode_config_platform007.sh"
     elif [ -n "$ROCKSDB_FBCODE_BUILD_WITH_PLATFORM009" ]; then
       source "$PWD/build_tools/fbcode_config_platform009.sh"
-    elif [ -z "$USE_CLANG" ]; then
-        # Still use platform007 for gcc by default for build break on
-        # some hosts.
-      source "$PWD/build_tools/fbcode_config_platform007.sh"
     else
       source "$PWD/build_tools/fbcode_config_platform009.sh"
     fi
@@ -170,9 +171,12 @@ case "$TARGET_OS" in
             PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -latomic"
         fi
         PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt -ldl"
-        if test $ROCKSDB_USE_IO_URING; then
+        if test -z "$ROCKSDB_USE_IO_URING"; then
+            ROCKSDB_USE_IO_URING=1
+        fi
+        if test "$ROCKSDB_USE_IO_URING" -ne 0; then
             # check for liburing
-            $CXX $CFLAGS -x c++ - -luring -o /dev/null 2>/dev/null  <<EOF
+            $CXX $PLATFORM_CXXFLAGS -x c++ - -luring -o /dev/null 2>/dev/null  <<EOF
               #include <liburing.h>
               int main() {
                 struct io_uring ring;
@@ -283,7 +287,7 @@ if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then
 else
     if ! test $ROCKSDB_DISABLE_FALLOCATE; then
         # Test whether fallocate is available
-        $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
           #include <fcntl.h>
           #include <linux/falloc.h>
           int main() {
@@ -299,7 +303,7 @@ EOF
     if ! test $ROCKSDB_DISABLE_SNAPPY; then
         # Test whether Snappy library is installed
         # http://code.google.com/p/snappy/
-        $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
           #include <snappy.h>
           int main() {}
 EOF
@@ -314,7 +318,7 @@ EOF
         # Test whether gflags library is installed
         # http://gflags.github.io/gflags/
         # check if the namespace is gflags
-        if $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF
+        if $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF
           #include <gflags/gflags.h>
           using namespace GFLAGS_NAMESPACE;
           int main() {}
@@ -323,7 +327,7 @@ EOF
           COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1"
           PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
         # check if namespace is gflags
-        elif $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF
+        elif $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF
             #include <gflags/gflags.h>
             using namespace gflags;
             int main() {}
@@ -332,7 +336,7 @@ EOF
           COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1 -DGFLAGS_NAMESPACE=gflags"
           PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
         # check if namespace is google
-        elif $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF
+        elif $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF
             #include <gflags/gflags.h>
             using namespace google;
             int main() {}
@@ -345,7 +349,7 @@ EOF
 
     if ! test $ROCKSDB_DISABLE_ZLIB; then
         # Test whether zlib library is installed
-        $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
           #include <zlib.h>
           int main() {}
 EOF
@@ -358,7 +362,7 @@ EOF
 
     if ! test $ROCKSDB_DISABLE_BZIP; then
         # Test whether bzip library is installed
-        $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
           #include <bzlib.h>
           int main() {}
 EOF
@@ -371,7 +375,7 @@ EOF
 
     if ! test $ROCKSDB_DISABLE_LZ4; then
         # Test whether lz4 library is installed
-        $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
           #include <lz4.h>
           #include <lz4hc.h>
           int main() {}
@@ -385,7 +389,7 @@ EOF
 
     if ! test $ROCKSDB_DISABLE_ZSTD; then
         # Test whether zstd library is installed
-        $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
           #include <zstd.h>
           int main() {}
 EOF
@@ -398,7 +402,7 @@ EOF
 
     if ! test $ROCKSDB_DISABLE_NUMA; then
         # Test whether numa is available
-        $CXX $CFLAGS -x c++ - -o /dev/null -lnuma 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null -lnuma 2>/dev/null  <<EOF
           #include <numa.h>
           #include <numaif.h>
           int main() {}
@@ -412,7 +416,7 @@ EOF
 
     if ! test $ROCKSDB_DISABLE_TBB; then
         # Test whether tbb is available
-        $CXX $CFLAGS $LDFLAGS -x c++ - -o /dev/null -ltbb 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS $LDFLAGS -x c++ - -o /dev/null -ltbb 2>/dev/null  <<EOF
           #include <tbb/tbb.h>
           int main() {}
 EOF
@@ -425,7 +429,7 @@ EOF
 
     if ! test $ROCKSDB_DISABLE_JEMALLOC; then
         # Test whether jemalloc is available
-        if echo 'int main() {}' | $CXX $CFLAGS -x c++ - -o /dev/null -ljemalloc \
+        if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null -ljemalloc \
           2>/dev/null; then
             # This will enable some preprocessor identifiers in the Makefile
             JEMALLOC=1
@@ -446,7 +450,7 @@ EOF
     fi
     if ! test $JEMALLOC && ! test $ROCKSDB_DISABLE_TCMALLOC; then
         # jemalloc is not available. Let's try tcmalloc
-        if echo 'int main() {}' | $CXX $CFLAGS -x c++ - -o /dev/null \
+        if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null \
           -ltcmalloc 2>/dev/null; then
             PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc"
             JAVA_LDFLAGS="$JAVA_LDFLAGS -ltcmalloc"
@@ -455,7 +459,7 @@ EOF
 
     if ! test $ROCKSDB_DISABLE_MALLOC_USABLE_SIZE; then
         # Test whether malloc_usable_size is available
-        $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
           #include <malloc.h>
           int main() {
             size_t res = malloc_usable_size(0);
@@ -470,7 +474,7 @@ EOF
 
     if ! test $ROCKSDB_DISABLE_MEMKIND; then
         # Test whether memkind library is installed
-        $CXX $CFLAGS $COMMON_FLAGS -lmemkind -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -lmemkind -x c++ - -o /dev/null 2>/dev/null  <<EOF
           #include <memkind.h>
           int main() {
             memkind_malloc(MEMKIND_DAX_KMEM, 1024);
@@ -486,7 +490,7 @@ EOF
 
     if ! test $ROCKSDB_DISABLE_PTHREAD_MUTEX_ADAPTIVE_NP; then
         # Test whether PTHREAD_MUTEX_ADAPTIVE_NP mutex type is available
-        $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
           #include <pthread.h>
           int main() {
             int x = PTHREAD_MUTEX_ADAPTIVE_NP;
@@ -501,7 +505,7 @@ EOF
 
     if ! test $ROCKSDB_DISABLE_BACKTRACE; then
         # Test whether backtrace is available
-        $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
           #include <execinfo.h>
           int main() {
             void* frames[1];
@@ -513,7 +517,7 @@ EOF
             COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_BACKTRACE"
         else
             # Test whether execinfo library is installed
-            $CXX $CFLAGS -lexecinfo -x c++ - -o /dev/null 2>/dev/null  <<EOF
+            $CXX $PLATFORM_CXXFLAGS -lexecinfo -x c++ - -o /dev/null 2>/dev/null  <<EOF
               #include <execinfo.h>
               int main() {
                 void* frames[1];
@@ -530,7 +534,7 @@ EOF
 
     if ! test $ROCKSDB_DISABLE_PG; then
         # Test if -pg is supported
-        $CXX $CFLAGS -pg -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS -pg -x c++ - -o /dev/null 2>/dev/null  <<EOF
           int main() {
             return 0;
           }
@@ -542,7 +546,7 @@ EOF
 
     if ! test $ROCKSDB_DISABLE_SYNC_FILE_RANGE; then
         # Test whether sync_file_range is supported for compatibility with an old glibc
-        $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
           #include <fcntl.h>
           int main() {
             int fd = open("/dev/null", 0);
@@ -556,7 +560,7 @@ EOF
 
     if ! test $ROCKSDB_DISABLE_SCHED_GETCPU; then
         # Test whether sched_getcpu is supported
-        $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
           #include <sched.h>
           int main() {
             int cpuid = sched_getcpu();
@@ -570,7 +574,7 @@ EOF
 
     if ! test $ROCKSDB_DISABLE_AUXV_GETAUXVAL; then
         # Test whether getauxval is supported
-        $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+        $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
           #include <sys/auxv.h>
           int main() {
             uint64_t auxv = getauxval(AT_HWCAP);
@@ -598,7 +602,7 @@ fi
 # -Wshorten-64-to-32 breaks compilation on FreeBSD i386
 if ! [ "$TARGET_OS" = FreeBSD -a "$TARGET_ARCHITECTURE" = i386 ]; then
   # Test whether -Wshorten-64-to-32 is available
-  $CXX $CFLAGS -x c++ - -o /dev/null -Wshorten-64-to-32 2>/dev/null  <<EOF
+  $CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null -Wshorten-64-to-32 2>/dev/null  <<EOF
     int main() {}
 EOF
   if [ "$?" = 0 ]; then
@@ -678,6 +682,28 @@ else
     COMMON_FLAGS="$COMMON_FLAGS -mmacosx-version-min=10.12"
     PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -mmacosx-version-min=10.12"
     PLATFORM_SHARED_LDFLAGS="$PLATFORM_SHARED_LDFLAGS -mmacosx-version-min=10.12"
+    PLATFORM_CMAKE_FLAGS="-DCMAKE_OSX_DEPLOYMENT_TARGET=10.12"
+    JAVA_STATIC_DEPS_COMMON_FLAGS="-mmacosx-version-min=10.12"
+    JAVA_STATIC_DEPS_LDFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
+    JAVA_STATIC_DEPS_CCFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
+    JAVA_STATIC_DEPS_CXXFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
+  fi
+fi
+
+if test -n "`echo $TARGET_ARCHITECTURE | grep ^ppc64`"; then
+  # check for GNU libc on ppc64
+  $CXX -x c++ - -o /dev/null 2>/dev/null <<EOF
+    #include <stdio.h>
+    #include <stdlib.h>
+    #include <gnu/libc-version.h>
+
+    int main(int argc, char *argv[]) {
+      printf("GNU libc version: %s\n", gnu_get_libc_version());
+      return 0;
+    }
+EOF
+  if [ "$?" != 0 ]; then
+      PPC_LIBC_IS_GNU=0
   fi
 fi
 
@@ -835,8 +861,12 @@ echo "CXX=$CXX" >> "$OUTPUT"
 echo "AR=$AR" >> "$OUTPUT"
 echo "PLATFORM=$PLATFORM" >> "$OUTPUT"
 echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> "$OUTPUT"
+echo "PLATFORM_CMAKE_FLAGS=$PLATFORM_CMAKE_FLAGS" >> "$OUTPUT"
 echo "JAVA_LDFLAGS=$JAVA_LDFLAGS" >> "$OUTPUT"
 echo "JAVA_STATIC_LDFLAGS=$JAVA_STATIC_LDFLAGS" >> "$OUTPUT"
+echo "JAVA_STATIC_DEPS_CCFLAGS=$JAVA_STATIC_DEPS_CCFLAGS" >> "$OUTPUT"
+echo "JAVA_STATIC_DEPS_CXXFLAGS=$JAVA_STATIC_DEPS_CXXFLAGS" >> "$OUTPUT"
+echo "JAVA_STATIC_DEPS_LDFLAGS=$JAVA_STATIC_DEPS_LDFLAGS" >> "$OUTPUT"
 echo "JAVAC_ARGS=$JAVAC_ARGS" >> "$OUTPUT"
 echo "VALGRIND_VER=$VALGRIND_VER" >> "$OUTPUT"
 echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> "$OUTPUT"
@@ -870,3 +900,6 @@ echo "LUA_PATH=$LUA_PATH" >> "$OUTPUT"
 if test -n "$USE_FOLLY_DISTRIBUTED_MUTEX"; then
   echo "USE_FOLLY_DISTRIBUTED_MUTEX=$USE_FOLLY_DISTRIBUTED_MUTEX" >> "$OUTPUT"
 fi
+if test -n "$PPC_LIBC_IS_GNU"; then
+  echo "PPC_LIBC_IS_GNU=$PPC_LIBC_IS_GNU" >> "$OUTPUT"
+fi
diff --git a/build_tools/fbcode_config_platform009.sh b/build_tools/fbcode_config_platform009.sh
index cf24bd81599..82c85b09c1d 100644
--- a/build_tools/fbcode_config_platform009.sh
+++ b/build_tools/fbcode_config_platform009.sh
@@ -120,7 +120,7 @@ if [ -z "$USE_CLANG" ]; then
   CXX="$GCC_BASE/bin/g++"
   AR="$GCC_BASE/bin/gcc-ar"
 
-  CFLAGS+=" -B$BINUTILS/gold"
+  CFLAGS+=" -B$BINUTILS"
   CFLAGS+=" -isystem $LIBGCC_INCLUDE"
   CFLAGS+=" -isystem $GLIBC_INCLUDE"
   JEMALLOC=1
@@ -133,7 +133,7 @@ else
 
   KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include"
 
-  CFLAGS+=" -B$BINUTILS/gold -nostdinc -nostdlib"
+  CFLAGS+=" -B$BINUTILS -nostdinc -nostdlib"
   CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/9.x "
   CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/9.x/x86_64-facebook-linux "
   CFLAGS+=" -isystem $GLIBC_INCLUDE"
@@ -150,10 +150,11 @@ CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PR
 CXXFLAGS+=" $CFLAGS"
 
 EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS $LIBURING_LIBS"
-EXEC_LDFLAGS+=" -B$BINUTILS/gold"
+EXEC_LDFLAGS+=" -B$BINUTILS"
 EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/platform009/lib/ld.so"
 EXEC_LDFLAGS+=" $LIBUNWIND"
 EXEC_LDFLAGS+=" -Wl,-rpath=/usr/local/fbcode/platform009/lib"
+EXEC_LDFLAGS+=" -Wl,-rpath=$GCC_BASE/lib64"
 # required by libtbb
 EXEC_LDFLAGS+=" -ldl"
 
diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh
index 386885b578e..c2842dfa493 100755
--- a/build_tools/format-diff.sh
+++ b/build_tools/format-diff.sh
@@ -52,15 +52,16 @@ else
     else
       echo "You didn't have clang-format-diff.py and/or clang-format available in your computer!"
       echo "You can download clang-format-diff.py by running: "
-      echo "    curl --location http://goo.gl/iUW1u2 -o ${CLANG_FORMAT_DIFF}"
+      echo "    curl --location https://raw.githubusercontent.com/llvm/llvm-project/main/clang/tools/clang-format/clang-format-diff.py -o ${REPO_ROOT}/clang-format-diff.py"
+      echo "You should make sure the downloaded script is not compromised."
       echo "You can download clang-format by running:"
       echo "    brew install clang-format"
       echo "  Or"
       echo "    apt install clang-format"
       echo "  This might work too:"
       echo "    yum install git-clang-format"
-      echo "Then, move both files (i.e. ${CLANG_FORMAT_DIFF} and clang-format) to some directory within PATH=${PATH}"
-      echo "and make sure ${CLANG_FORMAT_DIFF} is executable."
+      echo "Then make sure clang-format is available and executable from \$PATH:"
+      echo "    clang-format --version"
       exit 128
     fi
     # Check argparse pre-req on interpreter, or it will fail
@@ -75,17 +76,16 @@ else
       exit 129
     fi
     # Unfortunately, some machines have a Python2 clang-format-diff.py
-    # installed but only a Python3 interpreter installed. Rather than trying
-    # different Python versions that might be installed, we can try migrating
-    # the code to Python3 if it looks like Python2
+    # installed but only a Python3 interpreter installed. Unfortunately,
+    # automatic 2to3 migration is insufficient, so suggest downloading latest.
     if grep -q "print '" "$CFD_PATH" && \
        ${PYTHON:-python3} --version | grep -q 'ython 3'; then
-      if [ ! -f "$REPO_ROOT/.py3/clang-format-diff.py" ]; then
-        echo "Migrating $CFD_PATH to Python3 in a hidden file"
-        mkdir -p "$REPO_ROOT/.py3"
-        ${PYTHON:-python3} -m lib2to3 -w -n -o "$REPO_ROOT/.py3" "$CFD_PATH" > /dev/null || exit 128
-      fi
-      CFD_PATH="$REPO_ROOT/.py3/clang-format-diff.py"
+      echo "You have clang-format-diff.py for Python 2 but are using a Python 3"
+      echo "interpreter (${PYTHON:-python3})."
+      echo "You can download clang-format-diff.py for Python 3 by running: "
+      echo "    curl --location https://raw.githubusercontent.com/llvm/llvm-project/main/clang/tools/clang-format/clang-format-diff.py -o ${REPO_ROOT}/clang-format-diff.py"
+      echo "You should make sure the downloaded script is not compromised."
+      exit 130
     fi
     CLANG_FORMAT_DIFF="${PYTHON:-python3} $CFD_PATH"
     # This had better work after all those checks
@@ -136,9 +136,11 @@ then
   FORMAT_UPSTREAM_MERGE_BASE="$(git merge-base "$FORMAT_UPSTREAM" HEAD)"
   # Get the differences
   diffs=$(git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" | $CLANG_FORMAT_DIFF -p 1)
+  echo "Checking format of changes not yet in $FORMAT_UPSTREAM..."
 else
   # Check the format of uncommitted lines,
   diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1)
+  echo "Checking format of uncommitted changes..."
 fi
 
 if [ -z "$diffs" ]
diff --git a/build_tools/rocksdb-lego-determinator b/build_tools/rocksdb-lego-determinator
index f5ab73a7f6a..42ce511f0a6 100755
--- a/build_tools/rocksdb-lego-determinator
+++ b/build_tools/rocksdb-lego-determinator
@@ -3,7 +3,7 @@
 # to determine next steps to run
 
 # Usage:
-# EMAIL=<email> ONCALL=<email> TRIGGER=<trigger> SUBSCRIBER=<email> rocks_ci.py <test-name>
+# EMAIL=<email> ONCALL=<email> TRIGGER=<trigger> SUBSCRIBER=<email> WORKINGDIR=<working_dir> rocksdb-lego-determinator <test-name>
 #
 # Input         Value
 # -------------------------------------------------------------------------
@@ -11,7 +11,7 @@
 # ONCALL        Email address to raise a task on failure
 # TRIGGER       Trigger conditions for email. Valid values are fail, warn, all
 # SUBSCRIBER    Email addresss to add as subscriber for task
-#
+# WORKINGDIR    Working directory
 
 #
 # Report configuration
@@ -53,13 +53,19 @@ if [[ ! -z $REPORT_EMAIL || ! -z $CREATE_TASK ]]; then
   ]"
 fi
 
+# Working directory for the following command, default to current directory
+WORKING_DIR=.
+if [ ! -z $WORKINGDIR ]; then
+  WORKING_DIR=$WORKINGDIR
+fi
+
 #
 # Helper variables
 #
 CLEANUP_ENV="
 {
     'name':'Cleanup environment',
-    'shell':'rm -rf /dev/shm/rocksdb && mkdir /dev/shm/rocksdb && (chmod +t /dev/shm || true)  && make clean',
+    'shell':'cd $WORKING_DIR; rm -rf /dev/shm/rocksdb && mkdir /dev/shm/rocksdb && (chmod +t /dev/shm || true) && make clean',
     'user':'root'
 }"
 
@@ -100,9 +106,7 @@ NON_SHM="TMPD=/tmp/rocksdb_test_tmp"
 GCC_481="ROCKSDB_FBCODE_BUILD_WITH_481=1"
 ASAN="COMPILE_WITH_ASAN=1"
 CLANG="USE_CLANG=1"
-# in gcc-5 there are known problems with TSAN like https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71090.
-# using platform007 gives us gcc-8 or higher which has that bug fixed.
-TSAN="ROCKSDB_FBCODE_BUILD_WITH_PLATFORM007=1 COMPILE_WITH_TSAN=1"
+TSAN="COMPILE_WITH_TSAN=1"
 UBSAN="COMPILE_WITH_UBSAN=1"
 TSAN_CRASH='CRASH_TEST_EXT_ARGS="--compression_type=zstd --log2_keys_per_lock=22"'
 NON_TSAN_CRASH="CRASH_TEST_EXT_ARGS=--compression_type=zstd"
@@ -112,6 +116,7 @@ SETUP_JAVA_ENV="export $HTTP_PROXY; export JAVA_HOME=/usr/local/jdk-8u60-64/; ex
 PARSER="'parser':'python build_tools/error_filter.py $1'"
 
 CONTRUN_NAME="ROCKSDB_CONTRUN_NAME"
+SKIP_FORMAT_CHECKS="SKIP_FORMAT_BUCK_CHECKS=1"
 
 # This code is getting called under various scenarios. What we care about is to
 # understand when it's called from nightly contruns because in that case we'll
@@ -153,7 +158,7 @@ UNIT_TEST_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Build and test RocksDB debug version',
-                'shell':'$SHM $DEBUG make $PARALLELISM check || $CONTRUN_NAME=check $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM check || $CONTRUN_NAME=check $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -176,7 +181,7 @@ UNIT_TEST_NON_SHM_COMMANDS="[
             {
                 'name':'Build and test RocksDB debug version',
                 'timeout': 86400,
-                'shell':'$NON_SHM $DEBUG make $PARALLELISM check || $CONTRUN_NAME=non_shm_check $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $NON_SHM $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM check || $CONTRUN_NAME=non_shm_check $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -197,7 +202,7 @@ RELEASE_BUILD_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Build RocksDB release',
-                'shell':'make $PARALLEL_j release || $CONTRUN_NAME=release $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; make $PARALLEL_j release || $CONTRUN_NAME=release $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -218,7 +223,7 @@ UNIT_TEST_COMMANDS_481="[
             $CLEANUP_ENV,
             {
                 'name':'Build and test RocksDB debug version',
-                'shell':'$SHM $GCC_481 $DEBUG make $PARALLELISM check || $CONTRUN_NAME=unit_gcc_481_check $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $GCC_481 $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM check || $CONTRUN_NAME=unit_gcc_481_check $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -239,7 +244,7 @@ RELEASE_BUILD_COMMANDS_481="[
             $CLEANUP_ENV,
             {
                 'name':'Build RocksDB release on GCC 4.8.1',
-                'shell':'$GCC_481 make $PARALLEL_j release || $CONTRUN_NAME=release_gcc481 $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $GCC_481 make $PARALLEL_j release || $CONTRUN_NAME=release_gcc481 $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -260,7 +265,7 @@ CLANG_UNIT_TEST_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Build and test RocksDB debug',
-                'shell':'$CLANG $SHM $DEBUG make $PARALLELISM check || $CONTRUN_NAME=clang_check $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $CLANG $SHM $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM check || $CONTRUN_NAME=clang_check $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -281,7 +286,7 @@ CLANG_RELEASE_BUILD_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Build RocksDB release',
-                'shell':'$CLANG make $PARALLEL_j release|| $CONTRUN_NAME=clang_release $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $CLANG make $PARALLEL_j release|| $CONTRUN_NAME=clang_release $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -302,7 +307,7 @@ CLANG_ANALYZE_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'RocksDB build and analyze',
-                'shell':'$CLANG $SHM $DEBUG make $PARALLEL_j analyze || $CONTRUN_NAME=clang_analyze $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $CLANG $SHM $DEBUG make $PARALLEL_j analyze || $CONTRUN_NAME=clang_analyze $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -323,7 +328,7 @@ CODE_COV_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Build, test and collect code coverage info',
-                'shell':'$SHM $DEBUG make $PARALLELISM coverage || $CONTRUN_NAME=coverage $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM coverage || $CONTRUN_NAME=coverage $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -344,7 +349,7 @@ UNITY_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Build, test unity test',
-                'shell':'$SHM $DEBUG V=1 make J=1 unity_test || $CONTRUN_NAME=unity_test $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG V=1 make J=1 unity_test || $CONTRUN_NAME=unity_test $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -365,7 +370,7 @@ LITE_BUILD_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Build RocksDB debug version',
-                'shell':'make J=1 LITE=1 all check || $CONTRUN_NAME=lite $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SKIP_FORMAT_CHECKS make J=1 LITE=1 all check || $CONTRUN_NAME=lite $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -385,7 +390,7 @@ REPORT_LITE_BINARY_SIZE_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Report RocksDB Lite binary size to scuba',
-                'shell':'tools/report_lite_binary_size.sh',
+                'shell':'cd $WORKING_DIR; tools/report_lite_binary_size.sh',
                 'user':'root',
             },
         ],
@@ -404,14 +409,14 @@ STRESS_CRASH_TEST_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Build and run RocksDB debug stress tests',
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
             {
                 'name':'Build and run RocksDB debug crash tests',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test || $CONTRUN_NAME=crash_test $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test || $CONTRUN_NAME=crash_test $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -434,14 +439,14 @@ BLACKBOX_STRESS_CRASH_TEST_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Build and run RocksDB debug stress tests',
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
             {
                 'name':'Build and run RocksDB debug blackbox crash tests',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 blackbox_crash_test || $CONTRUN_NAME=blackbox_crash_test $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 blackbox_crash_test || $CONTRUN_NAME=blackbox_crash_test $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -464,14 +469,14 @@ WHITEBOX_STRESS_CRASH_TEST_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Build and run RocksDB debug stress tests',
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
             {
                 'name':'Build and run RocksDB debug whitebox crash tests',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 whitebox_crash_test || $CONTRUN_NAME=whitebox_crash_test $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 whitebox_crash_test || $CONTRUN_NAME=whitebox_crash_test $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -494,14 +499,14 @@ STRESS_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Build and run RocksDB debug stress tests',
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
             {
                 'name':'Build and run RocksDB debug crash tests with atomic flush',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test_with_atomic_flush || $CONTRUN_NAME=crash_test_with_atomic_flush $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test_with_atomic_flush || $CONTRUN_NAME=crash_test_with_atomic_flush $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -524,14 +529,44 @@ STRESS_CRASH_TEST_WITH_TXN_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Build and run RocksDB debug stress tests',
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
             {
                 'name':'Build and run RocksDB debug crash tests with txn',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test_with_txn || $CONTRUN_NAME=crash_test_with_txn $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test_with_txn || $CONTRUN_NAME=crash_test_with_txn $TASK_CREATION_TOOL',
+                'user':'root',
+                $PARSER
+            },
+            $UPLOAD_DB_DIR,
+        ],
+        $REPORT
+    }
+]"
+
+#
+# RocksDB stress/crash test with timestamp
+#
+STRESS_CRASH_TEST_WITH_TS_COMMANDS="[
+    {
+        'name':'Rocksdb Stress and Crash Test with ts',
+        'oncall':'$ONCALL',
+        'executeLocal': 'true',
+        'timeout': 86400,
+        'steps': [
+            $CLEANUP_ENV,
+            {
+                'name':'Build and run RocksDB debug stress tests',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL',
+                'user':'root',
+                $PARSER
+            },
+            {
+                'name':'Build and run RocksDB debug crash tests with ts',
+                'timeout': 86400,
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test_with_ts || $CONTRUN_NAME=crash_test_with_ts $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -553,7 +588,7 @@ WRITE_STRESS_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Build and run RocksDB write stress tests',
-                'shell':'make write_stress && python tools/write_stress_runner.py --runtime_sec=3600 --db=/tmp/rocksdb_write_stress || $CONTRUN_NAME=write_stress $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; make write_stress && python tools/write_stress_runner.py --runtime_sec=3600 --db=/tmp/rocksdb_write_stress || $CONTRUN_NAME=write_stress $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             }
@@ -576,7 +611,7 @@ ASAN_TEST_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Test RocksDB debug under ASAN',
-'shell':'set -o pipefail && ($SHM $ASAN $DEBUG make $PARALLELISM asan_check || $CONTRUN_NAME=asan_check $TASK_CREATION_TOOL) |& /usr/facebook/ops/scripts/asan_symbolize.py -d',
+'shell':'cd $WORKING_DIR; set -o pipefail && ($SHM $ASAN $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM asan_check || $CONTRUN_NAME=asan_check $TASK_CREATION_TOOL) |& /usr/facebook/ops/scripts/asan_symbolize.py -d',
                 'user':'root',
                 $PARSER
             }
@@ -599,7 +634,7 @@ ASAN_CRASH_TEST_COMMANDS="[
             {
                 'name':'Build and run RocksDB debug asan_crash_test',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 asan_crash_test || $CONTRUN_NAME=asan_crash_test $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $SKIP_FORMAT_CHECKS make J=1 asan_crash_test || $CONTRUN_NAME=asan_crash_test $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -623,7 +658,7 @@ ASAN_BLACKBOX_CRASH_TEST_COMMANDS="[
             {
                 'name':'Build and run RocksDB debug blackbox asan_crash_test',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 blackbox_asan_crash_test || $CONTRUN_NAME=blackbox_asan_crash_test $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $SKIP_FORMAT_CHECKS make J=1 blackbox_asan_crash_test || $CONTRUN_NAME=blackbox_asan_crash_test $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -647,7 +682,7 @@ ASAN_WHITEBOX_CRASH_TEST_COMMANDS="[
             {
                 'name':'Build and run RocksDB debug whitebox asan_crash_test',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 whitebox_asan_crash_test || $CONTRUN_NAME=whitebox_asan_crash_test $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $SKIP_FORMAT_CHECKS make J=1 whitebox_asan_crash_test || $CONTRUN_NAME=whitebox_asan_crash_test $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -671,7 +706,7 @@ ASAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
             {
                 'name':'Build and run RocksDB debug asan_crash_test_with_atomic_flush',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 asan_crash_test_with_atomic_flush || $CONTRUN_NAME=asan_crash_test_with_atomic_flush $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $SKIP_FORMAT_CHECKS make J=1 asan_crash_test_with_atomic_flush || $CONTRUN_NAME=asan_crash_test_with_atomic_flush $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -695,7 +730,7 @@ ASAN_CRASH_TEST_WITH_TXN_COMMANDS="[
             {
                 'name':'Build and run RocksDB debug asan_crash_test_with_txn',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 asan_crash_test_with_txn || $CONTRUN_NAME=asan_crash_test_with_txn $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $SKIP_FORMAT_CHECKS make J=1 asan_crash_test_with_txn || $CONTRUN_NAME=asan_crash_test_with_txn $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -717,7 +752,7 @@ UBSAN_TEST_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Test RocksDB debug under UBSAN',
-                'shell':'set -o pipefail && $SHM $UBSAN $CLANG $DEBUG make $PARALLELISM ubsan_check || $CONTRUN_NAME=ubsan_check $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; set -o pipefail && $SHM $UBSAN $CLANG $DEBUG $SKIP_FORMAT_CHECKS make $PARALLELISM ubsan_check || $CONTRUN_NAME=ubsan_check $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             }
@@ -740,7 +775,7 @@ UBSAN_CRASH_TEST_COMMANDS="[
             {
                 'name':'Build and run RocksDB debug ubsan_crash_test',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH $CLANG make J=1 ubsan_crash_test || $CONTRUN_NAME=ubsan_crash_test $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $CLANG $SKIP_FORMAT_CHECKS make J=1 ubsan_crash_test || $CONTRUN_NAME=ubsan_crash_test $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -764,7 +799,7 @@ UBSAN_BLACKBOX_CRASH_TEST_COMMANDS="[
             {
                 'name':'Build and run RocksDB debug blackbox ubsan_crash_test',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH $CLANG make J=1 blackbox_ubsan_crash_test || $CONTRUN_NAME=blackbox_ubsan_crash_test $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $CLANG $SKIP_FORMAT_CHECKS make J=1 blackbox_ubsan_crash_test || $CONTRUN_NAME=blackbox_ubsan_crash_test $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -788,7 +823,7 @@ UBSAN_WHITEBOX_CRASH_TEST_COMMANDS="[
             {
                 'name':'Build and run RocksDB debug whitebox ubsan_crash_test',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH $CLANG make J=1 whitebox_ubsan_crash_test || $CONTRUN_NAME=whitebox_ubsan_crash_test $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $CLANG $SKIP_FORMAT_CHECKS make J=1 whitebox_ubsan_crash_test || $CONTRUN_NAME=whitebox_ubsan_crash_test $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -812,7 +847,7 @@ UBSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
             {
                 'name':'Build and run RocksDB debug ubsan_crash_test_with_atomic_flush',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH $CLANG make J=1 ubsan_crash_test_with_atomic_flush || $CONTRUN_NAME=ubsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $CLANG $SKIP_FORMAT_CHECKS make J=1 ubsan_crash_test_with_atomic_flush || $CONTRUN_NAME=ubsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -836,7 +871,7 @@ UBSAN_CRASH_TEST_WITH_TXN_COMMANDS="[
             {
                 'name':'Build and run RocksDB debug ubsan_crash_test_with_txn',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG $NON_TSAN_CRASH $CLANG make J=1 ubsan_crash_test_with_txn || $CONTRUN_NAME=ubsan_crash_test_with_txn $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG $NON_TSAN_CRASH $CLANG $SKIP_FORMAT_CHECKS make J=1 ubsan_crash_test_with_txn || $CONTRUN_NAME=ubsan_crash_test_with_txn $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -860,7 +895,7 @@ VALGRIND_TEST_COMMANDS="[
             {
                 'name':'Run RocksDB debug unit tests',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG make $PARALLELISM valgrind_test || $CONTRUN_NAME=valgrind_check $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SHM $DEBUG make $PARALLELISM valgrind_test || $CONTRUN_NAME=valgrind_check $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -883,7 +918,7 @@ TSAN_UNIT_TEST_COMMANDS="[
             {
                 'name':'Run RocksDB debug unit test',
                 'timeout': 86400,
-                'shell':'set -o pipefail && $SHM $DEBUG $TSAN make $PARALLELISM check || $CONTRUN_NAME=tsan_check $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $SKIP_FORMAT_CHECKS make $PARALLELISM check || $CONTRUN_NAME=tsan_check $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -906,7 +941,7 @@ TSAN_CRASH_TEST_COMMANDS="[
             {
                 'name':'Compile and run',
                 'timeout': 86400,
-                'shell':'set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 crash_test || $CONTRUN_NAME=tsan_crash_test $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 crash_test || $CONTRUN_NAME=tsan_crash_test $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -930,7 +965,7 @@ TSAN_BLACKBOX_CRASH_TEST_COMMANDS="[
             {
                 'name':'Compile and run',
                 'timeout': 86400,
-                'shell':'set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 blackbox_crash_test || $CONTRUN_NAME=tsan_blackbox_crash_test $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 blackbox_crash_test || $CONTRUN_NAME=tsan_blackbox_crash_test $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -954,7 +989,7 @@ TSAN_WHITEBOX_CRASH_TEST_COMMANDS="[
             {
                 'name':'Compile and run',
                 'timeout': 86400,
-                'shell':'set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 whitebox_crash_test || $CONTRUN_NAME=tsan_whitebox_crash_test $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 whitebox_crash_test || $CONTRUN_NAME=tsan_whitebox_crash_test $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -978,7 +1013,7 @@ TSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
             {
                 'name':'Compile and run',
                 'timeout': 86400,
-                'shell':'set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 crash_test_with_atomic_flush || $CONTRUN_NAME=tsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 crash_test_with_atomic_flush || $CONTRUN_NAME=tsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -1002,7 +1037,7 @@ TSAN_CRASH_TEST_WITH_TXN_COMMANDS="[
             {
                 'name':'Compile and run',
                 'timeout': 86400,
-                'shell':'set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 crash_test_with_txn || $CONTRUN_NAME=tsan_crash_test_with_txn $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 crash_test_with_txn || $CONTRUN_NAME=tsan_crash_test_with_txn $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -1036,7 +1071,7 @@ FORMAT_COMPATIBLE_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Run RocksDB debug unit test',
-                'shell':'build_tools/rocksdb-lego-determinator run_format_compatible || $CONTRUN_NAME=run_format_compatible $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; build_tools/rocksdb-lego-determinator run_format_compatible || $CONTRUN_NAME=run_format_compatible $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -1058,6 +1093,7 @@ run_no_compression()
   mv .tmp.fbcode_config.sh build_tools/fbcode_config.sh
   cat Makefile | grep -v tools/ldb_test.py > .tmp.Makefile
   mv .tmp.Makefile Makefile
+  export $SKIP_FORMAT_CHECKS
   make $DEBUG J=1 check
 }
 
@@ -1070,7 +1106,7 @@ NO_COMPRESSION_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Run RocksDB debug unit test',
-                'shell':'build_tools/rocksdb-lego-determinator run_no_compression || $CONTRUN_NAME=run_no_compression $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; build_tools/rocksdb-lego-determinator run_no_compression || $CONTRUN_NAME=run_no_compression $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -1129,7 +1165,7 @@ REGRESSION_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Make and run script',
-                'shell':'build_tools/rocksdb-lego-determinator run_regression || $CONTRUN_NAME=run_regression $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; build_tools/rocksdb-lego-determinator run_regression || $CONTRUN_NAME=run_regression $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -1150,7 +1186,7 @@ JAVA_BUILD_TEST_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Build RocksDB for Java',
-                'shell':'$SETUP_JAVA_ENV; $SHM make rocksdbjava || $CONTRUN_NAME=rocksdbjava $TASK_CREATION_TOOL',
+                'shell':'cd $WORKING_DIR; $SETUP_JAVA_ENV; $SHM make rocksdbjava || $CONTRUN_NAME=rocksdbjava $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -1212,6 +1248,9 @@ case $1 in
   stress_crash_with_txn)
     echo $STRESS_CRASH_TEST_WITH_TXN_COMMANDS
     ;;
+  stress_crash_with_ts)
+    echo $STRESS_CRASH_TEST_WITH_TS_COMMANDS
+    ;;
   write_stress)
     echo $WRITE_STRESS_COMMANDS
     ;;
diff --git a/build_tools/run_ci_db_test.ps1 b/build_tools/run_ci_db_test.ps1
index 883d4e2a5c6..9aea51708cb 100644
--- a/build_tools/run_ci_db_test.ps1
+++ b/build_tools/run_ci_db_test.ps1
@@ -68,7 +68,7 @@ $BinariesFolder = -Join($RootFolder, "\build\Debug\")
 
 if($WorkFolder -eq "") {
 
-    # If TEST_TMPDIR is set use it    
+    # If TEST_TMPDIR is set use it
     [string]$var = $Env:TEST_TMPDIR
     if($var -eq "") {
         $WorkFolder = -Join($RootFolder, "\db_tests\")
@@ -93,7 +93,7 @@ $ExcludeCasesSet = New-Object System.Collections.Generic.HashSet[string]
 if($ExcludeCases -ne "") {
     Write-Host "ExcludeCases: $ExcludeCases"
     $l = $ExcludeCases -split ' '
-    ForEach($t in $l) { 
+    ForEach($t in $l) {
       $ExcludeCasesSet.Add($t) | Out-Null
     }
 }
@@ -102,7 +102,7 @@ $ExcludeExesSet = New-Object System.Collections.Generic.HashSet[string]
 if($ExcludeExes -ne "") {
     Write-Host "ExcludeExe: $ExcludeExes"
     $l = $ExcludeExes -split ' '
-    ForEach($t in $l) { 
+    ForEach($t in $l) {
       $ExcludeExesSet.Add($t) | Out-Null
     }
 }
@@ -118,6 +118,10 @@ if($ExcludeExes -ne "") {
 #   MultiThreaded/MultiThreadedDBTest.
 #     MultiThreaded/0  # GetParam() = 0
 #     MultiThreaded/1  # GetParam() = 1
+#   RibbonTypeParamTest/0.  # TypeParam = struct DefaultTypesAndSettings
+#     CompactnessAndBacktrackAndFpRate
+#     Extremes
+#     FindOccupancyForSuccessRate
 #
 # into this:
 #
@@ -125,6 +129,9 @@ if($ExcludeExes -ne "") {
 #   DBTest.WriteEmptyBatch
 #   MultiThreaded/MultiThreadedDBTest.MultiThreaded/0
 #   MultiThreaded/MultiThreadedDBTest.MultiThreaded/1
+#   RibbonTypeParamTest/0.CompactnessAndBacktrackAndFpRate
+#   RibbonTypeParamTest/0.Extremes
+#   RibbonTypeParamTest/0.FindOccupancyForSuccessRate
 #
 # Output into the parameter in a form TestName -> Log File Name
 function ExtractTestCases([string]$GTestExe, $HashTable) {
@@ -138,6 +145,8 @@ function ExtractTestCases([string]$GTestExe, $HashTable) {
 
     ForEach( $l in $Tests) {
 
+      # remove trailing comment if any
+      $l = $l -replace '\s+\#.*',''
       # Leading whitespace is fine
       $l = $l -replace '^\s+',''
       # Trailing dot is a test group but no whitespace
@@ -146,8 +155,7 @@ function ExtractTestCases([string]$GTestExe, $HashTable) {
       }  else {
         # Otherwise it is a test name, remove leading space
         $test = $l
-        # remove trailing comment if any and create a log name
-        $test = $test -replace '\s+\#.*',''
+        # create a log name
         $test = "$Group$test"
 
         if($ExcludeCasesSet.Contains($test)) {
@@ -253,7 +261,7 @@ if($Run -ne "") {
 
   $DiscoveredExe = @()
   dir -Path $search_path | ForEach-Object {
-     $DiscoveredExe += ($_.Name)     
+     $DiscoveredExe += ($_.Name)
   }
 
   # Remove exclusions
@@ -293,7 +301,7 @@ if($SuiteRun -ne "") {
 
   $ListOfExe = @()
   dir -Path $search_path | ForEach-Object {
-     $ListOfExe += ($_.Name)     
+     $ListOfExe += ($_.Name)
   }
 
   # Exclude those in RunOnly from running as suites
@@ -348,7 +356,7 @@ function RunJobs($Suites, $TestCmds, [int]$ConcurrencyVal)
 
     # Wait for all to finish and get the results
     while(($JobToLog.Count -gt 0) -or
-          ($TestCmds.Count -gt 0) -or 
+          ($TestCmds.Count -gt 0) -or
            ($Suites.Count -gt 0)) {
 
         # Make sure we have maximum concurrent jobs running if anything
@@ -468,8 +476,8 @@ RunJobs -Suites $CasesToRun -TestCmds $TestExes -ConcurrencyVal $Concurrency
 
 $EndDate = (Get-Date)
 
-New-TimeSpan -Start $StartDate -End $EndDate | 
-  ForEach-Object { 
+New-TimeSpan -Start $StartDate -End $EndDate |
+  ForEach-Object {
     "Elapsed time: {0:g}" -f $_
   }
 
@@ -484,4 +492,4 @@ if(!$script:success) {
 
  exit 0
 
- 
+
diff --git a/cache/cache.cc b/cache/cache.cc
index 78897c41614..4eef1c2d61e 100644
--- a/cache/cache.cc
+++ b/cache/cache.cc
@@ -44,9 +44,9 @@ Status Cache::CreateFromString(const ConfigOptions& config_options,
   } else {
 #ifndef ROCKSDB_LITE
     LRUCacheOptions cache_opts;
-    status = OptionTypeInfo::ParseStruct(
-        config_options, "", &lru_cache_options_type_info, "", value,
-        reinterpret_cast<char*>(&cache_opts));
+    status = OptionTypeInfo::ParseStruct(config_options, "",
+                                         &lru_cache_options_type_info, "",
+                                         value, &cache_opts);
     if (status.ok()) {
       cache = NewLRUCache(cache_opts);
     }
diff --git a/cache/cache_bench.cc b/cache/cache_bench.cc
index 7d246759694..0669354ae19 100644
--- a/cache/cache_bench.cc
+++ b/cache/cache_bench.cc
@@ -1,8 +1,11 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2013-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
 #ifndef GFLAGS
 #include <cstdio>
 int main() {
@@ -10,372 +13,8 @@ int main() {
   return 1;
 }
 #else
-
-#include <stdio.h>
-#include <sys/types.h>
-#include <cinttypes>
-#include <limits>
-
-#include "port/port.h"
-#include "rocksdb/cache.h"
-#include "rocksdb/db.h"
-#include "rocksdb/env.h"
-#include "util/coding.h"
-#include "util/gflags_compat.h"
-#include "util/hash.h"
-#include "util/mutexlock.h"
-#include "util/random.h"
-
-using GFLAGS_NAMESPACE::ParseCommandLineFlags;
-
-static constexpr uint32_t KiB = uint32_t{1} << 10;
-static constexpr uint32_t MiB = KiB << 10;
-static constexpr uint64_t GiB = MiB << 10;
-
-DEFINE_uint32(threads, 16, "Number of concurrent threads to run.");
-DEFINE_uint64(cache_size, 1 * GiB,
-              "Number of bytes to use as a cache of uncompressed data.");
-DEFINE_uint32(num_shard_bits, 6, "shard_bits.");
-
-DEFINE_double(resident_ratio, 0.25,
-              "Ratio of keys fitting in cache to keyspace.");
-DEFINE_uint64(ops_per_thread, 0,
-              "Number of operations per thread. (Default: 5 * keyspace size)");
-DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added.");
-
-DEFINE_uint32(skew, 5, "Degree of skew in key selection");
-DEFINE_bool(populate_cache, true, "Populate cache before operations");
-
-DEFINE_uint32(lookup_insert_percent, 87,
-              "Ratio of lookup (+ insert on not found) to total workload "
-              "(expressed as a percentage)");
-DEFINE_uint32(insert_percent, 2,
-              "Ratio of insert to total workload (expressed as a percentage)");
-DEFINE_uint32(lookup_percent, 10,
-              "Ratio of lookup to total workload (expressed as a percentage)");
-DEFINE_uint32(erase_percent, 1,
-              "Ratio of erase to total workload (expressed as a percentage)");
-
-DEFINE_bool(use_clock_cache, false, "");
-
-namespace ROCKSDB_NAMESPACE {
-
-class CacheBench;
-namespace {
-// State shared by all concurrent executions of the same benchmark.
-class SharedState {
- public:
-  explicit SharedState(CacheBench* cache_bench)
-      : cv_(&mu_),
-        num_initialized_(0),
-        start_(false),
-        num_done_(0),
-        cache_bench_(cache_bench) {}
-
-  ~SharedState() {}
-
-  port::Mutex* GetMutex() {
-    return &mu_;
-  }
-
-  port::CondVar* GetCondVar() {
-    return &cv_;
-  }
-
-  CacheBench* GetCacheBench() const {
-    return cache_bench_;
-  }
-
-  void IncInitialized() {
-    num_initialized_++;
-  }
-
-  void IncDone() {
-    num_done_++;
-  }
-
-  bool AllInitialized() const { return num_initialized_ >= FLAGS_threads; }
-
-  bool AllDone() const { return num_done_ >= FLAGS_threads; }
-
-  void SetStart() {
-    start_ = true;
-  }
-
-  bool Started() const {
-    return start_;
-  }
-
- private:
-  port::Mutex mu_;
-  port::CondVar cv_;
-
-  uint64_t num_initialized_;
-  bool start_;
-  uint64_t num_done_;
-
-  CacheBench* cache_bench_;
-};
-
-// Per-thread state for concurrent executions of the same benchmark.
-struct ThreadState {
-  uint32_t tid;
-  Random64 rnd;
-  SharedState* shared;
-
-  ThreadState(uint32_t index, SharedState* _shared)
-      : tid(index), rnd(1000 + index), shared(_shared) {}
-};
-
-struct KeyGen {
-  char key_data[27];
-
-  Slice GetRand(Random64& rnd, uint64_t max_key) {
-    uint64_t raw = rnd.Next();
-    // Skew according to setting
-    for (uint32_t i = 0; i < FLAGS_skew; ++i) {
-      raw = std::min(raw, rnd.Next());
-    }
-    uint64_t key = FastRange64(raw, max_key);
-    // Variable size and alignment
-    size_t off = key % 8;
-    key_data[0] = char{42};
-    EncodeFixed64(key_data + 1, key);
-    key_data[9] = char{11};
-    EncodeFixed64(key_data + 10, key);
-    key_data[18] = char{4};
-    EncodeFixed64(key_data + 19, key);
-    return Slice(&key_data[off], sizeof(key_data) - off);
-  }
-};
-
-char* createValue(Random64& rnd) {
-  char* rv = new char[FLAGS_value_bytes];
-  // Fill with some filler data, and take some CPU time
-  for (uint32_t i = 0; i < FLAGS_value_bytes; i += 8) {
-    EncodeFixed64(rv + i, rnd.Next());
-  }
-  return rv;
-}
-
-void deleter(const Slice& /*key*/, void* value) {
-  delete[] static_cast<char*>(value);
-}
-}  // namespace
-
-class CacheBench {
-  static constexpr uint64_t kHundredthUint64 =
-      std::numeric_limits<uint64_t>::max() / 100U;
-
- public:
-  CacheBench()
-      : max_key_(static_cast<uint64_t>(FLAGS_cache_size / FLAGS_resident_ratio /
-                                       FLAGS_value_bytes)),
-        lookup_insert_threshold_(kHundredthUint64 *
-                                 FLAGS_lookup_insert_percent),
-        insert_threshold_(lookup_insert_threshold_ +
-                          kHundredthUint64 * FLAGS_insert_percent),
-        lookup_threshold_(insert_threshold_ +
-                          kHundredthUint64 * FLAGS_lookup_percent),
-        erase_threshold_(lookup_threshold_ +
-                         kHundredthUint64 * FLAGS_erase_percent) {
-    if (erase_threshold_ != 100U * kHundredthUint64) {
-      fprintf(stderr, "Percentages must add to 100.\n");
-      exit(1);
-    }
-    if (FLAGS_use_clock_cache) {
-      cache_ = NewClockCache(FLAGS_cache_size, FLAGS_num_shard_bits);
-      if (!cache_) {
-        fprintf(stderr, "Clock cache not supported.\n");
-        exit(1);
-      }
-    } else {
-      cache_ = NewLRUCache(FLAGS_cache_size, FLAGS_num_shard_bits);
-    }
-    if (FLAGS_ops_per_thread == 0) {
-      FLAGS_ops_per_thread = 5 * max_key_;
-    }
-  }
-
-  ~CacheBench() {}
-
-  void PopulateCache() {
-    Random64 rnd(1);
-    KeyGen keygen;
-    for (uint64_t i = 0; i < 2 * FLAGS_cache_size; i += FLAGS_value_bytes) {
-      cache_->Insert(keygen.GetRand(rnd, max_key_), createValue(rnd),
-                     FLAGS_value_bytes, &deleter);
-    }
-  }
-
-  bool Run() {
-    ROCKSDB_NAMESPACE::Env* env = ROCKSDB_NAMESPACE::Env::Default();
-
-    PrintEnv();
-    SharedState shared(this);
-    std::vector<std::unique_ptr<ThreadState> > threads(FLAGS_threads);
-    for (uint32_t i = 0; i < FLAGS_threads; i++) {
-      threads[i].reset(new ThreadState(i, &shared));
-      env->StartThread(ThreadBody, threads[i].get());
-    }
-    {
-      MutexLock l(shared.GetMutex());
-      while (!shared.AllInitialized()) {
-        shared.GetCondVar()->Wait();
-      }
-      // Record start time
-      uint64_t start_time = env->NowMicros();
-
-      // Start all threads
-      shared.SetStart();
-      shared.GetCondVar()->SignalAll();
-
-      // Wait threads to complete
-      while (!shared.AllDone()) {
-        shared.GetCondVar()->Wait();
-      }
-
-      // Record end time
-      uint64_t end_time = env->NowMicros();
-      double elapsed = static_cast<double>(end_time - start_time) * 1e-6;
-      uint32_t qps = static_cast<uint32_t>(
-          static_cast<double>(FLAGS_threads * FLAGS_ops_per_thread) / elapsed);
-      fprintf(stdout, "Complete in %.3f s; QPS = %u\n", elapsed, qps);
-    }
-    return true;
-  }
-
- private:
-  std::shared_ptr<Cache> cache_;
-  const uint64_t max_key_;
-  // Cumulative thresholds in the space of a random uint64_t
-  const uint64_t lookup_insert_threshold_;
-  const uint64_t insert_threshold_;
-  const uint64_t lookup_threshold_;
-  const uint64_t erase_threshold_;
-
-  static void ThreadBody(void* v) {
-    ThreadState* thread = static_cast<ThreadState*>(v);
-    SharedState* shared = thread->shared;
-
-    {
-      MutexLock l(shared->GetMutex());
-      shared->IncInitialized();
-      if (shared->AllInitialized()) {
-        shared->GetCondVar()->SignalAll();
-      }
-      while (!shared->Started()) {
-        shared->GetCondVar()->Wait();
-      }
-    }
-    thread->shared->GetCacheBench()->OperateCache(thread);
-
-    {
-      MutexLock l(shared->GetMutex());
-      shared->IncDone();
-      if (shared->AllDone()) {
-        shared->GetCondVar()->SignalAll();
-      }
-    }
-  }
-
-  void OperateCache(ThreadState* thread) {
-    // To use looked-up values
-    uint64_t result = 0;
-    // To hold handles for a non-trivial amount of time
-    Cache::Handle* handle = nullptr;
-    KeyGen gen;
-    for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
-      Slice key = gen.GetRand(thread->rnd, max_key_);
-      uint64_t random_op = thread->rnd.Next();
-      if (random_op < lookup_insert_threshold_) {
-        if (handle) {
-          cache_->Release(handle);
-          handle = nullptr;
-        }
-        // do lookup
-        handle = cache_->Lookup(key);
-        if (handle) {
-          // do something with the data
-          result += NPHash64(static_cast<char*>(cache_->Value(handle)),
-                             FLAGS_value_bytes);
-        } else {
-          // do insert
-          cache_->Insert(key, createValue(thread->rnd), FLAGS_value_bytes,
-                         &deleter, &handle);
-        }
-      } else if (random_op < insert_threshold_) {
-        if (handle) {
-          cache_->Release(handle);
-          handle = nullptr;
-        }
-        // do insert
-        cache_->Insert(key, createValue(thread->rnd), FLAGS_value_bytes,
-                       &deleter, &handle);
-      } else if (random_op < lookup_threshold_) {
-        if (handle) {
-          cache_->Release(handle);
-          handle = nullptr;
-        }
-        // do lookup
-        handle = cache_->Lookup(key);
-        if (handle) {
-          // do something with the data
-          result += NPHash64(static_cast<char*>(cache_->Value(handle)),
-                             FLAGS_value_bytes);
-        }
-      } else if (random_op < erase_threshold_) {
-        // do erase
-        cache_->Erase(key);
-      } else {
-        // Should be extremely unlikely (noop)
-        assert(random_op >= kHundredthUint64 * 100U);
-      }
-    }
-    if (handle) {
-      cache_->Release(handle);
-      handle = nullptr;
-    }
-  }
-
-  void PrintEnv() const {
-    printf("RocksDB version     : %d.%d\n", kMajorVersion, kMinorVersion);
-    printf("Number of threads   : %u\n", FLAGS_threads);
-    printf("Ops per thread      : %" PRIu64 "\n", FLAGS_ops_per_thread);
-    printf("Cache size          : %" PRIu64 "\n", FLAGS_cache_size);
-    printf("Num shard bits      : %u\n", FLAGS_num_shard_bits);
-    printf("Max key             : %" PRIu64 "\n", max_key_);
-    printf("Resident ratio      : %g\n", FLAGS_resident_ratio);
-    printf("Skew degree         : %u\n", FLAGS_skew);
-    printf("Populate cache      : %d\n", int{FLAGS_populate_cache});
-    printf("Lookup+Insert pct   : %u%%\n", FLAGS_lookup_insert_percent);
-    printf("Insert percentage   : %u%%\n", FLAGS_insert_percent);
-    printf("Lookup percentage   : %u%%\n", FLAGS_lookup_percent);
-    printf("Erase percentage    : %u%%\n", FLAGS_erase_percent);
-    printf("----------------------------\n");
-  }
-};
-}  // namespace ROCKSDB_NAMESPACE
-
+#include <rocksdb/cache_bench_tool.h>
 int main(int argc, char** argv) {
-  ParseCommandLineFlags(&argc, &argv, true);
-
-  if (FLAGS_threads <= 0) {
-    fprintf(stderr, "threads number <= 0\n");
-    exit(1);
-  }
-
-  ROCKSDB_NAMESPACE::CacheBench bench;
-  if (FLAGS_populate_cache) {
-    bench.PopulateCache();
-    printf("Population complete\n");
-    printf("----------------------------\n");
-  }
-  if (bench.Run()) {
-    return 0;
-  } else {
-    return 1;
-  }
+  return ROCKSDB_NAMESPACE::cache_bench_tool(argc, argv);
 }
-
 #endif  // GFLAGS
diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc
new file mode 100644
index 00000000000..9fadf85a90b
--- /dev/null
+++ b/cache/cache_bench_tool.cc
@@ -0,0 +1,573 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifdef GFLAGS
+#include <cinttypes>
+#include <cstdio>
+#include <limits>
+#include <set>
+#include <sstream>
+
+#include "monitoring/histogram.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/secondary_cache.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "table/block_based/cachable_entry.h"
+#include "util/coding.h"
+#include "util/gflags_compat.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+static constexpr uint32_t KiB = uint32_t{1} << 10;
+static constexpr uint32_t MiB = KiB << 10;
+static constexpr uint64_t GiB = MiB << 10;
+
+DEFINE_uint32(threads, 16, "Number of concurrent threads to run.");
+DEFINE_uint64(cache_size, 1 * GiB,
+              "Number of bytes to use as a cache of uncompressed data.");
+DEFINE_uint32(num_shard_bits, 6, "shard_bits.");
+
+DEFINE_double(resident_ratio, 0.25,
+              "Ratio of keys fitting in cache to keyspace.");
+DEFINE_uint64(ops_per_thread, 2000000U, "Number of operations per thread.");
+DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added.");
+
+DEFINE_uint32(skew, 5, "Degree of skew in key selection");
+DEFINE_bool(populate_cache, true, "Populate cache before operations");
+
+DEFINE_uint32(lookup_insert_percent, 87,
+              "Ratio of lookup (+ insert on not found) to total workload "
+              "(expressed as a percentage)");
+DEFINE_uint32(insert_percent, 2,
+              "Ratio of insert to total workload (expressed as a percentage)");
+DEFINE_uint32(lookup_percent, 10,
+              "Ratio of lookup to total workload (expressed as a percentage)");
+DEFINE_uint32(erase_percent, 1,
+              "Ratio of erase to total workload (expressed as a percentage)");
+DEFINE_bool(gather_stats, false,
+            "Whether to periodically simulate gathering block cache stats, "
+            "using one more thread.");
+DEFINE_uint32(
+    gather_stats_sleep_ms, 1000,
+    "How many milliseconds to sleep between each gathering of stats.");
+
+DEFINE_uint32(gather_stats_entries_per_lock, 256,
+              "For Cache::ApplyToAllEntries");
+DEFINE_bool(skewed, false, "If true, skew the key access distribution");
+#ifndef ROCKSDB_LITE
+DEFINE_string(secondary_cache_uri, "",
+              "Full URI for creating a custom secondary cache object");
+static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
+#endif  // ROCKSDB_LITE
+
+DEFINE_bool(use_clock_cache, false, "");
+
+namespace ROCKSDB_NAMESPACE {
+
+class CacheBench;
+namespace {
+// State shared by all concurrent executions of the same benchmark.
+class SharedState {
+ public:
+  explicit SharedState(CacheBench* cache_bench)
+      : cv_(&mu_),
+        num_initialized_(0),
+        start_(false),
+        num_done_(0),
+        cache_bench_(cache_bench) {}
+
+  ~SharedState() {}
+
+  port::Mutex* GetMutex() { return &mu_; }
+
+  port::CondVar* GetCondVar() { return &cv_; }
+
+  CacheBench* GetCacheBench() const { return cache_bench_; }
+
+  void IncInitialized() { num_initialized_++; }
+
+  void IncDone() { num_done_++; }
+
+  bool AllInitialized() const { return num_initialized_ >= FLAGS_threads; }
+
+  bool AllDone() const { return num_done_ >= FLAGS_threads; }
+
+  void SetStart() { start_ = true; }
+
+  bool Started() const { return start_; }
+
+ private:
+  port::Mutex mu_;
+  port::CondVar cv_;
+
+  uint64_t num_initialized_;
+  bool start_;
+  uint64_t num_done_;
+
+  CacheBench* cache_bench_;
+};
+
+// Per-thread state for concurrent executions of the same benchmark.
+struct ThreadState {
+  uint32_t tid;
+  Random64 rnd;
+  SharedState* shared;
+  HistogramImpl latency_ns_hist;
+  uint64_t duration_us = 0;
+
+  ThreadState(uint32_t index, SharedState* _shared)
+      : tid(index), rnd(1000 + index), shared(_shared) {}
+};
+
+struct KeyGen {
+  char key_data[27];
+
+  Slice GetRand(Random64& rnd, uint64_t max_key, int max_log) {
+    uint64_t key = 0;
+    if (!FLAGS_skewed) {
+      uint64_t raw = rnd.Next();
+      // Skew according to setting
+      for (uint32_t i = 0; i < FLAGS_skew; ++i) {
+        raw = std::min(raw, rnd.Next());
+      }
+      key = FastRange64(raw, max_key);
+    } else {
+      key = rnd.Skewed(max_log);
+      if (key > max_key) {
+        key -= max_key;
+      }
+    }
+    // Variable size and alignment
+    size_t off = key % 8;
+    key_data[0] = char{42};
+    EncodeFixed64(key_data + 1, key);
+    key_data[9] = char{11};
+    EncodeFixed64(key_data + 10, key);
+    key_data[18] = char{4};
+    EncodeFixed64(key_data + 19, key);
+    return Slice(&key_data[off], sizeof(key_data) - off);
+  }
+};
+
+char* createValue(Random64& rnd) {
+  char* rv = new char[FLAGS_value_bytes];
+  // Fill with some filler data, and take some CPU time
+  for (uint32_t i = 0; i < FLAGS_value_bytes; i += 8) {
+    EncodeFixed64(rv + i, rnd.Next());
+  }
+  return rv;
+}
+
+// Callbacks for secondary cache
+size_t SizeFn(void* /*obj*/) { return FLAGS_value_bytes; }
+
+Status SaveToFn(void* obj, size_t /*offset*/, size_t size, void* out) {
+  memcpy(out, obj, size);
+  return Status::OK();
+}
+
+// Different deleters to simulate using deleter to gather
+// stats on the code origin and kind of cache entries.
+void deleter1(const Slice& /*key*/, void* value) {
+  delete[] static_cast<char*>(value);
+}
+void deleter2(const Slice& /*key*/, void* value) {
+  delete[] static_cast<char*>(value);
+}
+void deleter3(const Slice& /*key*/, void* value) {
+  delete[] static_cast<char*>(value);
+}
+
+Cache::CacheItemHelper helper1(SizeFn, SaveToFn, deleter1);
+Cache::CacheItemHelper helper2(SizeFn, SaveToFn, deleter2);
+Cache::CacheItemHelper helper3(SizeFn, SaveToFn, deleter3);
+}  // namespace
+
+class CacheBench {
+  static constexpr uint64_t kHundredthUint64 =
+      std::numeric_limits<uint64_t>::max() / 100U;
+
+ public:
+  CacheBench()
+      : max_key_(static_cast<uint64_t>(FLAGS_cache_size / FLAGS_resident_ratio /
+                                       FLAGS_value_bytes)),
+        lookup_insert_threshold_(kHundredthUint64 *
+                                 FLAGS_lookup_insert_percent),
+        insert_threshold_(lookup_insert_threshold_ +
+                          kHundredthUint64 * FLAGS_insert_percent),
+        lookup_threshold_(insert_threshold_ +
+                          kHundredthUint64 * FLAGS_lookup_percent),
+        erase_threshold_(lookup_threshold_ +
+                         kHundredthUint64 * FLAGS_erase_percent),
+        skewed_(FLAGS_skewed) {
+    if (erase_threshold_ != 100U * kHundredthUint64) {
+      fprintf(stderr, "Percentages must add to 100.\n");
+      exit(1);
+    }
+
+    max_log_ = 0;
+    if (skewed_) {
+      uint64_t max_key = max_key_;
+      while (max_key >>= 1) max_log_++;
+      if (max_key > (1u << max_log_)) max_log_++;
+    }
+
+    if (FLAGS_use_clock_cache) {
+      cache_ = NewClockCache(FLAGS_cache_size, FLAGS_num_shard_bits);
+      if (!cache_) {
+        fprintf(stderr, "Clock cache not supported.\n");
+        exit(1);
+      }
+    } else {
+      LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits, false, 0.5);
+#ifndef ROCKSDB_LITE
+      if (!FLAGS_secondary_cache_uri.empty()) {
+        Status s =
+            ObjectRegistry::NewInstance()->NewSharedObject<SecondaryCache>(
+                FLAGS_secondary_cache_uri, &secondary_cache);
+        if (secondary_cache == nullptr) {
+          fprintf(
+              stderr,
+              "No secondary cache registered matching string: %s status=%s\n",
+              FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str());
+          exit(1);
+        }
+        opts.secondary_cache = secondary_cache;
+      }
+#endif  // ROCKSDB_LITE
+
+      cache_ = NewLRUCache(opts);
+    }
+  }
+
+  ~CacheBench() {}
+
+  void PopulateCache() {
+    Random64 rnd(1);
+    KeyGen keygen;
+    for (uint64_t i = 0; i < 2 * FLAGS_cache_size; i += FLAGS_value_bytes) {
+      cache_->Insert(keygen.GetRand(rnd, max_key_, max_log_), createValue(rnd),
+                     &helper1, FLAGS_value_bytes);
+    }
+  }
+
+  bool Run() {
+    const auto clock = SystemClock::Default().get();
+
+    PrintEnv();
+    SharedState shared(this);
+    std::vector<std::unique_ptr<ThreadState> > threads(FLAGS_threads);
+    for (uint32_t i = 0; i < FLAGS_threads; i++) {
+      threads[i].reset(new ThreadState(i, &shared));
+      std::thread(ThreadBody, threads[i].get()).detach();
+    }
+
+    HistogramImpl stats_hist;
+    std::string stats_report;
+    std::thread stats_thread(StatsBody, &shared, &stats_hist, &stats_report);
+
+    uint64_t start_time;
+    {
+      MutexLock l(shared.GetMutex());
+      while (!shared.AllInitialized()) {
+        shared.GetCondVar()->Wait();
+      }
+      // Record start time
+      start_time = clock->NowMicros();
+
+      // Start all threads
+      shared.SetStart();
+      shared.GetCondVar()->SignalAll();
+
+      // Wait threads to complete
+      while (!shared.AllDone()) {
+        shared.GetCondVar()->Wait();
+      }
+    }
+
+    // Stats gathering is considered background work. This time measurement
+    // is for foreground work, and not really ideal for that. See below.
+    uint64_t end_time = clock->NowMicros();
+    stats_thread.join();
+
+    // Wall clock time - includes idle time if threads
+    // finish at different times (not ideal).
+    double elapsed_secs = static_cast<double>(end_time - start_time) * 1e-6;
+    uint32_t ops_per_sec = static_cast<uint32_t>(
+        1.0 * FLAGS_threads * FLAGS_ops_per_thread / elapsed_secs);
+    printf("Complete in %.3f s; Rough parallel ops/sec = %u\n", elapsed_secs,
+           ops_per_sec);
+
+    // Total time in each thread (more accurate throughput measure)
+    elapsed_secs = 0;
+    for (uint32_t i = 0; i < FLAGS_threads; i++) {
+      elapsed_secs += threads[i]->duration_us * 1e-6;
+    }
+    ops_per_sec = static_cast<uint32_t>(1.0 * FLAGS_threads *
+                                        FLAGS_ops_per_thread / elapsed_secs);
+    printf("Thread ops/sec = %u\n", ops_per_sec);
+
+    printf("\nOperation latency (ns):\n");
+    HistogramImpl combined;
+    for (uint32_t i = 0; i < FLAGS_threads; i++) {
+      combined.Merge(threads[i]->latency_ns_hist);
+    }
+    printf("%s", combined.ToString().c_str());
+
+    if (FLAGS_gather_stats) {
+      printf("\nGather stats latency (us):\n");
+      printf("%s", stats_hist.ToString().c_str());
+    }
+
+    printf("\n%s", stats_report.c_str());
+
+    return true;
+  }
+
+ private:
+  std::shared_ptr<Cache> cache_;
+  const uint64_t max_key_;
+  // Cumulative thresholds in the space of a random uint64_t
+  const uint64_t lookup_insert_threshold_;
+  const uint64_t insert_threshold_;
+  const uint64_t lookup_threshold_;
+  const uint64_t erase_threshold_;
+  const bool skewed_;
+  int max_log_;
+
+  // A benchmark version of gathering stats on an active block cache by
+  // iterating over it. The primary purpose is to measure the impact of
+  // gathering stats with ApplyToAllEntries on throughput- and
+  // latency-sensitive Cache users. Performance of stats gathering is
+  // also reported. The last set of gathered stats is also reported, for
+  // manual sanity checking for logical errors or other unexpected
+  // behavior of cache_bench or the underlying Cache.
+  static void StatsBody(SharedState* shared, HistogramImpl* stats_hist,
+                        std::string* stats_report) {
+    if (!FLAGS_gather_stats) {
+      return;
+    }
+    const auto clock = SystemClock::Default().get();
+    uint64_t total_key_size = 0;
+    uint64_t total_charge = 0;
+    uint64_t total_entry_count = 0;
+    std::set<Cache::DeleterFn> deleters;
+    StopWatchNano timer(clock);
+
+    for (;;) {
+      uint64_t time;
+      time = clock->NowMicros();
+      uint64_t deadline = time + uint64_t{FLAGS_gather_stats_sleep_ms} * 1000;
+
+      {
+        MutexLock l(shared->GetMutex());
+        for (;;) {
+          if (shared->AllDone()) {
+            std::ostringstream ostr;
+            ostr << "Most recent cache entry stats:\n"
+                 << "Number of entries: " << total_entry_count << "\n"
+                 << "Total charge: " << BytesToHumanString(total_charge) << "\n"
+                 << "Average key size: "
+                 << (1.0 * total_key_size / total_entry_count) << "\n"
+                 << "Average charge: "
+                 << BytesToHumanString(1.0 * total_charge / total_entry_count)
+                 << "\n"
+                 << "Unique deleters: " << deleters.size() << "\n";
+            *stats_report = ostr.str();
+            return;
+          }
+          if (clock->NowMicros() >= deadline) {
+            break;
+          }
+          uint64_t diff = deadline - std::min(clock->NowMicros(), deadline);
+          shared->GetCondVar()->TimedWait(diff + 1);
+        }
+      }
+
+      // Now gather stats, outside of mutex
+      total_key_size = 0;
+      total_charge = 0;
+      total_entry_count = 0;
+      deleters.clear();
+      auto fn = [&](const Slice& key, void* /*value*/, size_t charge,
+                    Cache::DeleterFn deleter) {
+        total_key_size += key.size();
+        total_charge += charge;
+        ++total_entry_count;
+        // Something slightly more expensive as in (future) stats by category
+        deleters.insert(deleter);
+      };
+      timer.Start();
+      Cache::ApplyToAllEntriesOptions opts;
+      opts.average_entries_per_lock = FLAGS_gather_stats_entries_per_lock;
+      shared->GetCacheBench()->cache_->ApplyToAllEntries(fn, opts);
+      stats_hist->Add(timer.ElapsedNanos() / 1000);
+    }
+  }
+
+  static void ThreadBody(ThreadState* thread) {
+    SharedState* shared = thread->shared;
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncInitialized();
+      if (shared->AllInitialized()) {
+        shared->GetCondVar()->SignalAll();
+      }
+      while (!shared->Started()) {
+        shared->GetCondVar()->Wait();
+      }
+    }
+    thread->shared->GetCacheBench()->OperateCache(thread);
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncDone();
+      if (shared->AllDone()) {
+        shared->GetCondVar()->SignalAll();
+      }
+    }
+  }
+
+  void OperateCache(ThreadState* thread) {
+    // To use looked-up values
+    uint64_t result = 0;
+    // To hold handles for a non-trivial amount of time
+    Cache::Handle* handle = nullptr;
+    KeyGen gen;
+    const auto clock = SystemClock::Default().get();
+    uint64_t start_time = clock->NowMicros();
+    StopWatchNano timer(clock);
+
+    for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
+      timer.Start();
+      Slice key = gen.GetRand(thread->rnd, max_key_, max_log_);
+      uint64_t random_op = thread->rnd.Next();
+      Cache::CreateCallback create_cb =
+          [](void* buf, size_t size, void** out_obj, size_t* charge) -> Status {
+        *out_obj = reinterpret_cast<void*>(new char[size]);
+        memcpy(*out_obj, buf, size);
+        *charge = size;
+        return Status::OK();
+      };
+
+      if (random_op < lookup_insert_threshold_) {
+        if (handle) {
+          cache_->Release(handle);
+          handle = nullptr;
+        }
+        // do lookup
+        handle = cache_->Lookup(key, &helper2, create_cb, Cache::Priority::LOW,
+                                true);
+        if (handle) {
+          // do something with the data
+          result += NPHash64(static_cast<char*>(cache_->Value(handle)),
+                             FLAGS_value_bytes);
+        } else {
+          // do insert
+          cache_->Insert(key, createValue(thread->rnd), &helper2,
+                         FLAGS_value_bytes, &handle);
+        }
+      } else if (random_op < insert_threshold_) {
+        if (handle) {
+          cache_->Release(handle);
+          handle = nullptr;
+        }
+        // do insert
+        cache_->Insert(key, createValue(thread->rnd), &helper3,
+                       FLAGS_value_bytes, &handle);
+      } else if (random_op < lookup_threshold_) {
+        if (handle) {
+          cache_->Release(handle);
+          handle = nullptr;
+        }
+        // do lookup
+        handle = cache_->Lookup(key, &helper2, create_cb, Cache::Priority::LOW,
+                                true);
+        if (handle) {
+          // do something with the data
+          result += NPHash64(static_cast<char*>(cache_->Value(handle)),
+                             FLAGS_value_bytes);
+        }
+      } else if (random_op < erase_threshold_) {
+        // do erase
+        cache_->Erase(key);
+      } else {
+        // Should be extremely unlikely (noop)
+        assert(random_op >= kHundredthUint64 * 100U);
+      }
+      thread->latency_ns_hist.Add(timer.ElapsedNanos());
+    }
+    if (handle) {
+      cache_->Release(handle);
+      handle = nullptr;
+    }
+    // Ensure computations on `result` are not optimized away.
+    if (result == 1) {
+      printf("You are extremely unlucky(2). Try again.\n");
+      exit(1);
+    }
+    thread->duration_us = clock->NowMicros() - start_time;
+  }
+
+  void PrintEnv() const {
+    printf("RocksDB version     : %d.%d\n", kMajorVersion, kMinorVersion);
+    printf("Number of threads   : %u\n", FLAGS_threads);
+    printf("Ops per thread      : %" PRIu64 "\n", FLAGS_ops_per_thread);
+    printf("Cache size          : %s\n",
+           BytesToHumanString(FLAGS_cache_size).c_str());
+    printf("Num shard bits      : %u\n", FLAGS_num_shard_bits);
+    printf("Max key             : %" PRIu64 "\n", max_key_);
+    printf("Resident ratio      : %g\n", FLAGS_resident_ratio);
+    printf("Skew degree         : %u\n", FLAGS_skew);
+    printf("Populate cache      : %d\n", int{FLAGS_populate_cache});
+    printf("Lookup+Insert pct   : %u%%\n", FLAGS_lookup_insert_percent);
+    printf("Insert percentage   : %u%%\n", FLAGS_insert_percent);
+    printf("Lookup percentage   : %u%%\n", FLAGS_lookup_percent);
+    printf("Erase percentage    : %u%%\n", FLAGS_erase_percent);
+    std::ostringstream stats;
+    if (FLAGS_gather_stats) {
+      stats << "enabled (" << FLAGS_gather_stats_sleep_ms << "ms, "
+            << FLAGS_gather_stats_entries_per_lock << "/lock)";
+    } else {
+      stats << "disabled";
+    }
+    printf("Gather stats        : %s\n", stats.str().c_str());
+    printf("----------------------------\n");
+  }
+};
+
+int cache_bench_tool(int argc, char** argv) {
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  if (FLAGS_threads <= 0) {
+    fprintf(stderr, "threads number <= 0\n");
+    exit(1);
+  }
+
+  ROCKSDB_NAMESPACE::CacheBench bench;
+  if (FLAGS_populate_cache) {
+    bench.PopulateCache();
+    printf("Population complete\n");
+    printf("----------------------------\n");
+  }
+  if (bench.Run()) {
+    return 0;
+  } else {
+    return 1;
+  }
+}  // namespace ROCKSDB_NAMESPACE
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // GFLAGS
diff --git a/cache/cache_entry_roles.cc b/cache/cache_entry_roles.cc
new file mode 100644
index 00000000000..dbc71206398
--- /dev/null
+++ b/cache/cache_entry_roles.cc
@@ -0,0 +1,66 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "cache/cache_entry_roles.h"
+
+#include <mutex>
+
+#include "port/lang.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::array<const char*, kNumCacheEntryRoles> kCacheEntryRoleToCamelString{{
+    "DataBlock",
+    "FilterBlock",
+    "FilterMetaBlock",
+    "DeprecatedFilterBlock",
+    "IndexBlock",
+    "OtherBlock",
+    "WriteBuffer",
+    "Misc",
+}};
+
+std::array<const char*, kNumCacheEntryRoles> kCacheEntryRoleToHyphenString{{
+    "data-block",
+    "filter-block",
+    "filter-meta-block",
+    "deprecated-filter-block",
+    "index-block",
+    "other-block",
+    "write-buffer",
+    "misc",
+}};
+
+namespace {
+
+struct Registry {
+  std::mutex mutex;
+  std::unordered_map<Cache::DeleterFn, CacheEntryRole> role_map;
+  void Register(Cache::DeleterFn fn, CacheEntryRole role) {
+    std::lock_guard<std::mutex> lock(mutex);
+    role_map[fn] = role;
+  }
+  std::unordered_map<Cache::DeleterFn, CacheEntryRole> Copy() {
+    std::lock_guard<std::mutex> lock(mutex);
+    return role_map;
+  }
+};
+
+Registry& GetRegistry() {
+  STATIC_AVOID_DESTRUCTION(Registry, registry);
+  return registry;
+}
+
+}  // namespace
+
+void RegisterCacheDeleterRole(Cache::DeleterFn fn, CacheEntryRole role) {
+  GetRegistry().Register(fn, role);
+}
+
+std::unordered_map<Cache::DeleterFn, CacheEntryRole> CopyCacheDeleterRoleMap() {
+  return GetRegistry().Copy();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/cache/cache_entry_roles.h b/cache/cache_entry_roles.h
new file mode 100644
index 00000000000..22148e00c41
--- /dev/null
+++ b/cache/cache_entry_roles.h
@@ -0,0 +1,122 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <unordered_map>
+
+#include "rocksdb/cache.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Classifications of block cache entries, for reporting statistics
+enum class CacheEntryRole {
+  // Block-based table data block
+  kDataBlock,
+  // Block-based table filter block (full or partitioned)
+  kFilterBlock,
+  // Block-based table metadata block for partitioned filter
+  kFilterMetaBlock,
+  // Block-based table deprecated filter block (old "block-based" filter)
+  kDeprecatedFilterBlock,
+  // Block-based table index block
+  kIndexBlock,
+  // Other kinds of block-based table block
+  kOtherBlock,
+  // WriteBufferManager reservations to account for memtable usage
+  kWriteBuffer,
+  // Default bucket, for miscellaneous cache entries. Do not use for
+  // entries that could potentially add up to large usage.
+  kMisc,
+};
+constexpr uint32_t kNumCacheEntryRoles =
+    static_cast<uint32_t>(CacheEntryRole::kMisc) + 1;
+
+extern std::array<const char*, kNumCacheEntryRoles>
+    kCacheEntryRoleToCamelString;
+extern std::array<const char*, kNumCacheEntryRoles>
+    kCacheEntryRoleToHyphenString;
+
+// To associate cache entries with their role, we use a hack on the
+// existing Cache interface. Because the deleter of an entry can authenticate
+// the code origin of an entry, we can elaborate the choice of deleter to
+// also encode role information, without inferring false role information
+// from entries not choosing to encode a role.
+//
+// The rest of this file is for handling mappings between deleters and
+// roles.
+
+// To infer a role from a deleter, the deleter must be registered. This
+// can be done "manually" with this function. This function is thread-safe,
+// and the registration mappings go into private but static storage. (Note
+// that DeleterFn is a function pointer, not std::function. Registrations
+// should not be too many.)
+void RegisterCacheDeleterRole(Cache::DeleterFn fn, CacheEntryRole role);
+
+// Gets a copy of the registered deleter -> role mappings. This is the only
+// function for reading the mappings made with RegisterCacheDeleterRole.
+// Why only this interface for reading?
+// * This function has to be thread safe, which could incur substantial
+// overhead. We should not pay this overhead for every deleter look-up.
+// * This is suitable for preparing for batch operations, like with
+// CacheEntryStatsCollector.
+// * The number of mappings should be sufficiently small (dozens).
+std::unordered_map<Cache::DeleterFn, CacheEntryRole> CopyCacheDeleterRoleMap();
+
+// ************************************************************** //
+// An automatic registration infrastructure. This enables code
+// to simply ask for a deleter associated with a particular type
+// and role, and registration is automatic. In a sense, this is
+// a small dependency injection infrastructure, because linking
+// in new deleter instantiations is essentially sufficient for
+// making stats collection (using CopyCacheDeleterRoleMap) aware
+// of them.
+
+namespace cache_entry_roles_detail {
+
+template <typename T, CacheEntryRole R>
+struct RegisteredDeleter {
+  RegisteredDeleter() { RegisterCacheDeleterRole(Delete, R); }
+
+  // These have global linkage to help ensure compiler optimizations do not
+  // break uniqueness for each <T,R>
+  static void Delete(const Slice& /* key */, void* value) {
+    delete static_cast<T*>(value);
+  }
+};
+
+template <CacheEntryRole R>
+struct RegisteredNoopDeleter {
+  RegisteredNoopDeleter() { RegisterCacheDeleterRole(Delete, R); }
+
+  static void Delete(const Slice& /* key */, void* value) {
+    (void)value;
+    assert(value == nullptr);
+  }
+};
+
+}  // namespace cache_entry_roles_detail
+
+// Get an automatically registered deleter for value type T and role R.
+// Based on C++ semantics, registration is invoked exactly once in a
+// thread-safe way on first call to this function, for each <T, R>.
+template <typename T, CacheEntryRole R>
+Cache::DeleterFn GetCacheEntryDeleterForRole() {
+  static cache_entry_roles_detail::RegisteredDeleter<T, R> reg;
+  return reg.Delete;
+}
+
+// Get an automatically registered no-op deleter (value should be nullptr)
+// and associated with role R. This is used for Cache "reservation" entries
+// such as for WriteBufferManager.
+template <CacheEntryRole R>
+Cache::DeleterFn GetNoopDeleterForRole() {
+  static cache_entry_roles_detail::RegisteredNoopDeleter<R> reg;
+  return reg.Delete;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/cache/cache_entry_stats.h b/cache/cache_entry_stats.h
new file mode 100644
index 00000000000..ecd387f8523
--- /dev/null
+++ b/cache/cache_entry_stats.h
@@ -0,0 +1,165 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+
+#include "cache/cache_helpers.h"
+#include "port/lang.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/status.h"
+#include "rocksdb/system_clock.h"
+#include "util/coding_lean.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A generic helper object for gathering stats about cache entries by
+// iterating over them with ApplyToAllEntries. This class essentially
+// solves the problem of slowing down a Cache with too many stats
+// collectors that could be sharing stat results, such as from multiple
+// column families or multiple DBs sharing a Cache. We employ a few
+// mitigations:
+// * Only one collector for a particular kind of Stats is alive
+// for each Cache. This is guaranteed using the Cache itself to hold
+// the collector.
+// * A mutex ensures only one thread is gathering stats for this
+// collector.
+// * The most recent gathered stats are saved and simply copied to
+// satisfy requests within a time window (default: 3 minutes) of
+// completion of the most recent stat gathering.
+//
+// Template parameter Stats must be copyable and trivially constructable,
+// as well as...
+// concept Stats {
+//   // Notification before applying callback to all entries
+//   void BeginCollection(Cache*, SystemClock*, uint64_t start_time_micros);
+//   // Get the callback to apply to all entries. `callback`
+//   // type must be compatible with Cache::ApplyToAllEntries
+//   callback GetEntryCallback();
+//   // Notification after applying callback to all entries
+//   void EndCollection(Cache*, SystemClock*, uint64_t end_time_micros);
+//   // Notification that a collection was skipped because of
+//   // sufficiently recent saved results.
+//   void SkippedCollection();
+// }
+template <class Stats>
+class CacheEntryStatsCollector {
+ public:
+  // Gathers stats and saves results into `stats`
+  //
+  // Maximum allowed age for a "hit" on saved results is determined by the
+  // two interval parameters. Both set to 0 forces a re-scan. For example
+  // with min_interval_seconds=300 and min_interval_factor=100, if the last
+  // scan took 10s, we would only rescan ("miss") if the age in seconds of
+  // the saved results is > max(300, 100*10).
+  // Justification: scans can vary wildly in duration, e.g. from 0.02 sec
+  // to as much as 20 seconds, so we want to be able to cap the absolute
+  // and relative frequency of scans.
+  void GetStats(Stats *stats, int min_interval_seconds,
+                int min_interval_factor) {
+    // Waits for any pending reader or writer (collector)
+    std::lock_guard<std::mutex> lock(mutex_);
+
+    uint64_t max_age_micros =
+        static_cast<uint64_t>(std::max(min_interval_seconds, 0)) * 1000000U;
+
+    if (last_end_time_micros_ > last_start_time_micros_ &&
+        min_interval_factor > 0) {
+      max_age_micros = std::max(
+          max_age_micros, min_interval_factor * (last_end_time_micros_ -
+                                                 last_start_time_micros_));
+    }
+
+    uint64_t start_time_micros = clock_->NowMicros();
+    if ((start_time_micros - last_end_time_micros_) > max_age_micros) {
+      last_start_time_micros_ = start_time_micros;
+      saved_stats_.BeginCollection(cache_, clock_, start_time_micros);
+
+      cache_->ApplyToAllEntries(saved_stats_.GetEntryCallback(), {});
+      TEST_SYNC_POINT_CALLBACK(
+          "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries", nullptr);
+
+      uint64_t end_time_micros = clock_->NowMicros();
+      last_end_time_micros_ = end_time_micros;
+      saved_stats_.EndCollection(cache_, clock_, end_time_micros);
+    } else {
+      saved_stats_.SkippedCollection();
+    }
+    // Copy to caller
+    *stats = saved_stats_;
+  }
+
+  Cache *GetCache() const { return cache_; }
+
+  // Gets or creates a shared instance of CacheEntryStatsCollector in the
+  // cache itself, and saves into `ptr`. This shared_ptr will hold the
+  // entry in cache until all refs are destroyed.
+  static Status GetShared(Cache *cache, SystemClock *clock,
+                          std::shared_ptr<CacheEntryStatsCollector> *ptr) {
+    std::array<uint64_t, 3> cache_key_data{
+        {// First 16 bytes == md5 of class name
+         0x7eba5a8fb5437c90U, 0x8ca68c9b11655855U,
+         // Last 8 bytes based on a function pointer to make unique for each
+         // template instantiation
+         reinterpret_cast<uint64_t>(&CacheEntryStatsCollector::GetShared)}};
+    Slice cache_key = GetSlice(&cache_key_data);
+
+    Cache::Handle *h = cache->Lookup(cache_key);
+    if (h == nullptr) {
+      // Not yet in cache, but Cache doesn't provide a built-in way to
+      // avoid racing insert. So we double-check under a shared mutex,
+      // inspired by TableCache.
+      STATIC_AVOID_DESTRUCTION(std::mutex, static_mutex);
+      std::lock_guard<std::mutex> lock(static_mutex);
+
+      h = cache->Lookup(cache_key);
+      if (h == nullptr) {
+        auto new_ptr = new CacheEntryStatsCollector(cache, clock);
+        // TODO: non-zero charge causes some tests that count block cache
+        // usage to go flaky. Fix the problem somehow so we can use an
+        // accurate charge.
+        size_t charge = 0;
+        Status s = cache->Insert(cache_key, new_ptr, charge, Deleter, &h,
+                                 Cache::Priority::HIGH);
+        if (!s.ok()) {
+          assert(h == nullptr);
+          return s;
+        }
+      }
+    }
+    // If we reach here, shared entry is in cache with handle `h`.
+    assert(cache->GetDeleter(h) == Deleter);
+
+    // Build an aliasing shared_ptr that keeps `ptr` in cache while there
+    // are references.
+    *ptr = MakeSharedCacheHandleGuard<CacheEntryStatsCollector>(cache, h);
+    return Status::OK();
+  }
+
+ private:
+  explicit CacheEntryStatsCollector(Cache *cache, SystemClock *clock)
+      : saved_stats_(),
+        last_start_time_micros_(0),
+        last_end_time_micros_(/*pessimistic*/ 10000000),
+        cache_(cache),
+        clock_(clock) {}
+
+  static void Deleter(const Slice &, void *value) {
+    delete static_cast<CacheEntryStatsCollector *>(value);
+  }
+
+  std::mutex mutex_;
+  Stats saved_stats_;
+  uint64_t last_start_time_micros_;
+  uint64_t last_end_time_micros_;
+  Cache *const cache_;
+  SystemClock *const clock_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/cache/cache_helpers.h b/cache/cache_helpers.h
new file mode 100644
index 00000000000..4b784939613
--- /dev/null
+++ b/cache/cache_helpers.h
@@ -0,0 +1,125 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Returns the cached value given a cache handle.
+template <typename T>
+T* GetFromCacheHandle(Cache* cache, Cache::Handle* handle) {
+  assert(cache);
+  assert(handle);
+
+  return static_cast<T*>(cache->Value(handle));
+}
+
+// Simple generic deleter for Cache (to be used with Cache::Insert).
+template <typename T>
+void DeleteCacheEntry(const Slice& /* key */, void* value) {
+  delete static_cast<T*>(value);
+}
+
+// Turns a T* into a Slice so it can be used as a key with Cache.
+template <typename T>
+Slice GetSlice(const T* t) {
+  return Slice(reinterpret_cast<const char*>(t), sizeof(T));
+}
+
+// Generic resource management object for cache handles that releases the handle
+// when destroyed. Has unique ownership of the handle, so copying it is not
+// allowed, while moving it transfers ownership.
+template <typename T>
+class CacheHandleGuard {
+ public:
+  CacheHandleGuard() = default;
+
+  CacheHandleGuard(Cache* cache, Cache::Handle* handle)
+      : cache_(cache),
+        handle_(handle),
+        value_(GetFromCacheHandle<T>(cache, handle)) {
+    assert(cache_ && handle_ && value_);
+  }
+
+  CacheHandleGuard(const CacheHandleGuard&) = delete;
+  CacheHandleGuard& operator=(const CacheHandleGuard&) = delete;
+
+  CacheHandleGuard(CacheHandleGuard&& rhs) noexcept
+      : cache_(rhs.cache_), handle_(rhs.handle_), value_(rhs.value_) {
+    assert((!cache_ && !handle_ && !value_) || (cache_ && handle_ && value_));
+
+    rhs.ResetFields();
+  }
+
+  CacheHandleGuard& operator=(CacheHandleGuard&& rhs) noexcept {
+    if (this == &rhs) {
+      return *this;
+    }
+
+    ReleaseHandle();
+
+    cache_ = rhs.cache_;
+    handle_ = rhs.handle_;
+    value_ = rhs.value_;
+
+    assert((!cache_ && !handle_ && !value_) || (cache_ && handle_ && value_));
+
+    rhs.ResetFields();
+
+    return *this;
+  }
+
+  ~CacheHandleGuard() { ReleaseHandle(); }
+
+  bool IsEmpty() const { return !handle_; }
+
+  Cache* GetCache() const { return cache_; }
+  Cache::Handle* GetCacheHandle() const { return handle_; }
+  T* GetValue() const { return value_; }
+
+  void Reset() {
+    ReleaseHandle();
+    ResetFields();
+  }
+
+ private:
+  void ReleaseHandle() {
+    if (IsEmpty()) {
+      return;
+    }
+
+    assert(cache_);
+    cache_->Release(handle_);
+  }
+
+  void ResetFields() {
+    cache_ = nullptr;
+    handle_ = nullptr;
+    value_ = nullptr;
+  }
+
+ private:
+  Cache* cache_ = nullptr;
+  Cache::Handle* handle_ = nullptr;
+  T* value_ = nullptr;
+};
+
+// Build an aliasing shared_ptr that keeps `handle` in cache while there
+// are references, but the pointer is to the value for that cache entry,
+// which must be of type T. This is copyable, unlike CacheHandleGuard, but
+// does not provide access to caching details.
+template <typename T>
+std::shared_ptr<T> MakeSharedCacheHandleGuard(Cache* cache,
+                                              Cache::Handle* handle) {
+  auto wrapper = std::make_shared<CacheHandleGuard<T>>(cache, handle);
+  return std::shared_ptr<T>(wrapper, static_cast<T*>(cache->Value(handle)));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/cache/cache_test.cc b/cache/cache_test.cc
index 40d8c42cc7c..66881b5c998 100644
--- a/cache/cache_test.cc
+++ b/cache/cache_test.cc
@@ -712,25 +712,98 @@ TEST_P(CacheTest, OverCapacity) {
 }
 
 namespace {
-std::vector<std::pair<int, int>> callback_state;
-void callback(void* entry, size_t charge) {
-  callback_state.push_back({DecodeValue(entry), static_cast<int>(charge)});
+std::vector<std::pair<int, int>> legacy_callback_state;
+void legacy_callback(void* value, size_t charge) {
+  legacy_callback_state.push_back(
+      {DecodeValue(value), static_cast<int>(charge)});
 }
 };
 
-TEST_P(CacheTest, ApplyToAllCacheEntiresTest) {
+TEST_P(CacheTest, ApplyToAllCacheEntriesTest) {
   std::vector<std::pair<int, int>> inserted;
-  callback_state.clear();
+  legacy_callback_state.clear();
 
   for (int i = 0; i < 10; ++i) {
     Insert(i, i * 2, i + 1);
     inserted.push_back({i * 2, i + 1});
   }
-  cache_->ApplyToAllCacheEntries(callback, true);
+  cache_->ApplyToAllCacheEntries(legacy_callback, true);
+
+  std::sort(inserted.begin(), inserted.end());
+  std::sort(legacy_callback_state.begin(), legacy_callback_state.end());
+  ASSERT_EQ(inserted.size(), legacy_callback_state.size());
+  for (size_t i = 0; i < inserted.size(); ++i) {
+    EXPECT_EQ(inserted[i], legacy_callback_state[i]);
+  }
+}
+
+TEST_P(CacheTest, ApplyToAllEntriesTest) {
+  std::vector<std::string> callback_state;
+  const auto callback = [&](const Slice& key, void* value, size_t charge,
+                            Cache::DeleterFn deleter) {
+    callback_state.push_back(ToString(DecodeKey(key)) + "," +
+                             ToString(DecodeValue(value)) + "," +
+                             ToString(charge));
+    assert(deleter == &CacheTest::Deleter);
+  };
+
+  std::vector<std::string> inserted;
+  callback_state.clear();
+
+  for (int i = 0; i < 10; ++i) {
+    Insert(i, i * 2, i + 1);
+    inserted.push_back(ToString(i) + "," + ToString(i * 2) + "," +
+                       ToString(i + 1));
+  }
+  cache_->ApplyToAllEntries(callback, /*opts*/ {});
 
   std::sort(inserted.begin(), inserted.end());
   std::sort(callback_state.begin(), callback_state.end());
-  ASSERT_TRUE(inserted == callback_state);
+  ASSERT_EQ(inserted.size(), callback_state.size());
+  for (size_t i = 0; i < inserted.size(); ++i) {
+    EXPECT_EQ(inserted[i], callback_state[i]);
+  }
+}
+
+TEST_P(CacheTest, ApplyToAllEntriesDuringResize) {
+  // This is a mini-stress test of ApplyToAllEntries, to ensure
+  // items in the cache that are neither added nor removed
+  // during ApplyToAllEntries are counted exactly once.
+
+  // Insert some entries that we expect to be seen exactly once
+  // during iteration.
+  constexpr int kSpecialCharge = 2;
+  constexpr int kNotSpecialCharge = 1;
+  constexpr int kSpecialCount = 100;
+  for (int i = 0; i < kSpecialCount; ++i) {
+    Insert(i, i * 2, kSpecialCharge);
+  }
+
+  // For callback
+  int special_count = 0;
+  const auto callback = [&](const Slice&, void*, size_t charge,
+                            Cache::DeleterFn) {
+    if (charge == static_cast<size_t>(kSpecialCharge)) {
+      ++special_count;
+    }
+  };
+
+  // Start counting
+  std::thread apply_thread([&]() {
+    // Use small average_entries_per_lock to make the problem difficult
+    Cache::ApplyToAllEntriesOptions opts;
+    opts.average_entries_per_lock = 2;
+    cache_->ApplyToAllEntries(callback, opts);
+  });
+
+  // In parallel, add more entries, enough to cause resize but not enough
+  // to cause ejections
+  for (int i = kSpecialCount * 1; i < kSpecialCount * 6; ++i) {
+    Insert(i, i * 2, kNotSpecialCharge);
+  }
+
+  apply_thread.join();
+  ASSERT_EQ(special_count, kSpecialCount);
 }
 
 TEST_P(CacheTest, DefaultShardBits) {
@@ -749,11 +822,12 @@ TEST_P(CacheTest, DefaultShardBits) {
   ASSERT_EQ(6, sc->GetNumShardBits());
 }
 
-TEST_P(CacheTest, GetCharge) {
+TEST_P(CacheTest, GetChargeAndDeleter) {
   Insert(1, 2);
   Cache::Handle* h1 = cache_->Lookup(EncodeKey(1));
   ASSERT_EQ(2, DecodeValue(cache_->Value(h1)));
   ASSERT_EQ(1, cache_->GetCharge(h1));
+  ASSERT_EQ(&CacheTest::Deleter, cache_->GetDeleter(h1));
   cache_->Release(h1);
 }
 
diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc
index 7934b378bdb..a3cb7d2c3f9 100644
--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
@@ -176,10 +176,13 @@ namespace {
 // Cache entry meta data.
 struct CacheHandle {
   Slice key;
-  uint32_t hash;
   void* value;
   size_t charge;
-  void (*deleter)(const Slice&, void* value);
+  Cache::DeleterFn deleter;
+  uint32_t hash;
+
+  // Addition to "charge" to get "total charge" under metadata policy.
+  uint32_t meta_charge;
 
   // Flags and counters associated with the cache handle:
   //   lowest bit: in-cache bit
@@ -205,9 +208,8 @@ struct CacheHandle {
     return *this;
   }
 
-  inline static size_t CalcTotalCharge(
-      Slice key, size_t charge,
-      CacheMetadataChargePolicy metadata_charge_policy) {
+  inline static uint32_t CalcMetadataCharge(
+      Slice key, CacheMetadataChargePolicy metadata_charge_policy) {
     size_t meta_charge = 0;
     if (metadata_charge_policy == kFullChargeCacheMetadata) {
       meta_charge += sizeof(CacheHandle);
@@ -218,13 +220,11 @@ struct CacheHandle {
       meta_charge += key.size();
 #endif
     }
-    return charge + meta_charge;
+    assert(meta_charge <= UINT32_MAX);
+    return static_cast<uint32_t>(meta_charge);
   }
 
-  inline size_t CalcTotalCharge(
-      CacheMetadataChargePolicy metadata_charge_policy) {
-    return CalcTotalCharge(key, charge, metadata_charge_policy);
-  }
+  inline size_t GetTotalCharge() { return charge + meta_charge; }
 };
 
 // Key of hash map. We store hash value with the key for convenience.
@@ -271,7 +271,25 @@ class ClockCacheShard final : public CacheShard {
   Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
                 void (*deleter)(const Slice& key, void* value),
                 Cache::Handle** handle, Cache::Priority priority) override;
+  Status Insert(const Slice& key, uint32_t hash, void* value,
+                const Cache::CacheItemHelper* helper, size_t charge,
+                Cache::Handle** handle, Cache::Priority priority) override {
+    return Insert(key, hash, value, charge, helper->del_cb, handle, priority);
+  }
   Cache::Handle* Lookup(const Slice& key, uint32_t hash) override;
+  Cache::Handle* Lookup(const Slice& key, uint32_t hash,
+                        const Cache::CacheItemHelper* /*helper*/,
+                        const Cache::CreateCallback& /*create_cb*/,
+                        Cache::Priority /*priority*/, bool /*wait*/) override {
+    return Lookup(key, hash);
+  }
+  bool Release(Cache::Handle* handle, bool /*useful*/,
+               bool force_erase) override {
+    return Release(handle, force_erase);
+  }
+  bool IsReady(Cache::Handle* /*handle*/) override { return true; }
+  void Wait(Cache::Handle* /*handle*/) override {}
+
   // If the entry in in cache, increase reference count and return true.
   // Return false otherwise.
   //
@@ -284,8 +302,10 @@ class ClockCacheShard final : public CacheShard {
   size_t GetUsage() const override;
   size_t GetPinnedUsage() const override;
   void EraseUnRefEntries() override;
-  void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
-                              bool thread_safe) override;
+  void ApplyToSomeEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      uint32_t average_entries_per_lock, uint32_t* state) override;
 
  private:
   static const uint32_t kInCacheBit = 1;
@@ -404,22 +424,46 @@ size_t ClockCacheShard::GetPinnedUsage() const {
   return pinned_usage_.load(std::memory_order_relaxed);
 }
 
-void ClockCacheShard::ApplyToAllCacheEntries(void (*callback)(void*, size_t),
-                                             bool thread_safe) {
-  if (thread_safe) {
-    mutex_.Lock();
+void ClockCacheShard::ApplyToSomeEntries(
+    const std::function<void(const Slice& key, void* value, size_t charge,
+                             DeleterFn deleter)>& callback,
+    uint32_t average_entries_per_lock, uint32_t* state) {
+  assert(average_entries_per_lock > 0);
+  MutexLock lock(&mutex_);
+
+  // Figure out the range to iterate, update `state`
+  size_t list_size = list_.size();
+  size_t start_idx = *state;
+  size_t end_idx = start_idx + average_entries_per_lock;
+  if (start_idx > list_size) {
+    // Shouldn't reach here, but recoverable
+    assert(false);
+    // Mark finished with all
+    *state = UINT32_MAX;
+    return;
+  }
+  if (end_idx >= list_size || end_idx >= UINT32_MAX) {
+    // This also includes the hypothetical case of >4 billion
+    // cache handles.
+    end_idx = list_size;
+    // Mark finished with all
+    *state = UINT32_MAX;
+  } else {
+    *state = static_cast<uint32_t>(end_idx);
   }
-  for (auto& handle : list_) {
-    // Use relaxed semantics instead of acquire semantics since we are either
-    // holding mutex, or don't have thread safe requirement.
+
+  // Do the iteration
+  auto cur = list_.begin() + start_idx;
+  auto end = list_.begin() + end_idx;
+  for (; cur != end; ++cur) {
+    const CacheHandle& handle = *cur;
+    // Use relaxed semantics instead of acquire semantics since we are
+    // holding mutex
     uint32_t flags = handle.flags.load(std::memory_order_relaxed);
     if (InCache(flags)) {
-      callback(handle.value, handle.charge);
+      callback(handle.key, handle.value, handle.charge, handle.deleter);
     }
   }
-  if (thread_safe) {
-    mutex_.Unlock();
-  }
 }
 
 void ClockCacheShard::RecycleHandle(CacheHandle* handle,
@@ -428,10 +472,8 @@ void ClockCacheShard::RecycleHandle(CacheHandle* handle,
   assert(!InCache(handle->flags) && CountRefs(handle->flags) == 0);
   context->to_delete_key.push_back(handle->key.data());
   context->to_delete_value.emplace_back(*handle);
-  size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
-  handle->key.clear();
-  handle->value = nullptr;
-  handle->deleter = nullptr;
+  size_t total_charge = handle->GetTotalCharge();
+  // clearing `handle` fields would go here but not strictly required
   recycle_.push_back(handle);
   usage_.fetch_sub(total_charge, std::memory_order_relaxed);
 }
@@ -459,7 +501,7 @@ bool ClockCacheShard::Ref(Cache::Handle* h) {
                                             std::memory_order_relaxed)) {
       if (CountRefs(flags) == 0) {
         // No reference count before the operation.
-        size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
+        size_t total_charge = handle->GetTotalCharge();
         pinned_usage_.fetch_add(total_charge, std::memory_order_relaxed);
       }
       return true;
@@ -473,6 +515,11 @@ bool ClockCacheShard::Unref(CacheHandle* handle, bool set_usage,
   if (set_usage) {
     handle->flags.fetch_or(kUsageBit, std::memory_order_relaxed);
   }
+  // If the handle reaches state refs=0 and InCache=true after this
+  // atomic operation then we cannot access `handle` afterward, because
+  // it could be evicted before we access the `handle`.
+  size_t total_charge = handle->GetTotalCharge();
+
   // Use acquire-release semantics as previous operations on the cache entry
   // has to be order before reference count is decreased, and potential cleanup
   // of the entry has to be order after.
@@ -480,7 +527,6 @@ bool ClockCacheShard::Unref(CacheHandle* handle, bool set_usage,
   assert(CountRefs(flags) > 0);
   if (CountRefs(flags) == 1) {
     // this is the last reference.
-    size_t total_charge = handle->CalcTotalCharge(metadata_charge_policy_);
     pinned_usage_.fetch_sub(total_charge, std::memory_order_relaxed);
     // Cleanup if it is the last reference.
     if (!InCache(flags)) {
@@ -567,8 +613,9 @@ CacheHandle* ClockCacheShard::Insert(
     void (*deleter)(const Slice& key, void* value), bool hold_reference,
     CleanupContext* context, bool* overwritten) {
   assert(overwritten != nullptr && *overwritten == false);
-  size_t total_charge =
-      CacheHandle::CalcTotalCharge(key, charge, metadata_charge_policy_);
+  uint32_t meta_charge =
+      CacheHandle::CalcMetadataCharge(key, metadata_charge_policy_);
+  size_t total_charge = charge + meta_charge;
   MutexLock l(&mutex_);
   bool success = EvictFromCache(total_charge, context);
   bool strict = strict_capacity_limit_.load(std::memory_order_relaxed);
@@ -594,8 +641,18 @@ CacheHandle* ClockCacheShard::Insert(
   handle->hash = hash;
   handle->value = value;
   handle->charge = charge;
+  handle->meta_charge = meta_charge;
   handle->deleter = deleter;
   uint32_t flags = hold_reference ? kInCacheBit + kOneRef : kInCacheBit;
+
+  // TODO investigate+fix suspected race condition:
+  // [thread 1] Lookup starts, up to Ref()
+  // [thread 2] Erase/evict the entry just looked up
+  // [thread 1] Ref() the handle, even though it's in the recycle bin
+  // [thread 2] Insert with recycling that handle
+  // Here we obliterate the other thread's Ref
+  // Possible fix: never blindly overwrite the flags, but only make
+  // relative updates (fetch_add, etc).
   handle->flags.store(flags, std::memory_order_relaxed);
   HashTable::accessor accessor;
   if (table_.find(accessor, CacheKey(key, hash))) {
@@ -726,11 +783,11 @@ class ClockCache final : public ShardedCache {
 
   const char* Name() const override { return "ClockCache"; }
 
-  CacheShard* GetShard(int shard) override {
+  CacheShard* GetShard(uint32_t shard) override {
     return reinterpret_cast<CacheShard*>(&shards_[shard]);
   }
 
-  const CacheShard* GetShard(int shard) const override {
+  const CacheShard* GetShard(uint32_t shard) const override {
     return reinterpret_cast<CacheShard*>(&shards_[shard]);
   }
 
@@ -746,7 +803,17 @@ class ClockCache final : public ShardedCache {
     return reinterpret_cast<const CacheHandle*>(handle)->hash;
   }
 
-  void DisownData() override { shards_ = nullptr; }
+  DeleterFn GetDeleter(Handle* handle) const override {
+    return reinterpret_cast<const CacheHandle*>(handle)->deleter;
+  }
+
+  void DisownData() override {
+#ifndef MUST_FREE_HEAP_ALLOCATIONS
+    shards_ = nullptr;
+#endif
+  }
+
+  void WaitAll(std::vector<Handle*>& /*handles*/) override {}
 
  private:
   ClockCacheShard* shards_;
diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc
index 04e612bdbcc..f7da46b69f8 100644
--- a/cache/lru_cache.cc
+++ b/cache/lru_cache.cc
@@ -9,26 +9,28 @@
 
 #include "cache/lru_cache.h"
 
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string>
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
 
 #include "util/mutexlock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-LRUHandleTable::LRUHandleTable() : list_(nullptr), length_(0), elems_(0) {
-  Resize();
-}
+LRUHandleTable::LRUHandleTable(int max_upper_hash_bits)
+    : length_bits_(/* historical starting size*/ 4),
+      list_(new LRUHandle* [size_t{1} << length_bits_] {}),
+      elems_(0),
+      max_length_bits_(max_upper_hash_bits) {}
 
 LRUHandleTable::~LRUHandleTable() {
-  ApplyToAllCacheEntries([](LRUHandle* h) {
-    if (!h->HasRefs()) {
-      h->Free();
-    }
-  });
-  delete[] list_;
+  ApplyToEntriesRange(
+      [](LRUHandle* h) {
+        if (!h->HasRefs()) {
+          h->Free();
+        }
+      },
+      0, uint32_t{1} << length_bits_);
 }
 
 LRUHandle* LRUHandleTable::Lookup(const Slice& key, uint32_t hash) {
@@ -42,7 +44,7 @@ LRUHandle* LRUHandleTable::Insert(LRUHandle* h) {
   *ptr = h;
   if (old == nullptr) {
     ++elems_;
-    if (elems_ > length_) {
+    if ((elems_ >> length_bits_) > 0) {  // elems_ >= length
       // Since each cache entry is fairly large, we aim for a small
       // average linked list length (<= 1).
       Resize();
@@ -62,7 +64,7 @@ LRUHandle* LRUHandleTable::Remove(const Slice& key, uint32_t hash) {
 }
 
 LRUHandle** LRUHandleTable::FindPointer(const Slice& key, uint32_t hash) {
-  LRUHandle** ptr = &list_[hash & (length_ - 1)];
+  LRUHandle** ptr = &list_[hash >> (32 - length_bits_)];
   while (*ptr != nullptr && ((*ptr)->hash != hash || key != (*ptr)->key())) {
     ptr = &(*ptr)->next_hash;
   }
@@ -70,19 +72,29 @@ LRUHandle** LRUHandleTable::FindPointer(const Slice& key, uint32_t hash) {
 }
 
 void LRUHandleTable::Resize() {
-  uint32_t new_length = 16;
-  while (new_length < elems_ * 1.5) {
-    new_length *= 2;
+  if (length_bits_ >= max_length_bits_) {
+    // Due to reaching limit of hash information, if we made the table
+    // bigger, we would allocate more addresses but only the same
+    // number would be used.
+    return;
   }
-  LRUHandle** new_list = new LRUHandle*[new_length];
-  memset(new_list, 0, sizeof(new_list[0]) * new_length);
+  if (length_bits_ >= 31) {
+    // Avoid undefined behavior shifting uint32_t by 32
+    return;
+  }
+
+  uint32_t old_length = uint32_t{1} << length_bits_;
+  int new_length_bits = length_bits_ + 1;
+  std::unique_ptr<LRUHandle* []> new_list {
+    new LRUHandle* [size_t{1} << new_length_bits] {}
+  };
   uint32_t count = 0;
-  for (uint32_t i = 0; i < length_; i++) {
+  for (uint32_t i = 0; i < old_length; i++) {
     LRUHandle* h = list_[i];
     while (h != nullptr) {
       LRUHandle* next = h->next_hash;
       uint32_t hash = h->hash;
-      LRUHandle** ptr = &new_list[hash & (new_length - 1)];
+      LRUHandle** ptr = &new_list[hash >> (32 - new_length_bits)];
       h->next_hash = *ptr;
       *ptr = h;
       h = next;
@@ -90,23 +102,25 @@ void LRUHandleTable::Resize() {
     }
   }
   assert(elems_ == count);
-  delete[] list_;
-  list_ = new_list;
-  length_ = new_length;
+  list_ = std::move(new_list);
+  length_bits_ = new_length_bits;
 }
 
-LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
-                             double high_pri_pool_ratio,
-                             bool use_adaptive_mutex,
-                             CacheMetadataChargePolicy metadata_charge_policy)
+LRUCacheShard::LRUCacheShard(
+    size_t capacity, bool strict_capacity_limit, double high_pri_pool_ratio,
+    bool use_adaptive_mutex, CacheMetadataChargePolicy metadata_charge_policy,
+    int max_upper_hash_bits,
+    const std::shared_ptr<SecondaryCache>& secondary_cache)
     : capacity_(0),
       high_pri_pool_usage_(0),
       strict_capacity_limit_(strict_capacity_limit),
       high_pri_pool_ratio_(high_pri_pool_ratio),
       high_pri_pool_capacity_(0),
+      table_(max_upper_hash_bits),
       usage_(0),
       lru_usage_(0),
-      mutex_(use_adaptive_mutex) {
+      mutex_(use_adaptive_mutex),
+      secondary_cache_(secondary_cache) {
   set_metadata_charge_policy(metadata_charge_policy);
   // Make empty circular linked list
   lru_.next = &lru_;
@@ -138,19 +152,40 @@ void LRUCacheShard::EraseUnRefEntries() {
   }
 }
 
-void LRUCacheShard::ApplyToAllCacheEntries(void (*callback)(void*, size_t),
-                                           bool thread_safe) {
-  const auto applyCallback = [&]() {
-    table_.ApplyToAllCacheEntries(
-        [callback](LRUHandle* h) { callback(h->value, h->charge); });
-  };
-
-  if (thread_safe) {
-    MutexLock l(&mutex_);
-    applyCallback();
+void LRUCacheShard::ApplyToSomeEntries(
+    const std::function<void(const Slice& key, void* value, size_t charge,
+                             DeleterFn deleter)>& callback,
+    uint32_t average_entries_per_lock, uint32_t* state) {
+  // The state is essentially going to be the starting hash, which works
+  // nicely even if we resize between calls because we use upper-most
+  // hash bits for table indexes.
+  MutexLock l(&mutex_);
+  uint32_t length_bits = table_.GetLengthBits();
+  uint32_t length = uint32_t{1} << length_bits;
+
+  assert(average_entries_per_lock > 0);
+  // Assuming we are called with same average_entries_per_lock repeatedly,
+  // this simplifies some logic (index_end will not overflow)
+  assert(average_entries_per_lock < length || *state == 0);
+
+  uint32_t index_begin = *state >> (32 - length_bits);
+  uint32_t index_end = index_begin + average_entries_per_lock;
+  if (index_end >= length) {
+    // Going to end
+    index_end = length;
+    *state = UINT32_MAX;
   } else {
-    applyCallback();
+    *state = index_end << (32 - length_bits);
   }
+
+  table_.ApplyToEntriesRange(
+      [callback](LRUHandle* h) {
+        DeleterFn deleter = h->IsSecondaryCacheCompatible()
+                                ? h->info_.helper->del_cb
+                                : h->info_.deleter;
+        callback(h->key(), h->value, h->charge, deleter);
+      },
+      index_begin, index_end);
 }
 
 void LRUCacheShard::TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri) {
@@ -257,8 +292,14 @@ void LRUCacheShard::SetCapacity(size_t capacity) {
     EvictFromLRU(0, &last_reference_list);
   }
 
+  // Try to insert the evicted entries into tiered cache
   // Free the entries outside of mutex for performance reasons
   for (auto entry : last_reference_list) {
+    if (secondary_cache_ && entry->IsSecondaryCacheCompatible() &&
+        !entry->IsPromoted()) {
+      secondary_cache_->Insert(entry->key(), entry->value, entry->info_.helper)
+          .PermitUncheckedError();
+    }
     entry->Free();
   }
 }
@@ -268,17 +309,176 @@ void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
   strict_capacity_limit_ = strict_capacity_limit;
 }
 
-Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) {
-  MutexLock l(&mutex_);
-  LRUHandle* e = table_.Lookup(key, hash);
-  if (e != nullptr) {
-    assert(e->InCache());
-    if (!e->HasRefs()) {
-      // The entry is in LRU since it's in hash and has no external references
-      LRU_Remove(e);
+Status LRUCacheShard::InsertItem(LRUHandle* e, Cache::Handle** handle,
+                                 bool free_handle_on_fail) {
+  Status s = Status::OK();
+  autovector<LRUHandle*> last_reference_list;
+  size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
+
+  {
+    MutexLock l(&mutex_);
+
+    // Free the space following strict LRU policy until enough space
+    // is freed or the lru list is empty
+    EvictFromLRU(total_charge, &last_reference_list);
+
+    if ((usage_ + total_charge) > capacity_ &&
+        (strict_capacity_limit_ || handle == nullptr)) {
+      e->SetInCache(false);
+      if (handle == nullptr) {
+        // Don't insert the entry but still return ok, as if the entry inserted
+        // into cache and get evicted immediately.
+        last_reference_list.push_back(e);
+      } else {
+        if (free_handle_on_fail) {
+          delete[] reinterpret_cast<char*>(e);
+          *handle = nullptr;
+        }
+        s = Status::Incomplete("Insert failed due to LRU cache being full.");
+      }
+    } else {
+      // Insert into the cache. Note that the cache might get larger than its
+      // capacity if not enough space was freed up.
+      LRUHandle* old = table_.Insert(e);
+      usage_ += total_charge;
+      if (old != nullptr) {
+        s = Status::OkOverwritten();
+        assert(old->InCache());
+        old->SetInCache(false);
+        if (!old->HasRefs()) {
+          // old is on LRU because it's in cache and its reference count is 0
+          LRU_Remove(old);
+          size_t old_total_charge =
+              old->CalcTotalCharge(metadata_charge_policy_);
+          assert(usage_ >= old_total_charge);
+          usage_ -= old_total_charge;
+          last_reference_list.push_back(old);
+        }
+      }
+      if (handle == nullptr) {
+        LRU_Insert(e);
+      } else {
+        e->Ref();
+        *handle = reinterpret_cast<Cache::Handle*>(e);
+      }
+    }
+  }
+
+  // Try to insert the evicted entries into the secondary cache
+  // Free the entries here outside of mutex for performance reasons
+  for (auto entry : last_reference_list) {
+    if (secondary_cache_ && entry->IsSecondaryCacheCompatible() &&
+        !entry->IsPromoted()) {
+      secondary_cache_->Insert(entry->key(), entry->value, entry->info_.helper)
+          .PermitUncheckedError();
+    }
+    entry->Free();
+  }
+
+  return s;
+}
+
+void LRUCacheShard::Promote(LRUHandle* e) {
+  SecondaryCacheResultHandle* secondary_handle = e->sec_handle;
+
+  assert(secondary_handle->IsReady());
+  e->SetIncomplete(false);
+  e->SetInCache(true);
+  e->SetPromoted(true);
+  e->value = secondary_handle->Value();
+  e->charge = secondary_handle->Size();
+  delete secondary_handle;
+
+  // This call could fail if the cache is over capacity and
+  // strict_capacity_limit_ is true. In such a case, we don't want
+  // InsertItem() to free the handle, since the item is already in memory
+  // and the caller will most likely just read from disk if we erase it here.
+  if (e->value) {
+    Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(e);
+    Status s = InsertItem(e, &handle, /*free_handle_on_fail=*/false);
+    if (s.ok()) {
+      // InsertItem would have taken a reference on the item, so decrement it
+      // here as we expect the caller to already hold a reference
+      e->Unref();
+    } else {
+      // Item is in memory, but not accounted against the cache capacity.
+      // When the handle is released, the item should get deleted
+      assert(!e->InCache());
+    }
+  } else {
+    // Since the secondary cache lookup failed, mark the item as not in cache
+    // and charge the cache only for metadata usage, i.e handle, key etc
+    MutexLock l(&mutex_);
+    e->charge = 0;
+    e->SetInCache(false);
+    usage_ += e->CalcTotalCharge(metadata_charge_policy_);
+  }
+}
+
+Cache::Handle* LRUCacheShard::Lookup(
+    const Slice& key, uint32_t hash,
+    const ShardedCache::CacheItemHelper* helper,
+    const ShardedCache::CreateCallback& create_cb, Cache::Priority priority,
+    bool wait) {
+  LRUHandle* e = nullptr;
+  {
+    MutexLock l(&mutex_);
+    e = table_.Lookup(key, hash);
+    if (e != nullptr) {
+      assert(e->InCache());
+      if (!e->HasRefs()) {
+        // The entry is in LRU since it's in hash and has no external references
+        LRU_Remove(e);
+      }
+      e->Ref();
+      e->SetHit();
+    }
+  }
+
+  // If handle table lookup failed, then allocate a handle outside the
+  // mutex if we're going to lookup in the secondary cache
+  // Only support synchronous for now
+  // TODO: Support asynchronous lookup in secondary cache
+  if (!e && secondary_cache_ && helper && helper->saveto_cb) {
+    // For objects from the secondary cache, we expect the caller to provide
+    // a way to create/delete the primary cache object. The only case where
+    // a deleter would not be required is for dummy entries inserted for
+    // accounting purposes, which we won't demote to the secondary cache
+    // anyway.
+    assert(create_cb && helper->del_cb);
+    std::unique_ptr<SecondaryCacheResultHandle> secondary_handle =
+        secondary_cache_->Lookup(key, create_cb, wait);
+    if (secondary_handle != nullptr) {
+      e = reinterpret_cast<LRUHandle*>(
+          new char[sizeof(LRUHandle) - 1 + key.size()]);
+
+      e->flags = 0;
+      e->SetSecondaryCacheCompatible(true);
+      e->info_.helper = helper;
+      e->key_length = key.size();
+      e->hash = hash;
+      e->refs = 0;
+      e->next = e->prev = nullptr;
+      e->SetPriority(priority);
+      memcpy(e->key_data, key.data(), key.size());
+      e->value = nullptr;
+      e->sec_handle = secondary_handle.release();
+      e->Ref();
+
+      if (wait) {
+        Promote(e);
+        if (!e->value) {
+          // The secondary cache returned a handle, but the lookup failed
+          e->Unref();
+          e->Free();
+          e = nullptr;
+        }
+      } else {
+        // If wait is false, we always return a handle and let the caller
+        // release the handle after checking for success or failure
+        e->SetIncomplete(true);
+      }
     }
-    e->Ref();
-    e->SetHit();
   }
   return reinterpret_cast<Cache::Handle*>(e);
 }
@@ -339,81 +539,32 @@ bool LRUCacheShard::Release(Cache::Handle* handle, bool force_erase) {
 Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
                              size_t charge,
                              void (*deleter)(const Slice& key, void* value),
+                             const Cache::CacheItemHelper* helper,
                              Cache::Handle** handle, Cache::Priority priority) {
   // Allocate the memory here outside of the mutex
   // If the cache is full, we'll have to release it
   // It shouldn't happen very often though.
   LRUHandle* e = reinterpret_cast<LRUHandle*>(
       new char[sizeof(LRUHandle) - 1 + key.size()]);
-  Status s = Status::OK();
-  autovector<LRUHandle*> last_reference_list;
 
   e->value = value;
-  e->deleter = deleter;
+  e->flags = 0;
+  if (helper) {
+    e->SetSecondaryCacheCompatible(true);
+    e->info_.helper = helper;
+  } else {
+    e->info_.deleter = deleter;
+  }
   e->charge = charge;
   e->key_length = key.size();
-  e->flags = 0;
   e->hash = hash;
   e->refs = 0;
   e->next = e->prev = nullptr;
   e->SetInCache(true);
   e->SetPriority(priority);
   memcpy(e->key_data, key.data(), key.size());
-  size_t total_charge = e->CalcTotalCharge(metadata_charge_policy_);
 
-  {
-    MutexLock l(&mutex_);
-
-    // Free the space following strict LRU policy until enough space
-    // is freed or the lru list is empty
-    EvictFromLRU(total_charge, &last_reference_list);
-
-    if ((usage_ + total_charge) > capacity_ &&
-        (strict_capacity_limit_ || handle == nullptr)) {
-      if (handle == nullptr) {
-        // Don't insert the entry but still return ok, as if the entry inserted
-        // into cache and get evicted immediately.
-        e->SetInCache(false);
-        last_reference_list.push_back(e);
-      } else {
-        delete[] reinterpret_cast<char*>(e);
-        *handle = nullptr;
-        s = Status::Incomplete("Insert failed due to LRU cache being full.");
-      }
-    } else {
-      // Insert into the cache. Note that the cache might get larger than its
-      // capacity if not enough space was freed up.
-      LRUHandle* old = table_.Insert(e);
-      usage_ += total_charge;
-      if (old != nullptr) {
-        s = Status::OkOverwritten();
-        assert(old->InCache());
-        old->SetInCache(false);
-        if (!old->HasRefs()) {
-          // old is on LRU because it's in cache and its reference count is 0
-          LRU_Remove(old);
-          size_t old_total_charge =
-              old->CalcTotalCharge(metadata_charge_policy_);
-          assert(usage_ >= old_total_charge);
-          usage_ -= old_total_charge;
-          last_reference_list.push_back(old);
-        }
-      }
-      if (handle == nullptr) {
-        LRU_Insert(e);
-      } else {
-        e->Ref();
-        *handle = reinterpret_cast<Cache::Handle*>(e);
-      }
-    }
-  }
-
-  // Free the entries here outside of mutex for performance reasons
-  for (auto entry : last_reference_list) {
-    entry->Free();
-  }
-
-  return s;
+  return InsertItem(e, handle, /* free_handle_on_fail */ true);
 }
 
 void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
@@ -443,6 +594,18 @@ void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
   }
 }
 
+bool LRUCacheShard::IsReady(Cache::Handle* handle) {
+  LRUHandle* e = reinterpret_cast<LRUHandle*>(handle);
+  MutexLock l(&mutex_);
+  bool ready = true;
+  if (e->IsPending()) {
+    assert(secondary_cache_);
+    assert(e->sec_handle);
+    ready = e->sec_handle->IsReady();
+  }
+  return ready;
+}
+
 size_t LRUCacheShard::GetUsage() const {
   MutexLock l(&mutex_);
   return usage_;
@@ -469,7 +632,8 @@ LRUCache::LRUCache(size_t capacity, int num_shard_bits,
                    bool strict_capacity_limit, double high_pri_pool_ratio,
                    std::shared_ptr<MemoryAllocator> allocator,
                    bool use_adaptive_mutex,
-                   CacheMetadataChargePolicy metadata_charge_policy)
+                   CacheMetadataChargePolicy metadata_charge_policy,
+                   const std::shared_ptr<SecondaryCache>& secondary_cache)
     : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
                    std::move(allocator)) {
   num_shards_ = 1 << num_shard_bits;
@@ -477,10 +641,12 @@ LRUCache::LRUCache(size_t capacity, int num_shard_bits,
       port::cacheline_aligned_alloc(sizeof(LRUCacheShard) * num_shards_));
   size_t per_shard = (capacity + (num_shards_ - 1)) / num_shards_;
   for (int i = 0; i < num_shards_; i++) {
-    new (&shards_[i])
-        LRUCacheShard(per_shard, strict_capacity_limit, high_pri_pool_ratio,
-                      use_adaptive_mutex, metadata_charge_policy);
+    new (&shards_[i]) LRUCacheShard(
+        per_shard, strict_capacity_limit, high_pri_pool_ratio,
+        use_adaptive_mutex, metadata_charge_policy,
+        /* max_upper_hash_bits */ 32 - num_shard_bits, secondary_cache);
   }
+  secondary_cache_ = secondary_cache;
 }
 
 LRUCache::~LRUCache() {
@@ -493,11 +659,11 @@ LRUCache::~LRUCache() {
   }
 }
 
-CacheShard* LRUCache::GetShard(int shard) {
+CacheShard* LRUCache::GetShard(uint32_t shard) {
   return reinterpret_cast<CacheShard*>(&shards_[shard]);
 }
 
-const CacheShard* LRUCache::GetShard(int shard) const {
+const CacheShard* LRUCache::GetShard(uint32_t shard) const {
   return reinterpret_cast<CacheShard*>(&shards_[shard]);
 }
 
@@ -509,23 +675,25 @@ size_t LRUCache::GetCharge(Handle* handle) const {
   return reinterpret_cast<const LRUHandle*>(handle)->charge;
 }
 
+Cache::DeleterFn LRUCache::GetDeleter(Handle* handle) const {
+  auto h = reinterpret_cast<const LRUHandle*>(handle);
+  if (h->IsSecondaryCacheCompatible()) {
+    return h->info_.helper->del_cb;
+  } else {
+    return h->info_.deleter;
+  }
+}
+
 uint32_t LRUCache::GetHash(Handle* handle) const {
   return reinterpret_cast<const LRUHandle*>(handle)->hash;
 }
 
 void LRUCache::DisownData() {
 // Do not drop data if compile with ASAN to suppress leak warning.
-#if defined(__clang__)
-#if !defined(__has_feature) || !__has_feature(address_sanitizer)
+#ifndef MUST_FREE_HEAP_ALLOCATIONS
   shards_ = nullptr;
   num_shards_ = 0;
 #endif
-#else  // __clang__
-#ifndef __SANITIZE_ADDRESS__
-  shards_ = nullptr;
-  num_shards_ = 0;
-#endif  // !__SANITIZE_ADDRESS__
-#endif  // __clang__
 }
 
 size_t LRUCache::TEST_GetLRUSize() {
@@ -544,19 +712,42 @@ double LRUCache::GetHighPriPoolRatio() {
   return result;
 }
 
-std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts) {
-  return NewLRUCache(cache_opts.capacity, cache_opts.num_shard_bits,
-                     cache_opts.strict_capacity_limit,
-                     cache_opts.high_pri_pool_ratio,
-                     cache_opts.memory_allocator, cache_opts.use_adaptive_mutex,
-                     cache_opts.metadata_charge_policy);
+void LRUCache::WaitAll(std::vector<Handle*>& handles) {
+  if (secondary_cache_) {
+    std::vector<SecondaryCacheResultHandle*> sec_handles;
+    sec_handles.reserve(handles.size());
+    for (Handle* handle : handles) {
+      if (!handle) {
+        continue;
+      }
+      LRUHandle* lru_handle = reinterpret_cast<LRUHandle*>(handle);
+      if (!lru_handle->IsPending()) {
+        continue;
+      }
+      sec_handles.emplace_back(lru_handle->sec_handle);
+    }
+    secondary_cache_->WaitAll(sec_handles);
+    for (Handle* handle : handles) {
+      if (!handle) {
+        continue;
+      }
+      LRUHandle* lru_handle = reinterpret_cast<LRUHandle*>(handle);
+      if (!lru_handle->IsPending()) {
+        continue;
+      }
+      uint32_t hash = GetHash(handle);
+      LRUCacheShard* shard = static_cast<LRUCacheShard*>(GetShard(Shard(hash)));
+      shard->Promote(lru_handle);
+    }
+  }
 }
 
 std::shared_ptr<Cache> NewLRUCache(
     size_t capacity, int num_shard_bits, bool strict_capacity_limit,
     double high_pri_pool_ratio,
     std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
-    CacheMetadataChargePolicy metadata_charge_policy) {
+    CacheMetadataChargePolicy metadata_charge_policy,
+    const std::shared_ptr<SecondaryCache>& secondary_cache) {
   if (num_shard_bits >= 20) {
     return nullptr;  // the cache cannot be sharded into too many fine pieces
   }
@@ -569,7 +760,25 @@ std::shared_ptr<Cache> NewLRUCache(
   }
   return std::make_shared<LRUCache>(
       capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio,
-      std::move(memory_allocator), use_adaptive_mutex, metadata_charge_policy);
+      std::move(memory_allocator), use_adaptive_mutex, metadata_charge_policy,
+      secondary_cache);
+}
+
+std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts) {
+  return NewLRUCache(
+      cache_opts.capacity, cache_opts.num_shard_bits,
+      cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio,
+      cache_opts.memory_allocator, cache_opts.use_adaptive_mutex,
+      cache_opts.metadata_charge_policy, cache_opts.secondary_cache);
 }
 
+std::shared_ptr<Cache> NewLRUCache(
+    size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+    double high_pri_pool_ratio,
+    std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
+    CacheMetadataChargePolicy metadata_charge_policy) {
+  return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit,
+                     high_pri_pool_ratio, memory_allocator, use_adaptive_mutex,
+                     metadata_charge_policy, nullptr);
+}
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/cache/lru_cache.h b/cache/lru_cache.h
index 827e0bece2b..af0155ad9fb 100644
--- a/cache/lru_cache.h
+++ b/cache/lru_cache.h
@@ -1,4 +1,4 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
@@ -8,12 +8,13 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #pragma once
 
+#include <memory>
 #include <string>
 
 #include "cache/sharded_cache.h"
-
 #include "port/malloc.h"
 #include "port/port.h"
+#include "rocksdb/secondary_cache.h"
 #include "util/autovector.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -49,8 +50,18 @@ namespace ROCKSDB_NAMESPACE {
 
 struct LRUHandle {
   void* value;
-  void (*deleter)(const Slice&, void* value);
-  LRUHandle* next_hash;
+  union Info {
+    Info() {}
+    ~Info() {}
+    Cache::DeleterFn deleter;
+    const ShardedCache::CacheItemHelper* helper;
+  } info_;
+  // An entry is not added to the LRUHandleTable until the secondary cache
+  // lookup is complete, so its safe to have this union.
+  union {
+    LRUHandle* next_hash;
+    SecondaryCacheResultHandle* sec_handle;
+  };
   LRUHandle* next;
   LRUHandle* prev;
   size_t charge;  // TODO(opt): Only allow uint32_t?
@@ -67,8 +78,14 @@ struct LRUHandle {
     IS_HIGH_PRI = (1 << 1),
     // Whether this entry is in high-pri pool.
     IN_HIGH_PRI_POOL = (1 << 2),
-    // Wwhether this entry has had any lookups (hits).
+    // Whether this entry has had any lookups (hits).
     HAS_HIT = (1 << 3),
+    // Can this be inserted into the tiered cache
+    IS_TIERED_CACHE_COMPATIBLE = (1 << 4),
+    // Is the handle still being read from a lower tier
+    IS_PENDING = (1 << 5),
+    // Has the item been promoted from a lower tier
+    IS_PROMOTED = (1 << 6),
   };
 
   uint8_t flags;
@@ -95,6 +112,11 @@ struct LRUHandle {
   bool IsHighPri() const { return flags & IS_HIGH_PRI; }
   bool InHighPriPool() const { return flags & IN_HIGH_PRI_POOL; }
   bool HasHit() const { return flags & HAS_HIT; }
+  bool IsSecondaryCacheCompatible() const {
+    return flags & IS_TIERED_CACHE_COMPATIBLE;
+  }
+  bool IsPending() const { return flags & IS_PENDING; }
+  bool IsPromoted() const { return flags & IS_PROMOTED; }
 
   void SetInCache(bool in_cache) {
     if (in_cache) {
@@ -122,15 +144,50 @@ struct LRUHandle {
 
   void SetHit() { flags |= HAS_HIT; }
 
+  void SetSecondaryCacheCompatible(bool tiered) {
+    if (tiered) {
+      flags |= IS_TIERED_CACHE_COMPATIBLE;
+    } else {
+      flags &= ~IS_TIERED_CACHE_COMPATIBLE;
+    }
+  }
+
+  void SetIncomplete(bool incomp) {
+    if (incomp) {
+      flags |= IS_PENDING;
+    } else {
+      flags &= ~IS_PENDING;
+    }
+  }
+
+  void SetPromoted(bool promoted) {
+    if (promoted) {
+      flags |= IS_PROMOTED;
+    } else {
+      flags &= ~IS_PROMOTED;
+    }
+  }
+
   void Free() {
     assert(refs == 0);
-    if (deleter) {
-      (*deleter)(key(), value);
+    if (!IsSecondaryCacheCompatible() && info_.deleter) {
+      (*info_.deleter)(key(), value);
+    } else if (IsSecondaryCacheCompatible()) {
+      if (IsPending()) {
+        assert(sec_handle != nullptr);
+        SecondaryCacheResultHandle* tmp_sec_handle = sec_handle;
+        tmp_sec_handle->Wait();
+        value = tmp_sec_handle->Value();
+        delete tmp_sec_handle;
+      }
+      if (value) {
+        (*info_.helper->del_cb)(key(), value);
+      }
     }
     delete[] reinterpret_cast<char*>(this);
   }
 
-  // Caclculate the memory usage by metadata
+  // Calculate the memory usage by metadata
   inline size_t CalcTotalCharge(
       CacheMetadataChargePolicy metadata_charge_policy) {
     size_t meta_charge = 0;
@@ -153,7 +210,10 @@ struct LRUHandle {
 // 4.4.3's builtin hashtable.
 class LRUHandleTable {
  public:
-  LRUHandleTable();
+  // If the table uses more hash bits than `max_upper_hash_bits`,
+  // it will eat into the bits used for sharding, which are constant
+  // for a given LRUHandleTable.
+  explicit LRUHandleTable(int max_upper_hash_bits);
   ~LRUHandleTable();
 
   LRUHandle* Lookup(const Slice& key, uint32_t hash);
@@ -161,8 +221,8 @@ class LRUHandleTable {
   LRUHandle* Remove(const Slice& key, uint32_t hash);
 
   template <typename T>
-  void ApplyToAllCacheEntries(T func) {
-    for (uint32_t i = 0; i < length_; i++) {
+  void ApplyToEntriesRange(T func, uint32_t index_begin, uint32_t index_end) {
+    for (uint32_t i = index_begin; i < index_end; i++) {
       LRUHandle* h = list_[i];
       while (h != nullptr) {
         auto n = h->next_hash;
@@ -173,6 +233,8 @@ class LRUHandleTable {
     }
   }
 
+  int GetLengthBits() const { return length_bits_; }
+
  private:
   // Return a pointer to slot that points to a cache entry that
   // matches key/hash.  If there is no such cache entry, return a
@@ -181,11 +243,19 @@ class LRUHandleTable {
 
   void Resize();
 
+  // Number of hash bits (upper because lower bits used for sharding)
+  // used for table index. Length == 1 << length_bits_
+  int length_bits_;
+
   // The table consists of an array of buckets where each bucket is
   // a linked list of cache entries that hash into the bucket.
-  LRUHandle** list_;
-  uint32_t length_;
+  std::unique_ptr<LRUHandle*[]> list_;
+
+  // Number of elements currently in the table
   uint32_t elems_;
+
+  // Set from max_upper_hash_bits (see constructor)
+  const int max_length_bits_;
 };
 
 // A single shard of sharded cache.
@@ -193,7 +263,9 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
  public:
   LRUCacheShard(size_t capacity, bool strict_capacity_limit,
                 double high_pri_pool_ratio, bool use_adaptive_mutex,
-                CacheMetadataChargePolicy metadata_charge_policy);
+                CacheMetadataChargePolicy metadata_charge_policy,
+                int max_upper_hash_bits,
+                const std::shared_ptr<SecondaryCache>& secondary_cache);
   virtual ~LRUCacheShard() override = default;
 
   // Separate from constructor so caller can easily make an array of LRUCache
@@ -209,11 +281,34 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
 
   // Like Cache methods, but with an extra "hash" parameter.
   virtual Status Insert(const Slice& key, uint32_t hash, void* value,
-                        size_t charge,
-                        void (*deleter)(const Slice& key, void* value),
+                        size_t charge, Cache::DeleterFn deleter,
                         Cache::Handle** handle,
-                        Cache::Priority priority) override;
-  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash) override;
+                        Cache::Priority priority) override {
+    return Insert(key, hash, value, charge, deleter, nullptr, handle, priority);
+  }
+  virtual Status Insert(const Slice& key, uint32_t hash, void* value,
+                        const Cache::CacheItemHelper* helper, size_t charge,
+                        Cache::Handle** handle,
+                        Cache::Priority priority) override {
+    assert(helper);
+    return Insert(key, hash, value, charge, nullptr, helper, handle, priority);
+  }
+  // If helper_cb is null, the values of the following arguments don't
+  // matter
+  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash,
+                                const ShardedCache::CacheItemHelper* helper,
+                                const ShardedCache::CreateCallback& create_cb,
+                                ShardedCache::Priority priority,
+                                bool wait) override;
+  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash) override {
+    return Lookup(key, hash, nullptr, nullptr, Cache::Priority::LOW, true);
+  }
+  virtual bool Release(Cache::Handle* handle, bool /*useful*/,
+                       bool force_erase) override {
+    return Release(handle, force_erase);
+  }
+  virtual bool IsReady(Cache::Handle* /*handle*/) override;
+  virtual void Wait(Cache::Handle* /*handle*/) override {}
   virtual bool Ref(Cache::Handle* handle) override;
   virtual bool Release(Cache::Handle* handle,
                        bool force_erase = false) override;
@@ -226,8 +321,10 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
   virtual size_t GetUsage() const override;
   virtual size_t GetPinnedUsage() const override;
 
-  virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
-                                      bool thread_safe) override;
+  virtual void ApplyToSomeEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      uint32_t average_entries_per_lock, uint32_t* state) override;
 
   virtual void EraseUnRefEntries() override;
 
@@ -239,10 +336,27 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
   //  not threadsafe
   size_t TEST_GetLRUSize();
 
-  //  Retrives high pri pool ratio
+  //  Retrieves high pri pool ratio
   double GetHighPriPoolRatio();
 
  private:
+  friend class LRUCache;
+  // Insert an item into the hash table and, if handle is null, insert into
+  // the LRU list. Older items are evicted as necessary. If the cache is full
+  // and free_handle_on_fail is true, the item is deleted and handle is set to.
+  Status InsertItem(LRUHandle* item, Cache::Handle** handle,
+                    bool free_handle_on_fail);
+  Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
+                DeleterFn deleter, const Cache::CacheItemHelper* helper,
+                Cache::Handle** handle, Cache::Priority priority);
+  // Promote an item looked up from the secondary cache to the LRU cache. The
+  // item is only inserted into the hash table and not the LRU list, and only
+  // if the cache is not at full capacity, as is the case during Insert.  The
+  // caller should hold a reference on the LRUHandle. When the caller releases
+  // the last reference, the item is added to the LRU list.
+  // The item is promoted to the high pri or low pri pool as specified by the
+  // caller in Lookup.
+  void Promote(LRUHandle* e);
   void LRU_Remove(LRUHandle* e);
   void LRU_Insert(LRUHandle* e);
 
@@ -303,6 +417,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
   // We don't count mutex_ as the cache's internal state so semantically we
   // don't mind mutex_ invoking the non-const actions.
   mutable port::Mutex mutex_;
+
+  std::shared_ptr<SecondaryCache> secondary_cache_;
 };
 
 class LRUCache
@@ -316,24 +432,28 @@ class LRUCache
            std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
            bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
            CacheMetadataChargePolicy metadata_charge_policy =
-               kDontChargeCacheMetadata);
+               kDontChargeCacheMetadata,
+           const std::shared_ptr<SecondaryCache>& secondary_cache = nullptr);
   virtual ~LRUCache();
   virtual const char* Name() const override { return "LRUCache"; }
-  virtual CacheShard* GetShard(int shard) override;
-  virtual const CacheShard* GetShard(int shard) const override;
+  virtual CacheShard* GetShard(uint32_t shard) override;
+  virtual const CacheShard* GetShard(uint32_t shard) const override;
   virtual void* Value(Handle* handle) override;
   virtual size_t GetCharge(Handle* handle) const override;
   virtual uint32_t GetHash(Handle* handle) const override;
+  virtual DeleterFn GetDeleter(Handle* handle) const override;
   virtual void DisownData() override;
+  virtual void WaitAll(std::vector<Handle*>& handles) override;
 
   //  Retrieves number of elements in LRU, for unit test purpose only
   size_t TEST_GetLRUSize();
-  //  Retrives high pri pool ratio
+  //  Retrieves high pri pool ratio
   double GetHighPriPoolRatio();
 
  private:
   LRUCacheShard* shards_ = nullptr;
   int num_shards_ = 0;
+  std::shared_ptr<SecondaryCache> secondary_cache_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc
index 08c05024aef..d20fd246360 100644
--- a/cache/lru_cache_test.cc
+++ b/cache/lru_cache_test.cc
@@ -7,8 +7,18 @@
 
 #include <string>
 #include <vector>
+
+#include "db/db_test_util.h"
+#include "file/sst_file_manager_impl.h"
 #include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/sst_file_manager.h"
 #include "test_util/testharness.h"
+#include "util/coding.h"
+#include "util/random.h"
+#include "utilities/fault_injection_fs.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -30,9 +40,10 @@ class LRUCacheTest : public testing::Test {
     DeleteCache();
     cache_ = reinterpret_cast<LRUCacheShard*>(
         port::cacheline_aligned_alloc(sizeof(LRUCacheShard)));
-    new (cache_) LRUCacheShard(capacity, false /*strict_capcity_limit*/,
-                               high_pri_pool_ratio, use_adaptive_mutex,
-                               kDontChargeCacheMetadata);
+    new (cache_) LRUCacheShard(
+        capacity, false /*strict_capcity_limit*/, high_pri_pool_ratio,
+        use_adaptive_mutex, kDontChargeCacheMetadata,
+        24 /*max_upper_hash_bits*/, nullptr /*secondary_cache*/);
   }
 
   void Insert(const std::string& key,
@@ -191,6 +202,978 @@ TEST_F(LRUCacheTest, EntriesWithPriority) {
   ValidateLRUList({"e", "f", "g", "Z", "d"}, 2);
 }
 
+class TestSecondaryCache : public SecondaryCache {
+ public:
+  // Specifies what action to take on a lookup for a particular key
+  enum ResultType {
+    SUCCESS,
+    // Fail lookup immediately
+    FAIL,
+    // Defer the result. It will returned after Wait/WaitAll is called
+    DEFER,
+    // Defer the result and eventually return failure
+    DEFER_AND_FAIL
+  };
+
+  using ResultMap = std::unordered_map<std::string, ResultType>;
+
+  explicit TestSecondaryCache(size_t capacity)
+      : num_inserts_(0), num_lookups_(0), inject_failure_(false) {
+    cache_ = NewLRUCache(capacity, 0, false, 0.5, nullptr,
+                         kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
+  }
+  ~TestSecondaryCache() override { cache_.reset(); }
+
+  std::string Name() override { return "TestSecondaryCache"; }
+
+  void InjectFailure() { inject_failure_ = true; }
+
+  void ResetInjectFailure() { inject_failure_ = false; }
+
+  void SetDbSessionId(const std::string& db_session_id) {
+    db_session_id_ = db_session_id;
+  }
+
+  Status Insert(const Slice& key, void* value,
+                const Cache::CacheItemHelper* helper) override {
+    if (inject_failure_) {
+      return Status::Corruption("Insertion Data Corrupted");
+    }
+    assert(IsDbSessionIdAsKeyPrefix(key) == true);
+    size_t size;
+    char* buf;
+    Status s;
+
+    num_inserts_++;
+    size = (*helper->size_cb)(value);
+    buf = new char[size + sizeof(uint64_t)];
+    EncodeFixed64(buf, size);
+    s = (*helper->saveto_cb)(value, 0, size, buf + sizeof(uint64_t));
+    if (!s.ok()) {
+      delete[] buf;
+      return s;
+    }
+    return cache_->Insert(key, buf, size,
+                          [](const Slice& /*key*/, void* val) -> void {
+                            delete[] static_cast<char*>(val);
+                          });
+  }
+
+  std::unique_ptr<SecondaryCacheResultHandle> Lookup(
+      const Slice& key, const Cache::CreateCallback& create_cb,
+      bool /*wait*/) override {
+    std::string key_str = key.ToString();
+    TEST_SYNC_POINT_CALLBACK("TestSecondaryCache::Lookup", &key_str);
+
+    std::unique_ptr<SecondaryCacheResultHandle> secondary_handle;
+    ResultType type = ResultType::SUCCESS;
+    auto iter = result_map_.find(key.ToString());
+    if (iter != result_map_.end()) {
+      type = iter->second;
+    }
+    if (type == ResultType::FAIL) {
+      return secondary_handle;
+    }
+
+    Cache::Handle* handle = cache_->Lookup(key);
+    num_lookups_++;
+    if (handle) {
+      void* value = nullptr;
+      size_t charge = 0;
+      Status s;
+      if (type != ResultType::DEFER_AND_FAIL) {
+        char* ptr = (char*)cache_->Value(handle);
+        size_t size = DecodeFixed64(ptr);
+        ptr += sizeof(uint64_t);
+        s = create_cb(ptr, size, &value, &charge);
+      }
+      if (s.ok()) {
+        secondary_handle.reset(new TestSecondaryCacheResultHandle(
+            cache_.get(), handle, value, charge, type));
+      } else {
+        cache_->Release(handle);
+      }
+    }
+    return secondary_handle;
+  }
+
+  void Erase(const Slice& /*key*/) override {}
+
+  void WaitAll(std::vector<SecondaryCacheResultHandle*> handles) override {
+    for (SecondaryCacheResultHandle* handle : handles) {
+      TestSecondaryCacheResultHandle* sec_handle =
+          static_cast<TestSecondaryCacheResultHandle*>(handle);
+      sec_handle->SetReady();
+    }
+  }
+
+  std::string GetPrintableOptions() const override { return ""; }
+
+  void SetResultMap(ResultMap&& map) { result_map_ = std::move(map); }
+
+  uint32_t num_inserts() { return num_inserts_; }
+
+  uint32_t num_lookups() { return num_lookups_; }
+
+  bool IsDbSessionIdAsKeyPrefix(const Slice& key) {
+    if (db_session_id_.size() == 0) {
+      return true;
+    }
+    if (key.size() < 20) {
+      return false;
+    }
+    std::string s_key = key.ToString();
+    if (s_key.substr(0, 20) != db_session_id_) {
+      return false;
+    }
+    return true;
+  }
+
+ private:
+  class TestSecondaryCacheResultHandle : public SecondaryCacheResultHandle {
+   public:
+    TestSecondaryCacheResultHandle(Cache* cache, Cache::Handle* handle,
+                                   void* value, size_t size, ResultType type)
+        : cache_(cache),
+          handle_(handle),
+          value_(value),
+          size_(size),
+          is_ready_(true) {
+      if (type != ResultType::SUCCESS) {
+        is_ready_ = false;
+      }
+    }
+
+    ~TestSecondaryCacheResultHandle() override { cache_->Release(handle_); }
+
+    bool IsReady() override { return is_ready_; }
+
+    void Wait() override {}
+
+    void* Value() override {
+      assert(is_ready_);
+      return value_;
+    }
+
+    size_t Size() override { return Value() ? size_ : 0; }
+
+    void SetReady() { is_ready_ = true; }
+
+   private:
+    Cache* cache_;
+    Cache::Handle* handle_;
+    void* value_;
+    size_t size_;
+    bool is_ready_;
+  };
+
+  std::shared_ptr<Cache> cache_;
+  uint32_t num_inserts_;
+  uint32_t num_lookups_;
+  bool inject_failure_;
+  std::string db_session_id_;
+  ResultMap result_map_;
+};
+
+class DBSecondaryCacheTest : public DBTestBase {
+ public:
+  DBSecondaryCacheTest()
+      : DBTestBase("/db_secondary_cache_test", /*env_do_fsync=*/true) {
+    fault_fs_.reset(new FaultInjectionTestFS(env_->GetFileSystem()));
+    fault_env_.reset(new CompositeEnvWrapper(env_, fault_fs_));
+  }
+
+  std::shared_ptr<FaultInjectionTestFS> fault_fs_;
+  std::unique_ptr<Env> fault_env_;
+};
+
+class LRUSecondaryCacheTest : public LRUCacheTest {
+ public:
+  LRUSecondaryCacheTest() : fail_create_(false) {}
+  ~LRUSecondaryCacheTest() {}
+
+ protected:
+  class TestItem {
+   public:
+    TestItem(const char* buf, size_t size) : buf_(new char[size]), size_(size) {
+      memcpy(buf_.get(), buf, size);
+    }
+    ~TestItem() {}
+
+    char* Buf() { return buf_.get(); }
+    size_t Size() { return size_; }
+    std::string ToString() { return std::string(Buf(), Size()); }
+
+   private:
+    std::unique_ptr<char[]> buf_;
+    size_t size_;
+  };
+
+  static size_t SizeCallback(void* obj) {
+    return reinterpret_cast<TestItem*>(obj)->Size();
+  }
+
+  static Status SaveToCallback(void* from_obj, size_t from_offset,
+                               size_t length, void* out) {
+    TestItem* item = reinterpret_cast<TestItem*>(from_obj);
+    char* buf = item->Buf();
+    EXPECT_EQ(length, item->Size());
+    EXPECT_EQ(from_offset, 0);
+    memcpy(out, buf, length);
+    return Status::OK();
+  }
+
+  static void DeletionCallback(const Slice& /*key*/, void* obj) {
+    delete reinterpret_cast<TestItem*>(obj);
+  }
+
+  static Cache::CacheItemHelper helper_;
+
+  static Status SaveToCallbackFail(void* /*obj*/, size_t /*offset*/,
+                                   size_t /*size*/, void* /*out*/) {
+    return Status::NotSupported();
+  }
+
+  static Cache::CacheItemHelper helper_fail_;
+
+  Cache::CreateCallback test_item_creator =
+      [&](void* buf, size_t size, void** out_obj, size_t* charge) -> Status {
+    if (fail_create_) {
+      return Status::NotSupported();
+    }
+    *out_obj = reinterpret_cast<void*>(new TestItem((char*)buf, size));
+    *charge = size;
+    return Status::OK();
+  };
+
+  void SetFailCreate(bool fail) { fail_create_ = fail; }
+
+ private:
+  bool fail_create_;
+};
+
+Cache::CacheItemHelper LRUSecondaryCacheTest::helper_(
+    LRUSecondaryCacheTest::SizeCallback, LRUSecondaryCacheTest::SaveToCallback,
+    LRUSecondaryCacheTest::DeletionCallback);
+
+Cache::CacheItemHelper LRUSecondaryCacheTest::helper_fail_(
+    LRUSecondaryCacheTest::SizeCallback,
+    LRUSecondaryCacheTest::SaveToCallbackFail,
+    LRUSecondaryCacheTest::DeletionCallback);
+
+TEST_F(LRUSecondaryCacheTest, BasicTest) {
+  LRUCacheOptions opts(1024, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache =
+      std::make_shared<TestSecondaryCache>(2048);
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+
+  Random rnd(301);
+  std::string str1 = rnd.RandomString(1020);
+  TestItem* item1 = new TestItem(str1.data(), str1.length());
+  ASSERT_OK(cache->Insert("k1", item1, &LRUSecondaryCacheTest::helper_,
+                          str1.length()));
+  std::string str2 = rnd.RandomString(1020);
+  TestItem* item2 = new TestItem(str2.data(), str2.length());
+  // k2 should be demoted to NVM
+  ASSERT_OK(cache->Insert("k2", item2, &LRUSecondaryCacheTest::helper_,
+                          str2.length()));
+
+  Cache::Handle* handle;
+  handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_,
+                         test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_NE(handle, nullptr);
+  cache->Release(handle);
+  // This lookup should promote k1 and demote k2
+  handle = cache->Lookup("k1", &LRUSecondaryCacheTest::helper_,
+                         test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_NE(handle, nullptr);
+  cache->Release(handle);
+  ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 1u);
+
+  cache.reset();
+  secondary_cache.reset();
+}
+
+TEST_F(LRUSecondaryCacheTest, BasicFailTest) {
+  LRUCacheOptions opts(1024, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache =
+      std::make_shared<TestSecondaryCache>(2048);
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+
+  Random rnd(301);
+  std::string str1 = rnd.RandomString(1020);
+  TestItem* item1 = new TestItem(str1.data(), str1.length());
+  ASSERT_NOK(cache->Insert("k1", item1, nullptr, str1.length()));
+  ASSERT_OK(cache->Insert("k1", item1, &LRUSecondaryCacheTest::helper_,
+                          str1.length()));
+
+  Cache::Handle* handle;
+  handle = cache->Lookup("k2", nullptr, test_item_creator, Cache::Priority::LOW,
+                         true);
+  ASSERT_EQ(handle, nullptr);
+  handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_,
+                         test_item_creator, Cache::Priority::LOW, false);
+  ASSERT_EQ(handle, nullptr);
+
+  cache.reset();
+  secondary_cache.reset();
+}
+
+TEST_F(LRUSecondaryCacheTest, SaveFailTest) {
+  LRUCacheOptions opts(1024, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache =
+      std::make_shared<TestSecondaryCache>(2048);
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+
+  Random rnd(301);
+  std::string str1 = rnd.RandomString(1020);
+  TestItem* item1 = new TestItem(str1.data(), str1.length());
+  ASSERT_OK(cache->Insert("k1", item1, &LRUSecondaryCacheTest::helper_fail_,
+                          str1.length()));
+  std::string str2 = rnd.RandomString(1020);
+  TestItem* item2 = new TestItem(str2.data(), str2.length());
+  // k1 should be demoted to NVM
+  ASSERT_OK(cache->Insert("k2", item2, &LRUSecondaryCacheTest::helper_fail_,
+                          str2.length()));
+
+  Cache::Handle* handle;
+  handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_fail_,
+                         test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_NE(handle, nullptr);
+  cache->Release(handle);
+  // This lookup should fail, since k1 demotion would have failed
+  handle = cache->Lookup("k1", &LRUSecondaryCacheTest::helper_fail_,
+                         test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_EQ(handle, nullptr);
+  // Since k1 didn't get promoted, k2 should still be in cache
+  handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_fail_,
+                         test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_NE(handle, nullptr);
+  cache->Release(handle);
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 1u);
+
+  cache.reset();
+  secondary_cache.reset();
+}
+
+TEST_F(LRUSecondaryCacheTest, CreateFailTest) {
+  LRUCacheOptions opts(1024, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache =
+      std::make_shared<TestSecondaryCache>(2048);
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+
+  Random rnd(301);
+  std::string str1 = rnd.RandomString(1020);
+  TestItem* item1 = new TestItem(str1.data(), str1.length());
+  ASSERT_OK(cache->Insert("k1", item1, &LRUSecondaryCacheTest::helper_,
+                          str1.length()));
+  std::string str2 = rnd.RandomString(1020);
+  TestItem* item2 = new TestItem(str2.data(), str2.length());
+  // k1 should be demoted to NVM
+  ASSERT_OK(cache->Insert("k2", item2, &LRUSecondaryCacheTest::helper_,
+                          str2.length()));
+
+  Cache::Handle* handle;
+  SetFailCreate(true);
+  handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_,
+                         test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_NE(handle, nullptr);
+  cache->Release(handle);
+  // This lookup should fail, since k1 creation would have failed
+  handle = cache->Lookup("k1", &LRUSecondaryCacheTest::helper_,
+                         test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_EQ(handle, nullptr);
+  // Since k1 didn't get promoted, k2 should still be in cache
+  handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_,
+                         test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_NE(handle, nullptr);
+  cache->Release(handle);
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 1u);
+
+  cache.reset();
+  secondary_cache.reset();
+}
+
+TEST_F(LRUSecondaryCacheTest, FullCapacityTest) {
+  LRUCacheOptions opts(1024, 0, /*_strict_capacity_limit=*/true, 0.5, nullptr,
+                       kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache =
+      std::make_shared<TestSecondaryCache>(2048);
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+
+  Random rnd(301);
+  std::string str1 = rnd.RandomString(1020);
+  TestItem* item1 = new TestItem(str1.data(), str1.length());
+  ASSERT_OK(cache->Insert("k1", item1, &LRUSecondaryCacheTest::helper_,
+                          str1.length()));
+  std::string str2 = rnd.RandomString(1020);
+  TestItem* item2 = new TestItem(str2.data(), str2.length());
+  // k1 should be demoted to NVM
+  ASSERT_OK(cache->Insert("k2", item2, &LRUSecondaryCacheTest::helper_,
+                          str2.length()));
+
+  Cache::Handle* handle;
+  handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_,
+                         test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_NE(handle, nullptr);
+  // k1 promotion should fail due to the block cache being at capacity,
+  // but the lookup should still succeed
+  Cache::Handle* handle2;
+  handle2 = cache->Lookup("k1", &LRUSecondaryCacheTest::helper_,
+                          test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_NE(handle2, nullptr);
+  // Since k1 didn't get inserted, k2 should still be in cache
+  cache->Release(handle);
+  cache->Release(handle2);
+  handle = cache->Lookup("k2", &LRUSecondaryCacheTest::helper_,
+                         test_item_creator, Cache::Priority::LOW, true);
+  ASSERT_NE(handle, nullptr);
+  cache->Release(handle);
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 1u);
+
+  cache.reset();
+  secondary_cache.reset();
+}
+
+// In this test, the block cache size is set to 4096, after insert 6 KV-pairs
+// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta
+// blocks. block_1 size is 4096 and block_2 size is 2056. The total size
+// of the meta blocks are about 900 to 1000. Therefore, in any situation,
+// if we try to insert block_1 to the block cache, it will always fails. Only
+// block_2 will be successfully inserted into the block cache.
+TEST_F(DBSecondaryCacheTest, TestSecondaryCacheCorrectness1) {
+  LRUCacheOptions opts(4 * 1024, 0, false, 0.5, nullptr,
+                       kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+  fault_fs_->SetFailGetUniqueId(true);
+
+  // Set the file paranoid check, so after flush, the file will be read
+  // all the blocks will be accessed.
+  options.paranoid_file_checks = true;
+  DestroyAndReopen(options);
+  std::string session_id;
+  ASSERT_OK(db_->GetDbSessionId(session_id));
+  secondary_cache->SetDbSessionId(session_id);
+  Random rnd(301);
+  const int N = 6;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1007);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+
+  ASSERT_OK(Flush());
+  // After Flush is successful, RocksDB do the paranoid check for the new
+  // SST file. Meta blocks are always cached in the block cache and they
+  // will not be evicted. When block_2 is cache miss and read out, it is
+  // inserted to the block cache. Note that, block_1 is never successfully
+  // inserted to the block cache. Here are 2 lookups in the secondary cache
+  // for block_1 and block_2
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  Compact("a", "z");
+  // Compaction will create the iterator to scan the whole file. So all the
+  // blocks are needed. Meta blocks are always cached. When block_1 is read
+  // out, block_2 is evicted from block cache and inserted to secondary
+  // cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 3u);
+
+  std::string v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // The first data block is not in the cache, similarly, trigger the block
+  // cache Lookup and secondary cache lookup for block_1. But block_1 will not
+  // be inserted successfully due to the size. Currently, cache only has
+  // the meta blocks.
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+  // The second data block is not in the cache, similarly, trigger the block
+  // cache Lookup and secondary cache lookup for block_2 and block_2 is found
+  // in the secondary cache. Now block cache has block_2
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+  // block_2 is in the block cache. There is a block cache hit. No need to
+  // lookup or insert the secondary cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+  v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // Lookup the first data block, not in the block cache, so lookup the
+  // secondary cache. Also not in the secondary cache. After Get, still
+  // block_1 is will not be cached.
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 6u);
+
+  v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // Lookup the first data block, not in the block cache, so lookup the
+  // secondary cache. Also not in the secondary cache. After Get, still
+  // block_1 is will not be cached.
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 7u);
+
+  Destroy(options);
+}
+
+// In this test, the block cache size is set to 6100, after insert 6 KV-pairs
+// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta
+// blocks. block_1 size is 4096 and block_2 size is 2056. The total size
+// of the meta blocks are about 900 to 1000. Therefore, we can successfully
+// insert and cache block_1 in the block cache (this is the different place
+// from TestSecondaryCacheCorrectness1)
+TEST_F(DBSecondaryCacheTest, TestSecondaryCacheCorrectness2) {
+  LRUCacheOptions opts(6100, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.paranoid_file_checks = true;
+  options.env = fault_env_.get();
+  fault_fs_->SetFailGetUniqueId(true);
+  DestroyAndReopen(options);
+  std::string session_id;
+  ASSERT_OK(db_->GetDbSessionId(session_id));
+  secondary_cache->SetDbSessionId(session_id);
+  Random rnd(301);
+  const int N = 6;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1007);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+
+  ASSERT_OK(Flush());
+  // After Flush is successful, RocksDB do the paranoid check for the new
+  // SST file. Meta blocks are always cached in the block cache and they
+  // will not be evicted. When block_2 is cache miss and read out, it is
+  // inserted to the block cache. Thefore, block_1 is evicted from block
+  // cache and successfully inserted to the secondary cache. Here are 2
+  // lookups in the secondary cache for block_1 and block_2.
+  ASSERT_EQ(secondary_cache->num_inserts(), 1u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  Compact("a", "z");
+  // Compaction will create the iterator to scan the whole file. So all the
+  // blocks are needed. After Flush, only block_2 is cached in block cache
+  // and block_1 is in the secondary cache. So when read block_1, it is
+  // read out from secondary cache and inserted to block cache. At the same
+  // time, block_2 is inserted to secondary cache. Now, secondary cache has
+  // both block_1 and block_2. After compaction, block_1 is in the cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 3u);
+
+  std::string v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // This Get needs to access block_1, since block_1 is cached in block cache
+  // there is no secondary cache lookup.
+  ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 3u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+  // This Get needs to access block_2 which is not in the block cache. So
+  // it will lookup the secondary cache for block_2 and cache it in the
+  // block_cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+  // This Get needs to access block_2 which is already in the block cache.
+  // No need to lookup secondary cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+  v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // This Get needs to access block_1, since block_1 is not in block cache
+  // there is one econdary cache lookup. Then, block_1 is cached in the
+  // block cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+  v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // This Get needs to access block_1, since block_1 is cached in block cache
+  // there is no secondary cache lookup.
+  ASSERT_EQ(secondary_cache->num_inserts(), 2u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+  Destroy(options);
+}
+
+// The block cache size is set to 1024*1024, after insert 6 KV-pairs
+// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta
+// blocks. block_1 size is 4096 and block_2 size is 2056. The total size
+// of the meta blocks are about 900 to 1000. Therefore, we can successfully
+// cache all the blocks in the block cache and there is not secondary cache
+// insertion. 2 lookup is needed for the blocks.
+TEST_F(DBSecondaryCacheTest, NoSecondaryCacheInsertion) {
+  LRUCacheOptions opts(1024 * 1024, 0, false, 0.5, nullptr,
+                       kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.paranoid_file_checks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+  fault_fs_->SetFailGetUniqueId(true);
+
+  DestroyAndReopen(options);
+  std::string session_id;
+  ASSERT_OK(db_->GetDbSessionId(session_id));
+  secondary_cache->SetDbSessionId(session_id);
+  Random rnd(301);
+  const int N = 6;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1000);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+
+  ASSERT_OK(Flush());
+  // After Flush is successful, RocksDB do the paranoid check for the new
+  // SST file. Meta blocks are always cached in the block cache and they
+  // will not be evicted. Now, block cache is large enough, it cache
+  // both block_1 and block_2. When first time read block_1 and block_2
+  // there are cache misses. So 2 secondary cache lookups are needed for
+  // the 2 blocks
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  Compact("a", "z");
+  // Compaction will iterate the whole SST file. Since all the data blocks
+  // are in the block cache. No need to lookup the secondary cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  std::string v = Get(Key(0));
+  ASSERT_EQ(1000, v.size());
+  // Since the block cache is large enough, all the blocks are cached. we
+  // do not need to lookup the seondary cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  Destroy(options);
+}
+
+TEST_F(DBSecondaryCacheTest, SecondaryCacheIntensiveTesting) {
+  LRUCacheOptions opts(8 * 1024, 0, false, 0.5, nullptr,
+                       kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+  fault_fs_->SetFailGetUniqueId(true);
+  DestroyAndReopen(options);
+  std::string session_id;
+  ASSERT_OK(db_->GetDbSessionId(session_id));
+  secondary_cache->SetDbSessionId(session_id);
+  Random rnd(301);
+  const int N = 256;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1000);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+  ASSERT_OK(Flush());
+  Compact("a", "z");
+
+  Random r_index(47);
+  std::string v;
+  for (int i = 0; i < 1000; i++) {
+    uint32_t key_i = r_index.Next() % N;
+    v = Get(Key(key_i));
+  }
+
+  // We have over 200 data blocks there will be multiple insertion
+  // and lookups.
+  ASSERT_GE(secondary_cache->num_inserts(), 1u);
+  ASSERT_GE(secondary_cache->num_lookups(), 1u);
+
+  Destroy(options);
+}
+
+// In this test, the block cache size is set to 4096, after insert 6 KV-pairs
+// and flush, there are 5 blocks in this SST file, 2 data blocks and 3 meta
+// blocks. block_1 size is 4096 and block_2 size is 2056. The total size
+// of the meta blocks are about 900 to 1000. Therefore, in any situation,
+// if we try to insert block_1 to the block cache, it will always fails. Only
+// block_2 will be successfully inserted into the block cache.
+TEST_F(DBSecondaryCacheTest, SecondaryCacheFailureTest) {
+  LRUCacheOptions opts(4 * 1024, 0, false, 0.5, nullptr,
+                       kDefaultToAdaptiveMutex, kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.paranoid_file_checks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = fault_env_.get();
+  fault_fs_->SetFailGetUniqueId(true);
+  DestroyAndReopen(options);
+  std::string session_id;
+  ASSERT_OK(db_->GetDbSessionId(session_id));
+  secondary_cache->SetDbSessionId(session_id);
+  Random rnd(301);
+  const int N = 6;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(1007);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+
+  ASSERT_OK(Flush());
+  // After Flush is successful, RocksDB do the paranoid check for the new
+  // SST file. Meta blocks are always cached in the block cache and they
+  // will not be evicted. When block_2 is cache miss and read out, it is
+  // inserted to the block cache. Note that, block_1 is never successfully
+  // inserted to the block cache. Here are 2 lookups in the secondary cache
+  // for block_1 and block_2
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 2u);
+
+  // Fail the insertion, in LRU cache, the secondary insertion returned status
+  // is not checked, therefore, the DB will not be influenced.
+  secondary_cache->InjectFailure();
+  Compact("a", "z");
+  // Compaction will create the iterator to scan the whole file. So all the
+  // blocks are needed. Meta blocks are always cached. When block_1 is read
+  // out, block_2 is evicted from block cache and inserted to secondary
+  // cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 3u);
+
+  std::string v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // The first data block is not in the cache, similarly, trigger the block
+  // cache Lookup and secondary cache lookup for block_1. But block_1 will not
+  // be inserted successfully due to the size. Currently, cache only has
+  // the meta blocks.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 4u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+  // The second data block is not in the cache, similarly, trigger the block
+  // cache Lookup and secondary cache lookup for block_2 and block_2 is found
+  // in the secondary cache. Now block cache has block_2
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+  v = Get(Key(5));
+  ASSERT_EQ(1007, v.size());
+  // block_2 is in the block cache. There is a block cache hit. No need to
+  // lookup or insert the secondary cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 5u);
+
+  v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // Lookup the first data block, not in the block cache, so lookup the
+  // secondary cache. Also not in the secondary cache. After Get, still
+  // block_1 is will not be cached.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 6u);
+
+  v = Get(Key(0));
+  ASSERT_EQ(1007, v.size());
+  // Lookup the first data block, not in the block cache, so lookup the
+  // secondary cache. Also not in the secondary cache. After Get, still
+  // block_1 is will not be cached.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 7u);
+  secondary_cache->ResetInjectFailure();
+
+  Destroy(options);
+}
+
+TEST_F(LRUSecondaryCacheTest, BasicWaitAllTest) {
+  LRUCacheOptions opts(1024, 2, false, 0.5, nullptr, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache =
+      std::make_shared<TestSecondaryCache>(32 * 1024);
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  const int num_keys = 32;
+
+  Random rnd(301);
+  std::vector<std::string> values;
+  for (int i = 0; i < num_keys; ++i) {
+    std::string str = rnd.RandomString(1020);
+    values.emplace_back(str);
+    TestItem* item = new TestItem(str.data(), str.length());
+    ASSERT_OK(cache->Insert("k" + std::to_string(i), item,
+                            &LRUSecondaryCacheTest::helper_, str.length()));
+  }
+  // Force all entries to be evicted to the secondary cache
+  cache->SetCapacity(0);
+  ASSERT_EQ(secondary_cache->num_inserts(), 32u);
+  cache->SetCapacity(32 * 1024);
+
+  secondary_cache->SetResultMap(
+      {{"k3", TestSecondaryCache::ResultType::DEFER},
+       {"k4", TestSecondaryCache::ResultType::DEFER_AND_FAIL},
+       {"k5", TestSecondaryCache::ResultType::FAIL}});
+  std::vector<Cache::Handle*> results;
+  for (int i = 0; i < 6; ++i) {
+    results.emplace_back(
+        cache->Lookup("k" + std::to_string(i), &LRUSecondaryCacheTest::helper_,
+                      test_item_creator, Cache::Priority::LOW, false));
+  }
+  cache->WaitAll(results);
+  for (int i = 0; i < 6; ++i) {
+    if (i == 4) {
+      ASSERT_EQ(cache->Value(results[i]), nullptr);
+    } else if (i == 5) {
+      ASSERT_EQ(results[i], nullptr);
+      continue;
+    } else {
+      TestItem* item = static_cast<TestItem*>(cache->Value(results[i]));
+      ASSERT_EQ(item->ToString(), values[i]);
+    }
+    cache->Release(results[i]);
+  }
+
+  cache.reset();
+  secondary_cache.reset();
+}
+
+// In this test, we have one KV pair per data block. We indirectly determine
+// the cache key associated with each data block (and thus each KV) by using
+// a sync point callback in TestSecondaryCache::Lookup. We then control the
+// lookup result by setting the ResultMap.
+TEST_F(DBSecondaryCacheTest, TestSecondaryCacheMultiGet) {
+  LRUCacheOptions opts(1 << 20, 0, false, 0.5, nullptr, kDefaultToAdaptiveMutex,
+                       kDontChargeCacheMetadata);
+  std::shared_ptr<TestSecondaryCache> secondary_cache(
+      new TestSecondaryCache(2048 * 1024));
+  opts.secondary_cache = secondary_cache;
+  std::shared_ptr<Cache> cache = NewLRUCache(opts);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  table_options.block_size = 4 * 1024;
+  table_options.cache_index_and_filter_blocks = false;
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.paranoid_file_checks = true;
+  DestroyAndReopen(options);
+  Random rnd(301);
+  const int N = 8;
+  std::vector<std::string> keys;
+  for (int i = 0; i < N; i++) {
+    std::string p_v = rnd.RandomString(4000);
+    keys.emplace_back(p_v);
+    ASSERT_OK(Put(Key(i), p_v));
+  }
+
+  ASSERT_OK(Flush());
+  // After Flush is successful, RocksDB does the paranoid check for the new
+  // SST file. This will try to lookup all data blocks in the secondary
+  // cache.
+  ASSERT_EQ(secondary_cache->num_inserts(), 0u);
+  ASSERT_EQ(secondary_cache->num_lookups(), 8u);
+
+  cache->SetCapacity(0);
+  ASSERT_EQ(secondary_cache->num_inserts(), 8u);
+  cache->SetCapacity(1 << 20);
+
+  std::vector<std::string> cache_keys;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "TestSecondaryCache::Lookup", [&cache_keys](void* key) -> void {
+        cache_keys.emplace_back(*(static_cast<std::string*>(key)));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  for (int i = 0; i < N; ++i) {
+    std::string v = Get(Key(i));
+    ASSERT_EQ(4000, v.size());
+    ASSERT_EQ(v, keys[i]);
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(secondary_cache->num_lookups(), 16u);
+  cache->SetCapacity(0);
+  cache->SetCapacity(1 << 20);
+
+  ASSERT_EQ(Get(Key(2)), keys[2]);
+  ASSERT_EQ(Get(Key(7)), keys[7]);
+  secondary_cache->SetResultMap(
+      {{cache_keys[3], TestSecondaryCache::ResultType::DEFER},
+       {cache_keys[4], TestSecondaryCache::ResultType::DEFER_AND_FAIL},
+       {cache_keys[5], TestSecondaryCache::ResultType::FAIL}});
+
+  std::vector<std::string> mget_keys(
+      {Key(0), Key(1), Key(2), Key(3), Key(4), Key(5), Key(6), Key(7)});
+  std::vector<PinnableSlice> values(mget_keys.size());
+  std::vector<Status> s(keys.size());
+  std::vector<Slice> key_slices;
+  for (const std::string& key : mget_keys) {
+    key_slices.emplace_back(key);
+  }
+  uint32_t num_lookups = secondary_cache->num_lookups();
+  dbfull()->MultiGet(ReadOptions(), dbfull()->DefaultColumnFamily(),
+                     key_slices.size(), key_slices.data(), values.data(),
+                     s.data(), false);
+  ASSERT_EQ(secondary_cache->num_lookups(), num_lookups + 5);
+  for (int i = 0; i < N; ++i) {
+    ASSERT_OK(s[i]);
+    ASSERT_EQ(values[i].ToString(), keys[i]);
+    values[i].Reset();
+  }
+  Destroy(options);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/cache/sharded_cache.cc b/cache/sharded_cache.cc
index 6c915df8cc8..bf90ea3b10f 100644
--- a/cache/sharded_cache.cc
+++ b/cache/sharded_cache.cc
@@ -9,53 +9,96 @@
 
 #include "cache/sharded_cache.h"
 
-#include <string>
+#include <algorithm>
+#include <cstdint>
+#include <memory>
 
+#include "util/hash.h"
+#include "util/math.h"
 #include "util/mutexlock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
+namespace {
+
+inline uint32_t HashSlice(const Slice& s) {
+  return Lower32of64(GetSliceNPHash64(s));
+}
+
+}  // namespace
+
 ShardedCache::ShardedCache(size_t capacity, int num_shard_bits,
                            bool strict_capacity_limit,
                            std::shared_ptr<MemoryAllocator> allocator)
     : Cache(std::move(allocator)),
-      num_shard_bits_(num_shard_bits),
+      shard_mask_((uint32_t{1} << num_shard_bits) - 1),
       capacity_(capacity),
       strict_capacity_limit_(strict_capacity_limit),
       last_id_(1) {}
 
 void ShardedCache::SetCapacity(size_t capacity) {
-  int num_shards = 1 << num_shard_bits_;
+  uint32_t num_shards = GetNumShards();
   const size_t per_shard = (capacity + (num_shards - 1)) / num_shards;
   MutexLock l(&capacity_mutex_);
-  for (int s = 0; s < num_shards; s++) {
+  for (uint32_t s = 0; s < num_shards; s++) {
     GetShard(s)->SetCapacity(per_shard);
   }
   capacity_ = capacity;
 }
 
 void ShardedCache::SetStrictCapacityLimit(bool strict_capacity_limit) {
-  int num_shards = 1 << num_shard_bits_;
+  uint32_t num_shards = GetNumShards();
   MutexLock l(&capacity_mutex_);
-  for (int s = 0; s < num_shards; s++) {
+  for (uint32_t s = 0; s < num_shards; s++) {
     GetShard(s)->SetStrictCapacityLimit(strict_capacity_limit);
   }
   strict_capacity_limit_ = strict_capacity_limit;
 }
 
 Status ShardedCache::Insert(const Slice& key, void* value, size_t charge,
-                            void (*deleter)(const Slice& key, void* value),
-                            Handle** handle, Priority priority) {
+                            DeleterFn deleter, Handle** handle,
+                            Priority priority) {
   uint32_t hash = HashSlice(key);
   return GetShard(Shard(hash))
       ->Insert(key, hash, value, charge, deleter, handle, priority);
 }
 
+Status ShardedCache::Insert(const Slice& key, void* value,
+                            const CacheItemHelper* helper, size_t charge,
+                            Handle** handle, Priority priority) {
+  uint32_t hash = HashSlice(key);
+  if (!helper) {
+    return Status::InvalidArgument();
+  }
+  return GetShard(Shard(hash))
+      ->Insert(key, hash, value, helper, charge, handle, priority);
+}
+
 Cache::Handle* ShardedCache::Lookup(const Slice& key, Statistics* /*stats*/) {
   uint32_t hash = HashSlice(key);
   return GetShard(Shard(hash))->Lookup(key, hash);
 }
 
+Cache::Handle* ShardedCache::Lookup(const Slice& key,
+                                    const CacheItemHelper* helper,
+                                    const CreateCallback& create_cb,
+                                    Priority priority, bool wait,
+                                    Statistics* /*stats*/) {
+  uint32_t hash = HashSlice(key);
+  return GetShard(Shard(hash))
+      ->Lookup(key, hash, helper, create_cb, priority, wait);
+}
+
+bool ShardedCache::IsReady(Handle* handle) {
+  uint32_t hash = GetHash(handle);
+  return GetShard(Shard(hash))->IsReady(handle);
+}
+
+void ShardedCache::Wait(Handle* handle) {
+  uint32_t hash = GetHash(handle);
+  GetShard(Shard(hash))->Wait(handle);
+}
+
 bool ShardedCache::Ref(Handle* handle) {
   uint32_t hash = GetHash(handle);
   return GetShard(Shard(hash))->Ref(handle);
@@ -66,6 +109,11 @@ bool ShardedCache::Release(Handle* handle, bool force_erase) {
   return GetShard(Shard(hash))->Release(handle, force_erase);
 }
 
+bool ShardedCache::Release(Handle* handle, bool useful, bool force_erase) {
+  uint32_t hash = GetHash(handle);
+  return GetShard(Shard(hash))->Release(handle, useful, force_erase);
+}
+
 void ShardedCache::Erase(const Slice& key) {
   uint32_t hash = HashSlice(key);
   GetShard(Shard(hash))->Erase(key, hash);
@@ -87,9 +135,9 @@ bool ShardedCache::HasStrictCapacityLimit() const {
 
 size_t ShardedCache::GetUsage() const {
   // We will not lock the cache when getting the usage from shards.
-  int num_shards = 1 << num_shard_bits_;
+  uint32_t num_shards = GetNumShards();
   size_t usage = 0;
-  for (int s = 0; s < num_shards; s++) {
+  for (uint32_t s = 0; s < num_shards; s++) {
     usage += GetShard(s)->GetUsage();
   }
   return usage;
@@ -101,25 +149,42 @@ size_t ShardedCache::GetUsage(Handle* handle) const {
 
 size_t ShardedCache::GetPinnedUsage() const {
   // We will not lock the cache when getting the usage from shards.
-  int num_shards = 1 << num_shard_bits_;
+  uint32_t num_shards = GetNumShards();
   size_t usage = 0;
-  for (int s = 0; s < num_shards; s++) {
+  for (uint32_t s = 0; s < num_shards; s++) {
     usage += GetShard(s)->GetPinnedUsage();
   }
   return usage;
 }
 
-void ShardedCache::ApplyToAllCacheEntries(void (*callback)(void*, size_t),
-                                          bool thread_safe) {
-  int num_shards = 1 << num_shard_bits_;
-  for (int s = 0; s < num_shards; s++) {
-    GetShard(s)->ApplyToAllCacheEntries(callback, thread_safe);
-  }
+void ShardedCache::ApplyToAllEntries(
+    const std::function<void(const Slice& key, void* value, size_t charge,
+                             DeleterFn deleter)>& callback,
+    const ApplyToAllEntriesOptions& opts) {
+  uint32_t num_shards = GetNumShards();
+  // Iterate over part of each shard, rotating between shards, to
+  // minimize impact on latency of concurrent operations.
+  std::unique_ptr<uint32_t[]> states(new uint32_t[num_shards]{});
+
+  uint32_t aepl_in_32 = static_cast<uint32_t>(
+      std::min(size_t{UINT32_MAX}, opts.average_entries_per_lock));
+  aepl_in_32 = std::min(aepl_in_32, uint32_t{1});
+
+  bool remaining_work;
+  do {
+    remaining_work = false;
+    for (uint32_t s = 0; s < num_shards; s++) {
+      if (states[s] != UINT32_MAX) {
+        GetShard(s)->ApplyToSomeEntries(callback, aepl_in_32, &states[s]);
+        remaining_work |= states[s] != UINT32_MAX;
+      }
+    }
+  } while (remaining_work);
 }
 
 void ShardedCache::EraseUnRefEntries() {
-  int num_shards = 1 << num_shard_bits_;
-  for (int s = 0; s < num_shards; s++) {
+  uint32_t num_shards = GetNumShards();
+  for (uint32_t s = 0; s < num_shards; s++) {
     GetShard(s)->EraseUnRefEntries();
   }
 }
@@ -134,7 +199,8 @@ std::string ShardedCache::GetPrintableOptions() const {
     snprintf(buffer, kBufferSize, "    capacity : %" ROCKSDB_PRIszt "\n",
              capacity_);
     ret.append(buffer);
-    snprintf(buffer, kBufferSize, "    num_shard_bits : %d\n", num_shard_bits_);
+    snprintf(buffer, kBufferSize, "    num_shard_bits : %d\n",
+             GetNumShardBits());
     ret.append(buffer);
     snprintf(buffer, kBufferSize, "    strict_capacity_limit : %d\n",
              strict_capacity_limit_);
@@ -159,4 +225,8 @@ int GetDefaultCacheShardBits(size_t capacity) {
   return num_shard_bits;
 }
 
+int ShardedCache::GetNumShardBits() const { return BitsSetToOne(shard_mask_); }
+
+uint32_t ShardedCache::GetNumShards() const { return shard_mask_ + 1; }
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/cache/sharded_cache.h b/cache/sharded_cache.h
index ce9e459dc14..3e2a20abac4 100644
--- a/cache/sharded_cache.h
+++ b/cache/sharded_cache.h
@@ -14,7 +14,6 @@
 
 #include "port/port.h"
 #include "rocksdb/cache.h"
-#include "util/hash.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -24,20 +23,37 @@ class CacheShard {
   CacheShard() = default;
   virtual ~CacheShard() = default;
 
+  using DeleterFn = Cache::DeleterFn;
   virtual Status Insert(const Slice& key, uint32_t hash, void* value,
-                        size_t charge,
-                        void (*deleter)(const Slice& key, void* value),
+                        size_t charge, DeleterFn deleter,
+                        Cache::Handle** handle, Cache::Priority priority) = 0;
+  virtual Status Insert(const Slice& key, uint32_t hash, void* value,
+                        const Cache::CacheItemHelper* helper, size_t charge,
                         Cache::Handle** handle, Cache::Priority priority) = 0;
   virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash) = 0;
+  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash,
+                                const Cache::CacheItemHelper* helper,
+                                const Cache::CreateCallback& create_cb,
+                                Cache::Priority priority, bool wait) = 0;
+  virtual bool Release(Cache::Handle* handle, bool useful,
+                       bool force_erase) = 0;
+  virtual bool IsReady(Cache::Handle* handle) = 0;
+  virtual void Wait(Cache::Handle* handle) = 0;
   virtual bool Ref(Cache::Handle* handle) = 0;
-  virtual bool Release(Cache::Handle* handle, bool force_erase = false) = 0;
+  virtual bool Release(Cache::Handle* handle, bool force_erase) = 0;
   virtual void Erase(const Slice& key, uint32_t hash) = 0;
   virtual void SetCapacity(size_t capacity) = 0;
   virtual void SetStrictCapacityLimit(bool strict_capacity_limit) = 0;
   virtual size_t GetUsage() const = 0;
   virtual size_t GetPinnedUsage() const = 0;
-  virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
-                                      bool thread_safe) = 0;
+  // Handles iterating over roughly `average_entries_per_lock` entries, using
+  // `state` to somehow record where it last ended up. Caller initially uses
+  // *state == 0 and implementation sets *state = UINT32_MAX to indicate
+  // completion.
+  virtual void ApplyToSomeEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      uint32_t average_entries_per_lock, uint32_t* state) = 0;
   virtual void EraseUnRefEntries() = 0;
   virtual std::string GetPrintableOptions() const { return ""; }
   void set_metadata_charge_policy(
@@ -57,22 +73,29 @@ class ShardedCache : public Cache {
   ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
                std::shared_ptr<MemoryAllocator> memory_allocator = nullptr);
   virtual ~ShardedCache() = default;
-  virtual const char* Name() const override = 0;
-  virtual CacheShard* GetShard(int shard) = 0;
-  virtual const CacheShard* GetShard(int shard) const = 0;
-  virtual void* Value(Handle* handle) override = 0;
-  virtual size_t GetCharge(Handle* handle) const override = 0;
+  virtual CacheShard* GetShard(uint32_t shard) = 0;
+  virtual const CacheShard* GetShard(uint32_t shard) const = 0;
 
   virtual uint32_t GetHash(Handle* handle) const = 0;
-  virtual void DisownData() override = 0;
 
   virtual void SetCapacity(size_t capacity) override;
   virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override;
 
   virtual Status Insert(const Slice& key, void* value, size_t charge,
-                        void (*deleter)(const Slice& key, void* value),
-                        Handle** handle, Priority priority) override;
+                        DeleterFn deleter, Handle** handle,
+                        Priority priority) override;
+  virtual Status Insert(const Slice& key, void* value,
+                        const CacheItemHelper* helper, size_t chargge,
+                        Handle** handle = nullptr,
+                        Priority priority = Priority::LOW) override;
   virtual Handle* Lookup(const Slice& key, Statistics* stats) override;
+  virtual Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
+                         const CreateCallback& create_cb, Priority priority,
+                         bool wait, Statistics* stats = nullptr) override;
+  virtual bool Release(Handle* handle, bool useful,
+                       bool force_erase = false) override;
+  virtual bool IsReady(Handle* handle) override;
+  virtual void Wait(Handle* handle) override;
   virtual bool Ref(Handle* handle) override;
   virtual bool Release(Handle* handle, bool force_erase = false) override;
   virtual void Erase(const Slice& key) override;
@@ -82,24 +105,21 @@ class ShardedCache : public Cache {
   virtual size_t GetUsage() const override;
   virtual size_t GetUsage(Handle* handle) const override;
   virtual size_t GetPinnedUsage() const override;
-  virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
-                                      bool thread_safe) override;
+  virtual void ApplyToAllEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      const ApplyToAllEntriesOptions& opts) override;
   virtual void EraseUnRefEntries() override;
   virtual std::string GetPrintableOptions() const override;
 
-  int GetNumShardBits() const { return num_shard_bits_; }
-
- private:
-  static inline uint32_t HashSlice(const Slice& s) {
-    return static_cast<uint32_t>(GetSliceNPHash64(s));
-  }
+  int GetNumShardBits() const;
+  uint32_t GetNumShards() const;
 
-  uint32_t Shard(uint32_t hash) {
-    // Note, hash >> 32 yields hash in gcc, not the zero we expect!
-    return (num_shard_bits_ > 0) ? (hash >> (32 - num_shard_bits_)) : 0;
-  }
+ protected:
+  inline uint32_t Shard(uint32_t hash) { return hash & shard_mask_; }
 
-  int num_shard_bits_;
+ private:
+  const uint32_t shard_mask_;
   mutable port::Mutex capacity_mutex_;
   size_t capacity_;
   bool strict_capacity_limit_;
diff --git a/cloud/aws/aws_kafka.cc b/cloud/aws/aws_kafka.cc
index 74e4a1b5f6f..3c4a9b31e4e 100644
--- a/cloud/aws/aws_kafka.cc
+++ b/cloud/aws/aws_kafka.cc
@@ -41,6 +41,7 @@ class KafkaWritableFile : public CloudLogWritableFile {
   }
 
   ~KafkaWritableFile() {}
+  using CloudLogWritableFile::Append;
   virtual Status Append(const Slice& data);
   virtual Status Close();
   virtual bool IsSyncThreadSafe() const;
diff --git a/cloud/aws/aws_kinesis.cc b/cloud/aws/aws_kinesis.cc
index 30c93531917..33a724a2666 100644
--- a/cloud/aws/aws_kinesis.cc
+++ b/cloud/aws/aws_kinesis.cc
@@ -55,6 +55,7 @@ class KinesisWritableFile : public CloudLogWritableFile {
   }
   virtual ~KinesisWritableFile() {}
 
+  using CloudLogWritableFile::Append;
   virtual Status Append(const Slice& data) override;
   virtual Status Close() override;
   virtual Status LogDelete() override;
diff --git a/cloud/cloud_env.cc b/cloud/cloud_env.cc
index 594d4c6e18d..633c8728fab 100644
--- a/cloud/cloud_env.cc
+++ b/cloud/cloud_env.cc
@@ -122,92 +122,92 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {0, OptionType::kString, OptionVerificationType::kNormal,
           OptionTypeFlags::kCompareNever,
           [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
-             const std::string& value, char* addr) {
-            auto bucket = reinterpret_cast<BucketOptions*>(addr);
+             const std::string& value, void* addr) {
+            auto bucket = static_cast<BucketOptions*>(addr);
             bucket->SetObjectPath(value);
             return Status::OK();
           },
           [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
-             const char* addr, std::string* value) {
-            auto bucket = reinterpret_cast<const BucketOptions*>(addr);
+             const void* addr, std::string* value) {
+            auto bucket = static_cast<const BucketOptions*>(addr);
             *value = bucket->GetObjectPath();
             return Status::OK();
           },
           [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
-             const char* addr1, const char* addr2, std::string* /*mismatch*/) {
-            auto bucket1 = reinterpret_cast<const BucketOptions*>(addr1);
-            auto bucket2 = reinterpret_cast<const BucketOptions*>(addr2);
+             const void* addr1, const void* addr2, std::string* /*mismatch*/) {
+            auto bucket1 = static_cast<const BucketOptions*>(addr1);
+            auto bucket2 = static_cast<const BucketOptions*>(addr2);
             return bucket1->GetObjectPath() == bucket2->GetObjectPath();
           }}},
         {"region",
          {0, OptionType::kString, OptionVerificationType::kNormal,
           OptionTypeFlags::kCompareNever,
           [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
-             const std::string& value, char* addr) {
-            auto bucket = reinterpret_cast<BucketOptions*>(addr);
+             const std::string& value, void* addr) {
+            auto bucket = static_cast<BucketOptions*>(addr);
             bucket->SetRegion(value);
             return Status::OK();
           },
           [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
-             const char* addr, std::string* value) {
-            auto bucket = reinterpret_cast<const BucketOptions*>(addr);
+             const void* addr, std::string* value) {
+            auto bucket = static_cast<const BucketOptions*>(addr);
             *value = bucket->GetRegion();
             return Status::OK();
           },
           [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
-             const char* addr1, const char* addr2, std::string* /*mismatch*/) {
-            auto bucket1 = reinterpret_cast<const BucketOptions*>(addr1);
-            auto bucket2 = reinterpret_cast<const BucketOptions*>(addr2);
+             const void* addr1, const void* addr2, std::string* /*mismatch*/) {
+            auto bucket1 = static_cast<const BucketOptions*>(addr1);
+            auto bucket2 = static_cast<const BucketOptions*>(addr2);
             return bucket1->GetRegion() == bucket2->GetRegion();
           }}},
         {"prefix",
          {0, OptionType::kString, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone,
           [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
-             const std::string& value, char* addr) {
-            auto bucket = reinterpret_cast<BucketOptions*>(addr);
+             const std::string& value, void* addr) {
+            auto bucket = static_cast<BucketOptions*>(addr);
             bucket->SetBucketName(bucket->GetBucketName(false), value);
             return Status::OK();
           },
           [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
-             const char* addr, std::string* value) {
-            auto bucket = reinterpret_cast<const BucketOptions*>(addr);
+             const void* addr, std::string* value) {
+            auto bucket = static_cast<const BucketOptions*>(addr);
             *value = bucket->GetBucketPrefix();
             return Status::OK();
           },
           [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
-             const char* addr1, const char* addr2, std::string* /*mismatch*/) {
-            auto bucket1 = reinterpret_cast<const BucketOptions*>(addr1);
-            auto bucket2 = reinterpret_cast<const BucketOptions*>(addr2);
+             const void* addr1, const void* addr2, std::string* /*mismatch*/) {
+            auto bucket1 = static_cast<const BucketOptions*>(addr1);
+            auto bucket2 = static_cast<const BucketOptions*>(addr2);
             return bucket1->GetBucketPrefix() == bucket2->GetBucketPrefix();
           }}},
         {"bucket",
          {0, OptionType::kString, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone,
           [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
-             const std::string& value, char* addr) {
-            auto bucket = reinterpret_cast<BucketOptions*>(addr);
+             const std::string& value, void* addr) {
+            auto bucket = static_cast<BucketOptions*>(addr);
             bucket->SetBucketName(value);
             return Status::OK();
           },
           [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
-             const char* addr, std::string* value) {
-            auto bucket = reinterpret_cast<const BucketOptions*>(addr);
+             const void* addr, std::string* value) {
+            auto bucket = static_cast<const BucketOptions*>(addr);
             *value = bucket->GetBucketName(false);
             return Status::OK();
           },
           [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
-             const char* addr1, const char* addr2, std::string* /*mismatch*/) {
-            auto bucket1 = reinterpret_cast<const BucketOptions*>(addr1);
-            auto bucket2 = reinterpret_cast<const BucketOptions*>(addr2);
+             const void* addr1, const void* addr2, std::string* /*mismatch*/) {
+            auto bucket1 = static_cast<const BucketOptions*>(addr1);
+            auto bucket2 = static_cast<const BucketOptions*>(addr2);
             return bucket1->GetBucketName(false) == bucket2->GetBucketName(false);
           }}},
         {"TEST",
          {0, OptionType::kUnknown, OptionVerificationType::kAlias,
           OptionTypeFlags::kNone,
           [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
-             const std::string& value, char* addr) {
-            auto bucket = reinterpret_cast<BucketOptions*>(addr);
+             const std::string& value, void* addr) {
+            auto bucket = static_cast<BucketOptions*>(addr);
             std::string name = value;
             std::string path;
             std::string region;
@@ -255,9 +255,6 @@ static std::unordered_map<std::string, OptionTypeInfo>
         {"skip_cloud_children_files",
          {offset_of(&CloudEnvOptions::skip_cloud_files_in_getchildren),
           OptionType::kBoolean}},
-        {"use_direct_io_for_cloud_download",
-         {offset_of(&CloudEnvOptions::use_direct_io_for_cloud_download),
-          OptionType::kBoolean}},
         {"constant_sst_file_size_in_manager",
          {offset_of(
               &CloudEnvOptions::constant_sst_file_size_in_sst_file_manager),
@@ -274,9 +271,9 @@ static std::unordered_map<std::string, OptionTypeInfo>
           (OptionTypeFlags::kShared | OptionTypeFlags::kCompareLoose |
            OptionTypeFlags::kCompareNever | OptionTypeFlags::kAllowNull),
           [](const ConfigOptions& opts, const std::string& /*name*/,
-             const std::string& value, char* addr) {
+             const std::string& value, void* addr) {
             auto provider =
-                reinterpret_cast<std::shared_ptr<CloudStorageProvider>*>(addr);
+                static_cast<std::shared_ptr<CloudStorageProvider>*>(addr);
             return CloudStorageProvider::CreateFromString(opts, value,
                                                           provider);
           }}},
@@ -287,9 +284,9 @@ static std::unordered_map<std::string, OptionTypeInfo>
            OptionTypeFlags::kCompareNever | OptionTypeFlags::kAllowNull),
           // Creates a new TableFactory based on value
           [](const ConfigOptions& opts, const std::string& /*name*/,
-             const std::string& value, char* addr) {
+             const std::string& value, void* addr) {
             auto controller =
-                reinterpret_cast<std::shared_ptr<CloudLogController>*>(addr);
+                static_cast<std::shared_ptr<CloudLogController>*>(addr);
             Status s =
                 CloudLogController::CreateFromString(opts, value, controller);
             return s;
@@ -306,8 +303,8 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {0, OptionType::kUnknown, OptionVerificationType::kAlias,
           OptionTypeFlags::kNone,
           [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
-             const std::string& value, char* addr) {
-            auto copts = reinterpret_cast<CloudEnvOptions*>(addr);
+             const std::string& value, void* addr) {
+            auto copts = static_cast<CloudEnvOptions*>(addr);
             std::string name;
             std::string path;
             std::string region;
@@ -359,8 +356,7 @@ Status CloudEnvOptions::Serialize(const ConfigOptions& config_options, std::stri
 CloudEnv::CloudEnv(const CloudEnvOptions& options, Env* base,
                    const std::shared_ptr<Logger>& logger)
     : cloud_env_options(options), base_env_(base), info_log_(logger) {
-  ConfigurableHelper::RegisterOptions(*this, &cloud_env_options,
-                                      &cloud_env_option_type_info);
+  RegisterOptions(&cloud_env_options, &cloud_env_option_type_info);
 }
 
 CloudEnv::~CloudEnv() {
diff --git a/cloud/cloud_env_impl.cc b/cloud/cloud_env_impl.cc
index dc47c6f3f83..8efdd4b3d8a 100644
--- a/cloud/cloud_env_impl.cc
+++ b/cloud/cloud_env_impl.cc
@@ -859,17 +859,15 @@ Status CloudEnvImpl::LoadLocalCloudManifest(const std::string& dbname) {
 Status CloudEnvImpl::LoadLocalCloudManifest(
     const std::string& dbname, Env* base_env,
     std::unique_ptr<CloudManifest>* cloud_manifest) {
-  std::unique_ptr<SequentialFile> file;
+  std::unique_ptr<SequentialFileReader> reader;
   auto cloud_manifest_file_name = CloudManifestFile(dbname);
-  auto s = base_env->NewSequentialFile(cloud_manifest_file_name, &file,
-                                       EnvOptions());
+  auto s = SequentialFileReader::Create(base_env->GetFileSystem(),
+                                        cloud_manifest_file_name, FileOptions(),
+                                        &reader, nullptr);
   if (!s.ok()) {
     return s;
   }
-  return CloudManifest::LoadFromLog(
-      std::unique_ptr<SequentialFileReader>(new SequentialFileReader(
-          NewLegacySequentialFileWrapper(file), cloud_manifest_file_name)),
-      cloud_manifest);
+  return CloudManifest::LoadFromLog(std::move(reader), cloud_manifest);
 }
 
 std::string CloudEnvImpl::RemapFilename(const std::string& logical_path) const {
@@ -1005,12 +1003,11 @@ Status CloudEnvImpl::writeCloudManifest(CloudManifest* manifest,
   // Write to tmp file and atomically rename later. This helps if we crash
   // mid-write :)
   auto tmp_fname = fname + ".tmp";
-  std::unique_ptr<WritableFile> file;
-  Status s = local_env->NewWritableFile(tmp_fname, &file, EnvOptions());
+  std::unique_ptr<WritableFileWriter> writer;
+  Status s = WritableFileWriter::Create(local_env->GetFileSystem(), tmp_fname,
+                                        FileOptions(), &writer, nullptr);
   if (s.ok()) {
-    s = manifest->WriteToLog(std::unique_ptr<WritableFileWriter>(
-        new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(file)),
-                               tmp_fname, EnvOptions())));
+    s = manifest->WriteToLog(std::move(writer));
   }
   if (s.ok()) {
     s = local_env->RenameFile(tmp_fname, fname);
@@ -1788,8 +1785,8 @@ Status CloudEnvImpl::RollNewEpoch(const std::string& local_dbname) {
     // However, we don't move here, we copy. If we moved and crashed immediately
     // after (before writing CLOUDMANIFEST), we'd corrupt our database. The old
     // MANIFEST file will be cleaned up in DeleteInvisibleFiles().
-    LegacyFileSystemWrapper fs(GetBaseEnv());
-    st = CopyFile(&fs, ManifestFileWithEpoch(local_dbname, oldEpoch),
+    const auto& fs = GetBaseEnv()->GetFileSystem();
+    st = CopyFile(fs.get(), ManifestFileWithEpoch(local_dbname, oldEpoch),
                   ManifestFileWithEpoch(local_dbname, newEpoch), 0, true);
     if (!st.ok()) {
       return st;
diff --git a/cloud/cloud_manifest_test.cc b/cloud/cloud_manifest_test.cc
index 177064371c4..b38936a0f2f 100644
--- a/cloud/cloud_manifest_test.cc
+++ b/cloud/cloud_manifest_test.cc
@@ -50,22 +50,18 @@ TEST_F(CloudManifestTest, BasicTest) {
       // serialize and deserialize
       auto tmpfile = tmp_dir_ + "/cloudmanifest";
       {
-        std::unique_ptr<WritableFile> file;
-        ASSERT_OK(env_->NewWritableFile(tmpfile, &file, EnvOptions()));
-        ASSERT_OK(manifest->WriteToLog(
-            std::unique_ptr<WritableFileWriter>(new WritableFileWriter(
-                NewLegacyWritableFileWrapper(std::move(file)), tmpfile,
-                EnvOptions()))));
+        std::unique_ptr<WritableFileWriter> writer;
+        ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), tmpfile,
+                                             FileOptions(), &writer, nullptr));
+        ASSERT_OK(manifest->WriteToLog(std::move(writer)));
       }
 
       manifest.reset();
       {
-        std::unique_ptr<SequentialFile> file;
-        ASSERT_OK(env_->NewSequentialFile(tmpfile, &file, EnvOptions()));
-        CloudManifest::LoadFromLog(
-            std::unique_ptr<SequentialFileReader>(new SequentialFileReader(
-                NewLegacySequentialFileWrapper(file), tmpfile)),
-            &manifest);
+        std::unique_ptr<SequentialFileReader> reader;
+        ASSERT_OK(SequentialFileReader::Create(
+            env_->GetFileSystem(), tmpfile, FileOptions(), &reader, nullptr));
+        ASSERT_OK(CloudManifest::LoadFromLog(std::move(reader), &manifest));
       }
     }
   }
diff --git a/cloud/cloud_storage_provider_impl.h b/cloud/cloud_storage_provider_impl.h
index 60f0974ca48..63c8587d6f2 100644
--- a/cloud/cloud_storage_provider_impl.h
+++ b/cloud/cloud_storage_provider_impl.h
@@ -51,12 +51,14 @@ class CloudStorageWritableFileImpl : public CloudStorageWritableFile {
                                const EnvOptions& options);
 
   virtual ~CloudStorageWritableFileImpl();
+  using CloudStorageWritableFile::Append;
   virtual Status Append(const Slice& data) override {
     assert(status_.ok());
     // write to temporary file
     return local_file_->Append(data);
   }
 
+  using CloudStorageWritableFile::PositionedAppend;
   Status PositionedAppend(const Slice& data, uint64_t offset) override {
     return local_file_->PositionedAppend(data, offset);
   }
diff --git a/cloud/db_cloud_impl.cc b/cloud/db_cloud_impl.cc
index 6fd7afef933..c91938a16d8 100644
--- a/cloud/db_cloud_impl.cc
+++ b/cloud/db_cloud_impl.cc
@@ -29,21 +29,22 @@ namespace {
  */
 class ConstantSizeSstFileManager : public SstFileManagerImpl {
  public:
-  ConstantSizeSstFileManager(int64_t constant_file_size, Env* env,
+  ConstantSizeSstFileManager(int64_t constant_file_size,
+                             const std::shared_ptr<SystemClock>& clock,
+                             const std::shared_ptr<FileSystem>& fs,
                              std::shared_ptr<Logger> logger,
                              int64_t rate_bytes_per_sec,
                              double max_trash_db_ratio,
                              uint64_t bytes_max_delete_chunk)
-      : SstFileManagerImpl(env, std::make_shared<LegacyFileSystemWrapper>(env),
-                           std::move(logger), rate_bytes_per_sec,
+      : SstFileManagerImpl(clock, fs, std::move(logger), rate_bytes_per_sec,
                            max_trash_db_ratio, bytes_max_delete_chunk),
         constant_file_size_(constant_file_size) {
     assert(constant_file_size_ >= 0);
   }
 
-  Status OnAddFile(const std::string& file_path, bool compaction) override {
+  Status OnAddFile(const std::string& file_path) override {
     return SstFileManagerImpl::OnAddFile(
-        file_path, uint64_t(constant_file_size_), compaction);
+        file_path, uint64_t(constant_file_size_));
   }
 
  private:
@@ -107,7 +108,8 @@ Status DBCloud::Open(const Options& opt, const std::string& local_dbname,
     // If users don't use Options.sst_file_manager, then these values are used
     // currently when creating an SST File Manager.
     options.sst_file_manager = std::make_shared<ConstantSizeSstFileManager>(
-        constant_sst_file_size, options.env, options.info_log,
+        constant_sst_file_size, options.env->GetSystemClock(),
+        options.env->GetFileSystem(), options.info_log,
         0 /* rate_bytes_per_sec */, 0.25 /* max_trash_db_ratio */,
         64 * 1024 * 1024 /* bytes_max_delete_chunk */);
   }
@@ -318,9 +320,9 @@ Status DBCloudImpl::DoCheckpointToCloud(
   auto current_epoch = cenv->GetCloudManifest()->GetCurrentEpoch().ToString();
   auto manifest_fname = ManifestFileWithEpoch("", current_epoch);
   auto tmp_manifest_fname = manifest_fname + ".tmp";
-  LegacyFileSystemWrapper fs(base_env);
+  auto fs = base_env->GetFileSystem();
   st =
-      CopyFile(&fs, GetName() + "/" + manifest_fname,
+      CopyFile(fs.get(), GetName() + "/" + manifest_fname,
                GetName() + "/" + tmp_manifest_fname, manifest_file_size, false);
   if (!st.ok()) {
     return st;
diff --git a/db/arena_wrapped_db_iter.cc b/db/arena_wrapped_db_iter.cc
index a1a1be3a264..520588afe89 100644
--- a/db/arena_wrapped_db_iter.cc
+++ b/db/arena_wrapped_db_iter.cc
@@ -30,20 +30,18 @@ Status ArenaWrappedDBIter::GetProperty(std::string prop_name,
   return db_iter_->GetProperty(prop_name, prop);
 }
 
-void ArenaWrappedDBIter::Init(Env* env, const ReadOptions& read_options,
-                              const ImmutableCFOptions& cf_options,
-                              const MutableCFOptions& mutable_cf_options,
-                              const SequenceNumber& sequence,
-                              uint64_t max_sequential_skip_in_iteration,
-                              uint64_t version_number,
-                              ReadCallback* read_callback, DBImpl* db_impl,
-                              ColumnFamilyData* cfd, bool allow_blob,
-                              bool allow_refresh) {
+void ArenaWrappedDBIter::Init(
+    Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, const Version* version,
+    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration,
+    uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl,
+    ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) {
   auto mem = arena_.AllocateAligned(sizeof(DBIter));
-  db_iter_ = new (mem) DBIter(env, read_options, cf_options, mutable_cf_options,
-                              cf_options.user_comparator, nullptr, sequence,
-                              true, max_sequential_skip_in_iteration,
-                              read_callback, db_impl, cfd, allow_blob);
+  db_iter_ =
+      new (mem) DBIter(env, read_options, ioptions, mutable_cf_options,
+                       ioptions.user_comparator, /* iter */ nullptr, version,
+                       sequence, true, max_sequential_skip_in_iteration,
+                       read_callback, db_impl, cfd, expose_blob_index);
   sv_number_ = version_number;
   read_options_ = read_options;
   allow_refresh_ = allow_refresh;
@@ -72,8 +70,9 @@ Status ArenaWrappedDBIter::Refresh() {
       read_callback_->Refresh(latest_seq);
     }
     Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options,
-         latest_seq, sv->mutable_cf_options.max_sequential_skip_in_iterations,
-         cur_sv_number, read_callback_, db_impl_, cfd_, allow_blob_,
+         sv->current, latest_seq,
+         sv->mutable_cf_options.max_sequential_skip_in_iterations,
+         cur_sv_number, read_callback_, db_impl_, cfd_, expose_blob_index_,
          allow_refresh_);
 
     InternalIterator* internal_iter = db_impl_->NewInternalIterator(
@@ -88,18 +87,17 @@ Status ArenaWrappedDBIter::Refresh() {
 }
 
 ArenaWrappedDBIter* NewArenaWrappedDbIterator(
-    Env* env, const ReadOptions& read_options,
-    const ImmutableCFOptions& cf_options,
-    const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence,
-    uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
-    ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
-    bool allow_blob, bool allow_refresh) {
+    Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, const Version* version,
+    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
+    uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl,
+    ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) {
   ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
-  iter->Init(env, read_options, cf_options, mutable_cf_options, sequence,
+  iter->Init(env, read_options, ioptions, mutable_cf_options, version, sequence,
              max_sequential_skip_in_iterations, version_number, read_callback,
-             db_impl, cfd, allow_blob, allow_refresh);
+             db_impl, cfd, expose_blob_index, allow_refresh);
   if (db_impl != nullptr && cfd != nullptr && allow_refresh) {
-    iter->StoreRefreshInfo(db_impl, cfd, read_callback, allow_blob);
+    iter->StoreRefreshInfo(db_impl, cfd, read_callback, expose_blob_index);
   }
 
   return iter;
diff --git a/db/arena_wrapped_db_iter.h b/db/arena_wrapped_db_iter.h
index 80422f63a02..17273b201d9 100644
--- a/db/arena_wrapped_db_iter.h
+++ b/db/arena_wrapped_db_iter.h
@@ -23,6 +23,7 @@
 namespace ROCKSDB_NAMESPACE {
 
 class Arena;
+class Version;
 
 // A wrapper iterator which wraps DB Iterator and the arena, with which the DB
 // iterator is supposed to be allocated. This class is used as an entry point of
@@ -71,21 +72,21 @@ class ArenaWrappedDBIter : public Iterator {
   Status Refresh() override;
 
   void Init(Env* env, const ReadOptions& read_options,
-            const ImmutableCFOptions& cf_options,
-            const MutableCFOptions& mutable_cf_options,
+            const ImmutableOptions& ioptions,
+            const MutableCFOptions& mutable_cf_options, const Version* version,
             const SequenceNumber& sequence,
             uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
             ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
-            bool allow_blob, bool allow_refresh);
+            bool expose_blob_index, bool allow_refresh);
 
   // Store some parameters so we can refresh the iterator at a later point
   // with these same params
   void StoreRefreshInfo(DBImpl* db_impl, ColumnFamilyData* cfd,
-                        ReadCallback* read_callback, bool allow_blob) {
+                        ReadCallback* read_callback, bool expose_blob_index) {
     db_impl_ = db_impl;
     cfd_ = cfd;
     read_callback_ = read_callback;
-    allow_blob_ = allow_blob;
+    expose_blob_index_ = expose_blob_index;
   }
 
  private:
@@ -96,7 +97,7 @@ class ArenaWrappedDBIter : public Iterator {
   DBImpl* db_impl_ = nullptr;
   ReadOptions read_options_;
   ReadCallback* read_callback_;
-  bool allow_blob_ = false;
+  bool expose_blob_index_ = false;
   bool allow_refresh_ = true;
 };
 
@@ -104,11 +105,10 @@ class ArenaWrappedDBIter : public Iterator {
 // `db_impl` and `cfd` are used for reneweal. If left null, renewal will not
 // be supported.
 extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
-    Env* env, const ReadOptions& read_options,
-    const ImmutableCFOptions& cf_options,
-    const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence,
-    uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
-    ReadCallback* read_callback, DBImpl* db_impl = nullptr,
-    ColumnFamilyData* cfd = nullptr, bool allow_blob = false,
-    bool allow_refresh = true);
+    Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, const Version* version,
+    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
+    uint64_t version_number, ReadCallback* read_callback,
+    DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr,
+    bool expose_blob_index = false, bool allow_refresh = true);
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/blob/blob_fetcher.cc b/db/blob/blob_fetcher.cc
new file mode 100644
index 00000000000..a42a4be5f39
--- /dev/null
+++ b/db/blob/blob_fetcher.cc
@@ -0,0 +1,22 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_fetcher.h"
+
+#include "db/version_set.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status BlobFetcher::FetchBlob(const Slice& user_key, const Slice& blob_index,
+                              PinnableSlice* blob_value) {
+  Status s;
+  assert(version_);
+  constexpr uint64_t* bytes_read = nullptr;
+  s = version_->GetBlob(read_options_, user_key, blob_index, blob_value,
+                        bytes_read);
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
\ No newline at end of file
diff --git a/db/blob/blob_fetcher.h b/db/blob/blob_fetcher.h
new file mode 100644
index 00000000000..747057f0999
--- /dev/null
+++ b/db/blob/blob_fetcher.h
@@ -0,0 +1,26 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Version;
+
+class BlobFetcher {
+ public:
+  BlobFetcher(Version* version, const ReadOptions& read_options)
+      : version_(version), read_options_(read_options) {}
+
+  Status FetchBlob(const Slice& user_key, const Slice& blob_index,
+                   PinnableSlice* blob_value);
+
+ private:
+  Version* version_;
+  ReadOptions read_options_;
+};
+}  // namespace ROCKSDB_NAMESPACE
\ No newline at end of file
diff --git a/db/blob/blob_file_builder.cc b/db/blob/blob_file_builder.cc
index 57f05438c47..674466c7128 100644
--- a/db/blob/blob_file_builder.cc
+++ b/db/blob/blob_file_builder.cc
@@ -8,6 +8,7 @@
 #include <cassert>
 
 #include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_file_completion_callback.h"
 #include "db/blob/blob_index.h"
 #include "db/blob/blob_log_format.h"
 #include "db/blob/blob_log_writer.h"
@@ -17,41 +18,46 @@
 #include "file/writable_file_writer.h"
 #include "logging/logging.h"
 #include "options/cf_options.h"
+#include "options/options_helper.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
 #include "test_util/sync_point.h"
+#include "trace_replay/io_tracer.h"
 #include "util/compression.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 BlobFileBuilder::BlobFileBuilder(
-    VersionSet* versions, Env* env, FileSystem* fs,
-    const ImmutableCFOptions* immutable_cf_options,
+    VersionSet* versions, FileSystem* fs,
+    const ImmutableOptions* immutable_options,
     const MutableCFOptions* mutable_cf_options, const FileOptions* file_options,
     int job_id, uint32_t column_family_id,
     const std::string& column_family_name, Env::IOPriority io_priority,
     Env::WriteLifeTimeHint write_hint,
+    const std::shared_ptr<IOTracer>& io_tracer,
+    BlobFileCompletionCallback* blob_callback,
     std::vector<std::string>* blob_file_paths,
     std::vector<BlobFileAddition>* blob_file_additions)
-    : BlobFileBuilder([versions]() { return versions->NewFileNumber(); }, env,
-                      fs, immutable_cf_options, mutable_cf_options,
-                      file_options, job_id, column_family_id,
-                      column_family_name, io_priority, write_hint,
-                      blob_file_paths, blob_file_additions) {}
+    : BlobFileBuilder([versions]() { return versions->NewFileNumber(); }, fs,
+                      immutable_options, mutable_cf_options, file_options,
+                      job_id, column_family_id, column_family_name, io_priority,
+                      write_hint, io_tracer, blob_callback, blob_file_paths,
+                      blob_file_additions) {}
 
 BlobFileBuilder::BlobFileBuilder(
-    std::function<uint64_t()> file_number_generator, Env* env, FileSystem* fs,
-    const ImmutableCFOptions* immutable_cf_options,
+    std::function<uint64_t()> file_number_generator, FileSystem* fs,
+    const ImmutableOptions* immutable_options,
     const MutableCFOptions* mutable_cf_options, const FileOptions* file_options,
     int job_id, uint32_t column_family_id,
     const std::string& column_family_name, Env::IOPriority io_priority,
     Env::WriteLifeTimeHint write_hint,
+    const std::shared_ptr<IOTracer>& io_tracer,
+    BlobFileCompletionCallback* blob_callback,
     std::vector<std::string>* blob_file_paths,
     std::vector<BlobFileAddition>* blob_file_additions)
     : file_number_generator_(std::move(file_number_generator)),
-      env_(env),
       fs_(fs),
-      immutable_cf_options_(immutable_cf_options),
+      immutable_options_(immutable_options),
       min_blob_size_(mutable_cf_options->min_blob_size),
       blob_file_size_(mutable_cf_options->blob_file_size),
       blob_compression_type_(mutable_cf_options->blob_compression_type),
@@ -61,14 +67,15 @@ BlobFileBuilder::BlobFileBuilder(
       column_family_name_(column_family_name),
       io_priority_(io_priority),
       write_hint_(write_hint),
+      io_tracer_(io_tracer),
+      blob_callback_(blob_callback),
       blob_file_paths_(blob_file_paths),
       blob_file_additions_(blob_file_additions),
       blob_count_(0),
       blob_bytes_(0) {
   assert(file_number_generator_);
-  assert(env_);
   assert(fs_);
-  assert(immutable_cf_options_);
+  assert(immutable_options_);
   assert(file_options_);
   assert(blob_file_paths_);
   assert(blob_file_paths_->empty());
@@ -149,19 +156,20 @@ Status BlobFileBuilder::OpenBlobFileIfNeeded() {
   assert(file_number_generator_);
   const uint64_t blob_file_number = file_number_generator_();
 
-  assert(immutable_cf_options_);
-  assert(!immutable_cf_options_->cf_paths.empty());
-  std::string blob_file_path = BlobFileName(
-      immutable_cf_options_->cf_paths.front().path, blob_file_number);
+  assert(immutable_options_);
+  assert(!immutable_options_->cf_paths.empty());
+  std::string blob_file_path =
+      BlobFileName(immutable_options_->cf_paths.front().path, blob_file_number);
 
   std::unique_ptr<FSWritableFile> file;
 
   {
-    TEST_SYNC_POINT("BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile");
-
     assert(file_options_);
-    const Status s =
-        NewWritableFile(fs_, blob_file_path, &file, *file_options_);
+    Status s = NewWritableFile(fs_, blob_file_path, &file, *file_options_);
+
+    TEST_SYNC_POINT_CALLBACK(
+        "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile", &s);
+
     if (!s.ok()) {
       return s;
     }
@@ -176,17 +184,20 @@ Status BlobFileBuilder::OpenBlobFileIfNeeded() {
   assert(file);
   file->SetIOPriority(io_priority_);
   file->SetWriteLifeTimeHint(write_hint_);
-
-  Statistics* const statistics = immutable_cf_options_->statistics;
-
+  FileTypeSet tmp_set = immutable_options_->checksum_handoff_file_types;
+  Statistics* const statistics = immutable_options_->stats;
   std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      std::move(file), blob_file_paths_->back(), *file_options_, env_,
-      nullptr /*IOTracer*/, statistics, immutable_cf_options_->listeners,
-      immutable_cf_options_->file_checksum_gen_factory));
+      std::move(file), blob_file_paths_->back(), *file_options_,
+      immutable_options_->clock, io_tracer_, statistics,
+      immutable_options_->listeners,
+      immutable_options_->file_checksum_gen_factory.get(),
+      tmp_set.Contains(FileType::kBlobFile)));
 
-  std::unique_ptr<BlobLogWriter> blob_log_writer(
-      new BlobLogWriter(std::move(file_writer), env_, statistics,
-                        blob_file_number, immutable_cf_options_->use_fsync));
+  constexpr bool do_flush = false;
+
+  std::unique_ptr<BlobLogWriter> blob_log_writer(new BlobLogWriter(
+      std::move(file_writer), immutable_options_->clock, statistics,
+      blob_file_number, immutable_options_->use_fsync, do_flush));
 
   constexpr bool has_ttl = false;
   constexpr ExpirationRange expiration_range;
@@ -195,9 +206,11 @@ Status BlobFileBuilder::OpenBlobFileIfNeeded() {
                        expiration_range);
 
   {
-    TEST_SYNC_POINT("BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader");
+    Status s = blob_log_writer->WriteHeader(header);
+
+    TEST_SYNC_POINT_CALLBACK(
+        "BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader", &s);
 
-    const Status s = blob_log_writer->WriteHeader(header);
     if (!s.ok()) {
       return s;
     }
@@ -247,9 +260,10 @@ Status BlobFileBuilder::WriteBlobToFile(const Slice& key, const Slice& blob,
 
   uint64_t key_offset = 0;
 
-  TEST_SYNC_POINT("BlobFileBuilder::WriteBlobToFile:AddRecord");
+  Status s = writer_->AddRecord(key, blob, &key_offset, blob_offset);
+
+  TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AddRecord", &s);
 
-  const Status s = writer_->AddRecord(key, blob, &key_offset, blob_offset);
   if (!s.ok()) {
     return s;
   }
@@ -271,10 +285,10 @@ Status BlobFileBuilder::CloseBlobFile() {
   std::string checksum_method;
   std::string checksum_value;
 
-  TEST_SYNC_POINT("BlobFileBuilder::WriteBlobToFile:AppendFooter");
+  Status s = writer_->AppendFooter(footer, &checksum_method, &checksum_value);
+
+  TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AppendFooter", &s);
 
-  const Status s =
-      writer_->AppendFooter(footer, &checksum_method, &checksum_value);
   if (!s.ok()) {
     return s;
   }
@@ -286,18 +300,21 @@ Status BlobFileBuilder::CloseBlobFile() {
                                      std::move(checksum_method),
                                      std::move(checksum_value));
 
-  assert(immutable_cf_options_);
-  ROCKS_LOG_INFO(immutable_cf_options_->info_log,
+  assert(immutable_options_);
+  ROCKS_LOG_INFO(immutable_options_->logger,
                  "[%s] [JOB %d] Generated blob file #%" PRIu64 ": %" PRIu64
                  " total blobs, %" PRIu64 " total bytes",
                  column_family_name_.c_str(), job_id_, blob_file_number,
                  blob_count_, blob_bytes_);
+  if (blob_callback_) {
+    s = blob_callback_->OnBlobFileCompleted(blob_file_paths_->back());
+  }
 
   writer_.reset();
   blob_count_ = 0;
   blob_bytes_ = 0;
 
-  return Status::OK();
+  return s;
 }
 
 Status BlobFileBuilder::CloseBlobFileIfNeeded() {
@@ -313,4 +330,20 @@ Status BlobFileBuilder::CloseBlobFileIfNeeded() {
   return CloseBlobFile();
 }
 
+void BlobFileBuilder::Abandon() {
+  if (!IsBlobFileOpen()) {
+    return;
+  }
+
+  if (blob_callback_) {
+    // BlobFileBuilder::Abandon() is called because of error while writing to
+    // Blob files. So we can ignore the below error.
+    blob_callback_->OnBlobFileCompleted(blob_file_paths_->back())
+        .PermitUncheckedError();
+  }
+
+  writer_.reset();
+  blob_count_ = 0;
+  blob_bytes_ = 0;
+}
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/blob/blob_file_builder.h b/db/blob/blob_file_builder.h
index 755ab435094..0929b6a7758 100644
--- a/db/blob/blob_file_builder.h
+++ b/db/blob/blob_file_builder.h
@@ -18,36 +18,42 @@ namespace ROCKSDB_NAMESPACE {
 
 class VersionSet;
 class FileSystem;
-struct ImmutableCFOptions;
+class SystemClock;
+struct ImmutableOptions;
 struct MutableCFOptions;
 struct FileOptions;
 class BlobFileAddition;
 class Status;
 class Slice;
 class BlobLogWriter;
+class IOTracer;
+class BlobFileCompletionCallback;
 
 class BlobFileBuilder {
  public:
-  BlobFileBuilder(VersionSet* versions, Env* env, FileSystem* fs,
-                  const ImmutableCFOptions* immutable_cf_options,
+  BlobFileBuilder(VersionSet* versions, FileSystem* fs,
+                  const ImmutableOptions* immutable_options,
                   const MutableCFOptions* mutable_cf_options,
                   const FileOptions* file_options, int job_id,
                   uint32_t column_family_id,
                   const std::string& column_family_name,
                   Env::IOPriority io_priority,
                   Env::WriteLifeTimeHint write_hint,
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  BlobFileCompletionCallback* blob_callback,
                   std::vector<std::string>* blob_file_paths,
                   std::vector<BlobFileAddition>* blob_file_additions);
 
-  BlobFileBuilder(std::function<uint64_t()> file_number_generator, Env* env,
-                  FileSystem* fs,
-                  const ImmutableCFOptions* immutable_cf_options,
+  BlobFileBuilder(std::function<uint64_t()> file_number_generator,
+                  FileSystem* fs, const ImmutableOptions* immutable_options,
                   const MutableCFOptions* mutable_cf_options,
                   const FileOptions* file_options, int job_id,
                   uint32_t column_family_id,
                   const std::string& column_family_name,
                   Env::IOPriority io_priority,
                   Env::WriteLifeTimeHint write_hint,
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  BlobFileCompletionCallback* blob_callback,
                   std::vector<std::string>* blob_file_paths,
                   std::vector<BlobFileAddition>* blob_file_additions);
 
@@ -58,6 +64,7 @@ class BlobFileBuilder {
 
   Status Add(const Slice& key, const Slice& value, std::string* blob_index);
   Status Finish();
+  void Abandon();
 
  private:
   bool IsBlobFileOpen() const;
@@ -69,9 +76,8 @@ class BlobFileBuilder {
   Status CloseBlobFileIfNeeded();
 
   std::function<uint64_t()> file_number_generator_;
-  Env* env_;
   FileSystem* fs_;
-  const ImmutableCFOptions* immutable_cf_options_;
+  const ImmutableOptions* immutable_options_;
   uint64_t min_blob_size_;
   uint64_t blob_file_size_;
   CompressionType blob_compression_type_;
@@ -81,6 +87,8 @@ class BlobFileBuilder {
   std::string column_family_name_;
   Env::IOPriority io_priority_;
   Env::WriteLifeTimeHint write_hint_;
+  std::shared_ptr<IOTracer> io_tracer_;
+  BlobFileCompletionCallback* blob_callback_;
   std::vector<std::string>* blob_file_paths_;
   std::vector<BlobFileAddition>* blob_file_additions_;
   std::unique_ptr<BlobLogWriter> writer_;
diff --git a/db/blob/blob_file_builder_test.cc b/db/blob/blob_file_builder_test.cc
index 72e9ac47b98..08cfac00754 100644
--- a/db/blob/blob_file_builder_test.cc
+++ b/db/blob/blob_file_builder_test.cc
@@ -15,7 +15,6 @@
 #include "db/blob/blob_index.h"
 #include "db/blob/blob_log_format.h"
 #include "db/blob/blob_log_sequential_reader.h"
-#include "env/composite_env_wrapper.h"
 #include "env/mock_env.h"
 #include "file/filename.h"
 #include "file/random_access_file_reader.h"
@@ -40,7 +39,10 @@ class TestFileNumberGenerator {
 
 class BlobFileBuilderTest : public testing::Test {
  protected:
-  BlobFileBuilderTest() : mock_env_(Env::Default()), fs_(&mock_env_) {}
+  BlobFileBuilderTest() : mock_env_(Env::Default()) {
+    fs_ = mock_env_.GetFileSystem().get();
+    clock_ = mock_env_.GetSystemClock().get();
+  }
 
   void VerifyBlobFile(uint64_t blob_file_number,
                       const std::string& blob_file_path,
@@ -54,14 +56,13 @@ class BlobFileBuilderTest : public testing::Test {
     std::unique_ptr<FSRandomAccessFile> file;
     constexpr IODebugContext* dbg = nullptr;
     ASSERT_OK(
-        fs_.NewRandomAccessFile(blob_file_path, file_options_, &file, dbg));
+        fs_->NewRandomAccessFile(blob_file_path, file_options_, &file, dbg));
 
     std::unique_ptr<RandomAccessFileReader> file_reader(
-        new RandomAccessFileReader(std::move(file), blob_file_path,
-                                   &mock_env_));
+        new RandomAccessFileReader(std::move(file), blob_file_path, clock_));
 
     constexpr Statistics* statistics = nullptr;
-    BlobLogSequentialReader blob_log_reader(std::move(file_reader), &mock_env_,
+    BlobLogSequentialReader blob_log_reader(std::move(file_reader), clock_,
                                             statistics);
 
     BlobLogHeader header;
@@ -108,7 +109,8 @@ class BlobFileBuilderTest : public testing::Test {
   }
 
   MockEnv mock_env_;
-  LegacyFileSystemWrapper fs_;
+  FileSystem* fs_;
+  SystemClock* clock_;
   FileOptions file_options_;
 };
 
@@ -125,8 +127,9 @@ TEST_F(BlobFileBuilderTest, BuildAndCheckOneFile) {
                             "BlobFileBuilderTest_BuildAndCheckOneFile"),
       0);
   options.enable_blob_files = true;
+  options.env = &mock_env_;
 
-  ImmutableCFOptions immutable_cf_options(options);
+  ImmutableOptions immutable_options(options);
   MutableCFOptions mutable_cf_options(options);
 
   constexpr int job_id = 1;
@@ -138,11 +141,11 @@ TEST_F(BlobFileBuilderTest, BuildAndCheckOneFile) {
   std::vector<std::string> blob_file_paths;
   std::vector<BlobFileAddition> blob_file_additions;
 
-  BlobFileBuilder builder(TestFileNumberGenerator(), &mock_env_, &fs_,
-                          &immutable_cf_options, &mutable_cf_options,
-                          &file_options_, job_id, column_family_id,
-                          column_family_name, io_priority, write_hint,
-                          &blob_file_paths, &blob_file_additions);
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, job_id, column_family_id, column_family_name, io_priority,
+      write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      &blob_file_paths, &blob_file_additions);
 
   std::vector<std::pair<std::string, std::string>> expected_key_value_pairs(
       number_of_blobs);
@@ -174,9 +177,9 @@ TEST_F(BlobFileBuilderTest, BuildAndCheckOneFile) {
 
   const std::string& blob_file_path = blob_file_paths[0];
 
-  ASSERT_EQ(blob_file_path,
-            BlobFileName(immutable_cf_options.cf_paths.front().path,
-                         blob_file_number));
+  ASSERT_EQ(
+      blob_file_path,
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
 
   ASSERT_EQ(blob_file_additions.size(), 1);
 
@@ -208,8 +211,9 @@ TEST_F(BlobFileBuilderTest, BuildAndCheckMultipleFiles) {
       0);
   options.enable_blob_files = true;
   options.blob_file_size = value_size;
+  options.env = &mock_env_;
 
-  ImmutableCFOptions immutable_cf_options(options);
+  ImmutableOptions immutable_options(options);
   MutableCFOptions mutable_cf_options(options);
 
   constexpr int job_id = 1;
@@ -221,11 +225,11 @@ TEST_F(BlobFileBuilderTest, BuildAndCheckMultipleFiles) {
   std::vector<std::string> blob_file_paths;
   std::vector<BlobFileAddition> blob_file_additions;
 
-  BlobFileBuilder builder(TestFileNumberGenerator(), &mock_env_, &fs_,
-                          &immutable_cf_options, &mutable_cf_options,
-                          &file_options_, job_id, column_family_id,
-                          column_family_name, io_priority, write_hint,
-                          &blob_file_paths, &blob_file_additions);
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, job_id, column_family_id, column_family_name, io_priority,
+      write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      &blob_file_paths, &blob_file_additions);
 
   std::vector<std::pair<std::string, std::string>> expected_key_value_pairs(
       number_of_blobs);
@@ -258,7 +262,7 @@ TEST_F(BlobFileBuilderTest, BuildAndCheckMultipleFiles) {
     const uint64_t blob_file_number = i + 2;
 
     ASSERT_EQ(blob_file_paths[i],
-              BlobFileName(immutable_cf_options.cf_paths.front().path,
+              BlobFileName(immutable_options.cf_paths.front().path,
                            blob_file_number));
 
     const auto& blob_file_addition = blob_file_additions[i];
@@ -293,8 +297,9 @@ TEST_F(BlobFileBuilderTest, InlinedValues) {
       0);
   options.enable_blob_files = true;
   options.min_blob_size = 1024;
+  options.env = &mock_env_;
 
-  ImmutableCFOptions immutable_cf_options(options);
+  ImmutableOptions immutable_options(options);
   MutableCFOptions mutable_cf_options(options);
 
   constexpr int job_id = 1;
@@ -306,11 +311,11 @@ TEST_F(BlobFileBuilderTest, InlinedValues) {
   std::vector<std::string> blob_file_paths;
   std::vector<BlobFileAddition> blob_file_additions;
 
-  BlobFileBuilder builder(TestFileNumberGenerator(), &mock_env_, &fs_,
-                          &immutable_cf_options, &mutable_cf_options,
-                          &file_options_, job_id, column_family_id,
-                          column_family_name, io_priority, write_hint,
-                          &blob_file_paths, &blob_file_additions);
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, job_id, column_family_id, column_family_name, io_priority,
+      write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      &blob_file_paths, &blob_file_additions);
 
   for (size_t i = 0; i < number_of_blobs; ++i) {
     const std::string key = std::to_string(i);
@@ -345,8 +350,9 @@ TEST_F(BlobFileBuilderTest, Compression) {
       test::PerThreadDBPath(&mock_env_, "BlobFileBuilderTest_Compression"), 0);
   options.enable_blob_files = true;
   options.blob_compression_type = kSnappyCompression;
+  options.env = &mock_env_;
 
-  ImmutableCFOptions immutable_cf_options(options);
+  ImmutableOptions immutable_options(options);
   MutableCFOptions mutable_cf_options(options);
 
   constexpr int job_id = 1;
@@ -358,11 +364,11 @@ TEST_F(BlobFileBuilderTest, Compression) {
   std::vector<std::string> blob_file_paths;
   std::vector<BlobFileAddition> blob_file_additions;
 
-  BlobFileBuilder builder(TestFileNumberGenerator(), &mock_env_, &fs_,
-                          &immutable_cf_options, &mutable_cf_options,
-                          &file_options_, job_id, column_family_id,
-                          column_family_name, io_priority, write_hint,
-                          &blob_file_paths, &blob_file_additions);
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, job_id, column_family_id, column_family_name, io_priority,
+      write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      &blob_file_paths, &blob_file_additions);
 
   const std::string key("1");
   const std::string uncompressed_value(value_size, 'x');
@@ -381,9 +387,9 @@ TEST_F(BlobFileBuilderTest, Compression) {
 
   const std::string& blob_file_path = blob_file_paths[0];
 
-  ASSERT_EQ(blob_file_path,
-            BlobFileName(immutable_cf_options.cf_paths.front().path,
-                         blob_file_number));
+  ASSERT_EQ(
+      blob_file_path,
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
 
   ASSERT_EQ(blob_file_additions.size(), 1);
 
@@ -427,8 +433,8 @@ TEST_F(BlobFileBuilderTest, CompressionError) {
       0);
   options.enable_blob_files = true;
   options.blob_compression_type = kSnappyCompression;
-
-  ImmutableCFOptions immutable_cf_options(options);
+  options.env = &mock_env_;
+  ImmutableOptions immutable_options(options);
   MutableCFOptions mutable_cf_options(options);
 
   constexpr int job_id = 1;
@@ -440,11 +446,11 @@ TEST_F(BlobFileBuilderTest, CompressionError) {
   std::vector<std::string> blob_file_paths;
   std::vector<BlobFileAddition> blob_file_additions;
 
-  BlobFileBuilder builder(TestFileNumberGenerator(), &mock_env_, &fs_,
-                          &immutable_cf_options, &mutable_cf_options,
-                          &file_options_, job_id, column_family_id,
-                          column_family_name, io_priority, write_hint,
-                          &blob_file_paths, &blob_file_additions);
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, job_id, column_family_id, column_family_name, io_priority,
+      write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      &blob_file_paths, &blob_file_additions);
 
   SyncPoint::GetInstance()->SetCallBack("CompressData:TamperWithReturnValue",
                                         [](void* arg) {
@@ -466,9 +472,9 @@ TEST_F(BlobFileBuilderTest, CompressionError) {
   constexpr uint64_t blob_file_number = 2;
 
   ASSERT_EQ(blob_file_paths.size(), 1);
-  ASSERT_EQ(blob_file_paths[0],
-            BlobFileName(immutable_cf_options.cf_paths.front().path,
-                         blob_file_number));
+  ASSERT_EQ(
+      blob_file_paths[0],
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
 
   ASSERT_TRUE(blob_file_additions.empty());
 }
@@ -504,8 +510,9 @@ TEST_F(BlobFileBuilderTest, Checksum) {
   options.enable_blob_files = true;
   options.file_checksum_gen_factory =
       std::make_shared<DummyFileChecksumGenFactory>();
+  options.env = &mock_env_;
 
-  ImmutableCFOptions immutable_cf_options(options);
+  ImmutableOptions immutable_options(options);
   MutableCFOptions mutable_cf_options(options);
 
   constexpr int job_id = 1;
@@ -517,11 +524,11 @@ TEST_F(BlobFileBuilderTest, Checksum) {
   std::vector<std::string> blob_file_paths;
   std::vector<BlobFileAddition> blob_file_additions;
 
-  BlobFileBuilder builder(TestFileNumberGenerator(), &mock_env_, &fs_,
-                          &immutable_cf_options, &mutable_cf_options,
-                          &file_options_, job_id, column_family_id,
-                          column_family_name, io_priority, write_hint,
-                          &blob_file_paths, &blob_file_additions);
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, job_id, column_family_id, column_family_name, io_priority,
+      write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      &blob_file_paths, &blob_file_additions);
 
   const std::string key("1");
   const std::string value("deadbeef");
@@ -540,9 +547,9 @@ TEST_F(BlobFileBuilderTest, Checksum) {
 
   const std::string& blob_file_path = blob_file_paths[0];
 
-  ASSERT_EQ(blob_file_path,
-            BlobFileName(immutable_cf_options.cf_paths.front().path,
-                         blob_file_number));
+  ASSERT_EQ(
+      blob_file_path,
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
 
   ASSERT_EQ(blob_file_additions.size(), 1);
 
@@ -570,13 +577,11 @@ class BlobFileBuilderIOErrorTest
  protected:
   BlobFileBuilderIOErrorTest()
       : mock_env_(Env::Default()),
-        fault_injection_env_(&mock_env_),
-        fs_(&fault_injection_env_),
+        fs_(mock_env_.GetFileSystem().get()),
         sync_point_(GetParam()) {}
 
   MockEnv mock_env_;
-  FaultInjectionTestEnv fault_injection_env_;
-  LegacyFileSystemWrapper fs_;
+  FileSystem* fs_;
   FileOptions file_options_;
   std::string sync_point_;
 };
@@ -597,13 +602,13 @@ TEST_P(BlobFileBuilderIOErrorTest, IOError) {
 
   Options options;
   options.cf_paths.emplace_back(
-      test::PerThreadDBPath(&fault_injection_env_,
-                            "BlobFileBuilderIOErrorTest_IOError"),
+      test::PerThreadDBPath(&mock_env_, "BlobFileBuilderIOErrorTest_IOError"),
       0);
   options.enable_blob_files = true;
   options.blob_file_size = value_size;
+  options.env = &mock_env_;
 
-  ImmutableCFOptions immutable_cf_options(options);
+  ImmutableOptions immutable_options(options);
   MutableCFOptions mutable_cf_options(options);
 
   constexpr int job_id = 1;
@@ -615,15 +620,17 @@ TEST_P(BlobFileBuilderIOErrorTest, IOError) {
   std::vector<std::string> blob_file_paths;
   std::vector<BlobFileAddition> blob_file_additions;
 
-  BlobFileBuilder builder(TestFileNumberGenerator(), &fault_injection_env_,
-                          &fs_, &immutable_cf_options, &mutable_cf_options,
-                          &file_options_, job_id, column_family_id,
-                          column_family_name, io_priority, write_hint,
-                          &blob_file_paths, &blob_file_additions);
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, job_id, column_family_id, column_family_name, io_priority,
+      write_hint, nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      &blob_file_paths, &blob_file_additions);
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) {
+    Status* const s = static_cast<Status*>(arg);
+    assert(s);
 
-  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
-    fault_injection_env_.SetFilesystemActive(false,
-                                             Status::IOError(sync_point_));
+    (*s) = Status::IOError(sync_point_);
   });
   SyncPoint::GetInstance()->EnableProcessing();
 
@@ -644,7 +651,7 @@ TEST_P(BlobFileBuilderIOErrorTest, IOError) {
 
     ASSERT_EQ(blob_file_paths.size(), 1);
     ASSERT_EQ(blob_file_paths[0],
-              BlobFileName(immutable_cf_options.cf_paths.front().path,
+              BlobFileName(immutable_options.cf_paths.front().path,
                            blob_file_number));
   }
 
diff --git a/db/blob/blob_file_cache.cc b/db/blob/blob_file_cache.cc
new file mode 100644
index 00000000000..1a6cdf6880c
--- /dev/null
+++ b/db/blob/blob_file_cache.cc
@@ -0,0 +1,102 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_cache.h"
+
+#include <cassert>
+#include <memory>
+
+#include "db/blob/blob_file_reader.h"
+#include "options/cf_options.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/slice.h"
+#include "test_util/sync_point.h"
+#include "trace_replay/io_tracer.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobFileCache::BlobFileCache(Cache* cache,
+                             const ImmutableOptions* immutable_options,
+                             const FileOptions* file_options,
+                             uint32_t column_family_id,
+                             HistogramImpl* blob_file_read_hist,
+                             const std::shared_ptr<IOTracer>& io_tracer)
+    : cache_(cache),
+      mutex_(kNumberOfMutexStripes, kGetSliceNPHash64UnseededFnPtr),
+      immutable_options_(immutable_options),
+      file_options_(file_options),
+      column_family_id_(column_family_id),
+      blob_file_read_hist_(blob_file_read_hist),
+      io_tracer_(io_tracer) {
+  assert(cache_);
+  assert(immutable_options_);
+  assert(file_options_);
+}
+
+Status BlobFileCache::GetBlobFileReader(
+    uint64_t blob_file_number,
+    CacheHandleGuard<BlobFileReader>* blob_file_reader) {
+  assert(blob_file_reader);
+  assert(blob_file_reader->IsEmpty());
+
+  const Slice key = GetSlice(&blob_file_number);
+
+  assert(cache_);
+
+  Cache::Handle* handle = cache_->Lookup(key);
+  if (handle) {
+    *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
+    return Status::OK();
+  }
+
+  TEST_SYNC_POINT("BlobFileCache::GetBlobFileReader:DoubleCheck");
+
+  // Check again while holding mutex
+  MutexLock lock(mutex_.get(key));
+
+  handle = cache_->Lookup(key);
+  if (handle) {
+    *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
+    return Status::OK();
+  }
+
+  assert(immutable_options_);
+  Statistics* const statistics = immutable_options_->stats;
+
+  RecordTick(statistics, NO_FILE_OPENS);
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  {
+    assert(file_options_);
+    const Status s = BlobFileReader::Create(
+        *immutable_options_, *file_options_, column_family_id_,
+        blob_file_read_hist_, blob_file_number, io_tracer_, &reader);
+    if (!s.ok()) {
+      RecordTick(statistics, NO_FILE_ERRORS);
+      return s;
+    }
+  }
+
+  {
+    constexpr size_t charge = 1;
+
+    const Status s = cache_->Insert(key, reader.get(), charge,
+                                    &DeleteCacheEntry<BlobFileReader>, &handle);
+    if (!s.ok()) {
+      RecordTick(statistics, NO_FILE_ERRORS);
+      return s;
+    }
+  }
+
+  reader.release();
+
+  *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
+
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/db/blob/blob_file_cache.h b/db/blob/blob_file_cache.h
new file mode 100644
index 00000000000..8eec05f184e
--- /dev/null
+++ b/db/blob/blob_file_cache.h
@@ -0,0 +1,52 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cinttypes>
+
+#include "cache/cache_helpers.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+struct ImmutableOptions;
+struct FileOptions;
+class HistogramImpl;
+class Status;
+class BlobFileReader;
+class Slice;
+class IOTracer;
+
+class BlobFileCache {
+ public:
+  BlobFileCache(Cache* cache, const ImmutableOptions* immutable_options,
+                const FileOptions* file_options, uint32_t column_family_id,
+                HistogramImpl* blob_file_read_hist,
+                const std::shared_ptr<IOTracer>& io_tracer);
+
+  BlobFileCache(const BlobFileCache&) = delete;
+  BlobFileCache& operator=(const BlobFileCache&) = delete;
+
+  Status GetBlobFileReader(uint64_t blob_file_number,
+                           CacheHandleGuard<BlobFileReader>* blob_file_reader);
+
+ private:
+  Cache* cache_;
+  // Note: mutex_ below is used to guard against multiple threads racing to open
+  // the same file.
+  Striped<port::Mutex, Slice> mutex_;
+  const ImmutableOptions* immutable_options_;
+  const FileOptions* file_options_;
+  uint32_t column_family_id_;
+  HistogramImpl* blob_file_read_hist_;
+  std::shared_ptr<IOTracer> io_tracer_;
+
+  static constexpr size_t kNumberOfMutexStripes = 1 << 7;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/db/blob/blob_file_cache_test.cc b/db/blob/blob_file_cache_test.cc
new file mode 100644
index 00000000000..bef2d6202e7
--- /dev/null
+++ b/db/blob/blob_file_cache_test.cc
@@ -0,0 +1,267 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_cache.h"
+
+#include <cassert>
+#include <string>
+
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Creates a test blob file with a single blob in it.
+void WriteBlobFile(uint32_t column_family_id,
+                   const ImmutableOptions& immutable_options,
+                   uint64_t blob_file_number) {
+  assert(!immutable_options.cf_paths.empty());
+
+  const std::string blob_file_path =
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number);
+
+  std::unique_ptr<FSWritableFile> file;
+  ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file,
+                            FileOptions()));
+
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(file), blob_file_path, FileOptions(), immutable_options.clock));
+
+  constexpr Statistics* statistics = nullptr;
+  constexpr bool use_fsync = false;
+  constexpr bool do_flush = false;
+
+  BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock,
+                                statistics, blob_file_number, use_fsync,
+                                do_flush);
+
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+
+  BlobLogHeader header(column_family_id, kNoCompression, has_ttl,
+                       expiration_range);
+
+  ASSERT_OK(blob_log_writer.WriteHeader(header));
+
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  std::string compressed_blob;
+
+  uint64_t key_offset = 0;
+  uint64_t blob_offset = 0;
+
+  ASSERT_OK(blob_log_writer.AddRecord(key, blob, &key_offset, &blob_offset));
+
+  BlobLogFooter footer;
+  footer.blob_count = 1;
+  footer.expiration_range = expiration_range;
+
+  std::string checksum_method;
+  std::string checksum_value;
+
+  ASSERT_OK(
+      blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value));
+}
+
+}  // anonymous namespace
+
+class BlobFileCacheTest : public testing::Test {
+ protected:
+  BlobFileCacheTest() : mock_env_(Env::Default()) {}
+
+  MockEnv mock_env_;
+};
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader) {
+  Options options;
+  options.env = &mock_env_;
+  options.statistics = CreateDBStatistics();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(&mock_env_, "BlobFileCacheTest_GetBlobFileReader"),
+      0);
+  options.enable_blob_files = true;
+
+  constexpr uint32_t column_family_id = 1;
+  ImmutableOptions immutable_options(options);
+  constexpr uint64_t blob_file_number = 123;
+
+  WriteBlobFile(column_family_id, immutable_options, blob_file_number);
+
+  constexpr size_t capacity = 10;
+  std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+                                &file_options, column_family_id,
+                                blob_file_read_hist, nullptr /*IOTracer*/);
+
+  // First try: reader should be opened and put in cache
+  CacheHandleGuard<BlobFileReader> first;
+
+  ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first));
+  ASSERT_NE(first.GetValue(), nullptr);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+
+  // Second try: reader should be served from cache
+  CacheHandleGuard<BlobFileReader> second;
+
+  ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second));
+  ASSERT_NE(second.GetValue(), nullptr);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+
+  ASSERT_EQ(first.GetValue(), second.GetValue());
+}
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader_Race) {
+  Options options;
+  options.env = &mock_env_;
+  options.statistics = CreateDBStatistics();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(&mock_env_,
+                            "BlobFileCacheTest_GetBlobFileReader_Race"),
+      0);
+  options.enable_blob_files = true;
+
+  constexpr uint32_t column_family_id = 1;
+  ImmutableOptions immutable_options(options);
+  constexpr uint64_t blob_file_number = 123;
+
+  WriteBlobFile(column_family_id, immutable_options, blob_file_number);
+
+  constexpr size_t capacity = 10;
+  std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+                                &file_options, column_family_id,
+                                blob_file_read_hist, nullptr /*IOTracer*/);
+
+  CacheHandleGuard<BlobFileReader> first;
+  CacheHandleGuard<BlobFileReader> second;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileCache::GetBlobFileReader:DoubleCheck", [&](void* /* arg */) {
+        // Disabling sync points to prevent infinite recursion
+        SyncPoint::GetInstance()->DisableProcessing();
+
+        ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second));
+        ASSERT_NE(second.GetValue(), nullptr);
+        ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+        ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first));
+  ASSERT_NE(first.GetValue(), nullptr);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+
+  ASSERT_EQ(first.GetValue(), second.GetValue());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader_IOError) {
+  Options options;
+  options.env = &mock_env_;
+  options.statistics = CreateDBStatistics();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(&mock_env_,
+                            "BlobFileCacheTest_GetBlobFileReader_IOError"),
+      0);
+  options.enable_blob_files = true;
+
+  constexpr size_t capacity = 10;
+  std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+  ImmutableOptions immutable_options(options);
+  FileOptions file_options;
+  constexpr uint32_t column_family_id = 1;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+                                &file_options, column_family_id,
+                                blob_file_read_hist, nullptr /*IOTracer*/);
+
+  // Note: there is no blob file with the below number
+  constexpr uint64_t blob_file_number = 123;
+
+  CacheHandleGuard<BlobFileReader> reader;
+
+  ASSERT_TRUE(
+      blob_file_cache.GetBlobFileReader(blob_file_number, &reader).IsIOError());
+  ASSERT_EQ(reader.GetValue(), nullptr);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1);
+}
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader_CacheFull) {
+  Options options;
+  options.env = &mock_env_;
+  options.statistics = CreateDBStatistics();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(&mock_env_,
+                            "BlobFileCacheTest_GetBlobFileReader_CacheFull"),
+      0);
+  options.enable_blob_files = true;
+
+  constexpr uint32_t column_family_id = 1;
+  ImmutableOptions immutable_options(options);
+  constexpr uint64_t blob_file_number = 123;
+
+  WriteBlobFile(column_family_id, immutable_options, blob_file_number);
+
+  constexpr size_t capacity = 0;
+  constexpr int num_shard_bits = -1;  // determined automatically
+  constexpr bool strict_capacity_limit = true;
+  std::shared_ptr<Cache> backing_cache =
+      NewLRUCache(capacity, num_shard_bits, strict_capacity_limit);
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+                                &file_options, column_family_id,
+                                blob_file_read_hist, nullptr /*IOTracer*/);
+
+  // Insert into cache should fail since it has zero capacity and
+  // strict_capacity_limit is set
+  CacheHandleGuard<BlobFileReader> reader;
+
+  ASSERT_TRUE(blob_file_cache.GetBlobFileReader(blob_file_number, &reader)
+                  .IsIncomplete());
+  ASSERT_EQ(reader.GetValue(), nullptr);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/db/blob/blob_file_completion_callback.h b/db/blob/blob_file_completion_callback.h
new file mode 100644
index 00000000000..42b6def893c
--- /dev/null
+++ b/db/blob/blob_file_completion_callback.h
@@ -0,0 +1,57 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include "db/error_handler.h"
+#include "file/sst_file_manager_impl.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlobFileCompletionCallback {
+ public:
+#ifdef ROCKSDB_LITE
+  BlobFileCompletionCallback(SstFileManager* /*sst_file_manager*/,
+                             InstrumentedMutex* /*mutex*/,
+                             ErrorHandler* /*error_handler*/) {}
+  Status OnBlobFileCompleted(const std::string& /*file_name*/) {
+    return Status::OK();
+  }
+#else
+  BlobFileCompletionCallback(SstFileManager* sst_file_manager,
+                             InstrumentedMutex* mutex,
+                             ErrorHandler* error_handler)
+      : sst_file_manager_(sst_file_manager),
+        mutex_(mutex),
+        error_handler_(error_handler) {}
+
+  Status OnBlobFileCompleted(const std::string& file_name) {
+    Status s;
+    auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager_);
+    if (sfm) {
+      // Report new blob files to SstFileManagerImpl
+      s = sfm->OnAddFile(file_name);
+      if (sfm->IsMaxAllowedSpaceReached()) {
+        s = Status::SpaceLimit("Max allowed space was reached");
+        TEST_SYNC_POINT(
+            "BlobFileCompletionCallback::CallBack::MaxAllowedSpaceReached");
+        InstrumentedMutexLock l(mutex_);
+        error_handler_->SetBGError(s, BackgroundErrorReason::kFlush);
+      }
+    }
+    return s;
+  }
+
+ private:
+  SstFileManager* sst_file_manager_;
+  InstrumentedMutex* mutex_;
+  ErrorHandler* error_handler_;
+#endif  // ROCKSDB_LITE
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/db/blob/blob_file_reader.cc b/db/blob/blob_file_reader.cc
index 0cae4eb5341..1b4b82150da 100644
--- a/db/blob/blob_file_reader.cc
+++ b/db/blob/blob_file_reader.cc
@@ -21,9 +21,9 @@
 namespace ROCKSDB_NAMESPACE {
 
 Status BlobFileReader::Create(
-    const ImmutableCFOptions& immutable_cf_options,
-    const FileOptions& file_options, uint32_t column_family_id,
-    HistogramImpl* blob_file_read_hist, uint64_t blob_file_number,
+    const ImmutableOptions& immutable_options, const FileOptions& file_options,
+    uint32_t column_family_id, HistogramImpl* blob_file_read_hist,
+    uint64_t blob_file_number, const std::shared_ptr<IOTracer>& io_tracer,
     std::unique_ptr<BlobFileReader>* blob_file_reader) {
   assert(blob_file_reader);
   assert(!*blob_file_reader);
@@ -33,8 +33,8 @@ Status BlobFileReader::Create(
 
   {
     const Status s =
-        OpenFile(immutable_cf_options, file_options, blob_file_read_hist,
-                 blob_file_number, &file_size, &file_reader);
+        OpenFile(immutable_options, file_options, blob_file_read_hist,
+                 blob_file_number, io_tracer, &file_size, &file_reader);
     if (!s.ok()) {
       return s;
     }
@@ -66,20 +66,20 @@ Status BlobFileReader::Create(
 }
 
 Status BlobFileReader::OpenFile(
-    const ImmutableCFOptions& immutable_cf_options,
-    const FileOptions& file_opts, HistogramImpl* blob_file_read_hist,
-    uint64_t blob_file_number, uint64_t* file_size,
+    const ImmutableOptions& immutable_options, const FileOptions& file_opts,
+    HistogramImpl* blob_file_read_hist, uint64_t blob_file_number,
+    const std::shared_ptr<IOTracer>& io_tracer, uint64_t* file_size,
     std::unique_ptr<RandomAccessFileReader>* file_reader) {
   assert(file_size);
   assert(file_reader);
 
-  const auto& cf_paths = immutable_cf_options.cf_paths;
+  const auto& cf_paths = immutable_options.cf_paths;
   assert(!cf_paths.empty());
 
   const std::string blob_file_path =
       BlobFileName(cf_paths.front().path, blob_file_number);
 
-  FileSystem* const fs = immutable_cf_options.fs;
+  FileSystem* const fs = immutable_options.fs.get();
   assert(fs);
 
   constexpr IODebugContext* dbg = nullptr;
@@ -112,15 +112,15 @@ Status BlobFileReader::OpenFile(
 
   assert(file);
 
-  if (immutable_cf_options.advise_random_on_open) {
+  if (immutable_options.advise_random_on_open) {
     file->Hint(FSRandomAccessFile::kRandom);
   }
 
   file_reader->reset(new RandomAccessFileReader(
-      std::move(file), blob_file_path, immutable_cf_options.env,
-      std::shared_ptr<IOTracer>(), immutable_cf_options.statistics,
-      BLOB_DB_BLOB_FILE_READ_MICROS, blob_file_read_hist,
-      immutable_cf_options.rate_limiter, immutable_cf_options.listeners));
+      std::move(file), blob_file_path, immutable_options.clock, io_tracer,
+      immutable_options.stats, BLOB_DB_BLOB_FILE_READ_MICROS,
+      blob_file_read_hist, immutable_options.rate_limiter.get(),
+      immutable_options.listeners));
 
   return Status::OK();
 }
@@ -269,7 +269,8 @@ Status BlobFileReader::GetBlob(const ReadOptions& read_options,
                                const Slice& user_key, uint64_t offset,
                                uint64_t value_size,
                                CompressionType compression_type,
-                               PinnableSlice* value) const {
+                               PinnableSlice* value,
+                               uint64_t* bytes_read) const {
   assert(value);
 
   const uint64_t key_size = user_key.size();
@@ -292,6 +293,9 @@ Status BlobFileReader::GetBlob(const ReadOptions& read_options,
           : 0;
   assert(offset >= adjustment);
 
+  const uint64_t record_offset = offset - adjustment;
+  const uint64_t record_size = value_size + adjustment;
+
   Slice record_slice;
   Buffer buf;
   AlignedBuf aligned_buf;
@@ -299,9 +303,6 @@ Status BlobFileReader::GetBlob(const ReadOptions& read_options,
   {
     TEST_SYNC_POINT("BlobFileReader::GetBlob:ReadFromFile");
 
-    const uint64_t record_offset = offset - adjustment;
-    const uint64_t record_size = value_size + adjustment;
-
     const Status s = ReadFromFile(file_reader_.get(), record_offset,
                                   static_cast<size_t>(record_size),
                                   &record_slice, &buf, &aligned_buf);
@@ -330,6 +331,10 @@ Status BlobFileReader::GetBlob(const ReadOptions& read_options,
     }
   }
 
+  if (bytes_read) {
+    *bytes_read = record_size;
+  }
+
   return Status::OK();
 }
 
diff --git a/db/blob/blob_file_reader.h b/db/blob/blob_file_reader.h
index 8c7df393df9..9b3f5ebd624 100644
--- a/db/blob/blob_file_reader.h
+++ b/db/blob/blob_file_reader.h
@@ -15,7 +15,7 @@
 namespace ROCKSDB_NAMESPACE {
 
 class Status;
-struct ImmutableCFOptions;
+struct ImmutableOptions;
 struct FileOptions;
 class HistogramImpl;
 struct ReadOptions;
@@ -24,11 +24,12 @@ class PinnableSlice;
 
 class BlobFileReader {
  public:
-  static Status Create(const ImmutableCFOptions& immutable_cf_options,
+  static Status Create(const ImmutableOptions& immutable_options,
                        const FileOptions& file_options,
                        uint32_t column_family_id,
                        HistogramImpl* blob_file_read_hist,
                        uint64_t blob_file_number,
+                       const std::shared_ptr<IOTracer>& io_tracer,
                        std::unique_ptr<BlobFileReader>* reader);
 
   BlobFileReader(const BlobFileReader&) = delete;
@@ -38,16 +39,19 @@ class BlobFileReader {
 
   Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
                  uint64_t offset, uint64_t value_size,
-                 CompressionType compression_type, PinnableSlice* value) const;
+                 CompressionType compression_type, PinnableSlice* value,
+                 uint64_t* bytes_read) const;
 
  private:
   BlobFileReader(std::unique_ptr<RandomAccessFileReader>&& file_reader,
                  uint64_t file_size, CompressionType compression_type);
 
-  static Status OpenFile(const ImmutableCFOptions& immutable_cf_options,
+  static Status OpenFile(const ImmutableOptions& immutable_options,
                          const FileOptions& file_opts,
                          HistogramImpl* blob_file_read_hist,
-                         uint64_t blob_file_number, uint64_t* file_size,
+                         uint64_t blob_file_number,
+                         const std::shared_ptr<IOTracer>& io_tracer,
+                         uint64_t* file_size,
                          std::unique_ptr<RandomAccessFileReader>* file_reader);
 
   static Status ReadHeader(const RandomAccessFileReader* file_reader,
diff --git a/db/blob/blob_file_reader_test.cc b/db/blob/blob_file_reader_test.cc
index e8af662f1eb..e08a4bab836 100644
--- a/db/blob/blob_file_reader_test.cc
+++ b/db/blob/blob_file_reader_test.cc
@@ -18,6 +18,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/file_system.h"
 #include "rocksdb/options.h"
+#include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "util/compression.h"
 #include "utilities/fault_injection_env.h"
@@ -29,34 +30,34 @@ namespace {
 // Creates a test blob file with a single blob in it. Note: this method
 // makes it possible to test various corner cases by allowing the caller
 // to specify the contents of various blob file header/footer fields.
-void WriteBlobFile(const ImmutableCFOptions& immutable_cf_options,
+void WriteBlobFile(const ImmutableOptions& immutable_options,
                    uint32_t column_family_id, bool has_ttl,
                    const ExpirationRange& expiration_range_header,
                    const ExpirationRange& expiration_range_footer,
                    uint64_t blob_file_number, const Slice& key,
                    const Slice& blob, CompressionType compression_type,
                    uint64_t* blob_offset, uint64_t* blob_size) {
-  assert(!immutable_cf_options.cf_paths.empty());
+  assert(!immutable_options.cf_paths.empty());
   assert(blob_offset);
   assert(blob_size);
 
-  const std::string blob_file_path = BlobFileName(
-      immutable_cf_options.cf_paths.front().path, blob_file_number);
+  const std::string blob_file_path =
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number);
 
   std::unique_ptr<FSWritableFile> file;
-  ASSERT_OK(NewWritableFile(immutable_cf_options.fs, blob_file_path, &file,
+  ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file,
                             FileOptions()));
 
-  std::unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(file), blob_file_path, FileOptions(),
-                             immutable_cf_options.env));
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(file), blob_file_path, FileOptions(), immutable_options.clock));
 
   constexpr Statistics* statistics = nullptr;
   constexpr bool use_fsync = false;
+  constexpr bool do_flush = false;
 
-  BlobLogWriter blob_log_writer(std::move(file_writer),
-                                immutable_cf_options.env, statistics,
-                                blob_file_number, use_fsync);
+  BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock,
+                                statistics, blob_file_number, use_fsync,
+                                do_flush);
 
   BlobLogHeader header(column_family_id, compression_type, has_ttl,
                        expiration_range_header);
@@ -120,7 +121,7 @@ TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) {
       0);
   options.enable_blob_files = true;
 
-  ImmutableCFOptions immutable_cf_options(options);
+  ImmutableOptions immutable_options(options);
 
   constexpr uint32_t column_family_id = 1;
   constexpr bool has_ttl = false;
@@ -132,17 +133,17 @@ TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) {
   uint64_t blob_offset = 0;
   uint64_t blob_size = 0;
 
-  WriteBlobFile(immutable_cf_options, column_family_id, has_ttl,
-                expiration_range, expiration_range, blob_file_number, key, blob,
-                kNoCompression, &blob_offset, &blob_size);
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob, kNoCompression,
+                &blob_offset, &blob_size);
 
   constexpr HistogramImpl* blob_file_read_hist = nullptr;
 
   std::unique_ptr<BlobFileReader> reader;
 
-  ASSERT_OK(BlobFileReader::Create(immutable_cf_options, FileOptions(),
-                                   column_family_id, blob_file_read_hist,
-                                   blob_file_number, &reader));
+  ASSERT_OK(BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader));
 
   // Make sure the blob can be retrieved with and without checksum verification
   ReadOptions read_options;
@@ -150,83 +151,103 @@ TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) {
 
   {
     PinnableSlice value;
+    uint64_t bytes_read = 0;
 
     ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size,
-                              kNoCompression, &value));
+                              kNoCompression, &value, &bytes_read));
     ASSERT_EQ(value, blob);
+    ASSERT_EQ(bytes_read, blob_size);
   }
 
   read_options.verify_checksums = true;
 
   {
     PinnableSlice value;
+    uint64_t bytes_read = 0;
 
     ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size,
-                              kNoCompression, &value));
+                              kNoCompression, &value, &bytes_read));
     ASSERT_EQ(value, blob);
+
+    constexpr uint64_t key_size = sizeof(key) - 1;
+    ASSERT_EQ(bytes_read,
+              BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) +
+                  blob_size);
   }
 
   // Invalid offset (too close to start of file)
   {
     PinnableSlice value;
+    uint64_t bytes_read = 0;
 
     ASSERT_TRUE(reader
                     ->GetBlob(read_options, key, blob_offset - 1, blob_size,
-                              kNoCompression, &value)
+                              kNoCompression, &value, &bytes_read)
                     .IsCorruption());
+    ASSERT_EQ(bytes_read, 0);
   }
 
   // Invalid offset (too close to end of file)
   {
     PinnableSlice value;
+    uint64_t bytes_read = 0;
 
     ASSERT_TRUE(reader
                     ->GetBlob(read_options, key, blob_offset + 1, blob_size,
-                              kNoCompression, &value)
+                              kNoCompression, &value, &bytes_read)
                     .IsCorruption());
+    ASSERT_EQ(bytes_read, 0);
   }
 
   // Incorrect compression type
   {
     PinnableSlice value;
+    uint64_t bytes_read = 0;
 
-    ASSERT_TRUE(
-        reader
-            ->GetBlob(read_options, key, blob_offset, blob_size, kZSTD, &value)
-            .IsCorruption());
+    ASSERT_TRUE(reader
+                    ->GetBlob(read_options, key, blob_offset, blob_size, kZSTD,
+                              &value, &bytes_read)
+                    .IsCorruption());
+    ASSERT_EQ(bytes_read, 0);
   }
 
   // Incorrect key size
   {
     constexpr char shorter_key[] = "k";
     PinnableSlice value;
+    uint64_t bytes_read = 0;
 
     ASSERT_TRUE(reader
                     ->GetBlob(read_options, shorter_key,
                               blob_offset - (sizeof(key) - sizeof(shorter_key)),
-                              blob_size, kNoCompression, &value)
+                              blob_size, kNoCompression, &value, &bytes_read)
                     .IsCorruption());
+    ASSERT_EQ(bytes_read, 0);
   }
 
   // Incorrect key
   {
     constexpr char incorrect_key[] = "foo";
     PinnableSlice value;
+    uint64_t bytes_read = 0;
 
     ASSERT_TRUE(reader
                     ->GetBlob(read_options, incorrect_key, blob_offset,
-                              blob_size, kNoCompression, &value)
+                              blob_size, kNoCompression, &value, &bytes_read)
                     .IsCorruption());
+    ASSERT_EQ(bytes_read, 0);
   }
 
   // Incorrect value size
   {
     PinnableSlice value;
+    uint64_t bytes_read = 0;
 
     ASSERT_TRUE(reader
                     ->GetBlob(read_options, key, blob_offset, blob_size + 1,
-                              kNoCompression, &value)
+                              kNoCompression, &value, &bytes_read)
                     .IsCorruption());
+    ASSERT_EQ(bytes_read, 0);
   }
 }
 
@@ -240,7 +261,7 @@ TEST_F(BlobFileReaderTest, Malformed) {
       test::PerThreadDBPath(&mock_env_, "BlobFileReaderTest_Malformed"), 0);
   options.enable_blob_files = true;
 
-  ImmutableCFOptions immutable_cf_options(options);
+  ImmutableOptions immutable_options(options);
 
   constexpr uint32_t column_family_id = 1;
   constexpr uint64_t blob_file_number = 1;
@@ -249,23 +270,24 @@ TEST_F(BlobFileReaderTest, Malformed) {
     constexpr bool has_ttl = false;
     constexpr ExpirationRange expiration_range;
 
-    const std::string blob_file_path = BlobFileName(
-        immutable_cf_options.cf_paths.front().path, blob_file_number);
+    const std::string blob_file_path =
+        BlobFileName(immutable_options.cf_paths.front().path, blob_file_number);
 
     std::unique_ptr<FSWritableFile> file;
-    ASSERT_OK(NewWritableFile(immutable_cf_options.fs, blob_file_path, &file,
+    ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file,
                               FileOptions()));
 
     std::unique_ptr<WritableFileWriter> file_writer(
         new WritableFileWriter(std::move(file), blob_file_path, FileOptions(),
-                               immutable_cf_options.env));
+                               immutable_options.clock));
 
     constexpr Statistics* statistics = nullptr;
     constexpr bool use_fsync = false;
+    constexpr bool do_flush = false;
 
     BlobLogWriter blob_log_writer(std::move(file_writer),
-                                  immutable_cf_options.env, statistics,
-                                  blob_file_number, use_fsync);
+                                  immutable_options.clock, statistics,
+                                  blob_file_number, use_fsync, do_flush);
 
     BlobLogHeader header(column_family_id, kNoCompression, has_ttl,
                          expiration_range);
@@ -277,9 +299,10 @@ TEST_F(BlobFileReaderTest, Malformed) {
 
   std::unique_ptr<BlobFileReader> reader;
 
-  ASSERT_TRUE(BlobFileReader::Create(immutable_cf_options, FileOptions(),
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
                                      column_family_id, blob_file_read_hist,
-                                     blob_file_number, &reader)
+                                     blob_file_number, nullptr /*IOTracer*/,
+                                     &reader)
                   .IsCorruption());
 }
 
@@ -290,7 +313,7 @@ TEST_F(BlobFileReaderTest, TTL) {
       test::PerThreadDBPath(&mock_env_, "BlobFileReaderTest_TTL"), 0);
   options.enable_blob_files = true;
 
-  ImmutableCFOptions immutable_cf_options(options);
+  ImmutableOptions immutable_options(options);
 
   constexpr uint32_t column_family_id = 1;
   constexpr bool has_ttl = true;
@@ -302,17 +325,18 @@ TEST_F(BlobFileReaderTest, TTL) {
   uint64_t blob_offset = 0;
   uint64_t blob_size = 0;
 
-  WriteBlobFile(immutable_cf_options, column_family_id, has_ttl,
-                expiration_range, expiration_range, blob_file_number, key, blob,
-                kNoCompression, &blob_offset, &blob_size);
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob, kNoCompression,
+                &blob_offset, &blob_size);
 
   constexpr HistogramImpl* blob_file_read_hist = nullptr;
 
   std::unique_ptr<BlobFileReader> reader;
 
-  ASSERT_TRUE(BlobFileReader::Create(immutable_cf_options, FileOptions(),
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
                                      column_family_id, blob_file_read_hist,
-                                     blob_file_number, &reader)
+                                     blob_file_number, nullptr /*IOTracer*/,
+                                     &reader)
                   .IsCorruption());
 }
 
@@ -325,7 +349,7 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInHeader) {
       0);
   options.enable_blob_files = true;
 
-  ImmutableCFOptions immutable_cf_options(options);
+  ImmutableOptions immutable_options(options);
 
   constexpr uint32_t column_family_id = 1;
   constexpr bool has_ttl = false;
@@ -339,7 +363,7 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInHeader) {
   uint64_t blob_offset = 0;
   uint64_t blob_size = 0;
 
-  WriteBlobFile(immutable_cf_options, column_family_id, has_ttl,
+  WriteBlobFile(immutable_options, column_family_id, has_ttl,
                 expiration_range_header, expiration_range_footer,
                 blob_file_number, key, blob, kNoCompression, &blob_offset,
                 &blob_size);
@@ -348,9 +372,10 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInHeader) {
 
   std::unique_ptr<BlobFileReader> reader;
 
-  ASSERT_TRUE(BlobFileReader::Create(immutable_cf_options, FileOptions(),
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
                                      column_family_id, blob_file_read_hist,
-                                     blob_file_number, &reader)
+                                     blob_file_number, nullptr /*IOTracer*/,
+                                     &reader)
                   .IsCorruption());
 }
 
@@ -363,7 +388,7 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInFooter) {
       0);
   options.enable_blob_files = true;
 
-  ImmutableCFOptions immutable_cf_options(options);
+  ImmutableOptions immutable_options(options);
 
   constexpr uint32_t column_family_id = 1;
   constexpr bool has_ttl = false;
@@ -377,7 +402,7 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInFooter) {
   uint64_t blob_offset = 0;
   uint64_t blob_size = 0;
 
-  WriteBlobFile(immutable_cf_options, column_family_id, has_ttl,
+  WriteBlobFile(immutable_options, column_family_id, has_ttl,
                 expiration_range_header, expiration_range_footer,
                 blob_file_number, key, blob, kNoCompression, &blob_offset,
                 &blob_size);
@@ -386,9 +411,10 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInFooter) {
 
   std::unique_ptr<BlobFileReader> reader;
 
-  ASSERT_TRUE(BlobFileReader::Create(immutable_cf_options, FileOptions(),
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
                                      column_family_id, blob_file_read_hist,
-                                     blob_file_number, &reader)
+                                     blob_file_number, nullptr /*IOTracer*/,
+                                     &reader)
                   .IsCorruption());
 }
 
@@ -401,7 +427,7 @@ TEST_F(BlobFileReaderTest, IncorrectColumnFamily) {
       0);
   options.enable_blob_files = true;
 
-  ImmutableCFOptions immutable_cf_options(options);
+  ImmutableOptions immutable_options(options);
 
   constexpr uint32_t column_family_id = 1;
   constexpr bool has_ttl = false;
@@ -413,9 +439,9 @@ TEST_F(BlobFileReaderTest, IncorrectColumnFamily) {
   uint64_t blob_offset = 0;
   uint64_t blob_size = 0;
 
-  WriteBlobFile(immutable_cf_options, column_family_id, has_ttl,
-                expiration_range, expiration_range, blob_file_number, key, blob,
-                kNoCompression, &blob_offset, &blob_size);
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob, kNoCompression,
+                &blob_offset, &blob_size);
 
   constexpr HistogramImpl* blob_file_read_hist = nullptr;
 
@@ -423,10 +449,10 @@ TEST_F(BlobFileReaderTest, IncorrectColumnFamily) {
 
   constexpr uint32_t incorrect_column_family_id = 2;
 
-  ASSERT_TRUE(BlobFileReader::Create(immutable_cf_options, FileOptions(),
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
                                      incorrect_column_family_id,
                                      blob_file_read_hist, blob_file_number,
-                                     &reader)
+                                     nullptr /*IOTracer*/, &reader)
                   .IsCorruption());
 }
 
@@ -437,7 +463,7 @@ TEST_F(BlobFileReaderTest, BlobCRCError) {
       test::PerThreadDBPath(&mock_env_, "BlobFileReaderTest_BlobCRCError"), 0);
   options.enable_blob_files = true;
 
-  ImmutableCFOptions immutable_cf_options(options);
+  ImmutableOptions immutable_options(options);
 
   constexpr uint32_t column_family_id = 1;
   constexpr bool has_ttl = false;
@@ -449,17 +475,17 @@ TEST_F(BlobFileReaderTest, BlobCRCError) {
   uint64_t blob_offset = 0;
   uint64_t blob_size = 0;
 
-  WriteBlobFile(immutable_cf_options, column_family_id, has_ttl,
-                expiration_range, expiration_range, blob_file_number, key, blob,
-                kNoCompression, &blob_offset, &blob_size);
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob, kNoCompression,
+                &blob_offset, &blob_size);
 
   constexpr HistogramImpl* blob_file_read_hist = nullptr;
 
   std::unique_ptr<BlobFileReader> reader;
 
-  ASSERT_OK(BlobFileReader::Create(immutable_cf_options, FileOptions(),
-                                   column_family_id, blob_file_read_hist,
-                                   blob_file_number, &reader));
+  ASSERT_OK(BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader));
 
   SyncPoint::GetInstance()->SetCallBack(
       "BlobFileReader::VerifyBlob:CheckBlobCRC", [](void* arg) {
@@ -472,11 +498,13 @@ TEST_F(BlobFileReaderTest, BlobCRCError) {
   SyncPoint::GetInstance()->EnableProcessing();
 
   PinnableSlice value;
+  uint64_t bytes_read = 0;
 
   ASSERT_TRUE(reader
                   ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
-                            kNoCompression, &value)
+                            kNoCompression, &value, &bytes_read)
                   .IsCorruption());
+  ASSERT_EQ(bytes_read, 0);
 
   SyncPoint::GetInstance()->DisableProcessing();
   SyncPoint::GetInstance()->ClearAllCallBacks();
@@ -493,7 +521,7 @@ TEST_F(BlobFileReaderTest, Compression) {
       test::PerThreadDBPath(&mock_env_, "BlobFileReaderTest_Compression"), 0);
   options.enable_blob_files = true;
 
-  ImmutableCFOptions immutable_cf_options(options);
+  ImmutableOptions immutable_options(options);
 
   constexpr uint32_t column_family_id = 1;
   constexpr bool has_ttl = false;
@@ -505,17 +533,17 @@ TEST_F(BlobFileReaderTest, Compression) {
   uint64_t blob_offset = 0;
   uint64_t blob_size = 0;
 
-  WriteBlobFile(immutable_cf_options, column_family_id, has_ttl,
-                expiration_range, expiration_range, blob_file_number, key, blob,
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob,
                 kSnappyCompression, &blob_offset, &blob_size);
 
   constexpr HistogramImpl* blob_file_read_hist = nullptr;
 
   std::unique_ptr<BlobFileReader> reader;
 
-  ASSERT_OK(BlobFileReader::Create(immutable_cf_options, FileOptions(),
-                                   column_family_id, blob_file_read_hist,
-                                   blob_file_number, &reader));
+  ASSERT_OK(BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader));
 
   // Make sure the blob can be retrieved with and without checksum verification
   ReadOptions read_options;
@@ -523,20 +551,28 @@ TEST_F(BlobFileReaderTest, Compression) {
 
   {
     PinnableSlice value;
+    uint64_t bytes_read = 0;
 
     ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size,
-                              kSnappyCompression, &value));
+                              kSnappyCompression, &value, &bytes_read));
     ASSERT_EQ(value, blob);
+    ASSERT_EQ(bytes_read, blob_size);
   }
 
   read_options.verify_checksums = true;
 
   {
     PinnableSlice value;
+    uint64_t bytes_read = 0;
 
     ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size,
-                              kSnappyCompression, &value));
+                              kSnappyCompression, &value, &bytes_read));
     ASSERT_EQ(value, blob);
+
+    constexpr uint64_t key_size = sizeof(key) - 1;
+    ASSERT_EQ(bytes_read,
+              BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) +
+                  blob_size);
   }
 }
 
@@ -553,7 +589,7 @@ TEST_F(BlobFileReaderTest, UncompressionError) {
       0);
   options.enable_blob_files = true;
 
-  ImmutableCFOptions immutable_cf_options(options);
+  ImmutableOptions immutable_options(options);
 
   constexpr uint32_t column_family_id = 1;
   constexpr bool has_ttl = false;
@@ -565,17 +601,17 @@ TEST_F(BlobFileReaderTest, UncompressionError) {
   uint64_t blob_offset = 0;
   uint64_t blob_size = 0;
 
-  WriteBlobFile(immutable_cf_options, column_family_id, has_ttl,
-                expiration_range, expiration_range, blob_file_number, key, blob,
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob,
                 kSnappyCompression, &blob_offset, &blob_size);
 
   constexpr HistogramImpl* blob_file_read_hist = nullptr;
 
   std::unique_ptr<BlobFileReader> reader;
 
-  ASSERT_OK(BlobFileReader::Create(immutable_cf_options, FileOptions(),
-                                   column_family_id, blob_file_read_hist,
-                                   blob_file_number, &reader));
+  ASSERT_OK(BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader));
 
   SyncPoint::GetInstance()->SetCallBack(
       "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", [](void* arg) {
@@ -589,11 +625,13 @@ TEST_F(BlobFileReaderTest, UncompressionError) {
   SyncPoint::GetInstance()->EnableProcessing();
 
   PinnableSlice value;
+  uint64_t bytes_read = 0;
 
   ASSERT_TRUE(reader
                   ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
-                            kSnappyCompression, &value)
+                            kSnappyCompression, &value, &bytes_read)
                   .IsCorruption());
+  ASSERT_EQ(bytes_read, 0);
 
   SyncPoint::GetInstance()->DisableProcessing();
   SyncPoint::GetInstance()->ClearAllCallBacks();
@@ -632,7 +670,7 @@ TEST_P(BlobFileReaderIOErrorTest, IOError) {
       0);
   options.enable_blob_files = true;
 
-  ImmutableCFOptions immutable_cf_options(options);
+  ImmutableOptions immutable_options(options);
 
   constexpr uint32_t column_family_id = 1;
   constexpr bool has_ttl = false;
@@ -644,9 +682,9 @@ TEST_P(BlobFileReaderIOErrorTest, IOError) {
   uint64_t blob_offset = 0;
   uint64_t blob_size = 0;
 
-  WriteBlobFile(immutable_cf_options, column_family_id, has_ttl,
-                expiration_range, expiration_range, blob_file_number, key, blob,
-                kNoCompression, &blob_offset, &blob_size);
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob, kNoCompression,
+                &blob_offset, &blob_size);
 
   SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
     fault_injection_env_.SetFilesystemActive(false,
@@ -658,9 +696,9 @@ TEST_P(BlobFileReaderIOErrorTest, IOError) {
 
   std::unique_ptr<BlobFileReader> reader;
 
-  const Status s = BlobFileReader::Create(immutable_cf_options, FileOptions(),
-                                          column_family_id, blob_file_read_hist,
-                                          blob_file_number, &reader);
+  const Status s = BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader);
 
   const bool fail_during_create =
       (sync_point_ != "BlobFileReader::GetBlob:ReadFromFile");
@@ -671,11 +709,13 @@ TEST_P(BlobFileReaderIOErrorTest, IOError) {
     ASSERT_OK(s);
 
     PinnableSlice value;
+    uint64_t bytes_read = 0;
 
     ASSERT_TRUE(reader
                     ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
-                              kNoCompression, &value)
+                              kNoCompression, &value, &bytes_read)
                     .IsIOError());
+    ASSERT_EQ(bytes_read, 0);
   }
 
   SyncPoint::GetInstance()->DisableProcessing();
@@ -708,7 +748,7 @@ TEST_P(BlobFileReaderDecodingErrorTest, DecodingError) {
       0);
   options.enable_blob_files = true;
 
-  ImmutableCFOptions immutable_cf_options(options);
+  ImmutableOptions immutable_options(options);
 
   constexpr uint32_t column_family_id = 1;
   constexpr bool has_ttl = false;
@@ -720,9 +760,9 @@ TEST_P(BlobFileReaderDecodingErrorTest, DecodingError) {
   uint64_t blob_offset = 0;
   uint64_t blob_size = 0;
 
-  WriteBlobFile(immutable_cf_options, column_family_id, has_ttl,
-                expiration_range, expiration_range, blob_file_number, key, blob,
-                kNoCompression, &blob_offset, &blob_size);
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob, kNoCompression,
+                &blob_offset, &blob_size);
 
   SyncPoint::GetInstance()->SetCallBack(sync_point_, [](void* arg) {
     Slice* const slice = static_cast<Slice*>(arg);
@@ -738,9 +778,9 @@ TEST_P(BlobFileReaderDecodingErrorTest, DecodingError) {
 
   std::unique_ptr<BlobFileReader> reader;
 
-  const Status s = BlobFileReader::Create(immutable_cf_options, FileOptions(),
-                                          column_family_id, blob_file_read_hist,
-                                          blob_file_number, &reader);
+  const Status s = BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader);
 
   const bool fail_during_create =
       sync_point_ != "BlobFileReader::GetBlob:TamperWithResult";
@@ -751,11 +791,13 @@ TEST_P(BlobFileReaderDecodingErrorTest, DecodingError) {
     ASSERT_OK(s);
 
     PinnableSlice value;
+    uint64_t bytes_read = 0;
 
     ASSERT_TRUE(reader
                     ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
-                              kNoCompression, &value)
+                              kNoCompression, &value, &bytes_read)
                     .IsCorruption());
+    ASSERT_EQ(bytes_read, 0);
   }
 
   SyncPoint::GetInstance()->DisableProcessing();
diff --git a/db/blob/blob_log_format.cc b/db/blob/blob_log_format.cc
index b5cd0bdcc77..482bd078e6b 100644
--- a/db/blob/blob_log_format.cc
+++ b/db/blob/blob_log_format.cc
@@ -95,10 +95,6 @@ Status BlobLogFooter::DecodeFrom(Slice src) {
   return Status::OK();
 }
 
-uint64_t BlobLogRecord::CalculateAdjustmentForRecordHeader(uint64_t key_size) {
-  return key_size + kHeaderSize;
-}
-
 void BlobLogRecord::EncodeHeaderTo(std::string* dst) {
   assert(dst != nullptr);
   dst->clear();
diff --git a/db/blob/blob_log_format.h b/db/blob/blob_log_format.h
index afeb8d37090..539bbb52613 100644
--- a/db/blob/blob_log_format.h
+++ b/db/blob/blob_log_format.h
@@ -107,7 +107,9 @@ struct BlobLogRecord {
   // Note that the offset field of BlobIndex actually points to the blob value
   // as opposed to the start of the blob record. The following method can
   // be used to calculate the adjustment needed to read the blob record header.
-  static uint64_t CalculateAdjustmentForRecordHeader(uint64_t key_size);
+  static uint64_t CalculateAdjustmentForRecordHeader(uint64_t key_size) {
+    return key_size + kHeaderSize;
+  }
 
   uint64_t key_size = 0;
   uint64_t value_size = 0;
diff --git a/db/blob/blob_log_sequential_reader.cc b/db/blob/blob_log_sequential_reader.cc
index 58afd27a9c6..448b3b6f7d6 100644
--- a/db/blob/blob_log_sequential_reader.cc
+++ b/db/blob/blob_log_sequential_reader.cc
@@ -6,8 +6,6 @@
 
 #include "db/blob/blob_log_sequential_reader.h"
 
-#include <algorithm>
-
 #include "file/random_access_file_reader.h"
 #include "monitoring/statistics.h"
 #include "util/stop_watch.h"
@@ -15,10 +13,10 @@
 namespace ROCKSDB_NAMESPACE {
 
 BlobLogSequentialReader::BlobLogSequentialReader(
-    std::unique_ptr<RandomAccessFileReader>&& file_reader, Env* env,
+    std::unique_ptr<RandomAccessFileReader>&& file_reader, SystemClock* clock,
     Statistics* statistics)
     : file_(std::move(file_reader)),
-      env_(env),
+      clock_(clock),
       statistics_(statistics),
       next_byte_(0) {}
 
@@ -29,7 +27,7 @@ Status BlobLogSequentialReader::ReadSlice(uint64_t size, Slice* slice,
   assert(slice);
   assert(file_);
 
-  StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
+  StopWatch read_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
   Status s = file_->Read(IOOptions(), next_byte_, static_cast<size_t>(size),
                          slice, buf, nullptr);
   next_byte_ += size;
diff --git a/db/blob/blob_log_sequential_reader.h b/db/blob/blob_log_sequential_reader.h
index 62c707b4dd8..f8e1c02bdbc 100644
--- a/db/blob/blob_log_sequential_reader.h
+++ b/db/blob/blob_log_sequential_reader.h
@@ -16,6 +16,7 @@ class RandomAccessFileReader;
 class Env;
 class Statistics;
 class Status;
+class SystemClock;
 
 /**
  * BlobLogSequentialReader is a general purpose log stream reader
@@ -35,7 +36,7 @@ class BlobLogSequentialReader {
 
   // Create a reader that will return log records from "*file_reader".
   BlobLogSequentialReader(std::unique_ptr<RandomAccessFileReader>&& file_reader,
-                          Env* env, Statistics* statistics);
+                          SystemClock* clock, Statistics* statistics);
 
   // No copying allowed
   BlobLogSequentialReader(const BlobLogSequentialReader&) = delete;
@@ -63,7 +64,8 @@ class BlobLogSequentialReader {
   Status ReadSlice(uint64_t size, Slice* slice, char* buf);
 
   const std::unique_ptr<RandomAccessFileReader> file_;
-  Env* env_;
+  SystemClock* clock_;
+
   Statistics* statistics_;
 
   Slice buffer_;
diff --git a/db/blob/blob_log_writer.cc b/db/blob/blob_log_writer.cc
index 8b3d0e2c736..2dabc98e802 100644
--- a/db/blob/blob_log_writer.cc
+++ b/db/blob/blob_log_writer.cc
@@ -11,7 +11,7 @@
 #include "db/blob/blob_log_format.h"
 #include "file/writable_file_writer.h"
 #include "monitoring/statistics.h"
-#include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/stop_watch.h"
@@ -19,14 +19,16 @@
 namespace ROCKSDB_NAMESPACE {
 
 BlobLogWriter::BlobLogWriter(std::unique_ptr<WritableFileWriter>&& dest,
-                             Env* env, Statistics* statistics,
-                             uint64_t log_number, bool use_fs, uint64_t boffset)
+                             SystemClock* clock, Statistics* statistics,
+                             uint64_t log_number, bool use_fs, bool do_flush,
+                             uint64_t boffset)
     : dest_(std::move(dest)),
-      env_(env),
+      clock_(clock),
       statistics_(statistics),
       log_number_(log_number),
       block_offset_(boffset),
       use_fsync_(use_fs),
+      do_flush_(do_flush),
       last_elem_type_(kEtNone) {}
 
 BlobLogWriter::~BlobLogWriter() = default;
@@ -34,7 +36,7 @@ BlobLogWriter::~BlobLogWriter() = default;
 Status BlobLogWriter::Sync() {
   TEST_SYNC_POINT("BlobLogWriter::Sync");
 
-  StopWatch sync_sw(env_, statistics_, BLOB_DB_BLOB_FILE_SYNC_MICROS);
+  StopWatch sync_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_SYNC_MICROS);
   Status s = dest_->Sync(use_fsync_);
   RecordTick(statistics_, BLOB_DB_BLOB_FILE_SYNCED);
   return s;
@@ -49,7 +51,9 @@ Status BlobLogWriter::WriteHeader(BlobLogHeader& header) {
   Status s = dest_->Append(Slice(str));
   if (s.ok()) {
     block_offset_ += str.size();
-    s = dest_->Flush();
+    if (do_flush_) {
+      s = dest_->Flush();
+    }
   }
   last_elem_type_ = kEtFileHdr;
   RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
@@ -144,7 +148,7 @@ Status BlobLogWriter::EmitPhysicalRecord(const std::string& headerbuf,
                                          const Slice& key, const Slice& val,
                                          uint64_t* key_offset,
                                          uint64_t* blob_offset) {
-  StopWatch write_sw(env_, statistics_, BLOB_DB_BLOB_FILE_WRITE_MICROS);
+  StopWatch write_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_WRITE_MICROS);
   Status s = dest_->Append(Slice(headerbuf));
   if (s.ok()) {
     s = dest_->Append(key);
@@ -152,7 +156,7 @@ Status BlobLogWriter::EmitPhysicalRecord(const std::string& headerbuf,
   if (s.ok()) {
     s = dest_->Append(val);
   }
-  if (s.ok()) {
+  if (do_flush_ && s.ok()) {
     s = dest_->Flush();
   }
 
diff --git a/db/blob/blob_log_writer.h b/db/blob/blob_log_writer.h
index 0f9ea251642..c1f9f31ad00 100644
--- a/db/blob/blob_log_writer.h
+++ b/db/blob/blob_log_writer.h
@@ -9,7 +9,6 @@
 #include <string>
 
 #include "db/blob/blob_log_format.h"
-#include "rocksdb/env.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
@@ -18,7 +17,7 @@
 namespace ROCKSDB_NAMESPACE {
 
 class WritableFileWriter;
-
+class SystemClock;
 /**
  * BlobLogWriter is the blob log stream writer. It provides an append-only
  * abstraction for writing blob data.
@@ -32,9 +31,9 @@ class BlobLogWriter {
   // Create a writer that will append data to "*dest".
   // "*dest" must be initially empty.
   // "*dest" must remain live while this BlobLogWriter is in use.
-  BlobLogWriter(std::unique_ptr<WritableFileWriter>&& dest, Env* env,
+  BlobLogWriter(std::unique_ptr<WritableFileWriter>&& dest, SystemClock* clock,
                 Statistics* statistics, uint64_t log_number, bool use_fsync,
-                uint64_t boffset = 0);
+                bool do_flush, uint64_t boffset = 0);
   // No copying allowed
   BlobLogWriter(const BlobLogWriter&) = delete;
   BlobLogWriter& operator=(const BlobLogWriter&) = delete;
@@ -69,11 +68,12 @@ class BlobLogWriter {
 
  private:
   std::unique_ptr<WritableFileWriter> dest_;
-  Env* env_;
+  SystemClock* clock_;
   Statistics* statistics_;
   uint64_t log_number_;
   uint64_t block_offset_;  // Current offset in block
   bool use_fsync_;
+  bool do_flush_;
 
  public:
   enum ElemType { kEtNone, kEtFileHdr, kEtRecord, kEtFileFooter };
diff --git a/db/blob/db_blob_basic_test.cc b/db/blob/db_blob_basic_test.cc
new file mode 100644
index 00000000000..feee834c51f
--- /dev/null
+++ b/db/blob/db_blob_basic_test.cc
@@ -0,0 +1,517 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <array>
+
+#include "db/blob/blob_index.h"
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBlobBasicTest : public DBTestBase {
+ protected:
+  DBBlobBasicTest()
+      : DBTestBase("/db_blob_basic_test", /* env_do_fsync */ false) {}
+};
+
+TEST_F(DBBlobBasicTest, GetBlob) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+  constexpr char blob_value[] = "blob_value";
+
+  ASSERT_OK(Put(key, blob_value));
+
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(Get(key), blob_value);
+
+  // Try again with no I/O allowed. The table and the necessary blocks should
+  // already be in their respective caches; however, the blob itself can only be
+  // read from the blob file, so the read should return Incomplete.
+  ReadOptions read_options;
+  read_options.read_tier = kBlockCacheTier;
+
+  PinnableSlice result;
+  ASSERT_TRUE(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result)
+                  .IsIncomplete());
+}
+
+TEST_F(DBBlobBasicTest, MultiGetBlobs) {
+  constexpr size_t min_blob_size = 6;
+
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = min_blob_size;
+
+  Reopen(options);
+
+  // Put then retrieve three key-values. The first value is below the size limit
+  // and is thus stored inline; the other two are stored separately as blobs.
+  constexpr size_t num_keys = 3;
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "short";
+  static_assert(sizeof(first_value) - 1 < min_blob_size,
+                "first_value too long to be inlined");
+
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "long_value";
+  static_assert(sizeof(second_value) - 1 >= min_blob_size,
+                "second_value too short to be stored as blob");
+
+  ASSERT_OK(Put(second_key, second_value));
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "other_long_value";
+  static_assert(sizeof(third_value) - 1 >= min_blob_size,
+                "third_value too short to be stored as blob");
+
+  ASSERT_OK(Put(third_key, third_value));
+
+  ASSERT_OK(Flush());
+
+  ReadOptions read_options;
+
+  std::array<Slice, num_keys> keys{{first_key, second_key, third_key}};
+
+  {
+    std::array<PinnableSlice, num_keys> values;
+    std::array<Status, num_keys> statuses;
+
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    ASSERT_OK(statuses[0]);
+    ASSERT_EQ(values[0], first_value);
+
+    ASSERT_OK(statuses[1]);
+    ASSERT_EQ(values[1], second_value);
+
+    ASSERT_OK(statuses[2]);
+    ASSERT_EQ(values[2], third_value);
+  }
+
+  // Try again with no I/O allowed. The table and the necessary blocks should
+  // already be in their respective caches. The first (inlined) value should be
+  // successfully read; however, the two blob values could only be read from the
+  // blob file, so for those the read should return Incomplete.
+  read_options.read_tier = kBlockCacheTier;
+
+  {
+    std::array<PinnableSlice, num_keys> values;
+    std::array<Status, num_keys> statuses;
+
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    ASSERT_OK(statuses[0]);
+    ASSERT_EQ(values[0], first_value);
+
+    ASSERT_TRUE(statuses[1].IsIncomplete());
+
+    ASSERT_TRUE(statuses[2].IsIncomplete());
+  }
+}
+
+TEST_F(DBBlobBasicTest, GetBlob_CorruptIndex) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+
+  // Fake a corrupt blob index.
+  const std::string blob_index("foobar");
+
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Flush());
+
+  PinnableSlice result;
+  ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+                  .IsCorruption());
+}
+
+TEST_F(DBBlobBasicTest, GetBlob_InlinedTTLIndex) {
+  constexpr uint64_t min_blob_size = 10;
+
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = min_blob_size;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+  constexpr char blob[] = "short";
+  static_assert(sizeof(short) - 1 < min_blob_size,
+                "Blob too long to be inlined");
+
+  // Fake an inlined TTL blob index.
+  std::string blob_index;
+
+  constexpr uint64_t expiration = 1234567890;
+
+  BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob);
+
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Flush());
+
+  PinnableSlice result;
+  ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+                  .IsCorruption());
+}
+
+TEST_F(DBBlobBasicTest, GetBlob_IndexWithInvalidFileNumber) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+
+  // Fake a blob index referencing a non-existent blob file.
+  std::string blob_index;
+
+  constexpr uint64_t blob_file_number = 1000;
+  constexpr uint64_t offset = 1234;
+  constexpr uint64_t size = 5678;
+
+  BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+                        kNoCompression);
+
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Flush());
+
+  PinnableSlice result;
+  ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+                  .IsCorruption());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobBasicTest, GenerateIOTracing) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  std::string trace_file = dbname_ + "/io_trace_file";
+
+  Reopen(options);
+  {
+    // Create IO trace file
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(
+        NewFileTraceWriter(env_, EnvOptions(), trace_file, &trace_writer));
+    ASSERT_OK(db_->StartIOTrace(TraceOptions(), std::move(trace_writer)));
+
+    constexpr char key[] = "key";
+    constexpr char blob_value[] = "blob_value";
+
+    ASSERT_OK(Put(key, blob_value));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(Get(key), blob_value);
+
+    ASSERT_OK(db_->EndIOTrace());
+    ASSERT_OK(env_->FileExists(trace_file));
+  }
+  {
+    // Parse trace file to check file operations related to blob files are
+    // recorded.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(
+        NewFileTraceReader(env_, EnvOptions(), trace_file, &trace_reader));
+    IOTraceReader reader(std::move(trace_reader));
+
+    IOTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+    ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
+
+    // Read records.
+    int blob_files_op_count = 0;
+    Status status;
+    while (true) {
+      IOTraceRecord record;
+      status = reader.ReadIOOp(&record);
+      if (!status.ok()) {
+        break;
+      }
+      if (record.file_name.find("blob") != std::string::npos) {
+        blob_files_op_count++;
+      }
+    }
+    // Assuming blob files will have Append, Close and then Read operations.
+    ASSERT_GT(blob_files_op_count, 2);
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(DBBlobBasicTest, BestEffortsRecovery_MissingNewestBlobFile) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  Reopen(options);
+
+  ASSERT_OK(dbfull()->DisableFileDeletions());
+  constexpr int kNumTableFiles = 2;
+  for (int i = 0; i < kNumTableFiles; ++i) {
+    for (char ch = 'a'; ch != 'c'; ++ch) {
+      std::string key(1, ch);
+      ASSERT_OK(Put(key, "value" + std::to_string(i)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  Close();
+
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(dbname_, &files));
+  std::string blob_file_path;
+  uint64_t max_blob_file_num = kInvalidBlobFileNumber;
+  for (const auto& fname : files) {
+    uint64_t file_num = 0;
+    FileType type;
+    if (ParseFileName(fname, &file_num, /*info_log_name_prefix=*/"", &type) &&
+        type == kBlobFile) {
+      if (file_num > max_blob_file_num) {
+        max_blob_file_num = file_num;
+        blob_file_path = dbname_ + "/" + fname;
+      }
+    }
+  }
+  ASSERT_OK(env_->DeleteFile(blob_file_path));
+
+  options.best_efforts_recovery = true;
+  Reopen(options);
+  std::string value;
+  ASSERT_OK(db_->Get(ReadOptions(), "a", &value));
+  ASSERT_EQ("value" + std::to_string(kNumTableFiles - 2), value);
+}
+
+TEST_F(DBBlobBasicTest, GetMergeBlobWithPut) {
+  Options options = GetDefaultOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("Key1", "v1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("Key1", "v2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("Key1", "v3"));
+  ASSERT_OK(Flush());
+
+  std::string value;
+  ASSERT_OK(db_->Get(ReadOptions(), "Key1", &value));
+  ASSERT_EQ(Get("Key1"), "v1,v2,v3");
+}
+
+TEST_F(DBBlobBasicTest, MultiGetMergeBlobWithPut) {
+  constexpr size_t num_keys = 3;
+
+  Options options = GetDefaultOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("Key0", "v0_0"));
+  ASSERT_OK(Put("Key1", "v1_0"));
+  ASSERT_OK(Put("Key2", "v2_0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("Key0", "v0_1"));
+  ASSERT_OK(Merge("Key1", "v1_1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("Key0", "v0_2"));
+  ASSERT_OK(Flush());
+
+  std::array<Slice, num_keys> keys{{"Key0", "Key1", "Key2"}};
+  std::array<PinnableSlice, num_keys> values;
+  std::array<Status, num_keys> statuses;
+
+  db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0],
+                &values[0], &statuses[0]);
+
+  ASSERT_OK(statuses[0]);
+  ASSERT_EQ(values[0], "v0_0,v0_1,v0_2");
+
+  ASSERT_OK(statuses[1]);
+  ASSERT_EQ(values[1], "v1_0,v1_1");
+
+  ASSERT_OK(statuses[2]);
+  ASSERT_EQ(values[2], "v2_0");
+}
+
+class DBBlobBasicIOErrorTest : public DBBlobBasicTest,
+                               public testing::WithParamInterface<std::string> {
+ protected:
+  DBBlobBasicIOErrorTest() : sync_point_(GetParam()) {
+    fault_injection_env_.reset(new FaultInjectionTestEnv(env_));
+  }
+  ~DBBlobBasicIOErrorTest() { Close(); }
+
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env_;
+  std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(DBBlobBasicTest, DBBlobBasicIOErrorTest,
+                        ::testing::ValuesIn(std::vector<std::string>{
+                            "BlobFileReader::OpenFile:NewRandomAccessFile",
+                            "BlobFileReader::GetBlob:ReadFromFile"}));
+
+TEST_P(DBBlobBasicIOErrorTest, GetBlob_IOError) {
+  Options options;
+  options.env = fault_injection_env_.get();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+  constexpr char blob_value[] = "blob_value";
+
+  ASSERT_OK(Put(key, blob_value));
+
+  ASSERT_OK(Flush());
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+    fault_injection_env_->SetFilesystemActive(false,
+                                              Status::IOError(sync_point_));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  PinnableSlice result;
+  ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+                  .IsIOError());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBBlobBasicIOErrorTest, MultiGetBlobs_IOError) {
+  Options options = GetDefaultOptions();
+  options.env = fault_injection_env_.get();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr size_t num_keys = 2;
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+
+  ASSERT_OK(Put(second_key, second_value));
+
+  ASSERT_OK(Flush());
+
+  std::array<Slice, num_keys> keys{{first_key, second_key}};
+  std::array<PinnableSlice, num_keys> values;
+  std::array<Status, num_keys> statuses;
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+    fault_injection_env_->SetFilesystemActive(false,
+                                              Status::IOError(sync_point_));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0],
+                &values[0], &statuses[0]);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_TRUE(statuses[0].IsIOError());
+  ASSERT_TRUE(statuses[1].IsIOError());
+}
+
+namespace {
+
+class ReadBlobCompactionFilter : public CompactionFilter {
+ public:
+  ReadBlobCompactionFilter() = default;
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.read.blob";
+  }
+  CompactionFilter::Decision FilterV2(
+      int /*level*/, const Slice& /*key*/, ValueType value_type,
+      const Slice& existing_value, std::string* new_value,
+      std::string* /*skip_until*/) const override {
+    if (value_type != CompactionFilter::ValueType::kValue) {
+      return CompactionFilter::Decision::kKeep;
+    }
+    assert(new_value);
+    new_value->assign(existing_value.data(), existing_value.size());
+    return CompactionFilter::Decision::kChangeValue;
+  }
+};
+
+}  // anonymous namespace
+
+TEST_P(DBBlobBasicIOErrorTest, CompactionFilterReadBlob_IOError) {
+  Options options = GetDefaultOptions();
+  options.env = fault_injection_env_.get();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new ReadBlobCompactionFilter);
+  options.compaction_filter = compaction_filter_guard.get();
+
+  DestroyAndReopen(options);
+  constexpr char key[] = "foo";
+  constexpr char blob_value[] = "foo_blob_value";
+  ASSERT_OK(Put(key, blob_value));
+  ASSERT_OK(Flush());
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+    fault_injection_env_->SetFilesystemActive(false,
+                                              Status::IOError(sync_point_));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                                /*end=*/nullptr)
+                  .IsIOError());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/db/blob/db_blob_compaction_test.cc b/db/blob/db_blob_compaction_test.cc
new file mode 100644
index 00000000000..29f10f2e228
--- /dev/null
+++ b/db/blob/db_blob_compaction_test.cc
@@ -0,0 +1,399 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_index.h"
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBlobCompactionTest : public DBTestBase {
+ public:
+  explicit DBBlobCompactionTest()
+      : DBTestBase("/db_blob_compaction_test", /*env_do_fsync=*/false) {}
+
+#ifndef ROCKSDB_LITE
+  const std::vector<InternalStats::CompactionStats>& GetCompactionStats() {
+    VersionSet* const versions = dbfull()->TEST_GetVersionSet();
+    assert(versions);
+    assert(versions->GetColumnFamilySet());
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    assert(cfd);
+
+    const InternalStats* const internal_stats = cfd->internal_stats();
+    assert(internal_stats);
+
+    return internal_stats->TEST_GetCompactionStats();
+  }
+#endif  // ROCKSDB_LITE
+};
+
+namespace {
+
+class FilterByKeyLength : public CompactionFilter {
+ public:
+  explicit FilterByKeyLength(size_t len) : length_threshold_(len) {}
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.by.key.length";
+  }
+  CompactionFilter::Decision FilterBlobByKey(
+      int /*level*/, const Slice& key, std::string* /*new_value*/,
+      std::string* /*skip_until*/) const override {
+    if (key.size() < length_threshold_) {
+      return CompactionFilter::Decision::kRemove;
+    }
+    return CompactionFilter::Decision::kKeep;
+  }
+
+ private:
+  size_t length_threshold_;
+};
+
+class BadBlobCompactionFilter : public CompactionFilter {
+ public:
+  explicit BadBlobCompactionFilter(std::string prefix,
+                                   CompactionFilter::Decision filter_by_key,
+                                   CompactionFilter::Decision filter_v2)
+      : prefix_(std::move(prefix)),
+        filter_blob_by_key_(filter_by_key),
+        filter_v2_(filter_v2) {}
+  const char* Name() const override { return "rocksdb.compaction.filter.bad"; }
+  CompactionFilter::Decision FilterBlobByKey(
+      int /*level*/, const Slice& key, std::string* /*new_value*/,
+      std::string* /*skip_until*/) const override {
+    if (key.size() >= prefix_.size() &&
+        0 == strncmp(prefix_.data(), key.data(), prefix_.size())) {
+      return CompactionFilter::Decision::kUndetermined;
+    }
+    return filter_blob_by_key_;
+  }
+  CompactionFilter::Decision FilterV2(
+      int /*level*/, const Slice& /*key*/, ValueType /*value_type*/,
+      const Slice& /*existing_value*/, std::string* /*new_value*/,
+      std::string* /*skip_until*/) const override {
+    return filter_v2_;
+  }
+
+ private:
+  const std::string prefix_;
+  const CompactionFilter::Decision filter_blob_by_key_;
+  const CompactionFilter::Decision filter_v2_;
+};
+
+class ValueBlindWriteFilter : public CompactionFilter {
+ public:
+  explicit ValueBlindWriteFilter(std::string new_val)
+      : new_value_(std::move(new_val)) {}
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.blind.write";
+  }
+  CompactionFilter::Decision FilterBlobByKey(
+      int level, const Slice& key, std::string* new_value,
+      std::string* skip_until) const override;
+
+ private:
+  const std::string new_value_;
+};
+
+CompactionFilter::Decision ValueBlindWriteFilter::FilterBlobByKey(
+    int /*level*/, const Slice& /*key*/, std::string* new_value,
+    std::string* /*skip_until*/) const {
+  assert(new_value);
+  new_value->assign(new_value_);
+  return CompactionFilter::Decision::kChangeValue;
+}
+
+class ValueMutationFilter : public CompactionFilter {
+ public:
+  explicit ValueMutationFilter(std::string padding)
+      : padding_(std::move(padding)) {}
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.value.mutation";
+  }
+  CompactionFilter::Decision FilterV2(int level, const Slice& key,
+                                      ValueType value_type,
+                                      const Slice& existing_value,
+                                      std::string* new_value,
+                                      std::string* skip_until) const override;
+
+ private:
+  const std::string padding_;
+};
+
+CompactionFilter::Decision ValueMutationFilter::FilterV2(
+    int /*level*/, const Slice& /*key*/, ValueType value_type,
+    const Slice& existing_value, std::string* new_value,
+    std::string* /*skip_until*/) const {
+  assert(CompactionFilter::ValueType::kBlobIndex != value_type);
+  if (CompactionFilter::ValueType::kValue != value_type) {
+    return CompactionFilter::Decision::kKeep;
+  }
+  assert(new_value);
+  new_value->assign(existing_value.data(), existing_value.size());
+  new_value->append(padding_);
+  return CompactionFilter::Decision::kChangeValue;
+}
+
+class AlwaysKeepFilter : public CompactionFilter {
+ public:
+  explicit AlwaysKeepFilter() = default;
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.always.keep";
+  }
+  CompactionFilter::Decision FilterV2(
+      int /*level*/, const Slice& /*key*/, ValueType /*value_type*/,
+      const Slice& /*existing_value*/, std::string* /*new_value*/,
+      std::string* /*skip_until*/) const override {
+    return CompactionFilter::Decision::kKeep;
+  }
+};
+}  // anonymous namespace
+
+class DBBlobBadCompactionFilterTest
+    : public DBBlobCompactionTest,
+      public testing::WithParamInterface<
+          std::tuple<std::string, CompactionFilter::Decision,
+                     CompactionFilter::Decision>> {
+ public:
+  explicit DBBlobBadCompactionFilterTest()
+      : compaction_filter_guard_(new BadBlobCompactionFilter(
+            std::get<0>(GetParam()), std::get<1>(GetParam()),
+            std::get<2>(GetParam()))) {}
+
+ protected:
+  std::unique_ptr<CompactionFilter> compaction_filter_guard_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    BadCompactionFilter, DBBlobBadCompactionFilterTest,
+    testing::Combine(
+        testing::Values("a"),
+        testing::Values(CompactionFilter::Decision::kChangeBlobIndex,
+                        CompactionFilter::Decision::kIOError),
+        testing::Values(CompactionFilter::Decision::kUndetermined,
+                        CompactionFilter::Decision::kChangeBlobIndex,
+                        CompactionFilter::Decision::kIOError)));
+
+TEST_F(DBBlobCompactionTest, FilterByKeyLength) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  constexpr size_t kKeyLength = 2;
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new FilterByKeyLength(kKeyLength));
+  options.compaction_filter = compaction_filter_guard.get();
+
+  constexpr char short_key[] = "a";
+  constexpr char long_key[] = "abc";
+  constexpr char blob_value[] = "value";
+
+  DestroyAndReopen(options);
+  ASSERT_OK(Put(short_key, blob_value));
+  ASSERT_OK(Put(long_key, blob_value));
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  ASSERT_OK(db_->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+  std::string value;
+  ASSERT_TRUE(db_->Get(ReadOptions(), short_key, &value).IsNotFound());
+  value.clear();
+  ASSERT_OK(db_->Get(ReadOptions(), long_key, &value));
+  ASSERT_EQ("value", value);
+
+#ifndef ROCKSDB_LITE
+  const auto& compaction_stats = GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  // Filter decides between kKeep and kRemove solely based on key;
+  // this involves neither reading nor writing blobs
+  ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+  ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+#endif  // ROCKSDB_LITE
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, BlindWriteFilter) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  constexpr char new_blob_value[] = "new_blob_value";
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new ValueBlindWriteFilter(new_blob_value));
+  options.compaction_filter = compaction_filter_guard.get();
+  DestroyAndReopen(options);
+  const std::vector<std::string> keys = {"a", "b", "c"};
+  const std::vector<std::string> values = {"a_value", "b_value", "c_value"};
+  assert(keys.size() == values.size());
+  for (size_t i = 0; i < keys.size(); ++i) {
+    ASSERT_OK(Put(keys[i], values[i]));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+  for (const auto& key : keys) {
+    ASSERT_EQ(new_blob_value, Get(key));
+  }
+
+#ifndef ROCKSDB_LITE
+  const auto& compaction_stats = GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  // Filter unconditionally changes value in FilterBlobByKey;
+  // this involves writing but not reading blobs
+  ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+  ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
+#endif  // ROCKSDB_LITE
+
+  Close();
+}
+
+TEST_P(DBBlobBadCompactionFilterTest, BadDecisionFromCompactionFilter) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  options.compaction_filter = compaction_filter_guard_.get();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("b", "value"));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                                /*end=*/nullptr)
+                  .IsNotSupported());
+  Close();
+
+  DestroyAndReopen(options);
+  std::string key(std::get<0>(GetParam()));
+  ASSERT_OK(Put(key, "value"));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                                /*end=*/nullptr)
+                  .IsNotSupported());
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionFilter_InlinedTTLIndex) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new ValueMutationFilter(""));
+  options.compaction_filter = compaction_filter_guard.get();
+  DestroyAndReopen(options);
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+  // Fake an inlined TTL blob index.
+  std::string blob_index;
+  constexpr uint64_t expiration = 1234567890;
+  BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob);
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                                /*end=*/nullptr)
+                  .IsCorruption());
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionFilter) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  constexpr char padding[] = "_delta";
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new ValueMutationFilter(padding));
+  options.compaction_filter = compaction_filter_guard.get();
+  DestroyAndReopen(options);
+  const std::vector<std::pair<std::string, std::string>> kvs = {
+      {"a", "a_value"}, {"b", "b_value"}, {"c", "c_value"}};
+  for (const auto& kv : kvs) {
+    ASSERT_OK(Put(kv.first, kv.second));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+  for (const auto& kv : kvs) {
+    ASSERT_EQ(kv.second + std::string(padding), Get(kv.first));
+  }
+
+#ifndef ROCKSDB_LITE
+  const auto& compaction_stats = GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  // Filter changes the value using the previous value in FilterV2;
+  // this involves reading and writing blobs
+  ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
+  ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
+#endif  // ROCKSDB_LITE
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CorruptedBlobIndex) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new ValueMutationFilter(""));
+  options.compaction_filter = compaction_filter_guard.get();
+  DestroyAndReopen(options);
+  // Mock a corrupted blob index
+  constexpr char key[] = "key";
+  std::string blob_idx("blob_idx");
+  WriteBatch write_batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&write_batch, 0, key, blob_idx));
+  ASSERT_OK(db_->Write(WriteOptions(), &write_batch));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                                /*end=*/nullptr)
+                  .IsCorruption());
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionFilterReadBlobAndKeep) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new AlwaysKeepFilter());
+  options.compaction_filter = compaction_filter_guard.get();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "foo_value"));
+  ASSERT_OK(Flush());
+  std::vector<uint64_t> blob_files = GetBlobFileNumbers();
+  ASSERT_EQ(1, blob_files.size());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+  ASSERT_EQ(blob_files, GetBlobFileNumbers());
+
+#ifndef ROCKSDB_LITE
+  const auto& compaction_stats = GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  // Filter decides to keep the existing value in FilterV2;
+  // this involves reading but not writing blobs
+  ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
+  ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+#endif  // ROCKSDB_LITE
+
+  Close();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/db/blob/db_blob_corruption_test.cc b/db/blob/db_blob_corruption_test.cc
new file mode 100644
index 00000000000..77f11b75ab3
--- /dev/null
+++ b/db/blob/db_blob_corruption_test.cc
@@ -0,0 +1,81 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBlobCorruptionTest : public DBTestBase {
+ protected:
+  DBBlobCorruptionTest()
+      : DBTestBase("/db_blob_corruption_test", /* env_do_fsync */ false) {}
+
+  void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
+    // Pick file to corrupt
+    std::vector<std::string> filenames;
+    ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+    uint64_t number;
+    FileType type;
+    std::string fname;
+    uint64_t picked_number = kInvalidBlobFileNumber;
+    for (size_t i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type) && type == filetype &&
+          number > picked_number) {  // Pick latest file
+        fname = dbname_ + "/" + filenames[i];
+        picked_number = number;
+      }
+    }
+    ASSERT_TRUE(!fname.empty()) << filetype;
+    ASSERT_OK(test::CorruptFile(env_, fname, offset, bytes_to_corrupt));
+  }
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobCorruptionTest, VerifyWholeBlobFileChecksum) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  options.file_checksum_gen_factory =
+      ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory();
+  Reopen(options);
+
+  ASSERT_OK(Put(Slice("key_1"), Slice("blob_value_1")));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Slice("key_2"), Slice("blob_value_2")));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+  Close();
+
+  Corrupt(kBlobFile, 0, 2);
+
+  ASSERT_OK(TryReopen(options));
+
+  int count{0};
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::VerifyFullFileChecksum:mismatch", [&](void* arg) {
+        const Status* s = static_cast<Status*>(arg);
+        ASSERT_NE(s, nullptr);
+        ++count;
+        ASSERT_NOK(*s);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsCorruption());
+  ASSERT_EQ(1, count);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // !ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/db/blob/db_blob_index_test.cc b/db/blob/db_blob_index_test.cc
index e7ceabd3e56..34bcd9fb584 100644
--- a/db/blob/db_blob_index_test.cc
+++ b/db/blob/db_blob_index_test.cc
@@ -73,6 +73,9 @@ class DBBlobIndexTest : public DBTestBase {
     if (s.IsNotFound()) {
       return "NOT_FOUND";
     }
+    if (s.IsCorruption()) {
+      return "CORRUPTION";
+    }
     if (s.IsNotSupported()) {
       return "NOT_SUPPORTED";
     }
@@ -95,11 +98,12 @@ class DBBlobIndexTest : public DBTestBase {
   ArenaWrappedDBIter* GetBlobIterator() {
     return dbfull()->NewIteratorImpl(
         ReadOptions(), cfd(), dbfull()->GetLatestSequenceNumber(),
-        nullptr /*read_callback*/, true /*allow_blob*/);
+        nullptr /*read_callback*/, true /*expose_blob_index*/);
   }
 
   Options GetTestOptions() {
     Options options;
+    options.env = CurrentOptions().env;
     options.create_if_missing = true;
     options.num_levels = 2;
     options.disable_auto_compactions = true;
@@ -153,8 +157,13 @@ TEST_F(DBBlobIndexTest, Write) {
   }
 }
 
-// Get should be able to return blob index if is_blob_index is provided,
-// otherwise return Status::NotSupported status.
+// Note: the following test case pertains to the StackableDB-based BlobDB
+// implementation. Get should be able to return blob index if is_blob_index is
+// provided, otherwise it should return Status::NotSupported (when reading from
+// memtable) or Status::Corruption (when reading from SST). Reading from SST
+// returns Corruption because we can't differentiate between the application
+// accidentally opening the base DB of a stacked BlobDB and actual corruption
+// when using the integrated BlobDB.
 TEST_F(DBBlobIndexTest, Get) {
   for (auto tier : kAllTiers) {
     DestroyAndReopen(GetTestOptions());
@@ -171,15 +180,22 @@ TEST_F(DBBlobIndexTest, Get) {
     ASSERT_EQ("value", GetImpl("key", &is_blob_index));
     ASSERT_FALSE(is_blob_index);
     // Verify blob index
-    ASSERT_TRUE(Get("blob_key", &value).IsNotSupported());
-    ASSERT_EQ("NOT_SUPPORTED", GetImpl("blob_key"));
+    if (tier <= kImmutableMemtables) {
+      ASSERT_TRUE(Get("blob_key", &value).IsNotSupported());
+      ASSERT_EQ("NOT_SUPPORTED", GetImpl("blob_key"));
+    } else {
+      ASSERT_TRUE(Get("blob_key", &value).IsCorruption());
+      ASSERT_EQ("CORRUPTION", GetImpl("blob_key"));
+    }
     ASSERT_EQ("blob_index", GetImpl("blob_key", &is_blob_index));
     ASSERT_TRUE(is_blob_index);
   }
 }
 
-// Get should NOT return Status::NotSupported if blob index is updated with
-// a normal value.
+// Note: the following test case pertains to the StackableDB-based BlobDB
+// implementation. Get should NOT return Status::NotSupported/Status::Corruption
+// if blob index is updated with a normal value. See the test case above for
+// more details.
 TEST_F(DBBlobIndexTest, Updated) {
   for (auto tier : kAllTiers) {
     DestroyAndReopen(GetTestOptions());
@@ -206,7 +222,11 @@ TEST_F(DBBlobIndexTest, Updated) {
       ASSERT_EQ("blob_index", GetBlobIndex("key" + ToString(i), snapshot));
     }
     ASSERT_EQ("new_value", Get("key1"));
-    ASSERT_EQ("NOT_SUPPORTED", GetImpl("key2"));
+    if (tier <= kImmutableMemtables) {
+      ASSERT_EQ("NOT_SUPPORTED", GetImpl("key2"));
+    } else {
+      ASSERT_EQ("CORRUPTION", GetImpl("key2"));
+    }
     ASSERT_EQ("NOT_FOUND", Get("key3"));
     ASSERT_EQ("NOT_FOUND", Get("key4"));
     ASSERT_EQ("a,b,c", GetImpl("key5"));
@@ -218,8 +238,11 @@ TEST_F(DBBlobIndexTest, Updated) {
   }
 }
 
-// Iterator should get blob value if allow_blob flag is set,
-// otherwise return Status::NotSupported status.
+// Note: the following test case pertains to the StackableDB-based BlobDB
+// implementation. When a blob iterator is used, it should set the
+// expose_blob_index flag for the underlying DBIter, and retrieve/return the
+// corresponding blob value. If a regular DBIter is created (i.e.
+// expose_blob_index is not set), it should return Status::Corruption.
 TEST_F(DBBlobIndexTest, Iterate) {
   const std::vector<std::vector<ValueType>> data = {
       /*00*/ {kTypeValue},
@@ -282,6 +305,7 @@ TEST_F(DBBlobIndexTest, Iterate) {
                     std::function<void(Iterator*)> extra_check = nullptr) {
     // Seek
     auto* iterator = create_iterator();
+    ASSERT_OK(iterator->status());
     ASSERT_OK(iterator->Refresh());
     iterator->Seek(get_key(index));
     check_iterator(iterator, expected_status, forward_value);
@@ -295,6 +319,7 @@ TEST_F(DBBlobIndexTest, Iterate) {
     ASSERT_OK(iterator->Refresh());
     iterator->Seek(get_key(index - 1));
     ASSERT_TRUE(iterator->Valid());
+    ASSERT_OK(iterator->status());
     iterator->Next();
     check_iterator(iterator, expected_status, forward_value);
     if (extra_check) {
@@ -304,6 +329,7 @@ TEST_F(DBBlobIndexTest, Iterate) {
 
     // SeekForPrev
     iterator = create_iterator();
+    ASSERT_OK(iterator->status());
     ASSERT_OK(iterator->Refresh());
     iterator->SeekForPrev(get_key(index));
     check_iterator(iterator, expected_status, backward_value);
@@ -316,6 +342,7 @@ TEST_F(DBBlobIndexTest, Iterate) {
     iterator = create_iterator();
     iterator->Seek(get_key(index + 1));
     ASSERT_TRUE(iterator->Valid());
+    ASSERT_OK(iterator->status());
     iterator->Prev();
     check_iterator(iterator, expected_status, backward_value);
     if (extra_check) {
@@ -353,7 +380,7 @@ TEST_F(DBBlobIndexTest, Iterate) {
             ASSERT_OK(Write(&batch));
             break;
           default:
-            assert(false);
+            FAIL();
         };
       }
       snapshots.push_back(dbfull()->GetSnapshot());
@@ -364,15 +391,15 @@ TEST_F(DBBlobIndexTest, Iterate) {
     MoveDataTo(tier);
 
     // Normal iterator
-    verify(1, Status::kNotSupported, "", "", create_normal_iterator);
-    verify(3, Status::kNotSupported, "", "", create_normal_iterator);
+    verify(1, Status::kCorruption, "", "", create_normal_iterator);
+    verify(3, Status::kCorruption, "", "", create_normal_iterator);
     verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
            create_normal_iterator);
     verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
            create_normal_iterator);
     verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
            create_normal_iterator);
-    verify(11, Status::kNotSupported, "", "", create_normal_iterator);
+    verify(11, Status::kCorruption, "", "", create_normal_iterator);
     verify(13, Status::kOk,
            get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
            get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
@@ -391,7 +418,11 @@ TEST_F(DBBlobIndexTest, Iterate) {
            create_blob_iterator, check_is_blob(false));
     verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
            create_blob_iterator, check_is_blob(false));
-    verify(11, Status::kNotSupported, "", "", create_blob_iterator);
+    if (tier <= kImmutableMemtables) {
+      verify(11, Status::kNotSupported, "", "", create_blob_iterator);
+    } else {
+      verify(11, Status::kCorruption, "", "", create_blob_iterator);
+    }
     verify(13, Status::kOk,
            get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
            get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
@@ -413,7 +444,11 @@ TEST_F(DBBlobIndexTest, Iterate) {
            create_blob_iterator, check_is_blob(false));
     verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
            create_blob_iterator, check_is_blob(false));
-    verify(11, Status::kNotSupported, "", "", create_blob_iterator);
+    if (tier <= kImmutableMemtables) {
+      verify(11, Status::kNotSupported, "", "", create_blob_iterator);
+    } else {
+      verify(11, Status::kCorruption, "", "", create_blob_iterator);
+    }
     verify(13, Status::kOk,
            get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
            get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
@@ -428,6 +463,106 @@ TEST_F(DBBlobIndexTest, Iterate) {
   }
 }
 
+TEST_F(DBBlobIndexTest, IntegratedBlobIterate) {
+  const std::vector<std::vector<std::string>> data = {
+      /*00*/ {"Put"},
+      /*01*/ {"Put", "Merge", "Merge", "Merge"},
+      /*02*/ {"Put"}};
+
+  auto get_key = [](size_t index) { return ("key" + std::to_string(index)); };
+
+  auto get_value = [&](size_t index, size_t version) {
+    return get_key(index) + "_value" + ToString(version);
+  };
+
+  auto check_iterator = [&](Iterator* iterator, Status expected_status,
+                            const Slice& expected_value) {
+    ASSERT_EQ(expected_status, iterator->status());
+    if (expected_status.ok()) {
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ(expected_value, iterator->value());
+    } else {
+      ASSERT_FALSE(iterator->Valid());
+    }
+  };
+
+  auto verify = [&](size_t index, Status expected_status,
+                    const Slice& expected_value) {
+    // Seek
+    {
+      Iterator* iterator = db_->NewIterator(ReadOptions());
+      std::unique_ptr<Iterator> iterator_guard(iterator);
+      ASSERT_OK(iterator->status());
+      ASSERT_OK(iterator->Refresh());
+      iterator->Seek(get_key(index));
+      check_iterator(iterator, expected_status, expected_value);
+    }
+    // Next
+    {
+      Iterator* iterator = db_->NewIterator(ReadOptions());
+      std::unique_ptr<Iterator> iterator_guard(iterator);
+      ASSERT_OK(iterator->Refresh());
+      iterator->Seek(get_key(index - 1));
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_OK(iterator->status());
+      iterator->Next();
+      check_iterator(iterator, expected_status, expected_value);
+    }
+    // SeekForPrev
+    {
+      Iterator* iterator = db_->NewIterator(ReadOptions());
+      std::unique_ptr<Iterator> iterator_guard(iterator);
+      ASSERT_OK(iterator->status());
+      ASSERT_OK(iterator->Refresh());
+      iterator->SeekForPrev(get_key(index));
+      check_iterator(iterator, expected_status, expected_value);
+    }
+    // Prev
+    {
+      Iterator* iterator = db_->NewIterator(ReadOptions());
+      std::unique_ptr<Iterator> iterator_guard(iterator);
+      iterator->Seek(get_key(index + 1));
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_OK(iterator->status());
+      iterator->Prev();
+      check_iterator(iterator, expected_status, expected_value);
+    }
+  };
+
+  Options options = GetTestOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  DestroyAndReopen(options);
+
+  // fill data
+  for (size_t i = 0; i < data.size(); i++) {
+    for (size_t j = 0; j < data[i].size(); j++) {
+      std::string key = get_key(i);
+      std::string value = get_value(i, j);
+      if (data[i][j] == "Put") {
+        ASSERT_OK(Put(key, value));
+        ASSERT_OK(Flush());
+      } else if (data[i][j] == "Merge") {
+        ASSERT_OK(Merge(key, value));
+        ASSERT_OK(Flush());
+      }
+    }
+  }
+
+  std::string expected_value = get_value(1, 0) + "," + get_value(1, 1) + "," +
+                               get_value(1, 2) + "," + get_value(1, 3);
+  Status expected_status;
+  verify(1, expected_status, expected_value);
+
+#ifndef ROCKSDB_LITE
+  // Test DBIter::FindValueForCurrentKeyUsingSeek flow.
+  ASSERT_OK(dbfull()->SetOptions(cfh(),
+                                 {{"max_sequential_skip_in_iterations", "0"}}));
+  verify(1, expected_status, expected_value);
+#endif  // !ROCKSDB_LITE
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/builder.cc b/db/builder.cc
index 90cfbbffbb7..88c99a08415 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -23,11 +23,13 @@
 #include "db/range_del_aggregator.h"
 #include "db/table_cache.h"
 #include "db/version_edit.h"
+#include "file/file_util.h"
 #include "file/filename.h"
 #include "file/read_write_util.h"
 #include "file/writable_file_writer.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/thread_status_util.h"
+#include "options/options_helper.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
@@ -43,62 +45,41 @@ namespace ROCKSDB_NAMESPACE {
 
 class TableFactory;
 
-TableBuilder* NewTableBuilder(
-    const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
-    const InternalKeyComparator& internal_comparator,
-    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-        int_tbl_prop_collector_factories,
-    uint32_t column_family_id, const std::string& column_family_name,
-    WritableFileWriter* file, const CompressionType compression_type,
-    uint64_t sample_for_compression, const CompressionOptions& compression_opts,
-    int level, const bool skip_filters, const uint64_t creation_time,
-    const uint64_t oldest_key_time, const uint64_t target_file_size,
-    const uint64_t file_creation_time, const std::string& db_id,
-    const std::string& db_session_id) {
-  assert((column_family_id ==
+TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
+                              WritableFileWriter* file) {
+  assert((tboptions.column_family_id ==
           TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
-         column_family_name.empty());
-  return ioptions.table_factory->NewTableBuilder(
-      TableBuilderOptions(ioptions, moptions, internal_comparator,
-                          int_tbl_prop_collector_factories, compression_type,
-                          sample_for_compression, compression_opts,
-                          skip_filters, column_family_name, level,
-                          creation_time, oldest_key_time, target_file_size,
-                          file_creation_time, db_id, db_session_id),
-      column_family_id, file);
+         tboptions.column_family_name.empty());
+  return tboptions.ioptions.table_factory->NewTableBuilder(tboptions, file);
 }
 
 Status BuildTable(
-    const std::string& dbname, VersionSet* versions, Env* env, FileSystem* fs,
-    const ImmutableCFOptions& ioptions,
-    const MutableCFOptions& mutable_cf_options, const FileOptions& file_options,
-    TableCache* table_cache, InternalIterator* iter,
+    const std::string& dbname, VersionSet* versions,
+    const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
+    const FileOptions& file_options, TableCache* table_cache,
+    InternalIterator* iter,
     std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
         range_del_iters,
     FileMetaData* meta, std::vector<BlobFileAddition>* blob_file_additions,
-    const InternalKeyComparator& internal_comparator,
-    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-        int_tbl_prop_collector_factories,
-    uint32_t column_family_id, const std::string& column_family_name,
     std::vector<SequenceNumber> snapshots,
     SequenceNumber earliest_write_conflict_snapshot,
-    SnapshotChecker* snapshot_checker, const CompressionType compression,
-    uint64_t sample_for_compression, const CompressionOptions& compression_opts,
-    bool paranoid_file_checks, InternalStats* internal_stats,
-    TableFileCreationReason reason, IOStatus* io_status,
+    SnapshotChecker* snapshot_checker, bool paranoid_file_checks,
+    InternalStats* internal_stats, IOStatus* io_status,
     const std::shared_ptr<IOTracer>& io_tracer, EventLogger* event_logger,
     int job_id, const Env::IOPriority io_priority,
-    TableProperties* table_properties, int level, const uint64_t creation_time,
-    const uint64_t oldest_key_time, Env::WriteLifeTimeHint write_hint,
-    const uint64_t file_creation_time, const std::string& db_id,
-    const std::string& db_session_id) {
-  assert((column_family_id ==
+    TableProperties* table_properties, Env::WriteLifeTimeHint write_hint,
+    const std::string* full_history_ts_low,
+    BlobFileCompletionCallback* blob_callback, uint64_t* num_input_entries,
+    uint64_t* memtable_payload_bytes, uint64_t* memtable_garbage_bytes) {
+  assert((tboptions.column_family_id ==
           TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
-         column_family_name.empty());
+         tboptions.column_family_name.empty());
+  auto& mutable_cf_options = tboptions.moptions;
+  auto& ioptions = tboptions.ioptions;
   // Reports the IOStats for flush for every following bytes.
   const size_t kReportFlushIOStatsEvery = 1048576;
   OutputValidator output_validator(
-      internal_comparator,
+      tboptions.internal_comparator,
       /*enable_order_check=*/
       mutable_cf_options.check_flush_compaction_key_order,
       /*enable_hash=*/paranoid_file_checks);
@@ -106,8 +87,15 @@ Status BuildTable(
   meta->fd.file_size = 0;
   iter->SeekToFirst();
   std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
-      new CompactionRangeDelAggregator(&internal_comparator, snapshots));
+      new CompactionRangeDelAggregator(&tboptions.internal_comparator,
+                                       snapshots));
+  uint64_t num_unfragmented_tombstones = 0;
+  uint64_t total_tombstone_payload_bytes = 0;
   for (auto& range_del_iter : range_del_iters) {
+    num_unfragmented_tombstones +=
+        range_del_iter->num_unfragmented_tombstones();
+    total_tombstone_payload_bytes +=
+        range_del_iter->total_tombstone_payload_bytes();
     range_del_agg->AddTombstones(std::move(range_del_iter));
   }
 
@@ -117,18 +105,39 @@ Status BuildTable(
   std::string file_checksum = kUnknownFileChecksum;
   std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
 #ifndef ROCKSDB_LITE
-  EventHelpers::NotifyTableFileCreationStarted(
-      ioptions.listeners, dbname, column_family_name, fname, job_id, reason);
+  EventHelpers::NotifyTableFileCreationStarted(ioptions.listeners, dbname,
+                                               tboptions.column_family_name,
+                                               fname, job_id, tboptions.reason);
 #endif  // !ROCKSDB_LITE
+  Env* env = db_options.env;
+  assert(env);
+  FileSystem* fs = db_options.fs.get();
+  assert(fs);
+
   TableProperties tp;
   if (iter->Valid() || !range_del_agg->IsEmpty()) {
+    std::unique_ptr<CompactionFilter> compaction_filter;
+    if (ioptions.compaction_filter_factory != nullptr &&
+        ioptions.compaction_filter_factory->ShouldFilterTableFileCreation(
+            tboptions.reason)) {
+      CompactionFilter::Context context;
+      context.is_full_compaction = false;
+      context.is_manual_compaction = false;
+      context.column_family_id = tboptions.column_family_id;
+      context.reason = tboptions.reason;
+      compaction_filter =
+          ioptions.compaction_filter_factory->CreateCompactionFilter(context);
+      if (compaction_filter != nullptr &&
+          !compaction_filter->IgnoreSnapshots()) {
+        s.PermitUncheckedError();
+        return Status::NotSupported(
+            "CompactionFilter::IgnoreSnapshots() = false is not supported "
+            "anymore.");
+      }
+    }
+
     TableBuilder* builder;
     std::unique_ptr<WritableFileWriter> file_writer;
-    // Currently we only enable dictionary compression during compaction to the
-    // bottommost level.
-    CompressionOptions compression_opts_for_flush(compression_opts);
-    compression_opts_for_flush.max_dict_bytes = 0;
-    compression_opts_for_flush.zstd_max_train_bytes = 0;
     {
       std::unique_ptr<FSWritableFile> file;
 #ifndef NDEBUG
@@ -143,49 +152,51 @@ Status BuildTable(
       }
       if (!s.ok()) {
         EventHelpers::LogAndNotifyTableFileCreationFinished(
-            event_logger, ioptions.listeners, dbname, column_family_name, fname,
-            job_id, meta->fd, kInvalidBlobFileNumber, tp, reason, s,
-            file_checksum, file_checksum_func_name);
+            event_logger, ioptions.listeners, dbname,
+            tboptions.column_family_name, fname, job_id, meta->fd,
+            kInvalidBlobFileNumber, tp, tboptions.reason, s, file_checksum,
+            file_checksum_func_name);
         return s;
       }
+      FileTypeSet tmp_set = ioptions.checksum_handoff_file_types;
       file->SetIOPriority(io_priority);
       file->SetWriteLifeTimeHint(write_hint);
-
       file_writer.reset(new WritableFileWriter(
-          std::move(file), fname, file_options, env, io_tracer,
-          ioptions.statistics, ioptions.listeners,
-          ioptions.file_checksum_gen_factory));
-
-      builder = NewTableBuilder(
-          ioptions, mutable_cf_options, internal_comparator,
-          int_tbl_prop_collector_factories, column_family_id,
-          column_family_name, file_writer.get(), compression,
-          sample_for_compression, compression_opts_for_flush, level,
-          false /* skip_filters */, creation_time, oldest_key_time,
-          0 /*target_file_size*/, file_creation_time, db_id, db_session_id);
+          std::move(file), fname, file_options, ioptions.clock, io_tracer,
+          ioptions.stats, ioptions.listeners,
+          ioptions.file_checksum_gen_factory.get(),
+          tmp_set.Contains(FileType::kTableFile)));
+
+      builder = NewTableBuilder(tboptions, file_writer.get());
     }
 
-    MergeHelper merge(env, internal_comparator.user_comparator(),
-                      ioptions.merge_operator, nullptr, ioptions.info_log,
-                      true /* internal key corruption is not ok */,
-                      snapshots.empty() ? 0 : snapshots.back(),
-                      snapshot_checker);
+    MergeHelper merge(
+        env, tboptions.internal_comparator.user_comparator(),
+        ioptions.merge_operator.get(), compaction_filter.get(), ioptions.logger,
+        true /* internal key corruption is not ok */,
+        snapshots.empty() ? 0 : snapshots.back(), snapshot_checker);
 
     std::unique_ptr<BlobFileBuilder> blob_file_builder(
         (mutable_cf_options.enable_blob_files && blob_file_additions)
-            ? new BlobFileBuilder(versions, env, fs, &ioptions,
-                                  &mutable_cf_options, &file_options, job_id,
-                                  column_family_id, column_family_name,
-                                  io_priority, write_hint, &blob_file_paths,
-                                  blob_file_additions)
+            ? new BlobFileBuilder(versions, fs, &ioptions, &mutable_cf_options,
+                                  &file_options, job_id,
+                                  tboptions.column_family_id,
+                                  tboptions.column_family_name, io_priority,
+                                  write_hint, io_tracer, blob_callback,
+                                  &blob_file_paths, blob_file_additions)
             : nullptr);
 
     CompactionIterator c_iter(
-        iter, internal_comparator.user_comparator(), &merge, kMaxSequenceNumber,
-        &snapshots, earliest_write_conflict_snapshot, snapshot_checker, env,
-        ShouldReportDetailedTime(env, ioptions.statistics),
+        iter, tboptions.internal_comparator.user_comparator(), &merge,
+        kMaxSequenceNumber, &snapshots, earliest_write_conflict_snapshot,
+        snapshot_checker, env, ShouldReportDetailedTime(env, ioptions.stats),
         true /* internal key corruption is not ok */, range_del_agg.get(),
-        blob_file_builder.get(), ioptions.allow_data_in_errors);
+        blob_file_builder.get(), ioptions.allow_data_in_errors,
+        /*compaction=*/nullptr, compaction_filter.get(),
+        /*shutting_down=*/nullptr,
+        /*preserve_deletes_seqnum=*/0, /*manual_compaction_paused=*/nullptr,
+        /*manual_compaction_canceled=*/nullptr, db_options.info_log,
+        full_history_ts_low);
 
     c_iter.SeekToFirst();
     for (; c_iter.Valid(); c_iter.Next()) {
@@ -212,6 +223,7 @@ Status BuildTable(
     } else if (!c_iter.status().ok()) {
       s = c_iter.status();
     }
+
     if (s.ok()) {
       auto range_del_it = range_del_agg->NewIterator();
       for (range_del_it->SeekToFirst(); range_del_it->Valid();
@@ -220,16 +232,17 @@ Status BuildTable(
         auto kv = tombstone.Serialize();
         builder->Add(kv.first.Encode(), kv.second);
         meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(),
-                                       tombstone.seq_, internal_comparator);
-      }
-
-      if (blob_file_builder) {
-        s = blob_file_builder->Finish();
+                                       tombstone.seq_,
+                                       tboptions.internal_comparator);
       }
     }
 
     TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable");
     const bool empty = builder->IsEmpty();
+    if (num_input_entries != nullptr) {
+      *num_input_entries =
+          c_iter.num_input_entry_scanned() + num_unfragmented_tombstones;
+    }
     if (!s.ok() || empty) {
       builder->Abandon();
     } else {
@@ -245,6 +258,25 @@ Status BuildTable(
       meta->marked_for_compaction = builder->NeedCompact();
       assert(meta->fd.GetFileSize() > 0);
       tp = builder->GetTableProperties(); // refresh now that builder is finished
+      if (memtable_payload_bytes != nullptr &&
+          memtable_garbage_bytes != nullptr) {
+        const CompactionIterationStats& ci_stats = c_iter.iter_stats();
+        uint64_t total_payload_bytes = ci_stats.total_input_raw_key_bytes +
+                                       ci_stats.total_input_raw_value_bytes +
+                                       total_tombstone_payload_bytes;
+        uint64_t total_payload_bytes_written =
+            (tp.raw_key_size + tp.raw_value_size);
+        // Prevent underflow, which may still happen at this point
+        // since we only support inserts, deletes, and deleteRanges.
+        if (total_payload_bytes_written <= total_payload_bytes) {
+          *memtable_payload_bytes = total_payload_bytes;
+          *memtable_garbage_bytes =
+              total_payload_bytes - total_payload_bytes_written;
+        } else {
+          *memtable_payload_bytes = 0;
+          *memtable_garbage_bytes = 0;
+        }
+      }
       if (table_properties) {
         *table_properties = tp;
       }
@@ -254,7 +286,7 @@ Status BuildTable(
     // Finish and check for file errors
     TEST_SYNC_POINT("BuildTable:BeforeSyncTable");
     if (s.ok() && !empty) {
-      StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS);
+      StopWatch sw(ioptions.clock, ioptions.stats, TABLE_SYNC_MICROS);
       *io_status = file_writer->Sync(ioptions.use_fsync);
     }
     TEST_SYNC_POINT("BuildTable:BeforeCloseTableFile");
@@ -273,6 +305,15 @@ Status BuildTable(
       s = *io_status;
     }
 
+    if (blob_file_builder) {
+      if (s.ok()) {
+        s = blob_file_builder->Finish();
+      } else {
+        blob_file_builder->Abandon();
+      }
+      blob_file_builder.reset();
+    }
+
     // TODO Also check the IO status when create the Iterator.
 
     if (s.ok() && !empty) {
@@ -284,20 +325,20 @@ Status BuildTable(
       // to cache it here for further user reads
       ReadOptions read_options;
       std::unique_ptr<InternalIterator> it(table_cache->NewIterator(
-          read_options, file_options, internal_comparator, *meta,
+          read_options, file_options, tboptions.internal_comparator, *meta,
           nullptr /* range_del_agg */,
           mutable_cf_options.prefix_extractor.get(), nullptr,
           (internal_stats == nullptr) ? nullptr
                                       : internal_stats->GetFileReadHist(0),
           TableReaderCaller::kFlush, /*arena=*/nullptr,
-          /*skip_filter=*/false, level,
+          /*skip_filter=*/false, tboptions.level_at_creation,
           MaxFileSizeForL0MetaPin(mutable_cf_options),
           /*smallest_compaction_key=*/nullptr,
           /*largest_compaction_key*/ nullptr,
           /*allow_unprepared_value*/ false));
       s = it->status();
       if (s.ok() && paranoid_file_checks) {
-        OutputValidator file_validator(internal_comparator,
+        OutputValidator file_validator(tboptions.internal_comparator,
                                        /*enable_order_check=*/true,
                                        /*enable_hash=*/true);
         for (it->SeekToFirst(); it->Valid(); it->Next()) {
@@ -318,6 +359,8 @@ Status BuildTable(
   }
 
   if (!s.ok() || meta->fd.GetFileSize() == 0) {
+    TEST_SYNC_POINT("BuildTable:BeforeDeleteFile");
+
     constexpr IODebugContext* dbg = nullptr;
 
     Status ignored = fs->DeleteFile(fname, IOOptions(), dbg);
@@ -327,11 +370,11 @@ Status BuildTable(
 
     if (blob_file_additions) {
       for (const std::string& blob_file_path : blob_file_paths) {
-        ignored = fs->DeleteFile(blob_file_path, IOOptions(), dbg);
+        ignored = DeleteDBFile(&db_options, blob_file_path, dbname,
+                               /*force_bg=*/false, /*force_fg=*/false);
         ignored.PermitUncheckedError();
+        TEST_SYNC_POINT("BuildTable::AfterDeleteFile");
       }
-
-      blob_file_additions->clear();
     }
   }
 
@@ -340,9 +383,9 @@ Status BuildTable(
   }
   // Output to event logger and fire events.
   EventHelpers::LogAndNotifyTableFileCreationFinished(
-      event_logger, ioptions.listeners, dbname, column_family_name, fname,
-      job_id, meta->fd, meta->oldest_blob_file_number, tp, reason, s,
-      file_checksum, file_checksum_func_name);
+      event_logger, ioptions.listeners, dbname, tboptions.column_family_name,
+      fname, job_id, meta->fd, meta->oldest_blob_file_number, tp,
+      tboptions.reason, s, file_checksum, file_checksum_func_name);
 
   return s;
 }
diff --git a/db/builder.h b/db/builder.h
index 8c80c637955..f8828f5c448 100644
--- a/db/builder.h
+++ b/db/builder.h
@@ -24,37 +24,20 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-struct Options;
 struct FileMetaData;
 
 class VersionSet;
-class Env;
-struct EnvOptions;
 class BlobFileAddition;
-class Iterator;
 class SnapshotChecker;
 class TableCache;
-class VersionEdit;
 class TableBuilder;
 class WritableFileWriter;
 class InternalStats;
+class BlobFileCompletionCallback;
 
-// @param column_family_name Name of the column family that is also identified
-//    by column_family_id, or empty string if unknown. It must outlive the
-//    TableBuilder returned by this function.
-TableBuilder* NewTableBuilder(
-    const ImmutableCFOptions& options, const MutableCFOptions& moptions,
-    const InternalKeyComparator& internal_comparator,
-    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-        int_tbl_prop_collector_factories,
-    uint32_t column_family_id, const std::string& column_family_name,
-    WritableFileWriter* file, const CompressionType compression_type,
-    const uint64_t sample_for_compression,
-    const CompressionOptions& compression_opts, int level,
-    const bool skip_filters = false, const uint64_t creation_time = 0,
-    const uint64_t oldest_key_time = 0, const uint64_t target_file_size = 0,
-    const uint64_t file_creation_time = 0, const std::string& db_id = "",
-    const std::string& db_session_id = "");
+// Convenience function for NewTableBuilder on the embedded table_factory.
+TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
+                              WritableFileWriter* file);
 
 // Build a Table file from the contents of *iter.  The generated file
 // will be named according to number specified in meta. On success, the rest of
@@ -65,30 +48,26 @@ TableBuilder* NewTableBuilder(
 // @param column_family_name Name of the column family that is also identified
 //    by column_family_id, or empty string if unknown.
 extern Status BuildTable(
-    const std::string& dbname, VersionSet* versions, Env* env, FileSystem* fs,
-    const ImmutableCFOptions& options,
-    const MutableCFOptions& mutable_cf_options, const FileOptions& file_options,
-    TableCache* table_cache, InternalIterator* iter,
+    const std::string& dbname, VersionSet* versions,
+    const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
+    const FileOptions& file_options, TableCache* table_cache,
+    InternalIterator* iter,
     std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
         range_del_iters,
     FileMetaData* meta, std::vector<BlobFileAddition>* blob_file_additions,
-    const InternalKeyComparator& internal_comparator,
-    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-        int_tbl_prop_collector_factories,
-    uint32_t column_family_id, const std::string& column_family_name,
     std::vector<SequenceNumber> snapshots,
     SequenceNumber earliest_write_conflict_snapshot,
-    SnapshotChecker* snapshot_checker, const CompressionType compression,
-    const uint64_t sample_for_compression,
-    const CompressionOptions& compression_opts, bool paranoid_file_checks,
-    InternalStats* internal_stats, TableFileCreationReason reason,
-    IOStatus* io_status, const std::shared_ptr<IOTracer>& io_tracer,
+    SnapshotChecker* snapshot_checker, bool paranoid_file_checks,
+    InternalStats* internal_stats, IOStatus* io_status,
+    const std::shared_ptr<IOTracer>& io_tracer,
     EventLogger* event_logger = nullptr, int job_id = 0,
     const Env::IOPriority io_priority = Env::IO_HIGH,
-    TableProperties* table_properties = nullptr, int level = -1,
-    const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0,
+    TableProperties* table_properties = nullptr,
     Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET,
-    const uint64_t file_creation_time = 0, const std::string& db_id = "",
-    const std::string& db_session_id = "");
+    const std::string* full_history_ts_low = nullptr,
+    BlobFileCompletionCallback* blob_callback = nullptr,
+    uint64_t* num_input_entries = nullptr,
+    uint64_t* memtable_payload_bytes = nullptr,
+    uint64_t* memtable_garbage_bytes = nullptr);
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/c.cc b/db/c.cc
index e196d0177e0..79fa5181d85 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -11,7 +11,11 @@
 
 #include "rocksdb/c.h"
 
-#include <stdlib.h>
+#include <cstdlib>
+#include <map>
+#include <unordered_set>
+#include <vector>
+
 #include "port/port.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
@@ -24,6 +28,7 @@
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
 #include "rocksdb/rate_limiter.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/statistics.h"
@@ -35,17 +40,13 @@
 #include "rocksdb/utilities/db_ttl.h"
 #include "rocksdb/utilities/memory_util.h"
 #include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/table_properties_collectors.h"
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
 #include "rocksdb/write_batch.h"
-#include "rocksdb/perf_context.h"
 #include "utilities/merge_operators.h"
 
-#include <vector>
-#include <unordered_set>
-#include <map>
-
 using ROCKSDB_NAMESPACE::BackupableDBOptions;
 using ROCKSDB_NAMESPACE::BackupEngine;
 using ROCKSDB_NAMESPACE::BackupID;
@@ -60,7 +61,6 @@ using ROCKSDB_NAMESPACE::ColumnFamilyDescriptor;
 using ROCKSDB_NAMESPACE::ColumnFamilyHandle;
 using ROCKSDB_NAMESPACE::ColumnFamilyOptions;
 using ROCKSDB_NAMESPACE::CompactionFilter;
-using ROCKSDB_NAMESPACE::CompactionFilterContext;
 using ROCKSDB_NAMESPACE::CompactionFilterFactory;
 using ROCKSDB_NAMESPACE::CompactionOptionsFIFO;
 using ROCKSDB_NAMESPACE::CompactRangeOptions;
@@ -80,10 +80,12 @@ using ROCKSDB_NAMESPACE::IngestExternalFileOptions;
 using ROCKSDB_NAMESPACE::Iterator;
 using ROCKSDB_NAMESPACE::LiveFileMetaData;
 using ROCKSDB_NAMESPACE::Logger;
+using ROCKSDB_NAMESPACE::LRUCacheOptions;
+using ROCKSDB_NAMESPACE::MemoryAllocator;
 using ROCKSDB_NAMESPACE::MemoryUtil;
 using ROCKSDB_NAMESPACE::MergeOperator;
-using ROCKSDB_NAMESPACE::MergeOperators;
 using ROCKSDB_NAMESPACE::NewBloomFilterPolicy;
+using ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory;
 using ROCKSDB_NAMESPACE::NewGenericRateLimiter;
 using ROCKSDB_NAMESPACE::NewLRUCache;
 using ROCKSDB_NAMESPACE::OptimisticTransactionDB;
@@ -104,6 +106,7 @@ using ROCKSDB_NAMESPACE::SliceTransform;
 using ROCKSDB_NAMESPACE::Snapshot;
 using ROCKSDB_NAMESPACE::SstFileWriter;
 using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory;
 using ROCKSDB_NAMESPACE::Transaction;
 using ROCKSDB_NAMESPACE::TransactionDB;
 using ROCKSDB_NAMESPACE::TransactionDBOptions;
@@ -115,10 +118,8 @@ using ROCKSDB_NAMESPACE::WriteBatch;
 using ROCKSDB_NAMESPACE::WriteBatchWithIndex;
 using ROCKSDB_NAMESPACE::WriteOptions;
 
-using std::shared_ptr;
 using std::vector;
 using std::unordered_set;
-using std::map;
 
 extern "C" {
 
@@ -154,6 +155,12 @@ struct rocksdb_filelock_t        { FileLock*         rep; };
 struct rocksdb_logger_t {
   std::shared_ptr<Logger> rep;
 };
+struct rocksdb_lru_cache_options_t {
+  LRUCacheOptions rep;
+};
+struct rocksdb_memory_allocator_t {
+  std::shared_ptr<MemoryAllocator> rep;
+};
 struct rocksdb_cache_t {
   std::shared_ptr<Cache> rep;
 };
@@ -181,6 +188,9 @@ struct rocksdb_transaction_options_t {
 struct rocksdb_transaction_t {
   Transaction* rep;
 };
+struct rocksdb_backupable_db_options_t {
+  BackupableDBOptions rep;
+};
 struct rocksdb_checkpoint_t {
   Checkpoint* rep;
 };
@@ -549,6 +559,18 @@ rocksdb_backup_engine_t* rocksdb_backup_engine_open(
   return result;
 }
 
+rocksdb_backup_engine_t* rocksdb_backup_engine_open_opts(
+    const rocksdb_backupable_db_options_t* options, rocksdb_env_t* env,
+    char** errptr) {
+  BackupEngine* be;
+  if (SaveError(errptr, BackupEngine::Open(options->rep, env->rep, &be))) {
+    return nullptr;
+  }
+  rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t;
+  result->rep = be;
+  return result;
+}
+
 void rocksdb_backup_engine_create_new_backup(rocksdb_backup_engine_t* be,
                                              rocksdb_t* db,
                                              char** errptr) {
@@ -645,6 +667,128 @@ void rocksdb_backup_engine_close(rocksdb_backup_engine_t* be) {
   delete be;
 }
 
+rocksdb_backupable_db_options_t* rocksdb_backupable_db_options_create(
+    const char* backup_dir) {
+  return new rocksdb_backupable_db_options_t{
+      BackupableDBOptions(std::string(backup_dir))};
+}
+
+void rocksdb_backupable_db_options_set_backup_dir(
+    rocksdb_backupable_db_options_t* options, const char* backup_dir) {
+  options->rep.backup_dir = std::string(backup_dir);
+}
+
+void rocksdb_backupable_db_options_set_env(
+    rocksdb_backupable_db_options_t* options, rocksdb_env_t* env) {
+  options->rep.backup_env = (env ? env->rep : nullptr);
+}
+
+void rocksdb_backupable_db_options_set_share_table_files(
+    rocksdb_backupable_db_options_t* options, unsigned char val) {
+  options->rep.share_table_files = val;
+}
+
+unsigned char rocksdb_backupable_db_options_get_share_table_files(
+    rocksdb_backupable_db_options_t* options) {
+  return options->rep.share_table_files;
+}
+
+void rocksdb_backupable_db_options_set_sync(
+    rocksdb_backupable_db_options_t* options, unsigned char val) {
+  options->rep.sync = val;
+}
+
+unsigned char rocksdb_backupable_db_options_get_sync(
+    rocksdb_backupable_db_options_t* options) {
+  return options->rep.sync;
+}
+
+void rocksdb_backupable_db_options_set_destroy_old_data(
+    rocksdb_backupable_db_options_t* options, unsigned char val) {
+  options->rep.destroy_old_data = val;
+}
+
+unsigned char rocksdb_backupable_db_options_get_destroy_old_data(
+    rocksdb_backupable_db_options_t* options) {
+  return options->rep.destroy_old_data;
+}
+
+void rocksdb_backupable_db_options_set_backup_log_files(
+    rocksdb_backupable_db_options_t* options, unsigned char val) {
+  options->rep.backup_log_files = val;
+}
+
+unsigned char rocksdb_backupable_db_options_get_backup_log_files(
+    rocksdb_backupable_db_options_t* options) {
+  return options->rep.backup_log_files;
+}
+
+void rocksdb_backupable_db_options_set_backup_rate_limit(
+    rocksdb_backupable_db_options_t* options, uint64_t limit) {
+  options->rep.backup_rate_limit = limit;
+}
+
+uint64_t rocksdb_backupable_db_options_get_backup_rate_limit(
+    rocksdb_backupable_db_options_t* options) {
+  return options->rep.backup_rate_limit;
+}
+
+void rocksdb_backupable_db_options_set_restore_rate_limit(
+    rocksdb_backupable_db_options_t* options, uint64_t limit) {
+  options->rep.restore_rate_limit = limit;
+}
+
+uint64_t rocksdb_backupable_db_options_get_restore_rate_limit(
+    rocksdb_backupable_db_options_t* options) {
+  return options->rep.restore_rate_limit;
+}
+
+void rocksdb_backupable_db_options_set_max_background_operations(
+    rocksdb_backupable_db_options_t* options, int val) {
+  options->rep.max_background_operations = val;
+}
+
+int rocksdb_backupable_db_options_get_max_background_operations(
+    rocksdb_backupable_db_options_t* options) {
+  return options->rep.max_background_operations;
+}
+
+void rocksdb_backupable_db_options_set_callback_trigger_interval_size(
+    rocksdb_backupable_db_options_t* options, uint64_t size) {
+  options->rep.callback_trigger_interval_size = size;
+}
+
+uint64_t rocksdb_backupable_db_options_get_callback_trigger_interval_size(
+    rocksdb_backupable_db_options_t* options) {
+  return options->rep.callback_trigger_interval_size;
+}
+
+void rocksdb_backupable_db_options_set_max_valid_backups_to_open(
+    rocksdb_backupable_db_options_t* options, int val) {
+  options->rep.max_valid_backups_to_open = val;
+}
+
+int rocksdb_backupable_db_options_get_max_valid_backups_to_open(
+    rocksdb_backupable_db_options_t* options) {
+  return options->rep.max_valid_backups_to_open;
+}
+
+void rocksdb_backupable_db_options_set_share_files_with_checksum_naming(
+    rocksdb_backupable_db_options_t* options, int val) {
+  options->rep.share_files_with_checksum_naming =
+      static_cast<BackupableDBOptions::ShareFilesNaming>(val);
+}
+
+int rocksdb_backupable_db_options_get_share_files_with_checksum_naming(
+    rocksdb_backupable_db_options_t* options) {
+  return static_cast<int>(options->rep.share_files_with_checksum_naming);
+}
+
+void rocksdb_backupable_db_options_destroy(
+    rocksdb_backupable_db_options_t* options) {
+  delete options;
+}
+
 rocksdb_checkpoint_t* rocksdb_checkpoint_object_create(rocksdb_t* db,
                                                        char** errptr) {
   Checkpoint* checkpoint;
@@ -1255,34 +1399,39 @@ char* rocksdb_property_value_cf(
   }
 }
 
-void rocksdb_approximate_sizes(
-    rocksdb_t* db,
-    int num_ranges,
-    const char* const* range_start_key, const size_t* range_start_key_len,
-    const char* const* range_limit_key, const size_t* range_limit_key_len,
-    uint64_t* sizes) {
+void rocksdb_approximate_sizes(rocksdb_t* db, int num_ranges,
+                               const char* const* range_start_key,
+                               const size_t* range_start_key_len,
+                               const char* const* range_limit_key,
+                               const size_t* range_limit_key_len,
+                               uint64_t* sizes, char** errptr) {
   Range* ranges = new Range[num_ranges];
   for (int i = 0; i < num_ranges; i++) {
     ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
     ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
   }
-  db->rep->GetApproximateSizes(ranges, num_ranges, sizes);
+  Status s = db->rep->GetApproximateSizes(ranges, num_ranges, sizes);
+  if (!s.ok()) {
+    SaveError(errptr, s);
+  }
   delete[] ranges;
 }
 
 void rocksdb_approximate_sizes_cf(
-    rocksdb_t* db,
-    rocksdb_column_family_handle_t* column_family,
-    int num_ranges,
-    const char* const* range_start_key, const size_t* range_start_key_len,
-    const char* const* range_limit_key, const size_t* range_limit_key_len,
-    uint64_t* sizes) {
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    int num_ranges, const char* const* range_start_key,
+    const size_t* range_start_key_len, const char* const* range_limit_key,
+    const size_t* range_limit_key_len, uint64_t* sizes, char** errptr) {
   Range* ranges = new Range[num_ranges];
   for (int i = 0; i < num_ranges; i++) {
     ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
     ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
   }
-  db->rep->GetApproximateSizes(column_family->rep, ranges, num_ranges, sizes);
+  Status s = db->rep->GetApproximateSizes(column_family->rep, ranges,
+                                          num_ranges, sizes);
+  if (!s.ok()) {
+    SaveError(errptr, s);
+  }
   delete[] ranges;
 }
 
@@ -1363,6 +1512,10 @@ void rocksdb_flush_cf(
   SaveError(errptr, db->rep->Flush(options->rep, column_family->rep));
 }
 
+void rocksdb_flush_wal(rocksdb_t* db, unsigned char sync, char** errptr) {
+  SaveError(errptr, db->rep->FlushWAL(sync));
+}
+
 void rocksdb_disable_file_deletions(
     rocksdb_t* db,
     char** errptr) {
@@ -2544,6 +2697,59 @@ unsigned char rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(
   return opt->rep.skip_checking_sst_file_sizes_on_db_open;
 }
 
+/* Blob Options Settings */
+void rocksdb_options_set_enable_blob_files(rocksdb_options_t* opt,
+                                           unsigned char val) {
+  opt->rep.enable_blob_files = val;
+}
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_files(
+    rocksdb_options_t* opt) {
+  return opt->rep.enable_blob_files;
+}
+
+void rocksdb_options_set_min_blob_size(rocksdb_options_t* opt, uint64_t val) {
+  opt->rep.min_blob_size = val;
+}
+
+uint64_t rocksdb_options_get_min_blob_size(rocksdb_options_t* opt) {
+  return opt->rep.min_blob_size;
+}
+
+void rocksdb_options_set_blob_file_size(rocksdb_options_t* opt, uint64_t val) {
+  opt->rep.blob_file_size = val;
+}
+
+uint64_t rocksdb_options_get_blob_file_size(rocksdb_options_t* opt) {
+  return opt->rep.blob_file_size;
+}
+
+void rocksdb_options_set_blob_compression_type(rocksdb_options_t* opt,
+                                               int val) {
+  opt->rep.blob_compression_type = static_cast<CompressionType>(val);
+}
+
+int rocksdb_options_get_blob_compression_type(rocksdb_options_t* opt) {
+  return opt->rep.blob_compression_type;
+}
+
+void rocksdb_options_set_enable_blob_gc(rocksdb_options_t* opt,
+                                        unsigned char val) {
+  opt->rep.enable_blob_garbage_collection = val;
+}
+
+unsigned char rocksdb_options_get_enable_blob_gc(rocksdb_options_t* opt) {
+  return opt->rep.enable_blob_garbage_collection;
+}
+
+void rocksdb_options_set_blob_gc_age_cutoff(rocksdb_options_t* opt,
+                                            double val) {
+  opt->rep.blob_garbage_collection_age_cutoff = val;
+}
+
+double rocksdb_options_get_blob_gc_age_cutoff(rocksdb_options_t* opt) {
+  return opt->rep.blob_garbage_collection_age_cutoff;
+}
+
 void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) {
   opt->rep.num_levels = n;
 }
@@ -2636,6 +2842,14 @@ void rocksdb_options_set_bottommost_compression_options_zstd_max_train_bytes(
   opt->rep.bottommost_compression_opts.enabled = enabled;
 }
 
+void rocksdb_options_set_bottommost_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t* opt, uint64_t max_dict_buffer_bytes,
+    unsigned char enabled) {
+  opt->rep.bottommost_compression_opts.max_dict_buffer_bytes =
+      max_dict_buffer_bytes;
+  opt->rep.bottommost_compression_opts.enabled = enabled;
+}
+
 void rocksdb_options_set_compression_options(rocksdb_options_t* opt, int w_bits,
                                              int level, int strategy,
                                              int max_dict_bytes) {
@@ -2650,6 +2864,31 @@ void rocksdb_options_set_compression_options_zstd_max_train_bytes(
   opt->rep.compression_opts.zstd_max_train_bytes = zstd_max_train_bytes;
 }
 
+int rocksdb_options_get_compression_options_zstd_max_train_bytes(
+    rocksdb_options_t* opt) {
+  return opt->rep.compression_opts.zstd_max_train_bytes;
+}
+
+void rocksdb_options_set_compression_options_parallel_threads(
+    rocksdb_options_t* opt, int value) {
+  opt->rep.compression_opts.parallel_threads = value;
+}
+
+int rocksdb_options_get_compression_options_parallel_threads(
+    rocksdb_options_t* opt) {
+  return opt->rep.compression_opts.parallel_threads;
+}
+
+void rocksdb_options_set_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t* opt, uint64_t max_dict_buffer_bytes) {
+  opt->rep.compression_opts.max_dict_buffer_bytes = max_dict_buffer_bytes;
+}
+
+uint64_t rocksdb_options_get_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t* opt) {
+  return opt->rep.compression_opts.max_dict_buffer_bytes;
+}
+
 void rocksdb_options_set_prefix_extractor(
     rocksdb_options_t* opt, rocksdb_slicetransform_t* prefix_extractor) {
   opt->rep.prefix_extractor.reset(prefix_extractor);
@@ -2808,6 +3047,8 @@ void rocksdb_options_set_access_hint_on_compaction_start(
       opt->rep.access_hint_on_compaction_start =
           ROCKSDB_NAMESPACE::Options::WILLNEED;
       break;
+    default:
+      assert(0);
   }
 }
 
@@ -3262,6 +3503,15 @@ unsigned char rocksdb_options_get_atomic_flush(rocksdb_options_t* opt) {
   return opt->rep.atomic_flush;
 }
 
+void rocksdb_options_set_manual_wal_flush(rocksdb_options_t* opt,
+                                          unsigned char manual_wal_flush) {
+  opt->rep.manual_wal_flush = manual_wal_flush;
+}
+
+unsigned char rocksdb_options_get_manual_wal_flush(rocksdb_options_t* opt) {
+  return opt->rep.manual_wal_flush;
+}
+
 rocksdb_ratelimiter_t* rocksdb_ratelimiter_create(
     int64_t rate_bytes_per_sec,
     int64_t refill_period_us,
@@ -3283,6 +3533,14 @@ void rocksdb_options_set_row_cache(rocksdb_options_t* opt, rocksdb_cache_t* cach
   }
 }
 
+void rocksdb_options_add_compact_on_deletion_collector_factory(
+    rocksdb_options_t* opt, size_t window_size, size_t num_dels_trigger) {
+  std::shared_ptr<ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory>
+      compact_on_del =
+          NewCompactOnDeletionCollectorFactory(window_size, num_dels_trigger);
+  opt->rep.table_properties_collector_factories.emplace_back(compact_on_del);
+}
+
 void rocksdb_set_perf_level(int v) {
   PerfLevel level = static_cast<PerfLevel>(v);
   SetPerfLevel(level);
@@ -3798,6 +4056,25 @@ unsigned char rocksdb_readoptions_get_ignore_range_deletions(
   return opt->rep.ignore_range_deletions;
 }
 
+void rocksdb_readoptions_set_deadline(rocksdb_readoptions_t* opt,
+                                      uint64_t microseconds) {
+  opt->rep.deadline = std::chrono::microseconds(microseconds);
+}
+
+uint64_t rocksdb_readoptions_get_deadline(rocksdb_readoptions_t* opt) {
+  return opt->rep.deadline.count();
+}
+
+void rocksdb_readoptions_set_io_timeout(rocksdb_readoptions_t* opt,
+                                        uint64_t microseconds) {
+  opt->rep.io_timeout = std::chrono::microseconds(microseconds);
+}
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t* opt) {
+  return opt->rep.io_timeout.count();
+}
+
 rocksdb_writeoptions_t* rocksdb_writeoptions_create() {
   return new rocksdb_writeoptions_t;
 }
@@ -3930,16 +4207,58 @@ unsigned char rocksdb_flushoptions_get_wait(rocksdb_flushoptions_t* opt) {
   return opt->rep.wait;
 }
 
+rocksdb_memory_allocator_t* rocksdb_jemalloc_nodump_allocator_create(
+    char** errptr) {
+  rocksdb_memory_allocator_t* allocator = new rocksdb_memory_allocator_t;
+  ROCKSDB_NAMESPACE::JemallocAllocatorOptions options;
+  SaveError(errptr, ROCKSDB_NAMESPACE::NewJemallocNodumpAllocator(
+                        options, &allocator->rep));
+  return allocator;
+}
+
+void rocksdb_memory_allocator_destroy(rocksdb_memory_allocator_t* allocator) {
+  delete allocator;
+}
+
+rocksdb_lru_cache_options_t* rocksdb_lru_cache_options_create() {
+  return new rocksdb_lru_cache_options_t;
+}
+
+void rocksdb_lru_cache_options_destroy(rocksdb_lru_cache_options_t* opt) {
+  delete opt;
+}
+
+void rocksdb_lru_cache_options_set_capacity(rocksdb_lru_cache_options_t* opt,
+                                            size_t capacity) {
+  opt->rep.capacity = capacity;
+}
+
+void rocksdb_lru_cache_options_set_memory_allocator(
+    rocksdb_lru_cache_options_t* opt, rocksdb_memory_allocator_t* allocator) {
+  opt->rep.memory_allocator = allocator->rep;
+}
+
 rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity) {
   rocksdb_cache_t* c = new rocksdb_cache_t;
   c->rep = NewLRUCache(capacity);
   return c;
 }
 
+rocksdb_cache_t* rocksdb_cache_create_lru_opts(
+    rocksdb_lru_cache_options_t* opt) {
+  rocksdb_cache_t* c = new rocksdb_cache_t;
+  c->rep = NewLRUCache(opt->rep);
+  return c;
+}
+
 void rocksdb_cache_destroy(rocksdb_cache_t* cache) {
   delete cache;
 }
 
+void rocksdb_cache_disown_data(rocksdb_cache_t* cache) {
+  cache->rep->DisownData();
+}
+
 void rocksdb_cache_set_capacity(rocksdb_cache_t* cache, size_t capacity) {
   cache->rep->SetCapacity(capacity);
 }
@@ -3985,20 +4304,36 @@ void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n) {
   env->rep->SetBackgroundThreads(n);
 }
 
+int rocksdb_env_get_background_threads(rocksdb_env_t* env) {
+  return env->rep->GetBackgroundThreads();
+}
+
 void rocksdb_env_set_bottom_priority_background_threads(rocksdb_env_t* env,
                                                         int n) {
   env->rep->SetBackgroundThreads(n, Env::BOTTOM);
 }
 
+int rocksdb_env_get_bottom_priority_background_threads(rocksdb_env_t* env) {
+  return env->rep->GetBackgroundThreads(Env::BOTTOM);
+}
+
 void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n) {
   env->rep->SetBackgroundThreads(n, Env::HIGH);
 }
 
+int rocksdb_env_get_high_priority_background_threads(rocksdb_env_t* env) {
+  return env->rep->GetBackgroundThreads(Env::HIGH);
+}
+
 void rocksdb_env_set_low_priority_background_threads(rocksdb_env_t* env,
                                                      int n) {
   env->rep->SetBackgroundThreads(n, Env::LOW);
 }
 
+int rocksdb_env_get_low_priority_background_threads(rocksdb_env_t* env) {
+  return env->rep->GetBackgroundThreads(Env::LOW);
+}
+
 void rocksdb_env_join_all_threads(rocksdb_env_t* env) {
   env->rep->WaitForJoin();
 }
@@ -4226,32 +4561,62 @@ void rocksdb_universal_compaction_options_set_size_ratio(
   uco->rep->size_ratio = ratio;
 }
 
+int rocksdb_universal_compaction_options_get_size_ratio(
+    rocksdb_universal_compaction_options_t* uco) {
+  return uco->rep->size_ratio;
+}
+
 void rocksdb_universal_compaction_options_set_min_merge_width(
   rocksdb_universal_compaction_options_t* uco, int w) {
   uco->rep->min_merge_width = w;
 }
 
+int rocksdb_universal_compaction_options_get_min_merge_width(
+    rocksdb_universal_compaction_options_t* uco) {
+  return uco->rep->min_merge_width;
+}
+
 void rocksdb_universal_compaction_options_set_max_merge_width(
   rocksdb_universal_compaction_options_t* uco, int w) {
   uco->rep->max_merge_width = w;
 }
 
+int rocksdb_universal_compaction_options_get_max_merge_width(
+    rocksdb_universal_compaction_options_t* uco) {
+  return uco->rep->max_merge_width;
+}
+
 void rocksdb_universal_compaction_options_set_max_size_amplification_percent(
   rocksdb_universal_compaction_options_t* uco, int p) {
   uco->rep->max_size_amplification_percent = p;
 }
 
+int rocksdb_universal_compaction_options_get_max_size_amplification_percent(
+    rocksdb_universal_compaction_options_t* uco) {
+  return uco->rep->max_size_amplification_percent;
+}
+
 void rocksdb_universal_compaction_options_set_compression_size_percent(
   rocksdb_universal_compaction_options_t* uco, int p) {
   uco->rep->compression_size_percent = p;
 }
 
+int rocksdb_universal_compaction_options_get_compression_size_percent(
+    rocksdb_universal_compaction_options_t* uco) {
+  return uco->rep->compression_size_percent;
+}
+
 void rocksdb_universal_compaction_options_set_stop_style(
   rocksdb_universal_compaction_options_t* uco, int style) {
   uco->rep->stop_style =
       static_cast<ROCKSDB_NAMESPACE::CompactionStopStyle>(style);
 }
 
+int rocksdb_universal_compaction_options_get_stop_style(
+    rocksdb_universal_compaction_options_t* uco) {
+  return static_cast<int>(uco->rep->stop_style);
+}
+
 void rocksdb_universal_compaction_options_destroy(
   rocksdb_universal_compaction_options_t* uco) {
   delete uco->rep;
@@ -4269,6 +4634,11 @@ void rocksdb_fifo_compaction_options_set_max_table_files_size(
   fifo_opts->rep.max_table_files_size = size;
 }
 
+uint64_t rocksdb_fifo_compaction_options_get_max_table_files_size(
+    rocksdb_fifo_compaction_options_t* fifo_opts) {
+  return fifo_opts->rep.max_table_files_size;
+}
+
 void rocksdb_fifo_compaction_options_destroy(
     rocksdb_fifo_compaction_options_t* fifo_opts) {
   delete fifo_opts;
@@ -4567,7 +4937,10 @@ void rocksdb_transaction_destroy(rocksdb_transaction_t* txn) {
 
 const rocksdb_snapshot_t* rocksdb_transaction_get_snapshot(
     rocksdb_transaction_t* txn) {
-  rocksdb_snapshot_t* result = new rocksdb_snapshot_t;
+  // This will be freed later on using free, so use malloc here to avoid a
+  // mismatch
+  rocksdb_snapshot_t* result =
+      (rocksdb_snapshot_t*)malloc(sizeof(rocksdb_snapshot_t));
   result->rep = txn->rep->GetSnapshot();
   return result;
 }
diff --git a/db/c_test.c b/db/c_test.c
index a01336738b3..5b7459b069f 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -517,6 +517,9 @@ int main(int argc, char** argv) {
   coptions = rocksdb_compactoptions_create();
   rocksdb_compactoptions_set_exclusive_manual_compaction(coptions, 1);
 
+  rocksdb_options_add_compact_on_deletion_collector_factory(options, 10000,
+                                                            10001);
+
   StartPhase("destroy");
   rocksdb_destroy_db(options, dbname, &err);
   Free(&err);
@@ -988,7 +991,9 @@ int main(int argc, char** argv) {
                   &err);
       CheckNoError(err);
     }
-    rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes);
+    rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes,
+                              &err);
+    CheckNoError(err);
     CheckCondition(sizes[0] > 0);
     CheckCondition(sizes[1] > 0);
   }
@@ -1117,9 +1122,8 @@ int main(int argc, char** argv) {
         // Essentially a fingerprint of the block-based Bloom schema
         CheckCondition(hits == 241);
       } else {
-        // Essentially a fingerprint of the full Bloom schema(s),
-        // format_version < 5, which vary for three different CACHE_LINE_SIZEs
-        CheckCondition(hits == 224 || hits == 180 || hits == 125);
+        // Essentially a fingerprint of full Bloom schema, format_version=5
+        CheckCondition(hits == 188);
       }
       CheckCondition(
           (keys_to_query - hits) ==
@@ -1275,6 +1279,9 @@ int main(int argc, char** argv) {
     CheckPinGetCF(db, roptions, handles[1], "box", "c");
     rocksdb_writebatch_destroy(wb);
 
+    rocksdb_flush_wal(db, 1, &err);
+    CheckNoError(err);
+
     const char* keys[3] = { "box", "box", "barfooxx" };
     const rocksdb_column_family_handle_t* get_handles[3] = { handles[0], handles[1], handles[1] };
     const size_t keys_sizes[3] = { 3, 3, 8 };
@@ -1758,6 +1765,28 @@ int main(int argc, char** argv) {
     rocksdb_options_set_atomic_flush(o, 1);
     CheckCondition(1 == rocksdb_options_get_atomic_flush(o));
 
+    rocksdb_options_set_manual_wal_flush(o, 1);
+    CheckCondition(1 == rocksdb_options_get_manual_wal_flush(o));
+
+    /* Blob Options */
+    rocksdb_options_set_enable_blob_files(o, 1);
+    CheckCondition(1 == rocksdb_options_get_enable_blob_files(o));
+
+    rocksdb_options_set_min_blob_size(o, 29);
+    CheckCondition(29 == rocksdb_options_get_min_blob_size(o));
+
+    rocksdb_options_set_blob_file_size(o, 30);
+    CheckCondition(30 == rocksdb_options_get_blob_file_size(o));
+
+    rocksdb_options_set_blob_compression_type(o, 4);
+    CheckCondition(4 == rocksdb_options_get_blob_compression_type(o));
+
+    rocksdb_options_set_enable_blob_gc(o, 1);
+    CheckCondition(1 == rocksdb_options_get_enable_blob_gc(o));
+
+    rocksdb_options_set_blob_gc_age_cutoff(o, 0.75);
+    CheckCondition(0.75 == rocksdb_options_get_blob_gc_age_cutoff(o));
+
     // Create a copy that should be equal to the original.
     rocksdb_options_t* copy;
     copy = rocksdb_options_create_copy(o);
@@ -2279,6 +2308,12 @@ int main(int argc, char** argv) {
     rocksdb_readoptions_set_ignore_range_deletions(ro, 1);
     CheckCondition(1 == rocksdb_readoptions_get_ignore_range_deletions(ro));
 
+    rocksdb_readoptions_set_deadline(ro, 300);
+    CheckCondition(300 == rocksdb_readoptions_get_deadline(ro));
+
+    rocksdb_readoptions_set_io_timeout(ro, 400);
+    CheckCondition(400 == rocksdb_readoptions_get_io_timeout(ro));
+
     rocksdb_readoptions_destroy(ro);
   }
 
@@ -2355,6 +2390,177 @@ int main(int argc, char** argv) {
     rocksdb_cache_destroy(co);
   }
 
+  StartPhase("jemalloc_nodump_allocator");
+  {
+    rocksdb_memory_allocator_t* allocator;
+    allocator = rocksdb_jemalloc_nodump_allocator_create(&err);
+    if (err != NULL) {
+      // not supported on all platforms, allow unsupported error
+      const char* ni = "Not implemented: ";
+      size_t ni_len = strlen(ni);
+      size_t err_len = strlen(err);
+
+      CheckCondition(err_len >= ni_len);
+      CheckCondition(memcmp(ni, err, ni_len) == 0);
+      Free(&err);
+    } else {
+      rocksdb_cache_t* co;
+      rocksdb_lru_cache_options_t* copts;
+
+      copts = rocksdb_lru_cache_options_create();
+
+      rocksdb_lru_cache_options_set_capacity(copts, 100);
+      rocksdb_lru_cache_options_set_memory_allocator(copts, allocator);
+
+      co = rocksdb_cache_create_lru_opts(copts);
+      CheckCondition(100 == rocksdb_cache_get_capacity(co));
+
+      rocksdb_cache_destroy(co);
+      rocksdb_lru_cache_options_destroy(copts);
+    }
+    rocksdb_memory_allocator_destroy(allocator);
+  }
+
+  StartPhase("env");
+  {
+    rocksdb_env_t* e;
+    e = rocksdb_create_default_env();
+
+    rocksdb_env_set_background_threads(e, 10);
+    CheckCondition(10 == rocksdb_env_get_background_threads(e));
+
+    rocksdb_env_set_high_priority_background_threads(e, 20);
+    CheckCondition(20 == rocksdb_env_get_high_priority_background_threads(e));
+
+    rocksdb_env_set_low_priority_background_threads(e, 30);
+    CheckCondition(30 == rocksdb_env_get_low_priority_background_threads(e));
+
+    rocksdb_env_set_bottom_priority_background_threads(e, 40);
+    CheckCondition(40 == rocksdb_env_get_bottom_priority_background_threads(e));
+
+    rocksdb_env_destroy(e);
+  }
+
+  StartPhase("universal_compaction_options");
+  {
+    rocksdb_universal_compaction_options_t* uco;
+    uco = rocksdb_universal_compaction_options_create();
+
+    rocksdb_universal_compaction_options_set_size_ratio(uco, 5);
+    CheckCondition(5 ==
+                   rocksdb_universal_compaction_options_get_size_ratio(uco));
+
+    rocksdb_universal_compaction_options_set_min_merge_width(uco, 15);
+    CheckCondition(
+        15 == rocksdb_universal_compaction_options_get_min_merge_width(uco));
+
+    rocksdb_universal_compaction_options_set_max_merge_width(uco, 25);
+    CheckCondition(
+        25 == rocksdb_universal_compaction_options_get_max_merge_width(uco));
+
+    rocksdb_universal_compaction_options_set_max_size_amplification_percent(uco,
+                                                                            35);
+    CheckCondition(
+        35 ==
+        rocksdb_universal_compaction_options_get_max_size_amplification_percent(
+            uco));
+
+    rocksdb_universal_compaction_options_set_compression_size_percent(uco, 45);
+    CheckCondition(
+        45 ==
+        rocksdb_universal_compaction_options_get_compression_size_percent(uco));
+
+    rocksdb_universal_compaction_options_set_stop_style(uco, 1);
+    CheckCondition(1 ==
+                   rocksdb_universal_compaction_options_get_stop_style(uco));
+
+    rocksdb_universal_compaction_options_destroy(uco);
+  }
+
+  StartPhase("fifo_compaction_options");
+  {
+    rocksdb_fifo_compaction_options_t* fco;
+    fco = rocksdb_fifo_compaction_options_create();
+
+    rocksdb_fifo_compaction_options_set_max_table_files_size(fco, 100000);
+    CheckCondition(
+        100000 ==
+        rocksdb_fifo_compaction_options_get_max_table_files_size(fco));
+
+    rocksdb_fifo_compaction_options_destroy(fco);
+  }
+
+  StartPhase("backupable_db_option");
+  {
+    rocksdb_backupable_db_options_t* bdo;
+    bdo = rocksdb_backupable_db_options_create("path");
+
+    rocksdb_backupable_db_options_set_share_table_files(bdo, 1);
+    CheckCondition(1 ==
+                   rocksdb_backupable_db_options_get_share_table_files(bdo));
+
+    rocksdb_backupable_db_options_set_sync(bdo, 1);
+    CheckCondition(1 == rocksdb_backupable_db_options_get_sync(bdo));
+
+    rocksdb_backupable_db_options_set_destroy_old_data(bdo, 1);
+    CheckCondition(1 ==
+                   rocksdb_backupable_db_options_get_destroy_old_data(bdo));
+
+    rocksdb_backupable_db_options_set_backup_log_files(bdo, 1);
+    CheckCondition(1 ==
+                   rocksdb_backupable_db_options_get_backup_log_files(bdo));
+
+    rocksdb_backupable_db_options_set_backup_rate_limit(bdo, 123);
+    CheckCondition(123 ==
+                   rocksdb_backupable_db_options_get_backup_rate_limit(bdo));
+
+    rocksdb_backupable_db_options_set_restore_rate_limit(bdo, 37);
+    CheckCondition(37 ==
+                   rocksdb_backupable_db_options_get_restore_rate_limit(bdo));
+
+    rocksdb_backupable_db_options_set_max_background_operations(bdo, 20);
+    CheckCondition(
+        20 == rocksdb_backupable_db_options_get_max_background_operations(bdo));
+
+    rocksdb_backupable_db_options_set_callback_trigger_interval_size(bdo, 9000);
+    CheckCondition(
+        9000 ==
+        rocksdb_backupable_db_options_get_callback_trigger_interval_size(bdo));
+
+    rocksdb_backupable_db_options_set_max_valid_backups_to_open(bdo, 40);
+    CheckCondition(
+        40 == rocksdb_backupable_db_options_get_max_valid_backups_to_open(bdo));
+
+    rocksdb_backupable_db_options_set_share_files_with_checksum_naming(bdo, 2);
+    CheckCondition(
+        2 == rocksdb_backupable_db_options_get_share_files_with_checksum_naming(
+                 bdo));
+
+    rocksdb_backupable_db_options_destroy(bdo);
+  }
+
+  StartPhase("compression_options");
+  {
+    rocksdb_options_t* co;
+    co = rocksdb_options_create();
+
+    rocksdb_options_set_compression_options_zstd_max_train_bytes(co, 100);
+    CheckCondition(
+        100 ==
+        rocksdb_options_get_compression_options_zstd_max_train_bytes(co));
+
+    rocksdb_options_set_compression_options_parallel_threads(co, 2);
+    CheckCondition(
+        2 == rocksdb_options_get_compression_options_parallel_threads(co));
+
+    rocksdb_options_set_compression_options_max_dict_buffer_bytes(co, 200);
+    CheckCondition(
+        200 ==
+        rocksdb_options_get_compression_options_max_dict_buffer_bytes(co));
+
+    rocksdb_options_destroy(co);
+  }
+
   StartPhase("iterate_upper_bound");
   {
     // Create new empty database
@@ -2755,7 +2961,7 @@ int main(int argc, char** argv) {
 
 #else
 
-int main() {
+int main(void) {
   fprintf(stderr, "SKIPPED\n");
   return 0;
 }
diff --git a/db/column_family.cc b/db/column_family.cc
index d0a16dd48c4..c168f2b1d12 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -16,6 +16,7 @@
 #include <string>
 #include <vector>
 
+#include "db/blob/blob_file_cache.h"
 #include "db/compaction/compaction_picker.h"
 #include "db/compaction/compaction_picker_fifo.h"
 #include "db/compaction/compaction_picker_level.h"
@@ -32,6 +33,7 @@
 #include "monitoring/thread_status_util.h"
 #include "options/options_helper.h"
 #include "port/port.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/table.h"
 #include "table/merging_iterator.h"
 #include "util/autovector.h"
@@ -107,8 +109,9 @@ const Comparator* ColumnFamilyHandleImpl::GetComparator() const {
 
 void GetIntTblPropCollectorFactory(
     const ImmutableCFOptions& ioptions,
-    std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-        int_tbl_prop_collector_factories) {
+    IntTblPropCollectorFactories* int_tbl_prop_collector_factories) {
+  assert(int_tbl_prop_collector_factories);
+
   auto& collector_factories = ioptions.table_properties_collector_factories;
   for (size_t i = 0; i < ioptions.table_properties_collector_factories.size();
        ++i) {
@@ -212,7 +215,8 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
   // if user sets arena_block_size, we trust user to use this value. Otherwise,
   // calculate a proper value from writer_buffer_size;
   if (result.arena_block_size <= 0) {
-    result.arena_block_size = result.write_buffer_size / 8;
+    result.arena_block_size =
+        std::min(size_t{1024 * 1024}, result.write_buffer_size / 8);
 
     // Align up to 4k
     const size_t align = 4 * 1024;
@@ -281,7 +285,7 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
   }
 
   if (result.level0_file_num_compaction_trigger == 0) {
-    ROCKS_LOG_WARN(db_options.info_log.get(),
+    ROCKS_LOG_WARN(db_options.logger,
                    "level0_file_num_compaction_trigger cannot be 0");
     result.level0_file_num_compaction_trigger = 1;
   }
@@ -290,7 +294,7 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
           result.level0_slowdown_writes_trigger ||
       result.level0_slowdown_writes_trigger <
           result.level0_file_num_compaction_trigger) {
-    ROCKS_LOG_WARN(db_options.info_log.get(),
+    ROCKS_LOG_WARN(db_options.logger,
                    "This condition must be satisfied: "
                    "level0_stop_writes_trigger(%d) >= "
                    "level0_slowdown_writes_trigger(%d) >= "
@@ -307,7 +311,7 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
         result.level0_slowdown_writes_trigger) {
       result.level0_stop_writes_trigger = result.level0_slowdown_writes_trigger;
     }
-    ROCKS_LOG_WARN(db_options.info_log.get(),
+    ROCKS_LOG_WARN(db_options.logger,
                    "Adjust the value to "
                    "level0_stop_writes_trigger(%d)"
                    "level0_slowdown_writes_trigger(%d)"
@@ -334,7 +338,9 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
   // was not used)
   auto sfm = static_cast<SstFileManagerImpl*>(db_options.sst_file_manager.get());
   for (size_t i = 0; i < result.cf_paths.size(); i++) {
-    DeleteScheduler::CleanupDirectory(db_options.env, sfm, result.cf_paths[i].path);
+    DeleteScheduler::CleanupDirectory(db_options.env, sfm,
+                                      result.cf_paths[i].path)
+        .PermitUncheckedError();
   }
 #endif
 
@@ -448,9 +454,7 @@ void SuperVersion::Cleanup() {
     to_delete.push_back(m);
   }
   current->Unref();
-  if (cfd->Unref()) {
-    delete cfd;
-  }
+  cfd->UnrefAndTryDelete(this);
 }
 
 void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
@@ -500,7 +504,8 @@ ColumnFamilyData::ColumnFamilyData(
     const ColumnFamilyOptions& cf_options, const ImmutableDBOptions& db_options,
     const FileOptions& file_options, ColumnFamilySet* column_family_set,
     BlockCacheTracer* const block_cache_tracer,
-    const std::shared_ptr<IOTracer>& io_tracer)
+    const std::shared_ptr<IOTracer>& io_tracer,
+    const std::string& db_session_id)
     : id_(id),
       name_(name),
       dummy_versions_(_dummy_versions),
@@ -543,7 +548,7 @@ ColumnFamilyData::ColumnFamilyData(
       db_paths_registered_ = true;
     } else {
       ROCKS_LOG_ERROR(
-          ioptions_.info_log,
+          ioptions_.logger,
           "Failed to register data paths of column family (id: %d, name: %s)",
           id_, name_.c_str());
     }
@@ -556,9 +561,14 @@ ColumnFamilyData::ColumnFamilyData(
   // if _dummy_versions is nullptr, then this is a dummy column family.
   if (_dummy_versions != nullptr) {
     internal_stats_.reset(
-        new InternalStats(ioptions_.num_levels, db_options.env, this));
+        new InternalStats(ioptions_.num_levels, ioptions_.clock, this));
     table_cache_.reset(new TableCache(ioptions_, file_options, _table_cache,
-                                      block_cache_tracer, io_tracer));
+                                      block_cache_tracer, io_tracer,
+                                      db_session_id));
+    blob_file_cache_.reset(
+        new BlobFileCache(_table_cache, ioptions(), soptions(), id_,
+                          internal_stats_->GetBlobFileReadHist(), io_tracer));
+
     if (ioptions_.compaction_style == kCompactionStyleLevel) {
       compaction_picker_.reset(
           new LevelCompactionPicker(ioptions_, &internal_comparator_));
@@ -572,13 +582,13 @@ ColumnFamilyData::ColumnFamilyData(
     } else if (ioptions_.compaction_style == kCompactionStyleNone) {
       compaction_picker_.reset(new NullCompactionPicker(
           ioptions_, &internal_comparator_));
-      ROCKS_LOG_WARN(ioptions_.info_log,
+      ROCKS_LOG_WARN(ioptions_.logger,
                      "Column family %s does not use any background compaction. "
                      "Compactions can only be done via CompactFiles\n",
                      GetName().c_str());
 #endif  // !ROCKSDB_LITE
     } else {
-      ROCKS_LOG_ERROR(ioptions_.info_log,
+      ROCKS_LOG_ERROR(ioptions_.logger,
                       "Unable to recognize the specified compaction style %d. "
                       "Column family %s will use kCompactionStyleLevel.\n",
                       ioptions_.compaction_style, GetName().c_str());
@@ -587,12 +597,12 @@ ColumnFamilyData::ColumnFamilyData(
     }
 
     if (column_family_set_->NumberOfColumnFamilies() < 10) {
-      ROCKS_LOG_INFO(ioptions_.info_log,
+      ROCKS_LOG_INFO(ioptions_.logger,
                      "--------------- Options for column family [%s]:\n",
                      name.c_str());
-      initial_cf_options_.Dump(ioptions_.info_log);
+      initial_cf_options_.Dump(ioptions_.logger);
     } else {
-      ROCKS_LOG_INFO(ioptions_.info_log, "\t(skipping printing options)\n");
+      ROCKS_LOG_INFO(ioptions_.logger, "\t(skipping printing options)\n");
     }
   }
 
@@ -627,7 +637,7 @@ ColumnFamilyData::~ColumnFamilyData() {
 
   if (dummy_versions_ != nullptr) {
     // List must be empty
-    assert(dummy_versions_->TEST_Next() == dummy_versions_);
+    assert(dummy_versions_->Next() == dummy_versions_);
     bool deleted __attribute__((__unused__));
     deleted = dummy_versions_->Unref();
     assert(deleted);
@@ -648,14 +658,14 @@ ColumnFamilyData::~ColumnFamilyData() {
     Status s = ioptions_.env->UnregisterDbPaths(GetDbPaths());
     if (!s.ok()) {
       ROCKS_LOG_ERROR(
-          ioptions_.info_log,
+          ioptions_.logger,
           "Failed to unregister data paths of column family (id: %d, name: %s)",
           id_, name_.c_str());
     }
   }
 }
 
-bool ColumnFamilyData::UnrefAndTryDelete() {
+bool ColumnFamilyData::UnrefAndTryDelete(SuperVersion* sv_under_cleanup) {
   int old_refs = refs_.fetch_sub(1);
   assert(old_refs > 0);
 
@@ -665,7 +675,11 @@ bool ColumnFamilyData::UnrefAndTryDelete() {
     return true;
   }
 
-  if (old_refs == 2 && super_version_ != nullptr) {
+  // If called under SuperVersion::Cleanup, we should not re-enter Cleanup on
+  // the same SuperVersion. (But while installing a new SuperVersion, this
+  // cfd could be referenced only by two SuperVersions.)
+  if (old_refs == 2 && super_version_ != nullptr &&
+      super_version_ != sv_under_cleanup) {
     // Only the super_version_ holds me
     SuperVersion* sv = super_version_;
     super_version_ = nullptr;
@@ -703,9 +717,7 @@ uint64_t ColumnFamilyData::OldestLogToKeep() {
   auto current_log = GetLogNumber();
 
   if (allow_2pc_) {
-    autovector<MemTable*> empty_list;
-    auto imm_prep_log =
-        imm()->PrecomputeMinLogContainingPrepSection(empty_list);
+    auto imm_prep_log = imm()->PrecomputeMinLogContainingPrepSection();
     auto mem_prep_log = mem()->GetMinLogContainingPrepSection();
 
     if (imm_prep_log > 0 && imm_prep_log < current_log) {
@@ -827,7 +839,8 @@ std::pair<WriteStallCondition, ColumnFamilyData::WriteStallCause>
 ColumnFamilyData::GetWriteStallConditionAndCause(
     int num_unflushed_memtables, int num_l0_files,
     uint64_t num_compaction_needed_bytes,
-    const MutableCFOptions& mutable_cf_options) {
+    const MutableCFOptions& mutable_cf_options,
+    const ImmutableCFOptions& immutable_cf_options) {
   if (num_unflushed_memtables >= mutable_cf_options.max_write_buffer_number) {
     return {WriteStallCondition::kStopped, WriteStallCause::kMemtableLimit};
   } else if (!mutable_cf_options.disable_auto_compactions &&
@@ -841,7 +854,9 @@ ColumnFamilyData::GetWriteStallConditionAndCause(
             WriteStallCause::kPendingCompactionBytes};
   } else if (mutable_cf_options.max_write_buffer_number > 3 &&
              num_unflushed_memtables >=
-                 mutable_cf_options.max_write_buffer_number - 1) {
+                 mutable_cf_options.max_write_buffer_number - 1 &&
+             num_unflushed_memtables - 1 >=
+                 immutable_cf_options.min_write_buffer_number_to_merge) {
     return {WriteStallCondition::kDelayed, WriteStallCause::kMemtableLimit};
   } else if (!mutable_cf_options.disable_auto_compactions &&
              mutable_cf_options.level0_slowdown_writes_trigger >= 0 &&
@@ -869,7 +884,8 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
 
     auto write_stall_condition_and_cause = GetWriteStallConditionAndCause(
         imm()->NumNotFlushed(), vstorage->l0_delay_trigger_count(),
-        vstorage->estimated_compaction_needed_bytes(), mutable_cf_options);
+        vstorage->estimated_compaction_needed_bytes(), mutable_cf_options,
+        *ioptions());
     write_stall_condition = write_stall_condition_and_cause.first;
     auto write_stall_cause = write_stall_condition_and_cause.second;
 
@@ -881,7 +897,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
       write_controller_token_ = write_controller->GetStopToken();
       internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_STOPS, 1);
       ROCKS_LOG_WARN(
-          ioptions_.info_log,
+          ioptions_.logger,
           "[%s] Stopping writes because we have %d immutable memtables "
           "(waiting for flush), max_write_buffer_number is set to %d",
           name_.c_str(), imm()->NumNotFlushed(),
@@ -894,7 +910,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
         internal_stats_->AddCFStats(
             InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_STOPS, 1);
       }
-      ROCKS_LOG_WARN(ioptions_.info_log,
+      ROCKS_LOG_WARN(ioptions_.logger,
                      "[%s] Stopping writes because we have %d level-0 files",
                      name_.c_str(), vstorage->l0_delay_trigger_count());
     } else if (write_stall_condition == WriteStallCondition::kStopped &&
@@ -903,7 +919,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
       internal_stats_->AddCFStats(
           InternalStats::PENDING_COMPACTION_BYTES_LIMIT_STOPS, 1);
       ROCKS_LOG_WARN(
-          ioptions_.info_log,
+          ioptions_.logger,
           "[%s] Stopping writes because of estimated pending compaction "
           "bytes %" PRIu64,
           name_.c_str(), compaction_needed_bytes);
@@ -915,7 +931,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
                      mutable_cf_options.disable_auto_compactions);
       internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_SLOWDOWNS, 1);
       ROCKS_LOG_WARN(
-          ioptions_.info_log,
+          ioptions_.logger,
           "[%s] Stalling writes because we have %d immutable memtables "
           "(waiting for flush), max_write_buffer_number is set to %d "
           "rate %" PRIu64,
@@ -937,7 +953,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
         internal_stats_->AddCFStats(
             InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, 1);
       }
-      ROCKS_LOG_WARN(ioptions_.info_log,
+      ROCKS_LOG_WARN(ioptions_.logger,
                      "[%s] Stalling writes because we have %d level-0 files "
                      "rate %" PRIu64,
                      name_.c_str(), vstorage->l0_delay_trigger_count(),
@@ -962,7 +978,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
       internal_stats_->AddCFStats(
           InternalStats::PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, 1);
       ROCKS_LOG_WARN(
-          ioptions_.info_log,
+          ioptions_.logger,
           "[%s] Stalling writes because of estimated pending compaction "
           "bytes %" PRIu64 " rate %" PRIu64,
           name_.c_str(), vstorage->estimated_compaction_needed_bytes(),
@@ -976,7 +992,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
         write_controller_token_ =
             write_controller->GetCompactionPressureToken();
         ROCKS_LOG_INFO(
-            ioptions_.info_log,
+            ioptions_.logger,
             "[%s] Increasing compaction threads because we have %d level-0 "
             "files ",
             name_.c_str(), vstorage->l0_delay_trigger_count());
@@ -990,7 +1006,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
             write_controller->GetCompactionPressureToken();
         if (mutable_cf_options.soft_pending_compaction_bytes_limit > 0) {
           ROCKS_LOG_INFO(
-              ioptions_.info_log,
+              ioptions_.logger,
               "[%s] Increasing compaction threads because of estimated pending "
               "compaction "
               "bytes %" PRIu64,
@@ -1083,7 +1099,7 @@ bool ColumnFamilyData::RangeOverlapWithCompaction(
 
 Status ColumnFamilyData::RangesOverlapWithMemtables(
     const autovector<Range>& ranges, SuperVersion* super_version,
-    bool* overlap) {
+    bool allow_data_in_errors, bool* overlap) {
   assert(overlap != nullptr);
   *overlap = false;
   // Create an InternalIterator over all unflushed memtables
@@ -1116,13 +1132,12 @@ Status ColumnFamilyData::RangesOverlapWithMemtables(
     memtable_iter->Seek(range_start.Encode());
     status = memtable_iter->status();
     ParsedInternalKey seek_result;
-    if (status.ok()) {
-      if (memtable_iter->Valid() &&
-          ParseInternalKey(memtable_iter->key(), &seek_result) !=
-              Status::OK()) {
-        status = Status::Corruption("DB have corrupted keys");
-      }
+
+    if (status.ok() && memtable_iter->Valid()) {
+      status = ParseInternalKey(memtable_iter->key(), &seek_result,
+                                allow_data_in_errors);
     }
+
     if (status.ok()) {
       if (memtable_iter->Valid() &&
           ucmp->Compare(seek_result.user_key, ranges[i].limit) <= 0) {
@@ -1192,11 +1207,11 @@ SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(DBImpl* db) {
   SuperVersion* sv = static_cast<SuperVersion*>(ptr);
   if (sv == SuperVersion::kSVObsolete ||
       sv->version_number != super_version_number_.load()) {
-    RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_ACQUIRES);
+    RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_ACQUIRES);
     SuperVersion* sv_to_delete = nullptr;
 
     if (sv && sv->Unref()) {
-      RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_CLEANUPS);
+      RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_CLEANUPS);
       db->mutex()->Lock();
       // NOTE: underlying resources held by superversion (sst files) might
       // not be released until the next background job.
@@ -1335,24 +1350,39 @@ Status ColumnFamilyData::ValidateOptions(
           "Block-Based Table format. ");
     }
   }
+
+  if (cf_options.enable_blob_garbage_collection &&
+      (cf_options.blob_garbage_collection_age_cutoff < 0.0 ||
+       cf_options.blob_garbage_collection_age_cutoff > 1.0)) {
+    return Status::InvalidArgument(
+        "The age cutoff for blob garbage collection should be in the range "
+        "[0.0, 1.0].");
+  }
+
+  if (cf_options.compaction_style == kCompactionStyleFIFO &&
+      db_options.max_open_files != -1 && cf_options.ttl > 0) {
+    return Status::NotSupported(
+        "FIFO compaction only supported with max_open_files = -1.");
+  }
+
   return s;
 }
 
 #ifndef ROCKSDB_LITE
 Status ColumnFamilyData::SetOptions(
-    const DBOptions& db_options,
+    const DBOptions& db_opts,
     const std::unordered_map<std::string, std::string>& options_map) {
-  MutableCFOptions new_mutable_cf_options;
-  Status s =
-      GetMutableOptionsFromStrings(mutable_cf_options_, options_map,
-                                   ioptions_.info_log, &new_mutable_cf_options);
+  ColumnFamilyOptions cf_opts =
+      BuildColumnFamilyOptions(initial_cf_options_, mutable_cf_options_);
+  ConfigOptions config_opts;
+  config_opts.mutable_options_only = true;
+  Status s = GetColumnFamilyOptionsFromMap(config_opts, cf_opts, options_map,
+                                           &cf_opts);
   if (s.ok()) {
-    ColumnFamilyOptions cf_options =
-        BuildColumnFamilyOptions(initial_cf_options_, new_mutable_cf_options);
-    s = ValidateOptions(db_options, cf_options);
+    s = ValidateOptions(db_opts, cf_opts);
   }
   if (s.ok()) {
-    mutable_cf_options_ = new_mutable_cf_options;
+    mutable_cf_options_ = MutableCFOptions(cf_opts);
     mutable_cf_options_.RefreshDerivedOptions(ioptions_);
   }
   return s;
@@ -1391,7 +1421,8 @@ Status ColumnFamilyData::AddDirectories(
 
     if (existing_dir == created_dirs->end()) {
       std::unique_ptr<FSDirectory> path_directory;
-      s = DBImpl::CreateAndNewDirectory(ioptions_.fs, p.path, &path_directory);
+      s = DBImpl::CreateAndNewDirectory(ioptions_.fs.get(), p.path,
+                                        &path_directory);
       if (!s.ok()) {
         return s;
       }
@@ -1422,12 +1453,13 @@ ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
                                  WriteBufferManager* _write_buffer_manager,
                                  WriteController* _write_controller,
                                  BlockCacheTracer* const block_cache_tracer,
-                                 const std::shared_ptr<IOTracer>& io_tracer)
+                                 const std::shared_ptr<IOTracer>& io_tracer,
+                                 const std::string& db_session_id)
     : max_column_family_(0),
       dummy_cfd_(new ColumnFamilyData(
           ColumnFamilyData::kDummyColumnFamilyDataId, "", nullptr, nullptr,
           nullptr, ColumnFamilyOptions(), *db_options, file_options, nullptr,
-          block_cache_tracer, io_tracer)),
+          block_cache_tracer, io_tracer, db_session_id)),
       default_cfd_cache_(nullptr),
       db_name_(dbname),
       db_options_(db_options),
@@ -1436,7 +1468,8 @@ ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
       write_buffer_manager_(_write_buffer_manager),
       write_controller_(_write_controller),
       block_cache_tracer_(block_cache_tracer),
-      io_tracer_(io_tracer) {
+      io_tracer_(io_tracer),
+      db_session_id_(db_session_id) {
   // initialize linked list
   dummy_cfd_->prev_ = dummy_cfd_;
   dummy_cfd_->next_ = dummy_cfd_;
@@ -1502,7 +1535,8 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
   assert(column_families_.find(name) == column_families_.end());
   ColumnFamilyData* new_cfd = new ColumnFamilyData(
       id, name, dummy_versions, table_cache_, write_buffer_manager_, options,
-      *db_options_, file_options_, this, block_cache_tracer_, io_tracer_);
+      *db_options_, file_options_, this, block_cache_tracer_, io_tracer_,
+      db_session_id_);
   column_families_.insert({name, id});
   column_family_data_.insert({id, new_cfd});
   max_column_family_ = std::max(max_column_family_, id);
diff --git a/db/column_family.h b/db/column_family.h
index 0a251e54537..7ad560e44d9 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -44,6 +44,7 @@ class LogBuffer;
 class InstrumentedMutex;
 class InstrumentedMutexLock;
 struct SuperVersionContext;
+class BlobFileCache;
 
 extern const double kIncSlowdownRatio;
 // This file contains a list of data structures for managing column family
@@ -252,13 +253,12 @@ extern Status CheckCFPathsSupported(const DBOptions& db_options,
 
 extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
                                            const ColumnFamilyOptions& src);
-// Wrap user defined table proproties collector factories `from cf_options`
+// Wrap user defined table properties collector factories `from cf_options`
 // into internal ones in int_tbl_prop_collector_factories. Add a system internal
 // one too.
 extern void GetIntTblPropCollectorFactory(
     const ImmutableCFOptions& ioptions,
-    std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-        int_tbl_prop_collector_factories);
+    IntTblPropCollectorFactories* int_tbl_prop_collector_factories);
 
 class ColumnFamilySet;
 
@@ -278,21 +278,11 @@ class ColumnFamilyData {
   // holding a DB mutex, or as the leader in a write batch group).
   void Ref() { refs_.fetch_add(1); }
 
-  // Unref decreases the reference count, but does not handle deletion
-  // when the count goes to 0.  If this method returns true then the
-  // caller should delete the instance immediately, or later, by calling
-  // FreeDeadColumnFamilies().  Unref() can only be called while holding
-  // a DB mutex, or during single-threaded recovery.
-  bool Unref() {
-    int old_refs = refs_.fetch_sub(1);
-    assert(old_refs > 0);
-    return old_refs == 1;
-  }
-
   // UnrefAndTryDelete() decreases the reference count and do free if needed,
   // return true if this is freed else false, UnrefAndTryDelete() can only
   // be called while holding a DB mutex, or during single-threaded recovery.
-  bool UnrefAndTryDelete();
+  // sv_under_cleanup is only provided when called from SuperVersion::Cleanup.
+  bool UnrefAndTryDelete(SuperVersion* sv_under_cleanup = nullptr);
 
   // SetDropped() can only be called under following conditions:
   // 1) Holding a DB mutex,
@@ -325,7 +315,7 @@ class ColumnFamilyData {
   FlushReason GetFlushReason() const { return flush_reason_; }
   // thread-safe
   const FileOptions* soptions() const;
-  const ImmutableCFOptions* ioptions() const { return &ioptions_; }
+  const ImmutableOptions* ioptions() const { return &ioptions_; }
   // REQUIRES: DB mutex held
   // This returns the MutableCFOptions used by current SuperVersion
   // You should use this API to reference MutableCFOptions most of the time.
@@ -359,6 +349,11 @@ class ColumnFamilyData {
 
   MemTableList* imm() { return &imm_; }
   MemTable* mem() { return mem_; }
+
+  bool IsEmpty() {
+    return mem()->GetFirstSequenceNumber() == 0 && imm()->NumNotFlushed() == 0;
+  }
+
   Version* current() { return current_; }
   Version* dummy_versions() { return dummy_versions_; }
   void SetCurrent(Version* _current);
@@ -381,6 +376,7 @@ class ColumnFamilyData {
                          SequenceNumber earliest_seq);
 
   TableCache* table_cache() const { return table_cache_.get(); }
+  BlobFileCache* blob_file_cache() const { return blob_file_cache_.get(); }
 
   // See documentation in compaction_picker.h
   // REQUIRES: DB mutex held
@@ -404,7 +400,8 @@ class ColumnFamilyData {
   //
   // Thread-safe
   Status RangesOverlapWithMemtables(const autovector<Range>& ranges,
-                                    SuperVersion* super_version, bool* overlap);
+                                    SuperVersion* super_version,
+                                    bool allow_data_in_errors, bool* overlap);
 
   // A flag to tell a manual compaction is to compact all levels together
   // instead of a specific level.
@@ -430,8 +427,7 @@ class ColumnFamilyData {
     return internal_comparator_;
   }
 
-  const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-  int_tbl_prop_collector_factories() const {
+  const IntTblPropCollectorFactories* int_tbl_prop_collector_factories() const {
     return &int_tbl_prop_collector_factories_;
   }
 
@@ -443,7 +439,7 @@ class ColumnFamilyData {
   // Get SuperVersion stored in thread local storage. If it does not exist,
   // get a reference from a current SuperVersion.
   SuperVersion* GetThreadLocalSuperVersion(DBImpl* db);
-  // Try to return SuperVersion back to thread local storage. Retrun true on
+  // Try to return SuperVersion back to thread local storage. Return true on
   // success and false on failure. It fails when the thread local storage
   // contains anything other than SuperVersion::kSVInUse flag.
   bool ReturnThreadLocalSuperVersion(SuperVersion* sv);
@@ -477,9 +473,11 @@ class ColumnFamilyData {
     kPendingCompactionBytes,
   };
   static std::pair<WriteStallCondition, WriteStallCause>
-  GetWriteStallConditionAndCause(int num_unflushed_memtables, int num_l0_files,
-                                 uint64_t num_compaction_needed_bytes,
-                                 const MutableCFOptions& mutable_cf_options);
+  GetWriteStallConditionAndCause(
+      int num_unflushed_memtables, int num_l0_files,
+      uint64_t num_compaction_needed_bytes,
+      const MutableCFOptions& mutable_cf_options,
+      const ImmutableCFOptions& immutable_cf_options);
 
   // Recalculate some small conditions, which are changed only during
   // compaction, adding new memtable and/or
@@ -506,6 +504,21 @@ class ColumnFamilyData {
 
   FSDirectory* GetDataDir(size_t path_id) const;
 
+  // full_history_ts_low_ can only increase.
+  void SetFullHistoryTsLow(std::string ts_low) {
+    assert(!ts_low.empty());
+    const Comparator* ucmp = user_comparator();
+    assert(ucmp);
+    if (full_history_ts_low_.empty() ||
+        ucmp->CompareTimestamp(ts_low, full_history_ts_low_) > 0) {
+      full_history_ts_low_ = std::move(ts_low);
+    }
+  }
+
+  const std::string& GetFullHistoryTsLow() const {
+    return full_history_ts_low_;
+  }
+
   ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
 
  private:
@@ -519,7 +532,8 @@ class ColumnFamilyData {
                    const FileOptions& file_options,
                    ColumnFamilySet* column_family_set,
                    BlockCacheTracer* const block_cache_tracer,
-                   const std::shared_ptr<IOTracer>& io_tracer);
+                   const std::shared_ptr<IOTracer>& io_tracer,
+                   const std::string& db_session_id);
 
   std::vector<std::string> GetDbPaths() const;
 
@@ -533,16 +547,16 @@ class ColumnFamilyData {
   std::atomic<bool> dropped_;  // true if client dropped it
 
   const InternalKeyComparator internal_comparator_;
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories_;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories_;
 
   const ColumnFamilyOptions initial_cf_options_;
-  const ImmutableCFOptions ioptions_;
+  const ImmutableOptions ioptions_;
   MutableCFOptions mutable_cf_options_;
 
   const bool is_delete_range_supported_;
 
   std::unique_ptr<TableCache> table_cache_;
+  std::unique_ptr<BlobFileCache> blob_file_cache_;
 
   std::unique_ptr<InternalStats> internal_stats_;
 
@@ -601,6 +615,8 @@ class ColumnFamilyData {
   std::vector<std::shared_ptr<FSDirectory>> data_dirs_;
 
   bool db_paths_registered_;
+
+  std::string full_history_ts_low_;
 };
 
 // ColumnFamilySet has interesting thread-safety requirements
@@ -653,7 +669,8 @@ class ColumnFamilySet {
                   WriteBufferManager* _write_buffer_manager,
                   WriteController* _write_controller,
                   BlockCacheTracer* const block_cache_tracer,
-                  const std::shared_ptr<IOTracer>& io_tracer);
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  const std::string& db_session_id);
   ~ColumnFamilySet();
 
   ColumnFamilyData* GetDefault() const;
@@ -718,6 +735,7 @@ class ColumnFamilySet {
   WriteController* write_controller_;
   BlockCacheTracer* const block_cache_tracer_;
   std::shared_ptr<IOTracer> io_tracer_;
+  std::string db_session_id_;
 };
 
 // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
diff --git a/db/column_family_test.cc b/db/column_family_test.cc
index fcb71926802..2db49813abe 100644
--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@@ -35,10 +35,10 @@ namespace ROCKSDB_NAMESPACE {
 static const int kValueSize = 1000;
 
 // counts how many operations were performed
-class EnvCounter : public EnvWrapper {
+class EnvCounter : public SpecialEnv {
  public:
   explicit EnvCounter(Env* base)
-      : EnvWrapper(base), num_new_writable_file_(0) {}
+      : SpecialEnv(base), num_new_writable_file_(0) {}
   int GetNumberOfNewWritableFileCalls() {
     return num_new_writable_file_;
   }
@@ -56,23 +56,16 @@ class ColumnFamilyTestBase : public testing::Test {
  public:
   explicit ColumnFamilyTestBase(uint32_t format) : rnd_(139), format_(format) {
     Env* base_env = Env::Default();
-#ifndef ROCKSDB_LITE
-    const char* test_env_uri = getenv("TEST_ENV_URI");
-    if (test_env_uri) {
-      Env* test_env = nullptr;
-      Status s = Env::LoadEnv(test_env_uri, &test_env, &env_guard_);
-      base_env = test_env;
-      EXPECT_OK(s);
-      EXPECT_NE(Env::Default(), base_env);
-    }
-#endif  // !ROCKSDB_LITE
+    EXPECT_OK(
+        test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_));
     EXPECT_NE(nullptr, base_env);
     env_ = new EnvCounter(base_env);
+    env_->skip_fsync_ = true;
     dbname_ = test::PerThreadDBPath("column_family_test");
     db_options_.create_if_missing = true;
     db_options_.fail_if_options_file_error = true;
     db_options_.env = env_;
-    DestroyDB(dbname_, Options(db_options_, column_family_options_));
+    EXPECT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_)));
   }
 
   ~ColumnFamilyTestBase() override {
@@ -87,7 +80,6 @@ class ColumnFamilyTestBase : public testing::Test {
 #endif  // ROCKSDB_LITE
       column_families.push_back(cfdescriptor);
     }
-    Close();
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
     Destroy(column_families);
     delete env_;
@@ -187,8 +179,8 @@ class ColumnFamilyTestBase : public testing::Test {
     std::vector<ColumnFamilyDescriptor> column_families;
     names_.clear();
     for (size_t i = 0; i < cf.size(); ++i) {
-      column_families.push_back(ColumnFamilyDescriptor(
-          cf[i], options.size() == 0 ? column_family_options_ : options[i]));
+      column_families.emplace_back(
+          cf[i], options.size() == 0 ? column_family_options_ : options[i]);
       names_.push_back(cf[i]);
     }
     return DB::Open(db_options_, dbname_, column_families, &handles_, &db_);
@@ -199,8 +191,8 @@ class ColumnFamilyTestBase : public testing::Test {
     std::vector<ColumnFamilyDescriptor> column_families;
     names_.clear();
     for (size_t i = 0; i < cf.size(); ++i) {
-      column_families.push_back(ColumnFamilyDescriptor(
-          cf[i], options.size() == 0 ? column_family_options_ : options[i]));
+      column_families.emplace_back(
+          cf[i], options.size() == 0 ? column_family_options_ : options[i]);
       names_.push_back(cf[i]);
     }
     return DB::OpenForReadOnly(db_options_, dbname_, column_families, &handles_,
@@ -653,8 +645,8 @@ TEST_P(FlushEmptyCFTestWithParam, FlushEmptyCFTest) {
   // after flushing file B is deleted. At the same time, the min log number of
   // default CF is not written to manifest. Log file A still remains.
   // Flushed to SST file Y.
-  Flush(1);
-  Flush(0);
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Flush(0));
   ASSERT_OK(Put(1, "bar", "v3"));  // seqID 4
   ASSERT_OK(Put(1, "foo", "v4"));  // seqID 5
   ASSERT_OK(db_->FlushWAL(/*sync=*/false));
@@ -708,15 +700,15 @@ TEST_P(FlushEmptyCFTestWithParam, FlushEmptyCFTest2) {
   // and is set to current. Both CFs' min log number is set to file C so after
   // flushing file B is deleted. Log file A still remains.
   // Flushed to SST file Y.
-  Flush(1);
+  ASSERT_OK(Flush(1));
   ASSERT_OK(Put(0, "bar", "v2"));  // seqID 4
   ASSERT_OK(Put(2, "bar", "v2"));  // seqID 5
   ASSERT_OK(Put(1, "bar", "v3"));  // seqID 6
   // Flushing all column families. This forces all CFs' min log to current. This
   // is written to the manifest file. Log file C is cleared.
-  Flush(0);
-  Flush(1);
-  Flush(2);
+  ASSERT_OK(Flush(0));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Flush(2));
   // Write to log file D
   ASSERT_OK(Put(1, "bar", "v4"));  // seqID 7
   ASSERT_OK(Put(1, "bar", "v5"));  // seqID 8
@@ -898,9 +890,7 @@ TEST_P(ColumnFamilyTest, IgnoreRecoveredLog) {
   std::vector<std::string> old_files;
   ASSERT_OK(env_->GetChildren(backup_logs, &old_files));
   for (auto& file : old_files) {
-    if (file != "." && file != "..") {
-      ASSERT_OK(env_->DeleteFile(backup_logs + "/" + file));
-    }
+    ASSERT_OK(env_->DeleteFile(backup_logs + "/" + file));
   }
 
   column_family_options_.merge_operator =
@@ -929,9 +919,7 @@ TEST_P(ColumnFamilyTest, IgnoreRecoveredLog) {
   std::vector<std::string> logs;
   ASSERT_OK(env_->GetChildren(db_options_.wal_dir, &logs));
   for (auto& log : logs) {
-    if (log != ".." && log != ".") {
-      CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log);
-    }
+    CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log);
   }
 
   // recover the DB
@@ -956,9 +944,7 @@ TEST_P(ColumnFamilyTest, IgnoreRecoveredLog) {
     if (iter == 0) {
       // copy the logs from backup back to wal dir
       for (auto& log : logs) {
-        if (log != ".." && log != ".") {
-          CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log);
-        }
+        CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log);
       }
     }
   }
@@ -985,7 +971,7 @@ TEST_P(ColumnFamilyTest, FlushTest) {
     for (int i = 0; i < 3; ++i) {
       uint64_t max_total_in_memory_state =
           MaxTotalInMemoryState();
-      Flush(i);
+      ASSERT_OK(Flush(i));
       AssertMaxTotalInMemoryState(max_total_in_memory_state);
     }
     ASSERT_OK(Put(1, "foofoo", "bar"));
@@ -1093,7 +1079,7 @@ TEST_P(ColumnFamilyTest, CrashAfterFlush) {
   ASSERT_OK(batch.Put(handles_[0], Slice("foo"), Slice("bar")));
   ASSERT_OK(batch.Put(handles_[1], Slice("foo"), Slice("bar")));
   ASSERT_OK(db_->Write(WriteOptions(), &batch));
-  Flush(0);
+  ASSERT_OK(Flush(0));
   fault_env->SetFilesystemActive(false);
 
   std::vector<std::string> names;
@@ -1103,7 +1089,7 @@ TEST_P(ColumnFamilyTest, CrashAfterFlush) {
     }
   }
   Close();
-  fault_env->DropUnsyncedFileData();
+  ASSERT_OK(fault_env->DropUnsyncedFileData());
   fault_env->ResetState();
   Open(names, {});
 
@@ -2236,7 +2222,7 @@ TEST_P(ColumnFamilyTest, FlushStaleColumnFamilies) {
   // files for column family [one], because it's empty
   AssertCountLiveFiles(4);
 
-  Flush(0);
+  ASSERT_OK(Flush(0));
   ASSERT_EQ(0, dbfull()->TEST_total_log_size());
   Close();
 }
@@ -2292,6 +2278,8 @@ TEST_P(ColumnFamilyTest, SanitizeOptions) {
               // not a multiple of 4k, round up 4k
               expected_arena_block_size += 4 * 1024;
             }
+            expected_arena_block_size =
+                std::min(size_t{1024 * 1024}, expected_arena_block_size);
             ASSERT_EQ(expected_arena_block_size, result.arena_block_size);
           }
         }
@@ -3040,7 +3028,7 @@ TEST_P(ColumnFamilyTest, IteratorCloseWALFile1) {
   Iterator* it = db_->NewIterator(ReadOptions(), handles_[1]);
   ASSERT_OK(it->status());
   // A flush will make `it` hold the last reference of its super version.
-  Flush(1);
+  ASSERT_OK(Flush(1));
 
   ASSERT_OK(Put(1, "fodor", "mirko"));
   ASSERT_OK(Put(0, "fodor", "mirko"));
@@ -3093,7 +3081,7 @@ TEST_P(ColumnFamilyTest, IteratorCloseWALFile2) {
   Iterator* it = db_->NewIterator(ro, handles_[1]);
   ASSERT_OK(it->status());
   // A flush will make `it` hold the last reference of its super version.
-  Flush(1);
+  ASSERT_OK(Flush(1));
 
   ASSERT_OK(Put(1, "fodor", "mirko"));
   ASSERT_OK(Put(0, "fodor", "mirko"));
@@ -3147,7 +3135,7 @@ TEST_P(ColumnFamilyTest, ForwardIteratorCloseWALFile) {
   CreateColumnFamilies({"one"});
   ASSERT_OK(Put(1, "fodor", "mirko"));
   ASSERT_OK(Put(1, "fodar2", "mirko"));
-  Flush(1);
+  ASSERT_OK(Flush(1));
 
   // Create an iterator holding the current super version, as well as
   // the SST file just flushed.
@@ -3159,7 +3147,7 @@ TEST_P(ColumnFamilyTest, ForwardIteratorCloseWALFile) {
 
   ASSERT_OK(Put(1, "fodor", "mirko"));
   ASSERT_OK(Put(1, "fodar2", "mirko"));
-  Flush(1);
+  ASSERT_OK(Flush(1));
 
   WaitForCompaction();
 
@@ -3232,9 +3220,9 @@ TEST_P(ColumnFamilyTest, LogSyncConflictFlush) {
   ROCKSDB_NAMESPACE::port::Thread thread([&] { ASSERT_OK(db_->SyncWAL()); });
 
   TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:1");
-  Flush(1);
+  ASSERT_OK(Flush(1));
   ASSERT_OK(Put(1, "foo", "bar"));
-  Flush(1);
+  ASSERT_OK(Flush(1));
 
   TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:2");
 
@@ -3256,7 +3244,7 @@ TEST_P(ColumnFamilyTest, DISABLED_LogTruncationTest) {
   Build(0, 100);
 
   // Flush the 0th column family to force a roll of the wal log
-  Flush(0);
+  ASSERT_OK(Flush(0));
 
   // Add some more entries
   Build(100, 100);
@@ -3271,7 +3259,7 @@ TEST_P(ColumnFamilyTest, DISABLED_LogTruncationTest) {
     FileType type;
     if (!(ParseFileName(filenames[i], &number, &type))) continue;
 
-    if (type != kLogFile) continue;
+    if (type != kWalFile) continue;
 
     logfs.push_back(filenames[i]);
   }
@@ -3332,14 +3320,14 @@ TEST_P(ColumnFamilyTest, DefaultCfPathsTest) {
 
   // Fill Column family 1.
   PutRandomData(1, 100, 100);
-  Flush(1);
+  ASSERT_OK(Flush(1));
 
   ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path));
   ASSERT_EQ(0, GetSstFileCount(dbname_));
 
   // Fill column family 2
   PutRandomData(2, 100, 100);
-  Flush(2);
+  ASSERT_OK(Flush(2));
 
   // SST from Column family 2 should be generated in
   // db_paths which is dbname_ in this case.
@@ -3358,14 +3346,14 @@ TEST_P(ColumnFamilyTest, MultipleCFPathsTest) {
   Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2});
 
   PutRandomData(1, 100, 100, true /* save */);
-  Flush(1);
+  ASSERT_OK(Flush(1));
 
   // Check that files are generated in appropriate paths.
   ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path));
   ASSERT_EQ(0, GetSstFileCount(dbname_));
 
   PutRandomData(2, 100, 100, true /* save */);
-  Flush(2);
+  ASSERT_OK(Flush(2));
 
   ASSERT_EQ(1, GetSstFileCount(cf_opt2.cf_paths[0].path));
   ASSERT_EQ(0, GetSstFileCount(dbname_));
@@ -3391,6 +3379,30 @@ TEST_P(ColumnFamilyTest, MultipleCFPathsTest) {
   }
 }
 
+TEST(ColumnFamilyTest, ValidateBlobGCCutoff) {
+  DBOptions db_options;
+
+  ColumnFamilyOptions cf_options;
+  cf_options.enable_blob_garbage_collection = true;
+
+  cf_options.blob_garbage_collection_age_cutoff = -0.5;
+  ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+                  .IsInvalidArgument());
+
+  cf_options.blob_garbage_collection_age_cutoff = 0.0;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.blob_garbage_collection_age_cutoff = 0.5;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.blob_garbage_collection_age_cutoff = 1.0;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.blob_garbage_collection_age_cutoff = 1.5;
+  ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+                  .IsInvalidArgument());
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc
index 048ed6e26f1..4793adddf7c 100644
--- a/db/compact_files_test.cc
+++ b/db/compact_files_test.cc
@@ -91,9 +91,9 @@ TEST_F(CompactFilesTest, L0ConflictsFiles) {
   // create couple files
   // Background compaction starts and waits in BackgroundCallCompaction:0
   for (int i = 0; i < kLevel0Trigger * 4; ++i) {
-    db->Put(WriteOptions(), ToString(i), "");
-    db->Put(WriteOptions(), ToString(100 - i), "");
-    db->Flush(FlushOptions());
+    ASSERT_OK(db->Put(WriteOptions(), ToString(i), ""));
+    ASSERT_OK(db->Put(WriteOptions(), ToString(100 - i), ""));
+    ASSERT_OK(db->Flush(FlushOptions()));
   }
 
   ROCKSDB_NAMESPACE::ColumnFamilyMetaData meta;
@@ -118,6 +118,78 @@ TEST_F(CompactFilesTest, L0ConflictsFiles) {
   delete db;
 }
 
+TEST_F(CompactFilesTest, MultipleLevel) {
+  Options options;
+  options.create_if_missing = true;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.num_levels = 6;
+  // Add listener
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  DB* db = nullptr;
+  DestroyDB(db_name_, options);
+  Status s = DB::Open(options, db_name_, &db);
+  ASSERT_OK(s);
+  ASSERT_NE(db, nullptr);
+
+  // create couple files in L0, L3, L4 and L5
+  for (int i = 5; i > 2; --i) {
+    collector->ClearFlushedFiles();
+    ASSERT_OK(db->Put(WriteOptions(), ToString(i), ""));
+    ASSERT_OK(db->Flush(FlushOptions()));
+    auto l0_files = collector->GetFlushedFiles();
+    ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, i));
+
+    std::string prop;
+    ASSERT_TRUE(
+        db->GetProperty("rocksdb.num-files-at-level" + ToString(i), &prop));
+    ASSERT_EQ("1", prop);
+  }
+  ASSERT_OK(db->Put(WriteOptions(), ToString(0), ""));
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  ColumnFamilyMetaData meta;
+  db->GetColumnFamilyMetaData(&meta);
+  // Compact files except the file in L3
+  std::vector<std::string> files;
+  for (int i = 0; i < 6; ++i) {
+    if (i == 3) continue;
+    for (auto& file : meta.levels[i].files) {
+      files.push_back(file.db_path + "/" + file.name);
+    }
+  }
+
+  SyncPoint::GetInstance()->LoadDependency({
+      {"CompactionJob::Run():Start", "CompactFilesTest.MultipleLevel:0"},
+      {"CompactFilesTest.MultipleLevel:1", "CompactFilesImpl:3"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::thread thread([&] {
+    TEST_SYNC_POINT("CompactFilesTest.MultipleLevel:0");
+    ASSERT_OK(db->Put(WriteOptions(), "bar", "v2"));
+    ASSERT_OK(db->Put(WriteOptions(), "foo", "v2"));
+    ASSERT_OK(db->Flush(FlushOptions()));
+    TEST_SYNC_POINT("CompactFilesTest.MultipleLevel:1");
+  });
+
+  // Compaction cannot move up the data to higher level
+  // here we have input file from level 5, so the output level has to be >= 5
+  for (int invalid_output_level = 0; invalid_output_level < 5;
+       invalid_output_level++) {
+    s = db->CompactFiles(CompactionOptions(), files, invalid_output_level);
+    std::cout << s.ToString() << std::endl;
+    ASSERT_TRUE(s.IsInvalidArgument());
+  }
+
+  ASSERT_OK(db->CompactFiles(CompactionOptions(), files, 5));
+  SyncPoint::GetInstance()->DisableProcessing();
+  thread.join();
+
+  delete db;
+}
+
 TEST_F(CompactFilesTest, ObsoleteFiles) {
   Options options;
   // to trigger compaction more easily
@@ -138,18 +210,18 @@ TEST_F(CompactFilesTest, ObsoleteFiles) {
   DB* db = nullptr;
   DestroyDB(db_name_, options);
   Status s = DB::Open(options, db_name_, &db);
-  assert(s.ok());
-  assert(db);
+  ASSERT_OK(s);
+  ASSERT_NE(db, nullptr);
 
   // create couple files
   for (int i = 1000; i < 2000; ++i) {
-    db->Put(WriteOptions(), ToString(i),
-            std::string(kWriteBufferSize / 10, 'a' + (i % 26)));
+    ASSERT_OK(db->Put(WriteOptions(), ToString(i),
+                      std::string(kWriteBufferSize / 10, 'a' + (i % 26))));
   }
 
   auto l0_files = collector->GetFlushedFiles();
   ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1));
-  static_cast_with_check<DBImpl>(db)->TEST_WaitForCompact();
+  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForCompact());
 
   // verify all compaction input files are deleted
   for (auto fname : l0_files) {
@@ -182,15 +254,17 @@ TEST_F(CompactFilesTest, NotCutOutputOnLevel0) {
 
   // create couple files
   for (int i = 0; i < 500; ++i) {
-    db->Put(WriteOptions(), ToString(i), std::string(1000, 'a' + (i % 26)));
+    ASSERT_OK(db->Put(WriteOptions(), ToString(i),
+                      std::string(1000, 'a' + (i % 26))));
   }
-  static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable();
+  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable());
   auto l0_files_1 = collector->GetFlushedFiles();
   collector->ClearFlushedFiles();
   for (int i = 0; i < 500; ++i) {
-    db->Put(WriteOptions(), ToString(i), std::string(1000, 'a' + (i % 26)));
+    ASSERT_OK(db->Put(WriteOptions(), ToString(i),
+                      std::string(1000, 'a' + (i % 26))));
   }
-  static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable();
+  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable());
   auto l0_files_2 = collector->GetFlushedFiles();
   ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_1, 0));
   ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_2, 0));
@@ -213,13 +287,13 @@ TEST_F(CompactFilesTest, CapturingPendingFiles) {
   DB* db = nullptr;
   DestroyDB(db_name_, options);
   Status s = DB::Open(options, db_name_, &db);
-  assert(s.ok());
+  ASSERT_OK(s);
   assert(db);
 
   // Create 5 files.
   for (int i = 0; i < 5; ++i) {
-    db->Put(WriteOptions(), "key" + ToString(i), "value");
-    db->Flush(FlushOptions());
+    ASSERT_OK(db->Put(WriteOptions(), "key" + ToString(i), "value"));
+    ASSERT_OK(db->Flush(FlushOptions()));
   }
 
   auto l0_files = collector->GetFlushedFiles();
@@ -237,8 +311,8 @@ TEST_F(CompactFilesTest, CapturingPendingFiles) {
 
   // In the meantime flush another file.
   TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:0");
-  db->Put(WriteOptions(), "key5", "value");
-  db->Flush(FlushOptions());
+  ASSERT_OK(db->Put(WriteOptions(), "key5", "value"));
+  ASSERT_OK(db->Flush(FlushOptions()));
   TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:1");
 
   compaction_thread.join();
@@ -249,7 +323,7 @@ TEST_F(CompactFilesTest, CapturingPendingFiles) {
 
   // Make sure we can reopen the DB.
   s = DB::Open(options, db_name_, &db);
-  ASSERT_TRUE(s.ok());
+  ASSERT_OK(s);
   assert(db);
   delete db;
 }
@@ -293,8 +367,8 @@ TEST_F(CompactFilesTest, CompactionFilterWithGetSv) {
   cf->SetDB(db);
 
   // Write one L0 file
-  db->Put(WriteOptions(), "K1", "V1");
-  db->Flush(FlushOptions());
+  ASSERT_OK(db->Put(WriteOptions(), "K1", "V1"));
+  ASSERT_OK(db->Flush(FlushOptions()));
 
   // Compact all L0 files using CompactFiles
   ROCKSDB_NAMESPACE::ColumnFamilyMetaData meta;
@@ -337,8 +411,8 @@ TEST_F(CompactFilesTest, SentinelCompressionType) {
     DB* db = nullptr;
     ASSERT_OK(DB::Open(options, db_name_, &db));
 
-    db->Put(WriteOptions(), "key", "val");
-    db->Flush(FlushOptions());
+    ASSERT_OK(db->Put(WriteOptions(), "key", "val"));
+    ASSERT_OK(db->Flush(FlushOptions()));
 
     auto l0_files = collector->GetFlushedFiles();
     ASSERT_EQ(1, l0_files.size());
@@ -377,14 +451,15 @@ TEST_F(CompactFilesTest, GetCompactionJobInfo) {
   DB* db = nullptr;
   DestroyDB(db_name_, options);
   Status s = DB::Open(options, db_name_, &db);
-  assert(s.ok());
+  ASSERT_OK(s);
   assert(db);
 
   // create couple files
   for (int i = 0; i < 500; ++i) {
-    db->Put(WriteOptions(), ToString(i), std::string(1000, 'a' + (i % 26)));
+    ASSERT_OK(db->Put(WriteOptions(), ToString(i),
+                      std::string(1000, 'a' + (i % 26))));
   }
-  static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable();
+  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable());
   auto l0_files_1 = collector->GetFlushedFiles();
   CompactionOptions co;
   co.compression = CompressionType::kLZ4Compression;
diff --git a/db/compaction/clipping_iterator.h b/db/compaction/clipping_iterator.h
new file mode 100644
index 00000000000..b287b653e10
--- /dev/null
+++ b/db/compaction/clipping_iterator.h
@@ -0,0 +1,275 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An internal iterator that wraps another one and ensures that any keys
+// returned are strictly within a range [start, end). If the underlying
+// iterator has already performed the bounds checking, it relies on that result;
+// otherwise, it performs the necessary key comparisons itself. Both bounds
+// are optional.
+class ClippingIterator : public InternalIterator {
+ public:
+  ClippingIterator(InternalIterator* iter, const Slice* start, const Slice* end,
+                   const Comparator* cmp)
+      : iter_(iter), start_(start), end_(end), cmp_(cmp), valid_(false) {
+    assert(iter_);
+    assert(cmp_);
+    assert(!start_ || !end_ || cmp_->Compare(*start_, *end_) <= 0);
+
+    UpdateAndEnforceBounds();
+  }
+
+  bool Valid() const override { return valid_; }
+
+  void SeekToFirst() override {
+    if (start_) {
+      iter_->Seek(*start_);
+    } else {
+      iter_->SeekToFirst();
+    }
+
+    UpdateAndEnforceUpperBound();
+  }
+
+  void SeekToLast() override {
+    if (end_) {
+      iter_->SeekForPrev(*end_);
+
+      // Upper bound is exclusive, so we need a key which is strictly smaller
+      if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) {
+        iter_->Prev();
+      }
+    } else {
+      iter_->SeekToLast();
+    }
+
+    UpdateAndEnforceLowerBound();
+  }
+
+  void Seek(const Slice& target) override {
+    if (start_ && cmp_->Compare(target, *start_) < 0) {
+      iter_->Seek(*start_);
+      UpdateAndEnforceUpperBound();
+      return;
+    }
+
+    if (end_ && cmp_->Compare(target, *end_) >= 0) {
+      valid_ = false;
+      return;
+    }
+
+    iter_->Seek(target);
+    UpdateAndEnforceUpperBound();
+  }
+
+  void SeekForPrev(const Slice& target) override {
+    if (start_ && cmp_->Compare(target, *start_) < 0) {
+      valid_ = false;
+      return;
+    }
+
+    if (end_ && cmp_->Compare(target, *end_) >= 0) {
+      iter_->SeekForPrev(*end_);
+
+      // Upper bound is exclusive, so we need a key which is strictly smaller
+      if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) {
+        iter_->Prev();
+      }
+
+      UpdateAndEnforceLowerBound();
+      return;
+    }
+
+    iter_->SeekForPrev(target);
+    UpdateAndEnforceLowerBound();
+  }
+
+  void Next() override {
+    assert(valid_);
+    iter_->Next();
+    UpdateAndEnforceUpperBound();
+  }
+
+  bool NextAndGetResult(IterateResult* result) override {
+    assert(valid_);
+    assert(result);
+
+    IterateResult res;
+    valid_ = iter_->NextAndGetResult(&res);
+
+    if (!valid_) {
+      return false;
+    }
+
+    if (end_) {
+      EnforceUpperBoundImpl(res.bound_check_result);
+
+      if (!valid_) {
+        return false;
+      }
+    }
+
+    res.bound_check_result = IterBoundCheck::kInbound;
+    *result = res;
+
+    return true;
+  }
+
+  void Prev() override {
+    assert(valid_);
+    iter_->Prev();
+    UpdateAndEnforceLowerBound();
+  }
+
+  Slice key() const override {
+    assert(valid_);
+    return iter_->key();
+  }
+
+  Slice user_key() const override {
+    assert(valid_);
+    return iter_->user_key();
+  }
+
+  Slice value() const override {
+    assert(valid_);
+    return iter_->value();
+  }
+
+  Status status() const override { return iter_->status(); }
+
+  bool PrepareValue() override {
+    assert(valid_);
+
+    if (iter_->PrepareValue()) {
+      return true;
+    }
+
+    assert(!iter_->Valid());
+    valid_ = false;
+    return false;
+  }
+
+  bool MayBeOutOfLowerBound() override {
+    assert(valid_);
+    return false;
+  }
+
+  IterBoundCheck UpperBoundCheckResult() override {
+    assert(valid_);
+    return IterBoundCheck::kInbound;
+  }
+
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    iter_->SetPinnedItersMgr(pinned_iters_mgr);
+  }
+
+  bool IsKeyPinned() const override {
+    assert(valid_);
+    return iter_->IsKeyPinned();
+  }
+
+  bool IsValuePinned() const override {
+    assert(valid_);
+    return iter_->IsValuePinned();
+  }
+
+  Status GetProperty(std::string prop_name, std::string* prop) override {
+    return iter_->GetProperty(prop_name, prop);
+  }
+
+ private:
+  void UpdateValid() {
+    assert(!iter_->Valid() || iter_->status().ok());
+
+    valid_ = iter_->Valid();
+  }
+
+  void EnforceUpperBoundImpl(IterBoundCheck bound_check_result) {
+    if (bound_check_result == IterBoundCheck::kInbound) {
+      return;
+    }
+
+    if (bound_check_result == IterBoundCheck::kOutOfBound) {
+      valid_ = false;
+      return;
+    }
+
+    assert(bound_check_result == IterBoundCheck::kUnknown);
+
+    if (cmp_->Compare(key(), *end_) >= 0) {
+      valid_ = false;
+    }
+  }
+
+  void EnforceUpperBound() {
+    if (!valid_) {
+      return;
+    }
+
+    if (!end_) {
+      return;
+    }
+
+    EnforceUpperBoundImpl(iter_->UpperBoundCheckResult());
+  }
+
+  void EnforceLowerBound() {
+    if (!valid_) {
+      return;
+    }
+
+    if (!start_) {
+      return;
+    }
+
+    if (!iter_->MayBeOutOfLowerBound()) {
+      return;
+    }
+
+    if (cmp_->Compare(key(), *start_) < 0) {
+      valid_ = false;
+    }
+  }
+
+  void AssertBounds() {
+    assert(!valid_ || !start_ || cmp_->Compare(key(), *start_) >= 0);
+    assert(!valid_ || !end_ || cmp_->Compare(key(), *end_) < 0);
+  }
+
+  void UpdateAndEnforceBounds() {
+    UpdateValid();
+    EnforceUpperBound();
+    EnforceLowerBound();
+    AssertBounds();
+  }
+
+  void UpdateAndEnforceUpperBound() {
+    UpdateValid();
+    EnforceUpperBound();
+    AssertBounds();
+  }
+
+  void UpdateAndEnforceLowerBound() {
+    UpdateValid();
+    EnforceLowerBound();
+    AssertBounds();
+  }
+
+  InternalIterator* iter_;
+  const Slice* start_;
+  const Slice* end_;
+  const Comparator* cmp_;
+  bool valid_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/clipping_iterator_test.cc b/db/compaction/clipping_iterator_test.cc
new file mode 100644
index 00000000000..3a31b61eb49
--- /dev/null
+++ b/db/compaction/clipping_iterator_test.cc
@@ -0,0 +1,256 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/compaction/clipping_iterator.h"
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/comparator.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A vector iterator which does its own bounds checking. This is for testing the
+// optimizations in the clipping iterator where we bypass the bounds checking if
+// the input iterator has already performed it.
+class BoundsCheckingVectorIterator : public test::VectorIterator {
+ public:
+  BoundsCheckingVectorIterator(const std::vector<std::string>& keys,
+                               const std::vector<std::string>& values,
+                               const Slice* start, const Slice* end,
+                               const Comparator* cmp)
+      : VectorIterator(keys, values), start_(start), end_(end), cmp_(cmp) {
+    assert(cmp_);
+  }
+
+  bool NextAndGetResult(IterateResult* result) override {
+    assert(Valid());
+    assert(result);
+
+    Next();
+
+    if (!Valid()) {
+      return false;
+    }
+
+    result->key = key();
+    result->bound_check_result = UpperBoundCheckResult();
+    result->value_prepared = true;
+
+    return true;
+  }
+
+  bool MayBeOutOfLowerBound() override {
+    assert(Valid());
+
+    if (!start_) {
+      return false;
+    }
+
+    return cmp_->Compare(key(), *start_) < 0;
+  }
+
+  IterBoundCheck UpperBoundCheckResult() override {
+    assert(Valid());
+
+    if (!end_) {
+      return IterBoundCheck::kInbound;
+    }
+
+    return cmp_->Compare(key(), *end_) >= 0 ? IterBoundCheck::kOutOfBound
+                                            : IterBoundCheck::kInbound;
+  }
+
+ private:
+  const Slice* start_;
+  const Slice* end_;
+  const Comparator* cmp_;
+};
+
+class ClippingIteratorTest
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<std::tuple<bool, size_t, size_t>> {};
+
+TEST_P(ClippingIteratorTest, Clip) {
+  const std::vector<std::string> keys{"key0", "key1", "key2", "key3", "key4",
+                                      "key5", "key6", "key7", "key8", "key9"};
+  const std::vector<std::string> values{
+      "unused0", "value1",  "value2",  "value3",  "unused4",
+      "unused5", "unused6", "unused7", "unused8", "unused9"};
+
+  assert(keys.size() == values.size());
+
+  // Note: the input always contains key1, key2, and key3; however, the clipping
+  // window is based on the test parameters: its left edge is a value in the
+  // range [0, 4], and its size is a value in the range [0, 5]
+  const std::vector<std::string> input_keys{keys[1], keys[2], keys[3]};
+  const std::vector<std::string> input_values{values[1], values[2], values[3]};
+
+  const bool use_bounds_checking_vec_it = std::get<0>(GetParam());
+
+  const size_t clip_start_idx = std::get<1>(GetParam());
+  const size_t clip_window_size = std::get<2>(GetParam());
+  const size_t clip_end_idx = clip_start_idx + clip_window_size;
+
+  const Slice start(keys[clip_start_idx]);
+  const Slice end(keys[clip_end_idx]);
+
+  std::unique_ptr<InternalIterator> input(
+      use_bounds_checking_vec_it
+          ? new BoundsCheckingVectorIterator(input_keys, input_values, &start,
+                                             &end, BytewiseComparator())
+          : new test::VectorIterator(input_keys, input_values));
+
+  ClippingIterator clip(input.get(), &start, &end, BytewiseComparator());
+
+  // The range the clipping iterator should return values from. This is
+  // essentially the intersection of the input range [1, 4) and the clipping
+  // window [clip_start_idx, clip_end_idx)
+  const size_t data_start_idx =
+      std::max(clip_start_idx, static_cast<size_t>(1));
+  const size_t data_end_idx = std::min(clip_end_idx, static_cast<size_t>(4));
+
+  // Range is empty; all Seeks should fail
+  if (data_start_idx >= data_end_idx) {
+    clip.SeekToFirst();
+    ASSERT_FALSE(clip.Valid());
+
+    clip.SeekToLast();
+    ASSERT_FALSE(clip.Valid());
+
+    for (size_t i = 0; i < keys.size(); ++i) {
+      clip.Seek(keys[i]);
+      ASSERT_FALSE(clip.Valid());
+
+      clip.SeekForPrev(keys[i]);
+      ASSERT_FALSE(clip.Valid());
+    }
+
+    return;
+  }
+
+  // Range is non-empty; call SeekToFirst and iterate forward
+  clip.SeekToFirst();
+  ASSERT_TRUE(clip.Valid());
+  ASSERT_EQ(clip.key(), keys[data_start_idx]);
+  ASSERT_EQ(clip.value(), values[data_start_idx]);
+  ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+  ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+
+  for (size_t i = data_start_idx + 1; i < data_end_idx; ++i) {
+    clip.Next();
+    ASSERT_TRUE(clip.Valid());
+    ASSERT_EQ(clip.key(), keys[i]);
+    ASSERT_EQ(clip.value(), values[i]);
+    ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+    ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+  }
+
+  clip.Next();
+  ASSERT_FALSE(clip.Valid());
+
+  // Do it again using NextAndGetResult
+  clip.SeekToFirst();
+  ASSERT_TRUE(clip.Valid());
+  ASSERT_EQ(clip.key(), keys[data_start_idx]);
+  ASSERT_EQ(clip.value(), values[data_start_idx]);
+  ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+  ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+
+  for (size_t i = data_start_idx + 1; i < data_end_idx; ++i) {
+    IterateResult result;
+    ASSERT_TRUE(clip.NextAndGetResult(&result));
+    ASSERT_EQ(result.key, keys[i]);
+    ASSERT_EQ(result.bound_check_result, IterBoundCheck::kInbound);
+    ASSERT_TRUE(clip.Valid());
+    ASSERT_EQ(clip.key(), keys[i]);
+    ASSERT_EQ(clip.value(), values[i]);
+    ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+    ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+  }
+
+  IterateResult result;
+  ASSERT_FALSE(clip.NextAndGetResult(&result));
+  ASSERT_FALSE(clip.Valid());
+
+  // Call SeekToLast and iterate backward
+  clip.SeekToLast();
+  ASSERT_TRUE(clip.Valid());
+  ASSERT_EQ(clip.key(), keys[data_end_idx - 1]);
+  ASSERT_EQ(clip.value(), values[data_end_idx - 1]);
+  ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+  ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+
+  for (size_t i = data_end_idx - 2; i >= data_start_idx; --i) {
+    clip.Prev();
+    ASSERT_TRUE(clip.Valid());
+    ASSERT_EQ(clip.key(), keys[i]);
+    ASSERT_EQ(clip.value(), values[i]);
+    ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+    ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+  }
+
+  clip.Prev();
+  ASSERT_FALSE(clip.Valid());
+
+  // Call Seek/SeekForPrev for all keys; Seek should return the smallest key
+  // which is >= the target; SeekForPrev should return the largest key which is
+  // <= the target
+  for (size_t i = 0; i < keys.size(); ++i) {
+    clip.Seek(keys[i]);
+
+    if (i < data_start_idx) {
+      ASSERT_TRUE(clip.Valid());
+      ASSERT_EQ(clip.key(), keys[data_start_idx]);
+      ASSERT_EQ(clip.value(), values[data_start_idx]);
+      ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+      ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+    } else if (i < data_end_idx) {
+      ASSERT_TRUE(clip.Valid());
+      ASSERT_EQ(clip.key(), keys[i]);
+      ASSERT_EQ(clip.value(), values[i]);
+      ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+      ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+    } else {
+      ASSERT_FALSE(clip.Valid());
+    }
+
+    clip.SeekForPrev(keys[i]);
+
+    if (i < data_start_idx) {
+      ASSERT_FALSE(clip.Valid());
+    } else if (i < data_end_idx) {
+      ASSERT_TRUE(clip.Valid());
+      ASSERT_EQ(clip.key(), keys[i]);
+      ASSERT_EQ(clip.value(), values[i]);
+      ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+      ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+    } else {
+      ASSERT_TRUE(clip.Valid());
+      ASSERT_EQ(clip.key(), keys[data_end_idx - 1]);
+      ASSERT_EQ(clip.value(), values[data_end_idx - 1]);
+      ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+      ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    ClippingIteratorTest, ClippingIteratorTest,
+    ::testing::Combine(
+        ::testing::Bool(),
+        ::testing::Range(static_cast<size_t>(0), static_cast<size_t>(5)),
+        ::testing::Range(static_cast<size_t>(0), static_cast<size_t>(6))));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc
index 60e2681faa1..57f814fbc75 100644
--- a/db/compaction/compaction.cc
+++ b/db/compaction/compaction.cc
@@ -204,27 +204,24 @@ bool Compaction::IsFullCompaction(
   return num_files_in_compaction == total_num_files;
 }
 
-Compaction::Compaction(VersionStorageInfo* vstorage,
-                       const ImmutableCFOptions& _immutable_cf_options,
-                       const MutableCFOptions& _mutable_cf_options,
-                       const MutableDBOptions& _mutable_db_options,
-                       std::vector<CompactionInputFiles> _inputs,
-                       int _output_level, uint64_t _target_file_size,
-                       uint64_t _max_compaction_bytes, uint32_t _output_path_id,
-                       CompressionType _compression,
-                       CompressionOptions _compression_opts,
-                       uint32_t _max_subcompactions,
-                       std::vector<FileMetaData*> _grandparents,
-                       bool _manual_compaction, double _score,
-                       bool _deletion_compaction,
-                       CompactionReason _compaction_reason)
+Compaction::Compaction(
+    VersionStorageInfo* vstorage, const ImmutableOptions& _immutable_options,
+    const MutableCFOptions& _mutable_cf_options,
+    const MutableDBOptions& _mutable_db_options,
+    std::vector<CompactionInputFiles> _inputs, int _output_level,
+    uint64_t _target_file_size, uint64_t _max_compaction_bytes,
+    uint32_t _output_path_id, CompressionType _compression,
+    CompressionOptions _compression_opts, uint32_t _max_subcompactions,
+    std::vector<FileMetaData*> _grandparents, bool _manual_compaction,
+    double _score, bool _deletion_compaction,
+    CompactionReason _compaction_reason)
     : input_vstorage_(vstorage),
       start_level_(_inputs[0].level),
       output_level_(_output_level),
       max_output_file_size_(_target_file_size),
       max_compaction_bytes_(_max_compaction_bytes),
       max_subcompactions_(_max_subcompactions),
-      immutable_cf_options_(_immutable_cf_options),
+      immutable_options_(_immutable_options),
       mutable_cf_options_(_mutable_cf_options),
       input_version_(nullptr),
       number_levels_(vstorage->num_levels()),
@@ -248,12 +245,6 @@ Compaction::Compaction(VersionStorageInfo* vstorage,
   if (max_subcompactions_ == 0) {
     max_subcompactions_ = _mutable_db_options.max_subcompactions;
   }
-  if (!bottommost_level_) {
-    // Currently we only enable dictionary compression during compaction to the
-    // bottommost level.
-    output_compression_opts_.max_dict_bytes = 0;
-    output_compression_opts_.zstd_max_train_bytes = 0;
-  }
 
 #ifndef NDEBUG
   for (size_t i = 1; i < inputs_.size(); ++i) {
@@ -284,7 +275,7 @@ Compaction::~Compaction() {
 
 bool Compaction::InputCompressionMatchesOutput() const {
   int base_level = input_vstorage_->base_level();
-  bool matches = (GetCompressionType(immutable_cf_options_, input_vstorage_,
+  bool matches = (GetCompressionType(immutable_options_, input_vstorage_,
                                      mutable_cf_options_, start_level_,
                                      base_level) == output_compression_);
   if (matches) {
@@ -309,8 +300,8 @@ bool Compaction::IsTrivialMove() const {
   }
 
   if (is_manual_compaction_ &&
-      (immutable_cf_options_.compaction_filter != nullptr ||
-       immutable_cf_options_.compaction_filter_factory != nullptr)) {
+      (immutable_options_.compaction_filter != nullptr ||
+       immutable_options_.compaction_filter_factory != nullptr)) {
     // This is a manual compaction and we have a compaction filter that should
     // be executed, we cannot do a trivial move
     return false;
@@ -383,7 +374,13 @@ bool Compaction::KeyNotExistsBeyondOutputLevel(
         auto* f = files[level_ptrs->at(lvl)];
         if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
           // We've advanced far enough
-          if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
+          // In the presence of user-defined timestamp, we may need to handle
+          // the case in which f->smallest.user_key() (including ts) has the
+          // same user key, but the ts part is smaller. If so,
+          // Compare(user_key, f->smallest.user_key()) returns -1.
+          // That's why we need CompareWithoutTimestamp().
+          if (user_cmp->CompareWithoutTimestamp(user_key,
+                                                f->smallest.user_key()) >= 0) {
             // Key falls in this file's range, so it may
             // exist beyond output level
             return false;
@@ -512,14 +509,14 @@ uint64_t Compaction::OutputFilePreallocationSize() const {
   }
 
   if (max_output_file_size_ != port::kMaxUint64 &&
-      (immutable_cf_options_.compaction_style == kCompactionStyleLevel ||
+      (immutable_options_.compaction_style == kCompactionStyleLevel ||
        output_level() > 0)) {
     preallocation_size = std::min(max_output_file_size_, preallocation_size);
   }
 
   // Over-estimate slightly so we don't end up just barely crossing
   // the threshold
-  // No point to prellocate more than 1GB.
+  // No point to preallocate more than 1GB.
   return std::min(uint64_t{1073741824},
                   preallocation_size + (preallocation_size / 10));
 }
@@ -529,16 +526,23 @@ std::unique_ptr<CompactionFilter> Compaction::CreateCompactionFilter() const {
     return nullptr;
   }
 
+  if (!cfd_->ioptions()
+           ->compaction_filter_factory->ShouldFilterTableFileCreation(
+               TableFileCreationReason::kCompaction)) {
+    return nullptr;
+  }
+
   CompactionFilter::Context context;
   context.is_full_compaction = is_full_compaction_;
   context.is_manual_compaction = is_manual_compaction_;
   context.column_family_id = cfd_->GetID();
+  context.reason = TableFileCreationReason::kCompaction;
   return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter(
       context);
 }
 
 std::unique_ptr<SstPartitioner> Compaction::CreateSstPartitioner() const {
-  if (!immutable_cf_options_.sst_partitioner_factory) {
+  if (!immutable_options_.sst_partitioner_factory) {
     return nullptr;
   }
 
@@ -548,8 +552,7 @@ std::unique_ptr<SstPartitioner> Compaction::CreateSstPartitioner() const {
   context.output_level = output_level_;
   context.smallest_user_key = smallest_user_key_;
   context.largest_user_key = largest_user_key_;
-  return immutable_cf_options_.sst_partitioner_factory->CreatePartitioner(
-      context);
+  return immutable_options_.sst_partitioner_factory->CreatePartitioner(context);
 }
 
 bool Compaction::IsOutputLevelEmpty() const {
@@ -560,6 +563,14 @@ bool Compaction::ShouldFormSubcompactions() const {
   if (max_subcompactions_ <= 1 || cfd_ == nullptr) {
     return false;
   }
+
+  // Note: the subcompaction boundary picking logic does not currently guarantee
+  // that all user keys that differ only by timestamp get processed by the same
+  // subcompaction.
+  if (cfd_->user_comparator()->timestamp_size() > 0) {
+    return false;
+  }
+
   if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
     return (start_level_ == 0 || is_manual_compaction_) && output_level_ > 0 &&
            !IsOutputLevelEmpty();
diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h
index d25ffd603a5..eda9bf002f4 100644
--- a/db/compaction/compaction.h
+++ b/db/compaction/compaction.h
@@ -70,7 +70,7 @@ class CompactionFilter;
 class Compaction {
  public:
   Compaction(VersionStorageInfo* input_version,
-             const ImmutableCFOptions& immutable_cf_options,
+             const ImmutableOptions& immutable_options,
              const MutableCFOptions& mutable_cf_options,
              const MutableDBOptions& mutable_db_options,
              std::vector<CompactionInputFiles> inputs, int output_level,
@@ -162,7 +162,7 @@ class Compaction {
   CompressionType output_compression() const { return output_compression_; }
 
   // What compression options for output
-  CompressionOptions output_compression_opts() const {
+  const CompressionOptions& output_compression_opts() const {
     return output_compression_opts_;
   }
 
@@ -223,10 +223,10 @@ class Compaction {
   // How many total levels are there?
   int number_levels() const { return number_levels_; }
 
-  // Return the ImmutableCFOptions that should be used throughout the compaction
+  // Return the ImmutableOptions that should be used throughout the compaction
   // procedure
-  const ImmutableCFOptions* immutable_cf_options() const {
-    return &immutable_cf_options_;
+  const ImmutableOptions* immutable_options() const {
+    return &immutable_options_;
   }
 
   // Return the MutableCFOptions that should be used throughout the compaction
@@ -330,7 +330,7 @@ class Compaction {
   uint64_t max_output_file_size_;
   uint64_t max_compaction_bytes_;
   uint32_t max_subcompactions_;
-  const ImmutableCFOptions immutable_cf_options_;
+  const ImmutableOptions immutable_options_;
   const MutableCFOptions mutable_cf_options_;
   Version* input_version_;
   VersionEdit edit_;
@@ -341,7 +341,7 @@ class Compaction {
   const uint32_t output_path_id_;
   CompressionType output_compression_;
   CompressionOptions output_compression_opts_;
-  // If true, then the comaction can be done by simply deleting input files.
+  // If true, then the compaction can be done by simply deleting input files.
   const bool deletion_compaction_;
 
   // Compaction input files organized by level. Constant after construction
diff --git a/db/compaction/compaction_iteration_stats.h b/db/compaction/compaction_iteration_stats.h
index 963c1d8eb49..cb7b82c65ab 100644
--- a/db/compaction/compaction_iteration_stats.h
+++ b/db/compaction/compaction_iteration_stats.h
@@ -34,4 +34,8 @@ struct CompactionIterationStats {
   // Single-Delete diagnostics for exceptional situations
   uint64_t num_single_del_fallthru = 0;
   uint64_t num_single_del_mismatch = 0;
+
+  // Blob related statistics
+  uint64_t num_blobs_read = 0;
+  uint64_t total_blob_bytes_read = 0;
 };
diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc
index 4555ec56832..e48818fd086 100644
--- a/db/compaction/compaction_iterator.cc
+++ b/db/compaction/compaction_iterator.cc
@@ -5,9 +5,11 @@
 
 #include "db/compaction/compaction_iterator.h"
 
-#include <cinttypes>
+#include <iterator>
+#include <limits>
 
 #include "db/blob/blob_file_builder.h"
+#include "db/blob/blob_index.h"
 #include "db/snapshot_checker.h"
 #include "port/likely.h"
 #include "rocksdb/listener.h"
@@ -31,7 +33,6 @@
    (snapshot_checker_ == nullptr || LIKELY(IsInEarliestSnapshot(seq))))
 
 namespace ROCKSDB_NAMESPACE {
-
 CompactionIterator::CompactionIterator(
     InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
     SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
@@ -44,16 +45,19 @@ CompactionIterator::CompactionIterator(
     const std::atomic<bool>* shutting_down,
     const SequenceNumber preserve_deletes_seqnum,
     const std::atomic<int>* manual_compaction_paused,
-    const std::shared_ptr<Logger> info_log)
+    const std::atomic<bool>* manual_compaction_canceled,
+    const std::shared_ptr<Logger> info_log,
+    const std::string* full_history_ts_low)
     : CompactionIterator(
           input, cmp, merge_helper, last_sequence, snapshots,
           earliest_write_conflict_snapshot, snapshot_checker, env,
           report_detailed_time, expect_valid_internal_key, range_del_agg,
           blob_file_builder, allow_data_in_errors,
           std::unique_ptr<CompactionProxy>(
-              compaction ? new CompactionProxy(compaction) : nullptr),
+              compaction ? new RealCompaction(compaction) : nullptr),
           compaction_filter, shutting_down, preserve_deletes_seqnum,
-          manual_compaction_paused, info_log) {}
+          manual_compaction_paused, manual_compaction_canceled, info_log,
+          full_history_ts_low) {}
 
 CompactionIterator::CompactionIterator(
     InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
@@ -68,14 +72,20 @@ CompactionIterator::CompactionIterator(
     const std::atomic<bool>* shutting_down,
     const SequenceNumber preserve_deletes_seqnum,
     const std::atomic<int>* manual_compaction_paused,
-    const std::shared_ptr<Logger> info_log)
-    : input_(input),
+    const std::atomic<bool>* manual_compaction_canceled,
+    const std::shared_ptr<Logger> info_log,
+    const std::string* full_history_ts_low)
+    : input_(
+          input, cmp,
+          compaction ==
+              nullptr),  // Now only need to count number of entries in flush.
       cmp_(cmp),
       merge_helper_(merge_helper),
       snapshots_(snapshots),
       earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
       snapshot_checker_(snapshot_checker),
       env_(env),
+      clock_(env_->GetSystemClock().get()),
       report_detailed_time_(report_detailed_time),
       expect_valid_internal_key_(expect_valid_internal_key),
       range_del_agg_(range_del_agg),
@@ -84,14 +94,20 @@ CompactionIterator::CompactionIterator(
       compaction_filter_(compaction_filter),
       shutting_down_(shutting_down),
       manual_compaction_paused_(manual_compaction_paused),
+      manual_compaction_canceled_(manual_compaction_canceled),
       preserve_deletes_seqnum_(preserve_deletes_seqnum),
+      info_log_(info_log),
+      allow_data_in_errors_(allow_data_in_errors),
+      timestamp_size_(cmp_ ? cmp_->timestamp_size() : 0),
+      full_history_ts_low_(full_history_ts_low),
       current_user_key_sequence_(0),
       current_user_key_snapshot_(0),
       merge_out_iter_(merge_helper_),
+      blob_garbage_collection_cutoff_file_number_(
+          ComputeBlobGarbageCollectionCutoffFileNumber(compaction_.get())),
       current_key_committed_(false),
-      info_log_(info_log),
-      allow_data_in_errors_(allow_data_in_errors) {
-  assert(compaction_filter_ == nullptr || compaction_ != nullptr);
+      cmp_with_history_ts_low_(0),
+      level_(compaction_ == nullptr ? 0 : compaction_->level()) {
   assert(snapshots_ != nullptr);
   bottommost_level_ = compaction_ == nullptr
                           ? false
@@ -117,14 +133,16 @@ CompactionIterator::CompactionIterator(
   for (size_t i = 1; i < snapshots_->size(); ++i) {
     assert(snapshots_->at(i - 1) < snapshots_->at(i));
   }
+  assert(timestamp_size_ == 0 || !full_history_ts_low_ ||
+         timestamp_size_ == full_history_ts_low_->size());
 #endif
-  input_->SetPinnedItersMgr(&pinned_iters_mgr_);
+  input_.SetPinnedItersMgr(&pinned_iters_mgr_);
   TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get());
 }
 
 CompactionIterator::~CompactionIterator() {
-  // input_ Iteartor lifetime is longer than pinned_iters_mgr_ lifetime
-  input_->SetPinnedItersMgr(nullptr);
+  // input_ Iterator lifetime is longer than pinned_iters_mgr_ lifetime
+  input_.SetPinnedItersMgr(nullptr);
 }
 
 void CompactionIterator::ResetRecordCounts() {
@@ -151,13 +169,13 @@ void CompactionIterator::Next() {
     if (merge_out_iter_.Valid()) {
       key_ = merge_out_iter_.key();
       value_ = merge_out_iter_.value();
-      Status s = ParseInternalKey(key_, &ikey_);
+      Status s = ParseInternalKey(key_, &ikey_, allow_data_in_errors_);
       // MergeUntil stops when it encounters a corrupt key and does not
       // include them in the result, so we expect the keys here to be valid.
       assert(s.ok());
       if (!s.ok()) {
-        ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction",
-                        key_.ToString(true).c_str());
+        ROCKS_LOG_FATAL(info_log_, "Invalid key in compaction. %s",
+                        s.getState());
       }
 
       // Keep current_key_ in sync.
@@ -177,7 +195,7 @@ void CompactionIterator::Next() {
     // Only advance the input iterator if there is no merge output and the
     // iterator is not already at the next record.
     if (!at_next_) {
-      input_->Next();
+      AdvanceInputIter();
     }
     NextFromInput();
   }
@@ -192,100 +210,172 @@ void CompactionIterator::Next() {
 
 bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
                                               Slice* skip_until) {
-  if (compaction_filter_ != nullptr &&
-      (ikey_.type == kTypeValue || ikey_.type == kTypeBlobIndex)) {
-    // If the user has specified a compaction filter and the sequence
-    // number is greater than any external snapshot, then invoke the
-    // filter. If the return value of the compaction filter is true,
-    // replace the entry with a deletion marker.
-    CompactionFilter::Decision filter;
-    compaction_filter_value_.clear();
-    compaction_filter_skip_until_.Clear();
-    CompactionFilter::ValueType value_type =
-        ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue
-                                 : CompactionFilter::ValueType::kBlobIndex;
-    // Hack: pass internal key to BlobIndexCompactionFilter since it needs
-    // to get sequence number.
-    Slice& filter_key = ikey_.type == kTypeValue ? ikey_.user_key : key_;
-    {
-      StopWatchNano timer(env_, report_detailed_time_);
+  if (!compaction_filter_ ||
+      (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex)) {
+    return true;
+  }
+  bool error = false;
+  // If the user has specified a compaction filter and the sequence
+  // number is greater than any external snapshot, then invoke the
+  // filter. If the return value of the compaction filter is true,
+  // replace the entry with a deletion marker.
+  CompactionFilter::Decision filter = CompactionFilter::Decision::kUndetermined;
+  compaction_filter_value_.clear();
+  compaction_filter_skip_until_.Clear();
+  CompactionFilter::ValueType value_type =
+      ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue
+                               : CompactionFilter::ValueType::kBlobIndex;
+  // Hack: pass internal key to BlobIndexCompactionFilter since it needs
+  // to get sequence number.
+  assert(compaction_filter_);
+  Slice& filter_key =
+      (ikey_.type == kTypeValue ||
+       !compaction_filter_->IsStackedBlobDbInternalCompactionFilter())
+          ? ikey_.user_key
+          : key_;
+  {
+    StopWatchNano timer(clock_, report_detailed_time_);
+    if (kTypeBlobIndex == ikey_.type) {
+      blob_value_.Reset();
+      filter = compaction_filter_->FilterBlobByKey(
+          level_, filter_key, &compaction_filter_value_,
+          compaction_filter_skip_until_.rep());
+      if (CompactionFilter::Decision::kUndetermined == filter &&
+          !compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+        // For integrated BlobDB impl, CompactionIterator reads blob value.
+        // For Stacked BlobDB impl, the corresponding CompactionFilter's
+        // FilterV2 method should read the blob value.
+        BlobIndex blob_index;
+        Status s = blob_index.DecodeFrom(value_);
+        if (!s.ok()) {
+          status_ = s;
+          valid_ = false;
+          return false;
+        }
+        if (blob_index.HasTTL() || blob_index.IsInlined()) {
+          status_ = Status::Corruption("Unexpected TTL/inlined blob index");
+          valid_ = false;
+          return false;
+        }
+        if (compaction_ == nullptr) {
+          status_ =
+              Status::Corruption("Unexpected blob index outside of compaction");
+          valid_ = false;
+          return false;
+        }
+        const Version* const version = compaction_->input_version();
+        assert(version);
+
+        uint64_t bytes_read = 0;
+        s = version->GetBlob(ReadOptions(), ikey_.user_key, blob_index,
+                             &blob_value_, &bytes_read);
+        if (!s.ok()) {
+          status_ = s;
+          valid_ = false;
+          return false;
+        }
+
+        ++iter_stats_.num_blobs_read;
+        iter_stats_.total_blob_bytes_read += bytes_read;
+
+        value_type = CompactionFilter::ValueType::kValue;
+      }
+    }
+    if (CompactionFilter::Decision::kUndetermined == filter) {
       filter = compaction_filter_->FilterV2(
-          compaction_->level(), filter_key, value_type, value_,
-          &compaction_filter_value_, compaction_filter_skip_until_.rep());
-      iter_stats_.total_filter_time +=
-          env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0;
+          level_, filter_key, value_type,
+          blob_value_.empty() ? value_ : blob_value_, &compaction_filter_value_,
+          compaction_filter_skip_until_.rep());
     }
+    iter_stats_.total_filter_time +=
+        env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0;
+  }
 
-    if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil &&
-        cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <=
-            0) {
-      // Can't skip to a key smaller than the current one.
-      // Keep the key as per FilterV2 documentation.
-      filter = CompactionFilter::Decision::kKeep;
-    }
+  if (CompactionFilter::Decision::kUndetermined == filter) {
+    // Should not reach here, since FilterV2 should never return kUndetermined.
+    status_ =
+        Status::NotSupported("FilterV2() should never return kUndetermined");
+    valid_ = false;
+    return false;
+  }
 
-    if (filter == CompactionFilter::Decision::kRemove) {
-      // convert the current key to a delete; key_ is pointing into
-      // current_key_ at this point, so updating current_key_ updates key()
-      ikey_.type = kTypeDeletion;
-      current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion);
-      // no value associated with delete
-      value_.clear();
-      iter_stats_.num_record_drop_user++;
-    } else if (filter == CompactionFilter::Decision::kChangeValue) {
-      if (ikey_.type == kTypeBlobIndex) {
-        // value transfer from blob file to inlined data
-        ikey_.type = kTypeValue;
-        current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
-      }
-      value_ = compaction_filter_value_;
-    } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) {
-      *need_skip = true;
-      compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
-                                                       kValueTypeForSeek);
-      *skip_until = compaction_filter_skip_until_.Encode();
-    } else if (filter == CompactionFilter::Decision::kChangeBlobIndex) {
-      if (ikey_.type == kTypeValue) {
-        // value transfer from inlined data to blob file
-        ikey_.type = kTypeBlobIndex;
-        current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
-      }
-      value_ = compaction_filter_value_;
-    } else if (filter == CompactionFilter::Decision::kIOError) {
-      status_ =
-          Status::IOError("Failed to access blob during compaction filter");
+  if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil &&
+      cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <=
+          0) {
+    // Can't skip to a key smaller than the current one.
+    // Keep the key as per FilterV2 documentation.
+    filter = CompactionFilter::Decision::kKeep;
+  }
+
+  if (filter == CompactionFilter::Decision::kRemove) {
+    // convert the current key to a delete; key_ is pointing into
+    // current_key_ at this point, so updating current_key_ updates key()
+    ikey_.type = kTypeDeletion;
+    current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion);
+    // no value associated with delete
+    value_.clear();
+    iter_stats_.num_record_drop_user++;
+  } else if (filter == CompactionFilter::Decision::kChangeValue) {
+    if (ikey_.type == kTypeBlobIndex) {
+      // value transfer from blob file to inlined data
+      ikey_.type = kTypeValue;
+      current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+    }
+    value_ = compaction_filter_value_;
+  } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) {
+    *need_skip = true;
+    compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
+                                                     kValueTypeForSeek);
+    *skip_until = compaction_filter_skip_until_.Encode();
+  } else if (filter == CompactionFilter::Decision::kChangeBlobIndex) {
+    // Only the StackableDB-based BlobDB impl's compaction filter should return
+    // kChangeBlobIndex. Decision about rewriting blob and changing blob index
+    // in the integrated BlobDB impl is made in subsequent call to
+    // PrepareOutput() and its callees.
+    if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+      status_ = Status::NotSupported(
+          "Only stacked BlobDB's internal compaction filter can return "
+          "kChangeBlobIndex.");
+      valid_ = false;
+      return false;
+    }
+    if (ikey_.type == kTypeValue) {
+      // value transfer from inlined data to blob file
+      ikey_.type = kTypeBlobIndex;
+      current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+    }
+    value_ = compaction_filter_value_;
+  } else if (filter == CompactionFilter::Decision::kIOError) {
+    if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+      status_ = Status::NotSupported(
+          "CompactionFilter for integrated BlobDB should not return kIOError");
+      valid_ = false;
       return false;
     }
+    status_ = Status::IOError("Failed to access blob during compaction filter");
+    error = true;
   }
-  return true;
+  return !error;
 }
 
 void CompactionIterator::NextFromInput() {
   at_next_ = false;
   valid_ = false;
 
-  while (!valid_ && input_->Valid() && !IsPausingManualCompaction() &&
+  while (!valid_ && input_.Valid() && !IsPausingManualCompaction() &&
          !IsShuttingDown()) {
-    key_ = input_->key();
-    value_ = input_->value();
+    key_ = input_.key();
+    value_ = input_.value();
     iter_stats_.num_input_records++;
 
-    Status pikStatus = ParseInternalKey(key_, &ikey_);
-    if (!pikStatus.ok()) {
+    Status pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_);
+    if (!pik_status.ok()) {
       iter_stats_.num_input_corrupt_records++;
 
       // If `expect_valid_internal_key_` is false, return the corrupted key
       // and let the caller decide what to do with it.
-      // TODO(noetzli): We should have a more elegant solution for this.
       if (expect_valid_internal_key_) {
-        std::string msg("Corrupted internal key not expected.");
-        if (allow_data_in_errors_) {
-          msg.append(" Corrupt key: " + ikey_.user_key.ToString(/*hex=*/true) +
-                     ". ");
-          msg.append("key type: " + std::to_string(ikey_.type) + ".");
-          msg.append("seq: " + std::to_string(ikey_.sequence) + ".");
-        }
-        status_ = Status::Corruption(msg.c_str());
+        status_ = pik_status;
         return;
       }
       key_ = current_key_.SetInternalKey(key_);
@@ -298,7 +388,8 @@ void CompactionIterator::NextFromInput() {
     TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_);
 
     // Update input statistics
-    if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion) {
+    if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion ||
+        ikey_.type == kTypeDeletionWithTimestamp) {
       iter_stats_.num_input_deletion_records++;
     }
     iter_stats_.total_input_raw_key_bytes += key_.size();
@@ -311,19 +402,54 @@ void CompactionIterator::NextFromInput() {
     // merge_helper_->compaction_filter_skip_until_.
     Slice skip_until;
 
+    bool user_key_equal_without_ts = false;
+    int cmp_ts = 0;
+    if (has_current_user_key_) {
+      user_key_equal_without_ts =
+          cmp_->EqualWithoutTimestamp(ikey_.user_key, current_user_key_);
+      // if timestamp_size_ > 0, then curr_ts_ has been initialized by a
+      // previous key.
+      cmp_ts = timestamp_size_ ? cmp_->CompareTimestamp(
+                                     ExtractTimestampFromUserKey(
+                                         ikey_.user_key, timestamp_size_),
+                                     curr_ts_)
+                               : 0;
+    }
+
     // Check whether the user key changed. After this if statement current_key_
     // is a copy of the current input key (maybe converted to a delete by the
     // compaction filter). ikey_.user_key is pointing to the copy.
-    if (!has_current_user_key_ ||
-        !cmp_->Equal(ikey_.user_key, current_user_key_)) {
+    if (!has_current_user_key_ || !user_key_equal_without_ts || cmp_ts != 0) {
       // First occurrence of this user key
       // Copy key for output
       key_ = current_key_.SetInternalKey(key_, &ikey_);
+
+      // If timestamp_size_ > 0, then copy from ikey_ to curr_ts_ for the use
+      // in next iteration to compare with the timestamp of next key.
+      UpdateTimestampAndCompareWithFullHistoryLow();
+
+      // If
+      // (1) !has_current_user_key_, OR
+      // (2) timestamp is disabled, OR
+      // (3) all history will be preserved, OR
+      // (4) user key (excluding timestamp) is different from previous key, OR
+      // (5) timestamp is NO older than *full_history_ts_low_
+      // then current_user_key_ must be treated as a different user key.
+      // This means, if a user key (excluding ts) is the same as the previous
+      // user key, and its ts is older than *full_history_ts_low_, then we
+      // consider this key for GC, e.g. it may be dropped if certain conditions
+      // match.
+      if (!has_current_user_key_ || !timestamp_size_ || !full_history_ts_low_ ||
+          !user_key_equal_without_ts || cmp_with_history_ts_low_ >= 0) {
+        // Initialize for future comparison for rule (A) and etc.
+        current_user_key_sequence_ = kMaxSequenceNumber;
+        current_user_key_snapshot_ = 0;
+        has_current_user_key_ = true;
+      }
       current_user_key_ = ikey_.user_key;
-      has_current_user_key_ = true;
+
       has_outputted_key_ = false;
-      current_user_key_sequence_ = kMaxSequenceNumber;
-      current_user_key_snapshot_ = 0;
+
       current_key_committed_ = KeyCommitted(ikey_.sequence);
 
       // Apply the compaction filter to the first committed version of the user
@@ -381,8 +507,8 @@ void CompactionIterator::NextFromInput() {
       // In the previous iteration we encountered a single delete that we could
       // not compact out.  We will keep this Put, but can drop it's data.
       // (See Optimization 3, below.)
-      assert(ikey_.type == kTypeValue);
-      if (ikey_.type != kTypeValue) {
+      assert(ikey_.type == kTypeValue || ikey_.type == kTypeBlobIndex);
+      if (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex) {
         ROCKS_LOG_FATAL(info_log_,
                         "Unexpected key type %d for compaction output",
                         ikey_.type);
@@ -395,6 +521,11 @@ void CompactionIterator::NextFromInput() {
                         current_user_key_snapshot_, last_snapshot);
       }
 
+      if (ikey_.type == kTypeBlobIndex) {
+        ikey_.type = kTypeValue;
+        current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+      }
+
       value_.clear();
       valid_ = true;
       clear_and_output_next_key_ = false;
@@ -434,12 +565,13 @@ void CompactionIterator::NextFromInput() {
       // The easiest way to process a SingleDelete during iteration is to peek
       // ahead at the next key.
       ParsedInternalKey next_ikey;
-      input_->Next();
+      AdvanceInputIter();
 
       // Check whether the next key exists, is not corrupt, and is the same key
       // as the single delete.
-      if (input_->Valid() &&
-          ParseInternalKey(input_->key(), &next_ikey) == Status::OK() &&
+      if (input_.Valid() &&
+          ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
+              .ok() &&
           cmp_->Equal(ikey_.user_key, next_ikey.user_key)) {
         // Check whether the next key belongs to the same snapshot as the
         // SingleDelete.
@@ -452,7 +584,7 @@ void CompactionIterator::NextFromInput() {
             // to handle the second SingleDelete
 
             // First SingleDelete has been skipped since we already called
-            // input_->Next().
+            // input_.Next().
             ++iter_stats_.num_record_drop_obsolete;
             ++iter_stats_.num_single_del_mismatch;
           } else if (has_outputted_key_ ||
@@ -474,9 +606,9 @@ void CompactionIterator::NextFromInput() {
 
             ++iter_stats_.num_record_drop_hidden;
             ++iter_stats_.num_record_drop_obsolete;
-            // Already called input_->Next() once.  Call it a second time to
+            // Already called input_.Next() once.  Call it a second time to
             // skip past the second key.
-            input_->Next();
+            AdvanceInputIter();
           } else {
             // Found a matching value, but we cannot drop both keys since
             // there is an earlier snapshot and we need to leave behind a record
@@ -543,9 +675,12 @@ void CompactionIterator::NextFromInput() {
                         last_sequence, current_user_key_sequence_);
       }
 
-      ++iter_stats_.num_record_drop_hidden;  // (A)
-      input_->Next();
-    } else if (compaction_ != nullptr && ikey_.type == kTypeDeletion &&
+      ++iter_stats_.num_record_drop_hidden;  // rule (A)
+      AdvanceInputIter();
+    } else if (compaction_ != nullptr &&
+               (ikey_.type == kTypeDeletion ||
+                (ikey_.type == kTypeDeletionWithTimestamp &&
+                 cmp_with_history_ts_low_ < 0)) &&
                IN_EARLIEST_SNAPSHOT(ikey_.sequence) &&
                ikeyNotNeededForIncrementalSnapshot() &&
                compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
@@ -569,35 +704,47 @@ void CompactionIterator::NextFromInput() {
       // given that:
       // (1) The deletion is earlier than earliest_write_conflict_snapshot, and
       // (2) No value exist earlier than the deletion.
+      //
+      // Note also that a deletion marker of type kTypeDeletionWithTimestamp
+      // will be treated as a different user key unless the timestamp is older
+      // than *full_history_ts_low_.
       ++iter_stats_.num_record_drop_obsolete;
       if (!bottommost_level_) {
         ++iter_stats_.num_optimized_del_drop_obsolete;
       }
-      input_->Next();
-    } else if ((ikey_.type == kTypeDeletion) && bottommost_level_ &&
-               ikeyNotNeededForIncrementalSnapshot()) {
+      AdvanceInputIter();
+    } else if ((ikey_.type == kTypeDeletion ||
+                (ikey_.type == kTypeDeletionWithTimestamp &&
+                 cmp_with_history_ts_low_ < 0)) &&
+               bottommost_level_ && ikeyNotNeededForIncrementalSnapshot()) {
       // Handle the case where we have a delete key at the bottom most level
       // We can skip outputting the key iff there are no subsequent puts for this
       // key
       assert(!compaction_ || compaction_->KeyNotExistsBeyondOutputLevel(
                                  ikey_.user_key, &level_ptrs_));
       ParsedInternalKey next_ikey;
-      input_->Next();
-      // Skip over all versions of this key that happen to occur in the same snapshot
-      // range as the delete
+      AdvanceInputIter();
+      // Skip over all versions of this key that happen to occur in the same
+      // snapshot range as the delete.
+      //
+      // Note that a deletion marker of type kTypeDeletionWithTimestamp will be
+      // considered to have a different user key unless the timestamp is older
+      // than *full_history_ts_low_.
       while (!IsPausingManualCompaction() && !IsShuttingDown() &&
-             input_->Valid() &&
-             (ParseInternalKey(input_->key(), &next_ikey) == Status::OK()) &&
-             cmp_->Equal(ikey_.user_key, next_ikey.user_key) &&
+             input_.Valid() &&
+             (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
+                  .ok()) &&
+             cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key) &&
              (prev_snapshot == 0 ||
               DEFINITELY_NOT_IN_SNAPSHOT(next_ikey.sequence, prev_snapshot))) {
-        input_->Next();
+        AdvanceInputIter();
       }
       // If you find you still need to output a row with this key, we need to output the
       // delete too
-      if (input_->Valid() &&
-          (ParseInternalKey(input_->key(), &next_ikey) == Status::OK()) &&
-          cmp_->Equal(ikey_.user_key, next_ikey.user_key)) {
+      if (input_.Valid() &&
+          (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
+               .ok()) &&
+          cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) {
         valid_ = true;
         at_next_ = true;
       }
@@ -613,8 +760,9 @@ void CompactionIterator::NextFromInput() {
       // have hit (A)
       // We encapsulate the merge related state machine in a different
       // object to minimize change to the existing flow.
-      Status s = merge_helper_->MergeUntil(input_, range_del_agg_,
-                                           prev_snapshot, bottommost_level_);
+      Status s =
+          merge_helper_->MergeUntil(&input_, range_del_agg_, prev_snapshot,
+                                    bottommost_level_, allow_data_in_errors_);
       merge_out_iter_.SeekToFirst();
 
       if (!s.ok() && !s.IsMergeInProgress()) {
@@ -625,13 +773,13 @@ void CompactionIterator::NextFromInput() {
         //       These will be correctly set below.
         key_ = merge_out_iter_.key();
         value_ = merge_out_iter_.value();
-        pikStatus = ParseInternalKey(key_, &ikey_);
+        pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_);
         // MergeUntil stops when it encounters a corrupt key and does not
         // include them in the result, so we expect the keys here to valid.
-        assert(pikStatus.ok());
-        if (!pikStatus.ok()) {
-          ROCKS_LOG_FATAL(info_log_, "Invalid key (%s) in compaction",
-                          key_.ToString(true).c_str());
+        assert(pik_status.ok());
+        if (!pik_status.ok()) {
+          ROCKS_LOG_FATAL(info_log_, "Invalid key in compaction. %s",
+                          pik_status.getState());
         }
         // Keep current_key_ in sync.
         current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
@@ -657,14 +805,14 @@ void CompactionIterator::NextFromInput() {
       if (should_delete) {
         ++iter_stats_.num_record_drop_hidden;
         ++iter_stats_.num_record_drop_range_del;
-        input_->Next();
+        AdvanceInputIter();
       } else {
         valid_ = true;
       }
     }
 
     if (need_skip) {
-      input_->Seek(skip_until);
+      SkipUntil(skip_until);
     }
   }
 
@@ -677,42 +825,144 @@ void CompactionIterator::NextFromInput() {
   }
 }
 
-void CompactionIterator::PrepareOutput() {
-  if (valid_) {
-    if (ikey_.type == kTypeValue) {
-      if (blob_file_builder_) {
-        blob_index_.clear();
-        const Status s =
-            blob_file_builder_->Add(user_key(), value_, &blob_index_);
+bool CompactionIterator::ExtractLargeValueIfNeededImpl() {
+  if (!blob_file_builder_) {
+    return false;
+  }
 
-        if (!s.ok()) {
-          status_ = s;
-          valid_ = false;
-        } else if (!blob_index_.empty()) {
-          value_ = blob_index_;
-          ikey_.type = kTypeBlobIndex;
-          current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
-        }
+  blob_index_.clear();
+  const Status s = blob_file_builder_->Add(user_key(), value_, &blob_index_);
+
+  if (!s.ok()) {
+    status_ = s;
+    valid_ = false;
+
+    return false;
+  }
+
+  if (blob_index_.empty()) {
+    return false;
+  }
+
+  value_ = blob_index_;
+
+  return true;
+}
+
+void CompactionIterator::ExtractLargeValueIfNeeded() {
+  assert(ikey_.type == kTypeValue);
+
+  if (!ExtractLargeValueIfNeededImpl()) {
+    return;
+  }
+
+  ikey_.type = kTypeBlobIndex;
+  current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+}
+
+void CompactionIterator::GarbageCollectBlobIfNeeded() {
+  assert(ikey_.type == kTypeBlobIndex);
+
+  if (!compaction_) {
+    return;
+  }
+
+  // GC for integrated BlobDB
+  if (compaction_->enable_blob_garbage_collection()) {
+    BlobIndex blob_index;
+
+    {
+      const Status s = blob_index.DecodeFrom(value_);
+
+      if (!s.ok()) {
+        status_ = s;
+        valid_ = false;
+
+        return;
       }
-    } else if (ikey_.type == kTypeBlobIndex) {
-      if (compaction_filter_) {
-        const auto blob_decision = compaction_filter_->PrepareBlobOutput(
-            user_key(), value_, &compaction_filter_value_);
+    }
 
-        if (blob_decision == CompactionFilter::BlobDecision::kCorruption) {
-          status_ = Status::Corruption(
-              "Corrupted blob reference encountered during GC");
-          valid_ = false;
-        } else if (blob_decision == CompactionFilter::BlobDecision::kIOError) {
-          status_ = Status::IOError("Could not relocate blob during GC");
-          valid_ = false;
-        } else if (blob_decision ==
-                   CompactionFilter::BlobDecision::kChangeValue) {
-          value_ = compaction_filter_value_;
-        }
+    if (blob_index.IsInlined() || blob_index.HasTTL()) {
+      status_ = Status::Corruption("Unexpected TTL/inlined blob index");
+      valid_ = false;
+
+      return;
+    }
+
+    if (blob_index.file_number() >=
+        blob_garbage_collection_cutoff_file_number_) {
+      return;
+    }
+
+    const Version* const version = compaction_->input_version();
+    assert(version);
+
+    uint64_t bytes_read = 0;
+
+    {
+      const Status s = version->GetBlob(ReadOptions(), user_key(), blob_index,
+                                        &blob_value_, &bytes_read);
+
+      if (!s.ok()) {
+        status_ = s;
+        valid_ = false;
+
+        return;
       }
     }
 
+    ++iter_stats_.num_blobs_read;
+    iter_stats_.total_blob_bytes_read += bytes_read;
+
+    value_ = blob_value_;
+
+    if (ExtractLargeValueIfNeededImpl()) {
+      return;
+    }
+
+    ikey_.type = kTypeValue;
+    current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+
+    return;
+  }
+
+  // GC for stacked BlobDB
+  if (compaction_filter_ &&
+      compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+    const auto blob_decision = compaction_filter_->PrepareBlobOutput(
+        user_key(), value_, &compaction_filter_value_);
+
+    if (blob_decision == CompactionFilter::BlobDecision::kCorruption) {
+      status_ =
+          Status::Corruption("Corrupted blob reference encountered during GC");
+      valid_ = false;
+
+      return;
+    }
+
+    if (blob_decision == CompactionFilter::BlobDecision::kIOError) {
+      status_ = Status::IOError("Could not relocate blob during GC");
+      valid_ = false;
+
+      return;
+    }
+
+    if (blob_decision == CompactionFilter::BlobDecision::kChangeValue) {
+      value_ = compaction_filter_value_;
+
+      return;
+    }
+  }
+}
+
+void CompactionIterator::PrepareOutput() {
+  if (valid_) {
+    if (ikey_.type == kTypeValue) {
+      ExtractLargeValueIfNeeded();
+    } else if (ikey_.type == kTypeBlobIndex) {
+      GarbageCollectBlobIfNeeded();
+    }
+
     // Zeroing out the sequence number leads to better compression.
     // If this is the bottommost level (no files in lower levels)
     // and the earliest snapshot is larger than this seqno
@@ -735,7 +985,18 @@ void CompactionIterator::PrepareOutput() {
                         ikey_.type);
       }
       ikey_.sequence = 0;
-      current_key_.UpdateInternalKey(0, ikey_.type);
+      if (!timestamp_size_) {
+        current_key_.UpdateInternalKey(0, ikey_.type);
+      } else if (full_history_ts_low_ && cmp_with_history_ts_low_ < 0) {
+        // We can also zero out timestamp for better compression.
+        // For the same user key (excluding timestamp), the timestamp-based
+        // history can be collapsed to save some space if the timestamp is
+        // older than *full_history_ts_low_.
+        const std::string kTsMin(timestamp_size_, static_cast<char>(0));
+        const Slice ts_slice = kTsMin;
+        ikey_.SetTimestamp(ts_slice);
+        current_key_.UpdateInternalKey(0, ikey_.type, &ts_slice);
+      }
     }
   }
 }
@@ -827,4 +1088,30 @@ bool CompactionIterator::IsInEarliestSnapshot(SequenceNumber sequence) {
   return in_snapshot == SnapshotCheckerResult::kInSnapshot;
 }
 
+uint64_t CompactionIterator::ComputeBlobGarbageCollectionCutoffFileNumber(
+    const CompactionProxy* compaction) {
+  if (!compaction) {
+    return 0;
+  }
+
+  if (!compaction->enable_blob_garbage_collection()) {
+    return 0;
+  }
+
+  Version* const version = compaction->input_version();
+  assert(version);
+
+  const VersionStorageInfo* const storage_info = version->storage_info();
+  assert(storage_info);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+
+  auto it = blob_files.begin();
+  std::advance(
+      it, compaction->blob_garbage_collection_age_cutoff() * blob_files.size());
+
+  return it != blob_files.end() ? it->first
+                                : std::numeric_limits<uint64_t>::max();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h
index 29dedd3c719..65df5c44498 100644
--- a/db/compaction/compaction_iterator.h
+++ b/db/compaction/compaction_iterator.h
@@ -5,6 +5,7 @@
 #pragma once
 
 #include <algorithm>
+#include <cinttypes>
 #include <deque>
 #include <string>
 #include <unordered_set>
@@ -23,76 +24,166 @@ namespace ROCKSDB_NAMESPACE {
 
 class BlobFileBuilder;
 
+// A wrapper of internal iterator whose purpose is to count how
+// many entries there are in the iterator.
+class SequenceIterWrapper : public InternalIterator {
+ public:
+  SequenceIterWrapper(InternalIterator* iter, const Comparator* cmp,
+                      bool need_count_entries)
+      : icmp_(cmp, /*named=*/false),
+        inner_iter_(iter),
+        need_count_entries_(need_count_entries) {}
+  bool Valid() const override { return inner_iter_->Valid(); }
+  Status status() const override { return inner_iter_->status(); }
+  void Next() override {
+    num_itered_++;
+    inner_iter_->Next();
+  }
+  void Seek(const Slice& target) override {
+    if (!need_count_entries_) {
+      inner_iter_->Seek(target);
+    } else {
+      // For flush cases, we need to count total number of entries, so we
+      // do Next() rather than Seek().
+      while (inner_iter_->Valid() &&
+             icmp_.Compare(inner_iter_->key(), target) < 0) {
+        Next();
+      }
+    }
+  }
+  Slice key() const override { return inner_iter_->key(); }
+  Slice value() const override { return inner_iter_->value(); }
+
+  // Unused InternalIterator methods
+  void SeekToFirst() override { assert(false); }
+  void Prev() override { assert(false); }
+  void SeekForPrev(const Slice& /* target */) override { assert(false); }
+  void SeekToLast() override { assert(false); }
+
+  uint64_t num_itered() const { return num_itered_; }
+
+ private:
+  InternalKeyComparator icmp_;
+  InternalIterator* inner_iter_;  // not owned
+  uint64_t num_itered_ = 0;
+  bool need_count_entries_;
+};
+
 class CompactionIterator {
  public:
   // A wrapper around Compaction. Has a much smaller interface, only what
   // CompactionIterator uses. Tests can override it.
   class CompactionProxy {
    public:
-    explicit CompactionProxy(const Compaction* compaction)
-        : compaction_(compaction) {}
-
     virtual ~CompactionProxy() = default;
-    virtual int level(size_t /*compaction_input_level*/ = 0) const {
-      return compaction_->level();
-    }
+
+    virtual int level() const = 0;
+
     virtual bool KeyNotExistsBeyondOutputLevel(
-        const Slice& user_key, std::vector<size_t>* level_ptrs) const {
+        const Slice& user_key, std::vector<size_t>* level_ptrs) const = 0;
+
+    virtual bool bottommost_level() const = 0;
+
+    virtual int number_levels() const = 0;
+
+    virtual Slice GetLargestUserKey() const = 0;
+
+    virtual bool allow_ingest_behind() const = 0;
+
+    virtual bool preserve_deletes() const = 0;
+
+    virtual bool enable_blob_garbage_collection() const = 0;
+
+    virtual double blob_garbage_collection_age_cutoff() const = 0;
+
+    virtual Version* input_version() const = 0;
+  };
+
+  class RealCompaction : public CompactionProxy {
+   public:
+    explicit RealCompaction(const Compaction* compaction)
+        : compaction_(compaction) {
+      assert(compaction_);
+      assert(compaction_->immutable_options());
+      assert(compaction_->mutable_cf_options());
+    }
+
+    int level() const override { return compaction_->level(); }
+
+    bool KeyNotExistsBeyondOutputLevel(
+        const Slice& user_key, std::vector<size_t>* level_ptrs) const override {
       return compaction_->KeyNotExistsBeyondOutputLevel(user_key, level_ptrs);
     }
-    virtual bool bottommost_level() const {
+
+    bool bottommost_level() const override {
       return compaction_->bottommost_level();
     }
-    virtual int number_levels() const { return compaction_->number_levels(); }
-    virtual Slice GetLargestUserKey() const {
+
+    int number_levels() const override { return compaction_->number_levels(); }
+
+    Slice GetLargestUserKey() const override {
       return compaction_->GetLargestUserKey();
     }
-    virtual bool allow_ingest_behind() const {
-      return compaction_->immutable_cf_options()->allow_ingest_behind;
+
+    bool allow_ingest_behind() const override {
+      return compaction_->immutable_options()->allow_ingest_behind;
+    }
+
+    bool preserve_deletes() const override {
+      return compaction_->immutable_options()->preserve_deletes;
+    }
+
+    bool enable_blob_garbage_collection() const override {
+      return compaction_->mutable_cf_options()->enable_blob_garbage_collection;
     }
-    virtual bool preserve_deletes() const {
-      return compaction_->immutable_cf_options()->preserve_deletes;
+
+    double blob_garbage_collection_age_cutoff() const override {
+      return compaction_->mutable_cf_options()
+          ->blob_garbage_collection_age_cutoff;
     }
 
-   protected:
-    CompactionProxy() = default;
+    Version* input_version() const override {
+      return compaction_->input_version();
+    }
 
    private:
     const Compaction* compaction_;
   };
 
-  CompactionIterator(InternalIterator* input, const Comparator* cmp,
-                     MergeHelper* merge_helper, SequenceNumber last_sequence,
-                     std::vector<SequenceNumber>* snapshots,
-                     SequenceNumber earliest_write_conflict_snapshot,
-                     const SnapshotChecker* snapshot_checker, Env* env,
-                     bool report_detailed_time, bool expect_valid_internal_key,
-                     CompactionRangeDelAggregator* range_del_agg,
-                     BlobFileBuilder* blob_file_builder,
-                     bool allow_data_in_errors,
-                     const Compaction* compaction = nullptr,
-                     const CompactionFilter* compaction_filter = nullptr,
-                     const std::atomic<bool>* shutting_down = nullptr,
-                     const SequenceNumber preserve_deletes_seqnum = 0,
-                     const std::atomic<int>* manual_compaction_paused = nullptr,
-                     const std::shared_ptr<Logger> info_log = nullptr);
+  CompactionIterator(
+      InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+      SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+      SequenceNumber earliest_write_conflict_snapshot,
+      const SnapshotChecker* snapshot_checker, Env* env,
+      bool report_detailed_time, bool expect_valid_internal_key,
+      CompactionRangeDelAggregator* range_del_agg,
+      BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
+      const Compaction* compaction = nullptr,
+      const CompactionFilter* compaction_filter = nullptr,
+      const std::atomic<bool>* shutting_down = nullptr,
+      const SequenceNumber preserve_deletes_seqnum = 0,
+      const std::atomic<int>* manual_compaction_paused = nullptr,
+      const std::atomic<bool>* manual_compaction_canceled = nullptr,
+      const std::shared_ptr<Logger> info_log = nullptr,
+      const std::string* full_history_ts_low = nullptr);
 
   // Constructor with custom CompactionProxy, used for tests.
-  CompactionIterator(InternalIterator* input, const Comparator* cmp,
-                     MergeHelper* merge_helper, SequenceNumber last_sequence,
-                     std::vector<SequenceNumber>* snapshots,
-                     SequenceNumber earliest_write_conflict_snapshot,
-                     const SnapshotChecker* snapshot_checker, Env* env,
-                     bool report_detailed_time, bool expect_valid_internal_key,
-                     CompactionRangeDelAggregator* range_del_agg,
-                     BlobFileBuilder* blob_file_builder,
-                     bool allow_data_in_errors,
-                     std::unique_ptr<CompactionProxy> compaction,
-                     const CompactionFilter* compaction_filter = nullptr,
-                     const std::atomic<bool>* shutting_down = nullptr,
-                     const SequenceNumber preserve_deletes_seqnum = 0,
-                     const std::atomic<int>* manual_compaction_paused = nullptr,
-                     const std::shared_ptr<Logger> info_log = nullptr);
+  CompactionIterator(
+      InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+      SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+      SequenceNumber earliest_write_conflict_snapshot,
+      const SnapshotChecker* snapshot_checker, Env* env,
+      bool report_detailed_time, bool expect_valid_internal_key,
+      CompactionRangeDelAggregator* range_del_agg,
+      BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
+      std::unique_ptr<CompactionProxy> compaction,
+      const CompactionFilter* compaction_filter = nullptr,
+      const std::atomic<bool>* shutting_down = nullptr,
+      const SequenceNumber preserve_deletes_seqnum = 0,
+      const std::atomic<int>* manual_compaction_paused = nullptr,
+      const std::atomic<bool>* manual_compaction_canceled = nullptr,
+      const std::shared_ptr<Logger> info_log = nullptr,
+      const std::string* full_history_ts_low = nullptr);
 
   ~CompactionIterator();
 
@@ -116,16 +207,36 @@ class CompactionIterator {
   bool Valid() const { return valid_; }
   const Slice& user_key() const { return current_user_key_; }
   const CompactionIterationStats& iter_stats() const { return iter_stats_; }
+  uint64_t num_input_entry_scanned() const { return input_.num_itered(); }
 
  private:
   // Processes the input stream to find the next output
   void NextFromInput();
 
-  // Do last preparations before presenting the output to the callee. At this
-  // point this only zeroes out the sequence number if possible for better
-  // compression.
+  // Do final preparations before presenting the output to the callee.
   void PrepareOutput();
 
+  // Passes the output value to the blob file builder (if any), and replaces it
+  // with the corresponding blob reference if it has been actually written to a
+  // blob file (i.e. if it passed the value size check). Returns true if the
+  // value got extracted to a blob file, false otherwise.
+  bool ExtractLargeValueIfNeededImpl();
+
+  // Extracts large values as described above, and updates the internal key's
+  // type to kTypeBlobIndex if the value got extracted. Should only be called
+  // for regular values (kTypeValue).
+  void ExtractLargeValueIfNeeded();
+
+  // Relocates valid blobs residing in the oldest blob files if garbage
+  // collection is enabled. Relocated blobs are written to new blob files or
+  // inlined in the LSM tree depending on the current settings (i.e.
+  // enable_blob_files and min_blob_size). Should only be called for blob
+  // references (kTypeBlobIndex).
+  //
+  // Note: the stacked BlobDB implementation's compaction filter based GC
+  // algorithm is also called from here.
+  void GarbageCollectBlobIfNeeded();
+
   // Invoke compaction filter if needed.
   // Return true on success, false on failures (e.g.: kIOError).
   bool InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until);
@@ -152,7 +263,24 @@ class CompactionIterator {
 
   bool IsInEarliestSnapshot(SequenceNumber sequence);
 
-  InternalIterator* input_;
+  // Extract user-defined timestamp from user key if possible and compare it
+  // with *full_history_ts_low_ if applicable.
+  inline void UpdateTimestampAndCompareWithFullHistoryLow() {
+    if (!timestamp_size_) {
+      return;
+    }
+    Slice ts = ExtractTimestampFromUserKey(ikey_.user_key, timestamp_size_);
+    curr_ts_.assign(ts.data(), ts.size());
+    if (full_history_ts_low_) {
+      cmp_with_history_ts_low_ =
+          cmp_->CompareTimestamp(ts, *full_history_ts_low_);
+    }
+  }
+
+  static uint64_t ComputeBlobGarbageCollectionCutoffFileNumber(
+      const CompactionProxy* compaction);
+
+  SequenceIterWrapper input_;
   const Comparator* cmp_;
   MergeHelper* merge_helper_;
   const std::vector<SequenceNumber>* snapshots_;
@@ -166,6 +294,7 @@ class CompactionIterator {
   const SequenceNumber earliest_write_conflict_snapshot_;
   const SnapshotChecker* const snapshot_checker_;
   Env* env_;
+  SystemClock* clock_;
   bool report_detailed_time_;
   bool expect_valid_internal_key_;
   CompactionRangeDelAggregator* range_del_agg_;
@@ -174,6 +303,7 @@ class CompactionIterator {
   const CompactionFilter* compaction_filter_;
   const std::atomic<bool>* shutting_down_;
   const std::atomic<int>* manual_compaction_paused_;
+  const std::atomic<bool>* manual_compaction_canceled_;
   const SequenceNumber preserve_deletes_seqnum_;
   bool bottommost_level_;
   bool valid_ = false;
@@ -181,6 +311,20 @@ class CompactionIterator {
   SequenceNumber earliest_snapshot_;
   SequenceNumber latest_snapshot_;
 
+  std::shared_ptr<Logger> info_log_;
+
+  bool allow_data_in_errors_;
+
+  // Comes from comparator.
+  const size_t timestamp_size_;
+
+  // Lower bound timestamp to retain full history in terms of user-defined
+  // timestamp. If a key's timestamp is older than full_history_ts_low_, then
+  // the key *may* be eligible for garbage collection (GC). The skipping logic
+  // is in `NextFromInput()` and `PrepareOutput()`.
+  // If nullptr, NO GC will be performed and all history will be preserved.
+  const std::string* const full_history_ts_low_;
+
   // State
   //
   // Points to a copy of the current compaction iterator output (current_key_)
@@ -199,11 +343,13 @@ class CompactionIterator {
   // Stores whether ikey_.user_key is valid. If set to false, the user key is
   // not compared against the current key in the underlying iterator.
   bool has_current_user_key_ = false;
-  bool at_next_ = false;  // If false, the iterator
-  // Holds a copy of the current compaction iterator output (or current key in
-  // the underlying iterator during NextFromInput()).
+  // If false, the iterator holds a copy of the current compaction iterator
+  // output (or current key in the underlying iterator during NextFromInput()).
+  bool at_next_ = false;
+
   IterKey current_key_;
   Slice current_user_key_;
+  std::string curr_ts_;
   SequenceNumber current_user_key_sequence_;
   SequenceNumber current_user_key_snapshot_;
 
@@ -218,7 +364,11 @@ class CompactionIterator {
   // PinnedIteratorsManager used to pin input_ Iterator blocks while reading
   // merge operands and then releasing them after consuming them.
   PinnedIteratorsManager pinned_iters_mgr_;
+
+  uint64_t blob_garbage_collection_cutoff_file_number_;
+
   std::string blob_index_;
+  PinnableSlice blob_value_;
   std::string compaction_filter_value_;
   InternalKey compaction_filter_skip_until_;
   // "level_ptrs" holds indices that remember which file of an associated
@@ -233,9 +383,15 @@ class CompactionIterator {
   // Used to avoid purging uncommitted values. The application can specify
   // uncommitted values by providing a SnapshotChecker object.
   bool current_key_committed_;
-  std::shared_ptr<Logger> info_log_;
 
-  bool allow_data_in_errors_;
+  // Saved result of ucmp->CompareTimestamp(current_ts_, *full_history_ts_low_)
+  int cmp_with_history_ts_low_;
+
+  const int level_;
+
+  void AdvanceInputIter() { input_.Next(); }
+
+  void SkipUntil(const Slice& skip_until) { input_.Seek(skip_until); }
 
   bool IsShuttingDown() {
     // This is a best-effort facility, so memory_order_relaxed is sufficient.
@@ -244,8 +400,10 @@ class CompactionIterator {
 
   bool IsPausingManualCompaction() {
     // This is a best-effort facility, so memory_order_relaxed is sufficient.
-    return manual_compaction_paused_ &&
-           manual_compaction_paused_->load(std::memory_order_relaxed) > 0;
+    return (manual_compaction_paused_ &&
+            manual_compaction_paused_->load(std::memory_order_relaxed) > 0) ||
+           (manual_compaction_canceled_ &&
+            manual_compaction_canceled_->load(std::memory_order_relaxed));
   }
 };
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/compaction_iterator_test.cc b/db/compaction/compaction_iterator_test.cc
index 57db424894d..fef7b541786 100644
--- a/db/compaction/compaction_iterator_test.cc
+++ b/db/compaction/compaction_iterator_test.cc
@@ -38,7 +38,7 @@ class NoMergingMergeOp : public MergeOperator {
 
 // Compaction filter that gets stuck when it sees a particular key,
 // then gets unstuck when told to.
-// Always returns Decition::kRemove.
+// Always returns Decision::kRemove.
 class StallingFilter : public CompactionFilter {
  public:
   Decision FilterV2(int /*level*/, const Slice& key, ValueType /*type*/,
@@ -156,23 +156,32 @@ class LoggingForwardVectorIterator : public InternalIterator {
 
 class FakeCompaction : public CompactionIterator::CompactionProxy {
  public:
-  FakeCompaction() = default;
+  int level() const override { return 0; }
 
-  int level(size_t /*compaction_input_level*/) const override { return 0; }
   bool KeyNotExistsBeyondOutputLevel(
       const Slice& /*user_key*/,
       std::vector<size_t>* /*level_ptrs*/) const override {
     return is_bottommost_level || key_not_exists_beyond_output_level;
   }
+
   bool bottommost_level() const override { return is_bottommost_level; }
+
   int number_levels() const override { return 1; }
+
   Slice GetLargestUserKey() const override {
     return "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
   }
+
   bool allow_ingest_behind() const override { return is_allow_ingest_behind; }
 
   bool preserve_deletes() const override { return false; }
 
+  bool enable_blob_garbage_collection() const override { return false; }
+
+  double blob_garbage_collection_age_cutoff() const override { return 0.0; }
+
+  Version* input_version() const override { return nullptr; }
+
   bool key_not_exists_beyond_output_level = false;
 
   bool is_bottommost_level = false;
@@ -180,7 +189,7 @@ class FakeCompaction : public CompactionIterator::CompactionProxy {
   bool is_allow_ingest_behind = false;
 };
 
-// A simplifed snapshot checker which assumes each snapshot has a global
+// A simplified snapshot checker which assumes each snapshot has a global
 // last visible sequence.
 class TestSnapshotChecker : public SnapshotChecker {
  public:
@@ -216,6 +225,9 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
   CompactionIteratorTest()
       : cmp_(BytewiseComparator()), icmp_(cmp_), snapshots_({}) {}
 
+  explicit CompactionIteratorTest(const Comparator* ucmp)
+      : cmp_(ucmp), icmp_(cmp_), snapshots_({}) {}
+
   void InitIterators(
       const std::vector<std::string>& ks, const std::vector<std::string>& vs,
       const std::vector<std::string>& range_del_ks,
@@ -224,7 +236,9 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
       SequenceNumber last_committed_sequence = kMaxSequenceNumber,
       MergeOperator* merge_op = nullptr, CompactionFilter* filter = nullptr,
       bool bottommost_level = false,
-      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) {
+      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
+      bool key_not_exists_beyond_output_level = false,
+      const std::string* full_history_ts_low = nullptr) {
     std::unique_ptr<InternalIterator> unfragmented_range_del_iter(
         new test::VectorIterator(range_del_ks, range_del_vs));
     auto tombstone_list = std::make_shared<FragmentedRangeTombstoneList>(
@@ -236,10 +250,12 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
     range_del_agg_->AddTombstones(std::move(range_del_iter));
 
     std::unique_ptr<CompactionIterator::CompactionProxy> compaction;
-    if (filter || bottommost_level) {
+    if (filter || bottommost_level || key_not_exists_beyond_output_level) {
       compaction_proxy_ = new FakeCompaction();
       compaction_proxy_->is_bottommost_level = bottommost_level;
       compaction_proxy_->is_allow_ingest_behind = AllowIngestBehind();
+      compaction_proxy_->key_not_exists_beyond_output_level =
+          key_not_exists_beyond_output_level;
       compaction.reset(compaction_proxy_);
     }
     bool use_snapshot_checker = UseSnapshotChecker() || GetParam();
@@ -252,6 +268,11 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
                         0 /*latest_snapshot*/, snapshot_checker_.get(),
                         0 /*level*/, nullptr /*statistics*/, &shutting_down_));
 
+    if (c_iter_) {
+      // Since iter_ is still used in ~CompactionIterator(), we call
+      // ~CompactionIterator() first.
+      c_iter_.reset();
+    }
     iter_.reset(new LoggingForwardVectorIterator(ks, vs));
     iter_->SeekToFirst();
     c_iter_.reset(new CompactionIterator(
@@ -259,8 +280,11 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
         earliest_write_conflict_snapshot, snapshot_checker_.get(),
         Env::Default(), false /* report_detailed_time */, false,
         range_del_agg_.get(), nullptr /* blob_file_builder */,
-        false /*allow_data_in_errors*/, std::move(compaction), filter,
-        &shutting_down_));
+        true /*allow_data_in_errors*/, std::move(compaction), filter,
+        &shutting_down_, /*preserve_deletes_seqnum=*/0,
+        /*manual_compaction_paused=*/nullptr,
+        /*manual_compaction_canceled=*/nullptr, /*info_log=*/nullptr,
+        full_history_ts_low));
   }
 
   void AddSnapshot(SequenceNumber snapshot,
@@ -282,10 +306,13 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
       MergeOperator* merge_operator = nullptr,
       CompactionFilter* compaction_filter = nullptr,
       bool bottommost_level = false,
-      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) {
+      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
+      bool key_not_exists_beyond_output_level = false,
+      const std::string* full_history_ts_low = nullptr) {
     InitIterators(input_keys, input_values, {}, {}, kMaxSequenceNumber,
                   last_committed_seq, merge_operator, compaction_filter,
-                  bottommost_level, earliest_write_conflict_snapshot);
+                  bottommost_level, earliest_write_conflict_snapshot,
+                  key_not_exists_beyond_output_level, full_history_ts_low);
     c_iter_->SeekToFirst();
     for (size_t i = 0; i < expected_keys.size(); i++) {
       std::string info = "i = " + ToString(i);
@@ -299,6 +326,11 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
     ASSERT_FALSE(c_iter_->Valid());
   }
 
+  void ClearSnapshots() {
+    snapshots_.clear();
+    snapshot_map_.clear();
+  }
+
   const Comparator* cmp_;
   const InternalKeyComparator icmp_;
   std::vector<SequenceNumber> snapshots_;
@@ -680,7 +712,7 @@ TEST_P(CompactionIteratorTest, ZeroOutSequenceAtBottomLevel) {
   RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue)},
           {"v1", "v2"},
           {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue)},
-          {"v1", "v2"}, kMaxSequenceNumber /*last_commited_seq*/,
+          {"v1", "v2"}, kMaxSequenceNumber /*last_committed_seq*/,
           nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
           true /*bottommost_level*/);
 }
@@ -689,15 +721,14 @@ TEST_P(CompactionIteratorTest, ZeroOutSequenceAtBottomLevel) {
 // permanently.
 TEST_P(CompactionIteratorTest, RemoveDeletionAtBottomLevel) {
   AddSnapshot(1);
-  RunTest({test::KeyStr("a", 1, kTypeDeletion),
-           test::KeyStr("b", 3, kTypeDeletion),
-           test::KeyStr("b", 1, kTypeValue)},
-          {"", "", ""},
-          {test::KeyStr("b", 3, kTypeDeletion),
-           test::KeyStr("b", 0, kTypeValue)},
-          {"", ""},
-          kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/,
-          nullptr /*compaction_filter*/, true /*bottommost_level*/);
+  RunTest(
+      {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 3, kTypeDeletion),
+       test::KeyStr("b", 1, kTypeValue)},
+      {"", "", ""},
+      {test::KeyStr("b", 3, kTypeDeletion), test::KeyStr("b", 0, kTypeValue)},
+      {"", ""}, kMaxSequenceNumber /*last_committed_seq*/,
+      nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+      true /*bottommost_level*/);
 }
 
 // In bottommost level, single deletions earlier than earliest snapshot can be
@@ -707,7 +738,7 @@ TEST_P(CompactionIteratorTest, RemoveSingleDeletionAtBottomLevel) {
   RunTest({test::KeyStr("a", 1, kTypeSingleDeletion),
            test::KeyStr("b", 2, kTypeSingleDeletion)},
           {"", ""}, {test::KeyStr("b", 2, kTypeSingleDeletion)}, {""},
-          kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/,
+          kMaxSequenceNumber /*last_committed_seq*/, nullptr /*merge_operator*/,
           nullptr /*compaction_filter*/, true /*bottommost_level*/);
 }
 
@@ -864,7 +895,7 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest,
           {"v1", "v2", "v3"},
           {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue),
            test::KeyStr("c", 3, kTypeValue)},
-          {"v1", "v2", "v3"}, kMaxSequenceNumber /*last_commited_seq*/,
+          {"v1", "v2", "v3"}, kMaxSequenceNumber /*last_committed_seq*/,
           nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
           true /*bottommost_level*/);
 }
@@ -875,9 +906,7 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest,
   RunTest(
       {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 2, kTypeDeletion),
        test::KeyStr("c", 3, kTypeDeletion)},
-      {"", "", ""},
-      {},
-      {"", ""}, kMaxSequenceNumber /*last_commited_seq*/,
+      {"", "", ""}, {}, {"", ""}, kMaxSequenceNumber /*last_committed_seq*/,
       nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
       true /*bottommost_level*/);
 }
@@ -885,15 +914,14 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest,
 TEST_F(CompactionIteratorWithSnapshotCheckerTest,
        NotRemoveDeletionIfValuePresentToEarlierSnapshot) {
   AddSnapshot(2,1);
-  RunTest(
-      {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 1, kTypeValue),
-          test::KeyStr("b", 3, kTypeValue)},
-      {"", "", ""},
-      {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 0, kTypeValue),
-            test::KeyStr("b", 3, kTypeValue)},
-      {"", "", ""}, kMaxSequenceNumber /*last_commited_seq*/,
-      nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
-      true /*bottommost_level*/);
+  RunTest({test::KeyStr("a", 4, kTypeDeletion),
+           test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 3, kTypeValue)},
+          {"", "", ""},
+          {test::KeyStr("a", 4, kTypeDeletion),
+           test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 3, kTypeValue)},
+          {"", "", ""}, kMaxSequenceNumber /*last_committed_seq*/,
+          nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+          true /*bottommost_level*/);
 }
 
 TEST_F(CompactionIteratorWithSnapshotCheckerTest,
@@ -905,7 +933,7 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest,
           {"", "", ""},
           {test::KeyStr("b", 2, kTypeSingleDeletion),
            test::KeyStr("c", 3, kTypeSingleDeletion)},
-          {"", ""}, kMaxSequenceNumber /*last_commited_seq*/,
+          {"", ""}, kMaxSequenceNumber /*last_committed_seq*/,
           nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
           true /*bottommost_level*/);
 }
@@ -939,9 +967,24 @@ TEST_F(CompactionIteratorWithSnapshotCheckerTest,
           2 /*earliest_write_conflict_snapshot*/);
 }
 
+// Same as above but with a blob index. In addition to the value getting
+// trimmed, the type of the KV is changed to kTypeValue.
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       KeepSingleDeletionForWriteConflictChecking_BlobIndex) {
+  AddSnapshot(2, 0);
+  RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeBlobIndex)},
+          {"", "fake_blob_index"},
+          {test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeValue)},
+          {"", ""}, 2 /*last_committed_seq*/, nullptr /*merge_operator*/,
+          nullptr /*compaction_filter*/, false /*bottommost_level*/,
+          2 /*earliest_write_conflict_snapshot*/);
+}
+
 // Compaction filter should keep uncommitted key as-is, and
-//   * Convert the latest velue to deletion, and/or
-//   * if latest value is a merge, apply filter to all suequent merges.
+//   * Convert the latest value to deletion, and/or
+//   * if latest value is a merge, apply filter to all subsequent merges.
 
 TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Value) {
   std::unique_ptr<CompactionFilter> compaction_filter(
@@ -1033,6 +1076,188 @@ INSTANTIATE_TEST_CASE_P(CompactionIteratorWithAllowIngestBehindTestInstance,
                         CompactionIteratorWithAllowIngestBehindTest,
                         testing::Values(true, false));
 
+class CompactionIteratorTsGcTest : public CompactionIteratorTest {
+ public:
+  CompactionIteratorTsGcTest()
+      : CompactionIteratorTest(test::ComparatorWithU64Ts()) {}
+};
+
+TEST_P(CompactionIteratorTsGcTest, NoKeyEligibleForGC) {
+  constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}};
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4, kTypeValue),
+      test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3,
+                   kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)};
+  const std::vector<std::string> input_values = {"a3", "", "b2"};
+  std::string full_history_ts_low;
+  // All keys' timestamps are newer than or equal to 102, thus none of them
+  // will be eligible for GC.
+  PutFixed64(&full_history_ts_low, 102);
+  const std::vector<std::string>& expected_keys = input_keys;
+  const std::vector<std::string>& expected_values = input_values;
+  const std::vector<std::pair<bool, bool>> params = {
+      {false, false}, {false, true}, {true, true}};
+  for (const std::pair<bool, bool>& param : params) {
+    const bool bottommost_level = param.first;
+    const bool key_not_exists_beyond_output_level = param.second;
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            bottommost_level,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            key_not_exists_beyond_output_level, &full_history_ts_low);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, AllKeysOlderThanThreshold) {
+  constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}};
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4,
+                   kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/101, user_key[0], /*seq=*/2, kTypeValue),
+      test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a2", "a1", "b5"};
+  std::string full_history_ts_low;
+  PutFixed64(&full_history_ts_low, std::numeric_limits<uint64_t>::max());
+  {
+    // With a snapshot at seq 3, both the deletion marker and the key at 3 must
+    // be preserved.
+    AddSnapshot(3);
+    const std::vector<std::string> expected_keys = {
+        input_keys[0], input_keys[1], input_keys[3]};
+    const std::vector<std::string> expected_values = {"", "a2", "b5"};
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+    ClearSnapshots();
+  }
+  {
+    // No snapshot, the deletion marker should be preserved because the user
+    // key may appear beyond output level.
+    const std::vector<std::string> expected_keys = {input_keys[0],
+                                                    input_keys[3]};
+    const std::vector<std::string> expected_values = {"", "b5"};
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+  }
+  {
+    // No snapshot, the deletion marker can be dropped because the user key
+    // does not appear in higher levels.
+    const std::vector<std::string> expected_keys = {input_keys[3]};
+    const std::vector<std::string> expected_values = {"b5"};
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, NewHidesOldSameSnapshot) {
+  constexpr char user_key[] = "a";
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeValue),
+      test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a2", "a1", "a0"};
+  {
+    std::string full_history_ts_low;
+    // Keys whose timestamps larger than or equal to 102 will be preserved.
+    PutFixed64(&full_history_ts_low, 102);
+    const std::vector<std::string> expected_keys = {input_keys[0],
+                                                    input_keys[1]};
+    const std::vector<std::string> expected_values = {"", "a2"};
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, DropTombstones) {
+  constexpr char user_key[] = "a";
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a2", "", "a0"};
+  const std::vector<std::string> expected_keys = {input_keys[0], input_keys[1]};
+  const std::vector<std::string> expected_values = {"", "a2"};
+
+  // Take a snapshot at seq 2.
+  AddSnapshot(2);
+
+  {
+    // Non-bottommost level, but key does not exist beyond output level.
+    std::string full_history_ts_low;
+    PutFixed64(&full_history_ts_low, 102);
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_sequence=*/kMaxSequenceNumber,
+            /*merge_op=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low);
+  }
+  {
+    // Bottommost level
+    std::string full_history_ts_low;
+    PutFixed64(&full_history_ts_low, 102);
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/true,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, RewriteTs) {
+  constexpr char user_key[] = "a";
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a2", "", "a0"};
+  const std::vector<std::string> expected_keys = {
+      input_keys[0], input_keys[1], input_keys[2],
+      test::KeyStr(/*ts=*/0, user_key, /*seq=*/0, kTypeValue)};
+  const std::vector<std::string> expected_values = {"", "a2", "", "a0"};
+
+  AddSnapshot(1);
+  AddSnapshot(2);
+
+  {
+    // Bottommost level and need to rewrite both ts and seq.
+    std::string full_history_ts_low;
+    PutFixed64(&full_history_ts_low, 102);
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/true,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionIteratorTsGcTestInstance,
+                        CompactionIteratorTsGcTest,
+                        testing::Values(true, false));
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 91fa2dc32e4..7928273426d 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -20,7 +20,10 @@
 #include <utility>
 #include <vector>
 
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_file_builder.h"
 #include "db/builder.h"
+#include "db/compaction/clipping_iterator.h"
 #include "db/db_impl/db_impl.h"
 #include "db/db_iter.h"
 #include "db/dbformat.h"
@@ -44,6 +47,8 @@
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_util.h"
+#include "options/configurable_helper.h"
+#include "options/options_helper.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
@@ -51,6 +56,7 @@
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
+#include "rocksdb/utilities/options_type.h"
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/merging_iterator.h"
@@ -126,10 +132,12 @@ struct CompactionJob::SubcompactionState {
   // Files produced by this subcompaction
   struct Output {
     Output(FileMetaData&& _meta, const InternalKeyComparator& _icmp,
-           bool _enable_order_check, bool _enable_hash)
+           bool _enable_order_check, bool _enable_hash, bool _finished = false,
+           uint64_t precalculated_hash = 0)
         : meta(std::move(_meta)),
-          validator(_icmp, _enable_order_check, _enable_hash),
-          finished(false) {}
+          validator(_icmp, _enable_order_check, _enable_hash,
+                    precalculated_hash),
+          finished(_finished) {}
     FileMetaData meta;
     OutputValidator validator;
     bool finished;
@@ -138,6 +146,7 @@ struct CompactionJob::SubcompactionState {
 
   // State kept for output being generated
   std::vector<Output> outputs;
+  std::vector<BlobFileAddition> blob_file_additions;
   std::unique_ptr<WritableFileWriter> outfile;
   std::unique_ptr<TableBuilder> builder;
 
@@ -146,7 +155,7 @@ struct CompactionJob::SubcompactionState {
       // This subcompaction's output could be empty if compaction was aborted
       // before this subcompaction had a chance to generate any output files.
       // When subcompactions are executed sequentially this is more likely and
-      // will be particulalry likely for the later subcompactions to be empty.
+      // will be particularly likely for the later subcompactions to be empty.
       // Once they are run in parallel however it should be much rarer.
       return nullptr;
     } else {
@@ -231,21 +240,13 @@ struct CompactionJob::CompactionState {
   std::vector<CompactionJob::SubcompactionState> sub_compact_states;
   Status status;
 
-  uint64_t total_bytes;
-  uint64_t num_output_records;
-
-  explicit CompactionState(Compaction* c)
-      : compaction(c),
-        total_bytes(0),
-        num_output_records(0) {}
+  size_t num_output_files = 0;
+  uint64_t total_bytes = 0;
+  size_t num_blob_output_files = 0;
+  uint64_t total_blob_bytes = 0;
+  uint64_t num_output_records = 0;
 
-  size_t NumOutputFiles() {
-    size_t total = 0;
-    for (auto& s : sub_compact_states) {
-      total += s.outputs.size();
-    }
-    return total;
-  }
+  explicit CompactionState(Compaction* c) : compaction(c) {}
 
   Slice SmallestUserKey() {
     for (const auto& sub_compact_state : sub_compact_states) {
@@ -272,21 +273,40 @@ struct CompactionJob::CompactionState {
 };
 
 void CompactionJob::AggregateStatistics() {
+  assert(compact_);
+
   for (SubcompactionState& sc : compact_->sub_compact_states) {
+    auto& outputs = sc.outputs;
+
+    if (!outputs.empty() && !outputs.back().meta.fd.file_size) {
+      // An error occurred, so ignore the last output.
+      outputs.pop_back();
+    }
+
+    compact_->num_output_files += outputs.size();
     compact_->total_bytes += sc.total_bytes;
+
+    const auto& blobs = sc.blob_file_additions;
+
+    compact_->num_blob_output_files += blobs.size();
+
+    for (const auto& blob : blobs) {
+      compact_->total_blob_bytes += blob.GetTotalBlobBytes();
+    }
+
     compact_->num_output_records += sc.num_output_records;
-  }
-  for (SubcompactionState& sc : compact_->sub_compact_states) {
+
     compaction_job_stats_->Add(sc.compaction_job_stats);
   }
 }
 
 CompactionJob::CompactionJob(
     int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
-    const FileOptions& file_options, VersionSet* versions,
-    const std::atomic<bool>* shutting_down,
+    const MutableDBOptions& mutable_db_options, const FileOptions& file_options,
+    VersionSet* versions, const std::atomic<bool>* shutting_down,
     const SequenceNumber preserve_deletes_seqnum, LogBuffer* log_buffer,
-    FSDirectory* db_directory, FSDirectory* output_directory, Statistics* stats,
+    FSDirectory* db_directory, FSDirectory* output_directory,
+    FSDirectory* blob_output_directory, Statistics* stats,
     InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
     std::vector<SequenceNumber> existing_snapshots,
     SequenceNumber earliest_write_conflict_snapshot,
@@ -294,16 +314,24 @@ CompactionJob::CompactionJob(
     EventLogger* event_logger, bool paranoid_file_checks, bool measure_io_stats,
     const std::string& dbname, CompactionJobStats* compaction_job_stats,
     Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
-    const std::atomic<int>* manual_compaction_paused, const std::string& db_id,
-    const std::string& db_session_id)
-    : job_id_(job_id),
-      compact_(new CompactionState(compaction)),
-      compaction_job_stats_(compaction_job_stats),
+    const std::atomic<int>* manual_compaction_paused,
+    const std::atomic<bool>* manual_compaction_canceled,
+    const std::string& db_id, const std::string& db_session_id,
+    std::string full_history_ts_low, BlobFileCompletionCallback* blob_callback)
+    : compact_(new CompactionState(compaction)),
       compaction_stats_(compaction->compaction_reason(), 1),
+      db_options_(db_options),
+      mutable_db_options_copy_(mutable_db_options),
+      log_buffer_(log_buffer),
+      output_directory_(output_directory),
+      stats_(stats),
+      bottommost_level_(false),
+      write_hint_(Env::WLTH_NOT_SET),
+      job_id_(job_id),
+      compaction_job_stats_(compaction_job_stats),
       dbname_(dbname),
       db_id_(db_id),
       db_session_id_(db_session_id),
-      db_options_(db_options),
       file_options_(file_options),
       env_(db_options.env),
       io_tracer_(io_tracer),
@@ -313,11 +341,10 @@ CompactionJob::CompactionJob(
       versions_(versions),
       shutting_down_(shutting_down),
       manual_compaction_paused_(manual_compaction_paused),
+      manual_compaction_canceled_(manual_compaction_canceled),
       preserve_deletes_seqnum_(preserve_deletes_seqnum),
-      log_buffer_(log_buffer),
       db_directory_(db_directory),
-      output_directory_(output_directory),
-      stats_(stats),
+      blob_output_directory_(blob_output_directory),
       db_mutex_(db_mutex),
       db_error_handler_(db_error_handler),
       existing_snapshots_(std::move(existing_snapshots)),
@@ -325,11 +352,11 @@ CompactionJob::CompactionJob(
       snapshot_checker_(snapshot_checker),
       table_cache_(std::move(table_cache)),
       event_logger_(event_logger),
-      bottommost_level_(false),
       paranoid_file_checks_(paranoid_file_checks),
       measure_io_stats_(measure_io_stats),
-      write_hint_(Env::WLTH_NOT_SET),
-      thread_pri_(thread_pri) {
+      thread_pri_(thread_pri),
+      full_history_ts_low_(std::move(full_history_ts_low)),
+      blob_callback_(blob_callback) {
   assert(compaction_job_stats_ != nullptr);
   assert(log_buffer_ != nullptr);
   const auto* cfd = compact_->compaction->column_family_data();
@@ -391,7 +418,7 @@ void CompactionJob::Prepare() {
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_COMPACTION_PREPARE);
 
-  // Generate file_levels_ for compaction berfore making Iterator
+  // Generate file_levels_ for compaction before making Iterator
   auto* c = compact_->compaction;
   assert(c->column_family_data() != nullptr);
   assert(c->column_family_data()->current()->storage_info()->NumLevelFiles(
@@ -403,7 +430,7 @@ void CompactionJob::Prepare() {
 
   if (c->ShouldFormSubcompactions()) {
     {
-      StopWatch sw(env_, stats_, SUBCOMPACTION_SETUP_TIME);
+      StopWatch sw(db_options_.clock, stats_, SUBCOMPACTION_SETUP_TIME);
       GenSubcompactionBoundaries();
     }
     assert(sizes_.size() == boundaries_.size() + 1);
@@ -526,9 +553,10 @@ void CompactionJob::GenSubcompactionBoundaries() {
   int base_level = v->storage_info()->base_level();
   uint64_t max_output_files = static_cast<uint64_t>(std::ceil(
       sum / min_file_fill_percent /
-      MaxFileSizeForLevel(*(c->mutable_cf_options()), out_lvl,
-          c->immutable_cf_options()->compaction_style, base_level,
-          c->immutable_cf_options()->level_compaction_dynamic_level_bytes)));
+      MaxFileSizeForLevel(
+          *(c->mutable_cf_options()), out_lvl,
+          c->immutable_options()->compaction_style, base_level,
+          c->immutable_options()->level_compaction_dynamic_level_bytes)));
   uint64_t subcompactions =
       std::min({static_cast<uint64_t>(ranges.size()),
                 static_cast<uint64_t>(c->max_subcompactions()),
@@ -569,7 +597,7 @@ Status CompactionJob::Run() {
 
   const size_t num_threads = compact_->sub_compact_states.size();
   assert(num_threads > 0);
-  const uint64_t start_micros = env_->NowMicros();
+  const uint64_t start_micros = db_options_.clock->NowMicros();
 
   // Launch a thread for each of subcompactions 1...num_threads-1
   std::vector<port::Thread> thread_pool;
@@ -588,7 +616,7 @@ Status CompactionJob::Run() {
     thread.join();
   }
 
-  compaction_stats_.micros = env_->NowMicros() - start_micros;
+  compaction_stats_.micros = db_options_.clock->NowMicros() - start_micros;
   compaction_stats_.cpu_micros = 0;
   for (size_t i = 0; i < compact_->sub_compact_states.size(); i++) {
     compaction_stats_.cpu_micros +=
@@ -604,18 +632,34 @@ Status CompactionJob::Run() {
   // Check if any thread encountered an error during execution
   Status status;
   IOStatus io_s;
+  bool wrote_new_blob_files = false;
+
   for (const auto& state : compact_->sub_compact_states) {
     if (!state.status.ok()) {
       status = state.status;
       io_s = state.io_status;
       break;
     }
+
+    if (!state.blob_file_additions.empty()) {
+      wrote_new_blob_files = true;
+    }
   }
+
   if (io_status_.ok()) {
     io_status_ = io_s;
   }
-  if (status.ok() && output_directory_) {
-    io_s = output_directory_->Fsync(IOOptions(), nullptr);
+  if (status.ok()) {
+    constexpr IODebugContext* dbg = nullptr;
+
+    if (output_directory_) {
+      io_s = output_directory_->Fsync(IOOptions(), dbg);
+    }
+
+    if (io_s.ok() && wrote_new_blob_files && blob_output_directory_ &&
+        blob_output_directory_ != output_directory_) {
+      io_s = blob_output_directory_->Fsync(IOOptions(), dbg);
+    }
   }
   if (io_status_.ok()) {
     io_status_ = io_s;
@@ -711,7 +755,7 @@ Status CompactionJob::Run() {
   for (const auto& state : compact_->sub_compact_states) {
     for (const auto& output : state.outputs) {
       auto fn =
-          TableFileName(state.compaction->immutable_cf_options()->cf_paths,
+          TableFileName(state.compaction->immutable_options()->cf_paths,
                         output.meta.fd.GetNumber(), output.meta.fd.GetPathId());
       tp[fn] = output.table_properties;
     }
@@ -721,6 +765,7 @@ Status CompactionJob::Run() {
   // Finish up all book-keeping to unify the subcompaction results
   AggregateStatistics();
   UpdateCompactionStats();
+
   RecordCompactionIOStats();
   LogFlush(db_options_.info_log);
   TEST_SYNC_POINT("CompactionJob::Run():End");
@@ -730,11 +775,16 @@ Status CompactionJob::Run() {
 }
 
 Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
+  assert(compact_);
+
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_COMPACTION_INSTALL);
   db_mutex_->AssertHeld();
   Status status = compact_->status;
+
   ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+  assert(cfd);
+
   cfd->internal_stats()->AddCompactionStats(
       compact_->compaction->output_level(), thread_pri_, compaction_stats_);
 
@@ -744,6 +794,7 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
   if (!versions_->io_status().ok()) {
     io_status_ = versions_->io_status();
   }
+
   VersionStorageInfo::LevelSummaryStorage tmp;
   auto vstorage = cfd->current()->storage_info();
   const auto& stats = compaction_stats_;
@@ -753,53 +804,78 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
   double bytes_read_per_sec = 0;
   double bytes_written_per_sec = 0;
 
-  if (stats.bytes_read_non_output_levels > 0) {
-    read_write_amp = (stats.bytes_written + stats.bytes_read_output_level +
-                      stats.bytes_read_non_output_levels) /
-                     static_cast<double>(stats.bytes_read_non_output_levels);
-    write_amp = stats.bytes_written /
-                static_cast<double>(stats.bytes_read_non_output_levels);
+  const uint64_t bytes_read_non_output_and_blob =
+      stats.bytes_read_non_output_levels + stats.bytes_read_blob;
+  const uint64_t bytes_read_all =
+      stats.bytes_read_output_level + bytes_read_non_output_and_blob;
+  const uint64_t bytes_written_all =
+      stats.bytes_written + stats.bytes_written_blob;
+
+  if (bytes_read_non_output_and_blob > 0) {
+    read_write_amp = (bytes_written_all + bytes_read_all) /
+                     static_cast<double>(bytes_read_non_output_and_blob);
+    write_amp =
+        bytes_written_all / static_cast<double>(bytes_read_non_output_and_blob);
   }
   if (stats.micros > 0) {
-    bytes_read_per_sec =
-        (stats.bytes_read_non_output_levels + stats.bytes_read_output_level) /
-        static_cast<double>(stats.micros);
+    bytes_read_per_sec = bytes_read_all / static_cast<double>(stats.micros);
     bytes_written_per_sec =
-        stats.bytes_written / static_cast<double>(stats.micros);
+        bytes_written_all / static_cast<double>(stats.micros);
   }
 
+  const std::string& column_family_name = cfd->GetName();
+
+  constexpr double kMB = 1048576.0;
+
   ROCKS_LOG_BUFFER(
       log_buffer_,
       "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
-      "files in(%d, %d) out(%d) "
-      "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
-      "write-amplify(%.1f) %s, records in: %" PRIu64
+      "files in(%d, %d) out(%d +%d blob) "
+      "MB in(%.1f, %.1f +%.1f blob) out(%.1f +%.1f blob), "
+      "read-write-amplify(%.1f) write-amplify(%.1f) %s, records in: %" PRIu64
       ", records dropped: %" PRIu64 " output_compression: %s\n",
-      cfd->GetName().c_str(), vstorage->LevelSummary(&tmp), bytes_read_per_sec,
-      bytes_written_per_sec, compact_->compaction->output_level(),
+      column_family_name.c_str(), vstorage->LevelSummary(&tmp),
+      bytes_read_per_sec, bytes_written_per_sec,
+      compact_->compaction->output_level(),
       stats.num_input_files_in_non_output_levels,
       stats.num_input_files_in_output_level, stats.num_output_files,
-      stats.bytes_read_non_output_levels / 1048576.0,
-      stats.bytes_read_output_level / 1048576.0,
-      stats.bytes_written / 1048576.0, read_write_amp, write_amp,
-      status.ToString().c_str(), stats.num_input_records,
+      stats.num_output_files_blob, stats.bytes_read_non_output_levels / kMB,
+      stats.bytes_read_output_level / kMB, stats.bytes_read_blob / kMB,
+      stats.bytes_written / kMB, stats.bytes_written_blob / kMB, read_write_amp,
+      write_amp, status.ToString().c_str(), stats.num_input_records,
       stats.num_dropped_records,
       CompressionTypeToString(compact_->compaction->output_compression())
           .c_str());
 
+  const auto& blob_files = vstorage->GetBlobFiles();
+  if (!blob_files.empty()) {
+    ROCKS_LOG_BUFFER(log_buffer_,
+                     "[%s] Blob file summary: head=%" PRIu64 ", tail=%" PRIu64
+                     "\n",
+                     column_family_name.c_str(), blob_files.begin()->first,
+                     blob_files.rbegin()->first);
+  }
+
   UpdateCompactionJobStats(stats);
 
   auto stream = event_logger_->LogToBuffer(log_buffer_);
   stream << "job" << job_id_ << "event"
          << "compaction_finished"
-         << "compaction_time_micros" << compaction_stats_.micros
-         << "compaction_time_cpu_micros" << compaction_stats_.cpu_micros
-         << "output_level" << compact_->compaction->output_level()
-         << "num_output_files" << compact_->NumOutputFiles()
-         << "total_output_size" << compact_->total_bytes << "num_input_records"
-         << compaction_stats_.num_input_records << "num_output_records"
-         << compact_->num_output_records << "num_subcompactions"
-         << compact_->sub_compact_states.size() << "output_compression"
+         << "compaction_time_micros" << stats.micros
+         << "compaction_time_cpu_micros" << stats.cpu_micros << "output_level"
+         << compact_->compaction->output_level() << "num_output_files"
+         << compact_->num_output_files << "total_output_size"
+         << compact_->total_bytes;
+
+  if (compact_->num_blob_output_files > 0) {
+    stream << "num_blob_output_files" << compact_->num_blob_output_files
+           << "total_blob_output_size" << compact_->total_blob_bytes;
+  }
+
+  stream << "num_input_records" << stats.num_input_records
+         << "num_output_records" << compact_->num_output_records
+         << "num_subcompactions" << compact_->sub_compact_states.size()
+         << "output_compression"
          << CompressionTypeToString(compact_->compaction->output_compression());
 
   stream << "num_single_delete_mismatches"
@@ -823,14 +899,173 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
   }
   stream.EndArray();
 
+  if (!blob_files.empty()) {
+    stream << "blob_file_head" << blob_files.begin()->first;
+    stream << "blob_file_tail" << blob_files.rbegin()->first;
+  }
+
   CleanupCompaction();
   return status;
 }
 
+#ifndef ROCKSDB_LITE
+void CompactionJob::ProcessKeyValueCompactionWithCompactionService(
+    SubcompactionState* sub_compact) {
+  assert(sub_compact);
+  assert(sub_compact->compaction);
+  assert(db_options_.compaction_service);
+
+  const Compaction* compaction = sub_compact->compaction;
+  CompactionServiceInput compaction_input;
+  compaction_input.output_level = compaction->output_level();
+
+  const std::vector<CompactionInputFiles>& inputs =
+      *(compact_->compaction->inputs());
+  for (const auto& files_per_level : inputs) {
+    for (const auto& file : files_per_level.files) {
+      compaction_input.input_files.emplace_back(
+          MakeTableFileName(file->fd.GetNumber()));
+    }
+  }
+  compaction_input.column_family.name =
+      compaction->column_family_data()->GetName();
+  compaction_input.column_family.options =
+      compaction->column_family_data()->GetLatestCFOptions();
+  compaction_input.db_options =
+      BuildDBOptions(db_options_, mutable_db_options_copy_);
+  compaction_input.snapshots = existing_snapshots_;
+  compaction_input.has_begin = sub_compact->start;
+  compaction_input.begin =
+      compaction_input.has_begin ? sub_compact->start->ToString() : "";
+  compaction_input.has_end = sub_compact->end;
+  compaction_input.end =
+      compaction_input.has_end ? sub_compact->end->ToString() : "";
+  compaction_input.approx_size = sub_compact->approx_size;
+
+  std::string compaction_input_binary;
+  Status s = compaction_input.Write(&compaction_input_binary);
+  if (!s.ok()) {
+    sub_compact->status = s;
+    return;
+  }
+
+  std::ostringstream input_files_oss;
+  bool is_first_one = true;
+  for (const auto& file : compaction_input.input_files) {
+    input_files_oss << (is_first_one ? "" : ", ") << file;
+    is_first_one = false;
+  }
+
+  ROCKS_LOG_INFO(
+      db_options_.info_log,
+      "[%s] [JOB %d] Starting remote compaction (output level: %d): %s",
+      compaction_input.column_family.name.c_str(), job_id_,
+      compaction_input.output_level, input_files_oss.str().c_str());
+  CompactionServiceJobStatus compaction_status =
+      db_options_.compaction_service->Start(compaction_input_binary, job_id_);
+  if (compaction_status != CompactionServiceJobStatus::kSuccess) {
+    sub_compact->status =
+        Status::Incomplete("CompactionService failed to start compaction job.");
+    return;
+  }
+
+  std::string compaction_result_binary;
+  compaction_status = db_options_.compaction_service->WaitForComplete(
+      job_id_, &compaction_result_binary);
+
+  CompactionServiceResult compaction_result;
+  s = CompactionServiceResult::Read(compaction_result_binary,
+                                    &compaction_result);
+  if (compaction_status != CompactionServiceJobStatus::kSuccess) {
+    sub_compact->status =
+        s.ok() ? compaction_result.status
+               : Status::Incomplete(
+                     "CompactionService failed to run compaction job.");
+    compaction_result.status.PermitUncheckedError();
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "[%s] [JOB %d] Remote compaction failed, status: %s",
+                   compaction_input.column_family.name.c_str(), job_id_,
+                   s.ToString().c_str());
+    return;
+  }
+
+  if (!s.ok()) {
+    sub_compact->status = s;
+    compaction_result.status.PermitUncheckedError();
+    return;
+  }
+  sub_compact->status = compaction_result.status;
+
+  std::ostringstream output_files_oss;
+  is_first_one = true;
+  for (const auto& file : compaction_result.output_files) {
+    output_files_oss << (is_first_one ? "" : ", ") << file.file_name;
+    is_first_one = false;
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "[%s] [JOB %d] Receive remote compaction result, output path: "
+                 "%s, files: %s",
+                 compaction_input.column_family.name.c_str(), job_id_,
+                 compaction_result.output_path.c_str(),
+                 output_files_oss.str().c_str());
+
+  if (!s.ok()) {
+    sub_compact->status = s;
+    return;
+  }
+
+  for (const auto& file : compaction_result.output_files) {
+    uint64_t file_num = versions_->NewFileNumber();
+    auto src_file = compaction_result.output_path + "/" + file.file_name;
+    auto tgt_file = TableFileName(compaction->immutable_options()->cf_paths,
+                                  file_num, compaction->output_path_id());
+    s = fs_->RenameFile(src_file, tgt_file, IOOptions(), nullptr);
+    if (!s.ok()) {
+      sub_compact->status = s;
+      return;
+    }
+
+    FileMetaData meta;
+    uint64_t file_size;
+    s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr);
+    if (!s.ok()) {
+      sub_compact->status = s;
+      return;
+    }
+    meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size,
+                             file.smallest_seqno, file.largest_seqno);
+    meta.smallest.DecodeFrom(file.smallest_internal_key);
+    meta.largest.DecodeFrom(file.largest_internal_key);
+    meta.oldest_ancester_time = file.oldest_ancester_time;
+    meta.file_creation_time = file.file_creation_time;
+    meta.marked_for_compaction = file.marked_for_compaction;
+
+    auto cfd = compaction->column_family_data();
+    sub_compact->outputs.emplace_back(std::move(meta),
+                                      cfd->internal_comparator(), false, false,
+                                      true, file.paranoid_hash);
+  }
+  sub_compact->compaction_job_stats = compaction_result.stats;
+  sub_compact->num_output_records = compaction_result.num_output_records;
+  sub_compact->approx_size = compaction_input.approx_size;  // is this used?
+  sub_compact->total_bytes = compaction_result.total_bytes;
+  IOSTATS_ADD(bytes_written, compaction_result.bytes_written);
+  IOSTATS_ADD(bytes_read, compaction_result.bytes_read);
+}
+#endif  // !ROCKSDB_LITE
+
 void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
-  assert(sub_compact != nullptr);
+  assert(sub_compact);
+  assert(sub_compact->compaction);
 
-  uint64_t prev_cpu_micros = env_->NowCPUNanos() / 1000;
+#ifndef ROCKSDB_LITE
+  if (db_options_.compaction_service) {
+    return ProcessKeyValueCompactionWithCompactionService(sub_compact);
+  }
+#endif  // !ROCKSDB_LITE
+
+  uint64_t prev_cpu_micros = db_options_.clock->CPUNanos() / 1000;
 
   ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
 
@@ -853,6 +1088,10 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
 
   CompactionRangeDelAggregator range_del_agg(&cfd->internal_comparator(),
                                              existing_snapshots_);
+
+  const Slice* const start = sub_compact->start;
+  const Slice* const end = sub_compact->end;
+
   ReadOptions read_options;
   read_options.verify_checksums = true;
   read_options.fill_cache = false;
@@ -862,11 +1101,42 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
   // (b) CompactionFilter::Decision::kRemoveAndSkipUntil.
   read_options.total_order_seek = true;
 
+  // Note: if we're going to support subcompactions for user-defined timestamps,
+  // the timestamp part will have to be stripped from the bounds here.
+  assert((!start && !end) || cfd->user_comparator()->timestamp_size() == 0);
+  read_options.iterate_lower_bound = start;
+  read_options.iterate_upper_bound = end;
+
   // Although the v2 aggregator is what the level iterator(s) know about,
   // the AddTombstones calls will be propagated down to the v1 aggregator.
-  std::unique_ptr<InternalIterator> input(
+  std::unique_ptr<InternalIterator> raw_input(
       versions_->MakeInputIterator(read_options, sub_compact->compaction,
                                    &range_del_agg, file_options_for_read_));
+  InternalIterator* input = raw_input.get();
+
+  IterKey start_ikey;
+  IterKey end_ikey;
+  Slice start_slice;
+  Slice end_slice;
+
+  if (start) {
+    start_ikey.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek);
+    start_slice = start_ikey.GetInternalKey();
+  }
+  if (end) {
+    end_ikey.SetInternalKey(*end, kMaxSequenceNumber, kValueTypeForSeek);
+    end_slice = end_ikey.GetInternalKey();
+  }
+
+  std::unique_ptr<InternalIterator> clip;
+  if (start || end) {
+    clip.reset(new ClippingIterator(
+        raw_input.get(), start ? &start_slice : nullptr,
+        end ? &end_slice : nullptr, &cfd->internal_comparator()));
+    input = clip.get();
+  }
+
+  input->SeekToFirst();
 
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
@@ -892,12 +1162,29 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
   }
 
   MergeHelper merge(
-      env_, cfd->user_comparator(), cfd->ioptions()->merge_operator,
+      env_, cfd->user_comparator(), cfd->ioptions()->merge_operator.get(),
       compaction_filter, db_options_.info_log.get(),
       false /* internal key corruption is expected */,
       existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
       snapshot_checker_, compact_->compaction->level(),
-      db_options_.statistics.get(), shutting_down_);
+      db_options_.stats, shutting_down_);
+
+  const MutableCFOptions* mutable_cf_options =
+      sub_compact->compaction->mutable_cf_options();
+  assert(mutable_cf_options);
+
+  std::vector<std::string> blob_file_paths;
+
+  std::unique_ptr<BlobFileBuilder> blob_file_builder(
+      mutable_cf_options->enable_blob_files
+          ? new BlobFileBuilder(versions_, fs_.get(),
+                                sub_compact->compaction->immutable_options(),
+                                mutable_cf_options, &file_options_, job_id_,
+                                cfd->GetID(), cfd->GetName(),
+                                Env::IOPriority::IO_LOW, write_hint_,
+                                io_tracer_, blob_callback_, &blob_file_paths,
+                                &sub_compact->blob_file_additions)
+          : nullptr);
 
   TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
   TEST_SYNC_POINT_CALLBACK(
@@ -905,26 +1192,18 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
       reinterpret_cast<void*>(
           const_cast<std::atomic<int>*>(manual_compaction_paused_)));
 
-  Slice* start = sub_compact->start;
-  Slice* end = sub_compact->end;
-  if (start != nullptr) {
-    IterKey start_iter;
-    start_iter.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek);
-    input->Seek(start_iter.GetInternalKey());
-  } else {
-    input->SeekToFirst();
-  }
-
   Status status;
+  const std::string* const full_history_ts_low =
+      full_history_ts_low_.empty() ? nullptr : &full_history_ts_low_;
   sub_compact->c_iter.reset(new CompactionIterator(
-      input.get(), cfd->user_comparator(), &merge, versions_->LastSequence(),
+      input, cfd->user_comparator(), &merge, versions_->LastSequence(),
       &existing_snapshots_, earliest_write_conflict_snapshot_,
       snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_),
       /*expect_valid_internal_key=*/true, &range_del_agg,
-      /* blob_file_builder */ nullptr, db_options_.allow_data_in_errors,
+      blob_file_builder.get(), db_options_.allow_data_in_errors,
       sub_compact->compaction, compaction_filter, shutting_down_,
       preserve_deletes_seqnum_, manual_compaction_paused_,
-      db_options_.info_log));
+      manual_compaction_canceled_, db_options_.info_log, full_history_ts_low));
   auto c_iter = sub_compact->c_iter.get();
   c_iter->SeekToFirst();
   if (c_iter->Valid() && sub_compact->compaction->output_level() != 0) {
@@ -948,12 +1227,9 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
     const Slice& key = c_iter->key();
     const Slice& value = c_iter->value();
 
-    // If an end key (exclusive) is specified, check if the current key is
-    // >= than it and exit if it is because the iterator is out of its range
-    if (end != nullptr &&
-        cfd->user_comparator()->Compare(c_iter->user_key(), *end) >= 0) {
-      break;
-    }
+    assert(!end ||
+           cfd->user_comparator()->Compare(c_iter->user_key(), *end) < 0);
+
     if (c_iter_stats.num_input_records % kRecordStatsEvery ==
         kRecordStatsEvery - 1) {
       RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
@@ -1037,6 +1313,10 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
     }
   }
 
+  sub_compact->compaction_job_stats.num_blobs_read =
+      c_iter_stats.num_blobs_read;
+  sub_compact->compaction_job_stats.total_blob_bytes_read =
+      c_iter_stats.total_blob_bytes_read;
   sub_compact->compaction_job_stats.num_input_deletion_records =
       c_iter_stats.num_input_deletion_records;
   sub_compact->compaction_job_stats.num_corrupt_keys =
@@ -1064,8 +1344,10 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
     status = Status::ShutdownInProgress("Database shutdown");
   }
   if ((status.ok() || status.IsColumnFamilyDropped()) &&
-      (manual_compaction_paused_ &&
-       manual_compaction_paused_->load(std::memory_order_relaxed) > 0)) {
+      ((manual_compaction_paused_ &&
+        manual_compaction_paused_->load(std::memory_order_relaxed) > 0) ||
+       (manual_compaction_canceled_ &&
+        manual_compaction_canceled_->load(std::memory_order_relaxed)))) {
     status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
   }
   if (status.ok()) {
@@ -1093,8 +1375,17 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
     RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats);
   }
 
+  if (blob_file_builder) {
+    if (status.ok()) {
+      status = blob_file_builder->Finish();
+    } else {
+      blob_file_builder->Abandon();
+    }
+    blob_file_builder.reset();
+  }
+
   sub_compact->compaction_job_stats.cpu_micros =
-      env_->NowCPUNanos() / 1000 - prev_cpu_micros;
+      db_options_.clock->CPUNanos() / 1000 - prev_cpu_micros;
 
   if (measure_io_stats_) {
     sub_compact->compaction_job_stats.file_write_nanos +=
@@ -1125,7 +1416,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
 #endif  // ROCKSDB_ASSERT_STATUS_CHECKED
 
   sub_compact->c_iter.reset();
-  input.reset();
+  clip.reset();
+  raw_input.reset();
   sub_compact->status = status;
 }
 
@@ -1373,7 +1665,7 @@ Status CompactionJob::FinishCompactionOutputFile(
 
   // Finish and check for file errors
   if (s.ok()) {
-    StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS);
+    StopWatch sw(db_options_.clock, stats_, COMPACTION_OUTFILE_SYNC_MICROS);
     io_s = sub_compact->outfile->Sync(db_options_.use_fsync);
   }
   if (s.ok() && io_s.ok()) {
@@ -1408,9 +1700,20 @@ Status CompactionJob::FinishCompactionOutputFile(
     // This happens when the output level is bottom level, at the same time
     // the sub_compact output nothing.
     std::string fname =
-        TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths,
+        TableFileName(sub_compact->compaction->immutable_options()->cf_paths,
                       meta->fd.GetNumber(), meta->fd.GetPathId());
-    env_->DeleteFile(fname);
+
+    // TODO(AR) it is not clear if there are any larger implications if
+    // DeleteFile fails here
+    Status ds = env_->DeleteFile(fname);
+    if (!ds.ok()) {
+      ROCKS_LOG_WARN(
+          db_options_.info_log,
+          "[%s] [JOB %d] Unable to remove SST file for table #%" PRIu64
+          " at bottom level%s",
+          cfd->GetName().c_str(), job_id_, output_number,
+          meta->marked_for_compaction ? " (need compaction)" : "");
+    }
 
     // Also need to remove the file from outputs, or it will be added to the
     // VersionEdit.
@@ -1434,9 +1737,7 @@ Status CompactionJob::FinishCompactionOutputFile(
   FileDescriptor output_fd;
   uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
   if (meta != nullptr) {
-    fname =
-        TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths,
-                      meta->fd.GetNumber(), meta->fd.GetPathId());
+    fname = GetTableFileName(meta->fd.GetNumber());
     output_fd = meta->fd;
     oldest_blob_file_number = meta->oldest_blob_file_number;
   } else {
@@ -1465,9 +1766,7 @@ Status CompactionJob::FinishCompactionOutputFile(
           "CompactionJob::FinishCompactionOutputFile:"
           "MaxAllowedSpaceReached");
       InstrumentedMutexLock l(db_mutex_);
-      // Should handle return error?
-      db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction)
-          .PermitUncheckedError();
+      db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction);
     }
   }
 #endif
@@ -1479,9 +1778,13 @@ Status CompactionJob::FinishCompactionOutputFile(
 
 Status CompactionJob::InstallCompactionResults(
     const MutableCFOptions& mutable_cf_options) {
+  assert(compact_);
+
   db_mutex_->AssertHeld();
 
   auto* compaction = compact_->compaction;
+  assert(compaction);
+
   // paranoia: verify that the files that we started with
   // still exist in the current version and in the same original level.
   // This ensures that a concurrent compaction did not erroneously
@@ -1497,23 +1800,32 @@ Status CompactionJob::InstallCompactionResults(
 
   {
     Compaction::InputLevelSummaryBuffer inputs_summary;
-    ROCKS_LOG_INFO(
-        db_options_.info_log, "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes",
-        compaction->column_family_data()->GetName().c_str(), job_id_,
-        compaction->InputLevelSummary(&inputs_summary), compact_->total_bytes);
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes",
+                   compaction->column_family_data()->GetName().c_str(), job_id_,
+                   compaction->InputLevelSummary(&inputs_summary),
+                   compact_->total_bytes + compact_->total_blob_bytes);
   }
 
+  VersionEdit* const edit = compaction->edit();
+  assert(edit);
+
   // Add compaction inputs
-  compaction->AddInputDeletions(compact_->compaction->edit());
+  compaction->AddInputDeletions(edit);
 
   for (const auto& sub_compact : compact_->sub_compact_states) {
     for (const auto& out : sub_compact.outputs) {
-      compaction->edit()->AddFile(compaction->output_level(), out.meta);
+      edit->AddFile(compaction->output_level(), out.meta);
+    }
+
+    for (const auto& blob : sub_compact.blob_file_additions) {
+      edit->AddBlobFile(blob);
     }
   }
+
   return versions_->LogAndApply(compaction->column_family_data(),
-                                mutable_cf_options, compaction->edit(),
-                                db_mutex_, db_directory_);
+                                mutable_cf_options, edit, db_mutex_,
+                                db_directory_);
 }
 
 void CompactionJob::RecordCompactionIOStats() {
@@ -1545,9 +1857,7 @@ Status CompactionJob::OpenCompactionOutputFile(
   assert(sub_compact->builder == nullptr);
   // no need to lock because VersionSet::next_file_number_ is atomic
   uint64_t file_number = versions_->NewFileNumber();
-  std::string fname =
-      TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths,
-                    file_number, sub_compact->compaction->output_path_id());
+  std::string fname = GetTableFileName(file_number);
   // Fire events.
   ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
 #ifndef ROCKSDB_LITE
@@ -1562,9 +1872,17 @@ Status CompactionJob::OpenCompactionOutputFile(
   TEST_SYNC_POINT_CALLBACK("CompactionJob::OpenCompactionOutputFile",
                            &syncpoint_arg);
 #endif
+
+  // Pass temperature of botommost files to FileSystem.
+  FileOptions fo_copy = file_options_;
+  Temperature temperature = Temperature::kUnknown;
+  if (bottommost_level_) {
+    fo_copy.temperature = temperature =
+        sub_compact->compaction->mutable_cf_options()->bottommost_temperature;
+  }
+
   Status s;
-  IOStatus io_s =
-      NewWritableFile(fs_.get(), fname, &writable_file, file_options_);
+  IOStatus io_s = NewWritableFile(fs_.get(), fname, &writable_file, fo_copy);
   s = io_s;
   if (sub_compact->io_status.ok()) {
     sub_compact->io_status = io_s;
@@ -1590,7 +1908,7 @@ Status CompactionJob::OpenCompactionOutputFile(
 
   // Try to figure out the output file's oldest ancester time.
   int64_t temp_current_time = 0;
-  auto get_time_status = env_->GetCurrentTime(&temp_current_time);
+  auto get_time_status = db_options_.clock->GetCurrentTime(&temp_current_time);
   // Safe to proceed even if GetCurrentTime fails. So, log and proceed.
   if (!get_time_status.ok()) {
     ROCKS_LOG_WARN(db_options_.info_log,
@@ -1611,6 +1929,7 @@ Status CompactionJob::OpenCompactionOutputFile(
                              sub_compact->compaction->output_path_id(), 0);
     meta.oldest_ancester_time = oldest_ancester_time;
     meta.file_creation_time = current_time;
+    meta.temperature = temperature;
     sub_compact->outputs.emplace_back(
         std::move(meta), cfd->internal_comparator(),
         /*enable_order_check=*/
@@ -1621,32 +1940,29 @@ Status CompactionJob::OpenCompactionOutputFile(
 
   writable_file->SetIOPriority(Env::IOPriority::IO_LOW);
   writable_file->SetWriteLifeTimeHint(write_hint_);
+  FileTypeSet tmp_set = db_options_.checksum_handoff_file_types;
   writable_file->SetPreallocationBlockSize(static_cast<size_t>(
       sub_compact->compaction->OutputFilePreallocationSize()));
   const auto& listeners =
-      sub_compact->compaction->immutable_cf_options()->listeners;
+      sub_compact->compaction->immutable_options()->listeners;
   sub_compact->outfile.reset(new WritableFileWriter(
-      std::move(writable_file), fname, file_options_, env_, io_tracer_,
-      db_options_.statistics.get(), listeners,
-      db_options_.file_checksum_gen_factory.get()));
-
-  // If the Column family flag is to only optimize filters for hits,
-  // we can skip creating filters if this is the bottommost_level where
-  // data is going to be found
-  bool skip_filters =
-      cfd->ioptions()->optimize_filters_for_hits && bottommost_level_;
+      std::move(writable_file), fname, file_options_, db_options_.clock,
+      io_tracer_, db_options_.stats, listeners,
+      db_options_.file_checksum_gen_factory.get(),
+      tmp_set.Contains(FileType::kTableFile)));
 
-  sub_compact->builder.reset(NewTableBuilder(
+  TableBuilderOptions tboptions(
       *cfd->ioptions(), *(sub_compact->compaction->mutable_cf_options()),
       cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
-      cfd->GetID(), cfd->GetName(), sub_compact->outfile.get(),
       sub_compact->compaction->output_compression(),
-      0 /*sample_for_compression */,
-      sub_compact->compaction->output_compression_opts(),
-      sub_compact->compaction->output_level(), skip_filters,
-      oldest_ancester_time, 0 /* oldest_key_time */,
-      sub_compact->compaction->max_output_file_size(), current_time, db_id_,
-      db_session_id_));
+      sub_compact->compaction->output_compression_opts(), cfd->GetID(),
+      cfd->GetName(), sub_compact->compaction->output_level(),
+      bottommost_level_, TableFileCreationReason::kCompaction,
+      oldest_ancester_time, 0 /* oldest_key_time */, current_time, db_id_,
+      db_session_id_, sub_compact->compaction->max_output_file_size(),
+      file_number);
+  sub_compact->builder.reset(
+      NewTableBuilder(tboptions, sub_compact->outfile.get()));
   LogFlush(db_options_.info_log);
   return s;
 }
@@ -1689,6 +2005,8 @@ void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
 #endif  // !ROCKSDB_LITE
 
 void CompactionJob::UpdateCompactionStats() {
+  assert(compact_);
+
   Compaction* compaction = compact_->compaction;
   compaction_stats_.num_input_files_in_non_output_levels = 0;
   compaction_stats_.num_input_files_in_output_level = 0;
@@ -1706,27 +2024,20 @@ void CompactionJob::UpdateCompactionStats() {
     }
   }
 
-  uint64_t num_output_records = 0;
-
-  for (const auto& sub_compact : compact_->sub_compact_states) {
-    size_t num_output_files = sub_compact.outputs.size();
-    if (sub_compact.builder != nullptr) {
-      // An error occurred so ignore the last output.
-      assert(num_output_files > 0);
-      --num_output_files;
-    }
-    compaction_stats_.num_output_files += static_cast<int>(num_output_files);
+  assert(compaction_job_stats_);
+  compaction_stats_.bytes_read_blob =
+      compaction_job_stats_->total_blob_bytes_read;
 
-    num_output_records += sub_compact.num_output_records;
+  compaction_stats_.num_output_files =
+      static_cast<int>(compact_->num_output_files);
+  compaction_stats_.num_output_files_blob =
+      static_cast<int>(compact_->num_blob_output_files);
+  compaction_stats_.bytes_written = compact_->total_bytes;
+  compaction_stats_.bytes_written_blob = compact_->total_blob_bytes;
 
-    for (const auto& out : sub_compact.outputs) {
-      compaction_stats_.bytes_written += out.meta.fd.file_size;
-    }
-  }
-
-  if (compaction_stats_.num_input_records > num_output_records) {
+  if (compaction_stats_.num_input_records > compact_->num_output_records) {
     compaction_stats_.num_dropped_records =
-        compaction_stats_.num_input_records - num_output_records;
+        compaction_stats_.num_input_records - compact_->num_output_records;
   }
 }
 
@@ -1762,10 +2073,12 @@ void CompactionJob::UpdateCompactionJobStats(
 
   // output information
   compaction_job_stats_->total_output_bytes = stats.bytes_written;
+  compaction_job_stats_->total_output_bytes_blob = stats.bytes_written_blob;
   compaction_job_stats_->num_output_records = compact_->num_output_records;
   compaction_job_stats_->num_output_files = stats.num_output_files;
+  compaction_job_stats_->num_output_files_blob = stats.num_output_files_blob;
 
-  if (compact_->NumOutputFiles() > 0U) {
+  if (stats.num_output_files > 0) {
     CopyPrefix(compact_->SmallestUserKey(),
                CompactionJobStats::kMaxPrefixLength,
                &compaction_job_stats_->smallest_output_key_prefix);
@@ -1852,7 +2165,7 @@ void CompactionJob::RunRemote(PluggableCompactionService* service) {
       uint64_t fileno = f[i]->fd.GetNumber();
       uint32_t pathid = f[i]->fd.GetPathId();
       files_in_one_level.files.push_back(
-          TableFileName(c->immutable_cf_options()->cf_paths, fileno, pathid));
+          TableFileName(c->immutable_options()->cf_paths, fileno, pathid));
     }
     param.input_files.push_back(files_in_one_level);
   }
@@ -1868,6 +2181,7 @@ void CompactionJob::RunRemote(PluggableCompactionService* service) {
         compact_->sub_compact_states[i].compaction_job_stats.cpu_micros;
   }
 
+
   RecordTimeToHistogram(stats_, COMPACTION_TIME, compaction_stats_.micros);
   RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
                         compaction_stats_.cpu_micros);
@@ -1895,7 +2209,7 @@ void CompactionJob::RunRemote(PluggableCompactionService* service) {
     // Generate a path name where an externally compacted file can
     // be copied into.  Do not read into block cache.
     destinations.push_back(
-        TableFileName(sub->compaction->immutable_cf_options()->cf_paths,
+        TableFileName(sub->compaction->immutable_options()->cf_paths,
                       file_numbers.back(), sub->compaction->output_path_id()));
 
     ROCKS_LOG_INFO(db_options_.info_log, "Going to install file %s to %s",
@@ -1948,7 +2262,7 @@ void CompactionJob::RunRemote(PluggableCompactionService* service) {
   for (const auto& state : compact_->sub_compact_states) {
     for (const auto& output : state.outputs) {
       auto fn =
-          TableFileName(state.compaction->immutable_cf_options()->cf_paths,
+          TableFileName(state.compaction->immutable_options()->cf_paths,
                         output.meta.fd.GetNumber(), output.meta.fd.GetPathId());
       tp[fn] = output.table_properties;
     }
@@ -1978,7 +2292,7 @@ void CompactionJob::RetrieveResultsAndCleanup(
   for (const auto& sub_compact : compact_->sub_compact_states) {
     for (const auto& out : sub_compact.outputs) {
       std::string path = TableFileName(
-          sub_compact.compaction->immutable_cf_options()->cf_paths,
+          sub_compact.compaction->immutable_options()->cf_paths,
           out.meta.fd.GetNumber(), out.meta.fd.GetPathId());
 
       OutputFile file;
@@ -2002,4 +2316,625 @@ void CompactionJob::RetrieveResultsAndCleanup(
   CleanupCompaction();
 }
 
+std::string CompactionJob::GetTableFileName(uint64_t file_number) {
+  return TableFileName(compact_->compaction->immutable_options()->cf_paths,
+                       file_number, compact_->compaction->output_path_id());
+}
+
+#ifndef ROCKSDB_LITE
+std::string CompactionServiceCompactionJob::GetTableFileName(
+    uint64_t file_number) {
+  return MakeTableFileName(output_path_, file_number);
+}
+
+CompactionServiceCompactionJob::CompactionServiceCompactionJob(
+    int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+    const MutableDBOptions& mutable_db_options, const FileOptions& file_options,
+    VersionSet* versions, const std::atomic<bool>* shutting_down,
+    LogBuffer* log_buffer, FSDirectory* output_directory, Statistics* stats,
+    InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+    std::vector<SequenceNumber> existing_snapshots,
+    std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+    const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer,
+    const std::string& db_id, const std::string& db_session_id,
+    const std::string& output_path,
+    const CompactionServiceInput& compaction_service_input,
+    CompactionServiceResult* compaction_service_result)
+    : CompactionJob(
+          job_id, compaction, db_options, mutable_db_options, file_options,
+          versions, shutting_down, 0, log_buffer, nullptr, output_directory,
+          nullptr, stats, db_mutex, db_error_handler, existing_snapshots,
+          kMaxSequenceNumber, nullptr, table_cache, event_logger,
+          compaction->mutable_cf_options()->paranoid_file_checks,
+          compaction->mutable_cf_options()->report_bg_io_stats, dbname,
+          &(compaction_service_result->stats), Env::Priority::USER, io_tracer,
+          nullptr, nullptr, db_id, db_session_id,
+          compaction->column_family_data()->GetFullHistoryTsLow()),
+      output_path_(output_path),
+      compaction_input_(compaction_service_input),
+      compaction_result_(compaction_service_result) {}
+
+Status CompactionServiceCompactionJob::Run() {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_RUN);
+
+  auto* c = compact_->compaction;
+  assert(c->column_family_data() != nullptr);
+  assert(c->column_family_data()->current()->storage_info()->NumLevelFiles(
+             compact_->compaction->level()) > 0);
+
+  write_hint_ =
+      c->column_family_data()->CalculateSSTWriteHint(c->output_level());
+  bottommost_level_ = c->bottommost_level();
+
+  Slice begin = compaction_input_.begin;
+  Slice end = compaction_input_.end;
+  compact_->sub_compact_states.emplace_back(
+      c, compaction_input_.has_begin ? &begin : nullptr,
+      compaction_input_.has_end ? &end : nullptr,
+      compaction_input_.approx_size);
+
+  log_buffer_->FlushBufferToLog();
+  LogCompaction();
+  const uint64_t start_micros = db_options_.clock->NowMicros();
+  // Pick the only sub-compaction we should have
+  assert(compact_->sub_compact_states.size() == 1);
+  SubcompactionState* sub_compact = compact_->sub_compact_states.data();
+
+  ProcessKeyValueCompaction(sub_compact);
+
+  compaction_stats_.micros = db_options_.clock->NowMicros() - start_micros;
+  compaction_stats_.cpu_micros = sub_compact->compaction_job_stats.cpu_micros;
+
+  RecordTimeToHistogram(stats_, COMPACTION_TIME, compaction_stats_.micros);
+  RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
+                        compaction_stats_.cpu_micros);
+
+  Status status = sub_compact->status;
+  IOStatus io_s = sub_compact->io_status;
+
+  if (io_status_.ok()) {
+    io_status_ = io_s;
+  }
+
+  if (status.ok()) {
+    constexpr IODebugContext* dbg = nullptr;
+
+    if (output_directory_) {
+      io_s = output_directory_->Fsync(IOOptions(), dbg);
+    }
+  }
+  if (io_status_.ok()) {
+    io_status_ = io_s;
+  }
+  if (status.ok()) {
+    status = io_s;
+  }
+  if (status.ok()) {
+    // TODO: Add verify_table() and VerifyCompactionFileConsistency()
+  }
+
+  // Finish up all book-keeping to unify the subcompaction results
+  AggregateStatistics();
+  UpdateCompactionStats();
+
+  compaction_result_->bytes_written = IOSTATS(bytes_written);
+  compaction_result_->bytes_read = IOSTATS(bytes_read);
+  RecordCompactionIOStats();
+
+  LogFlush(db_options_.info_log);
+  compact_->status = status;
+  compact_->status.PermitUncheckedError();
+
+  // Build compaction result
+  compaction_result_->output_level = compact_->compaction->output_level();
+  compaction_result_->output_path = output_path_;
+  for (const auto& output_file : sub_compact->outputs) {
+    auto& meta = output_file.meta;
+    compaction_result_->output_files.emplace_back(
+        MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno,
+        meta.fd.largest_seqno, meta.smallest.Encode().ToString(),
+        meta.largest.Encode().ToString(), meta.oldest_ancester_time,
+        meta.file_creation_time, output_file.validator.GetHash(),
+        meta.marked_for_compaction);
+  }
+  compaction_result_->num_output_records = sub_compact->num_output_records;
+  compaction_result_->total_bytes = sub_compact->total_bytes;
+
+  return status;
+}
+
+void CompactionServiceCompactionJob::CleanupCompaction() {
+  CompactionJob::CleanupCompaction();
+}
+
+// Internal binary format for the input and result data
+enum BinaryFormatVersion : uint32_t {
+  kOptionsString = 1,  // Use string format similar to Option string format
+};
+
+// offset_of is used to get the offset of a class data member
+// ex: offset_of(&ColumnFamilyDescriptor::options)
+// This call will return the offset of options in ColumnFamilyDescriptor class
+//
+// This is the same as offsetof() but allow us to work with non standard-layout
+// classes and structures
+// refs:
+// http://en.cppreference.com/w/cpp/concept/StandardLayoutType
+// https://gist.github.com/graphitemaster/494f21190bb2c63c5516
+static ColumnFamilyDescriptor dummy_cfd("", ColumnFamilyOptions());
+template <typename T1>
+int offset_of(T1 ColumnFamilyDescriptor::*member) {
+  return int(size_t(&(dummy_cfd.*member)) - size_t(&dummy_cfd));
+}
+
+static CompactionServiceInput dummy_cs_input;
+template <typename T1>
+int offset_of(T1 CompactionServiceInput::*member) {
+  return int(size_t(&(dummy_cs_input.*member)) - size_t(&dummy_cs_input));
+}
+
+static std::unordered_map<std::string, OptionTypeInfo> cfd_type_info = {
+    {"name",
+     {offset_of(&ColumnFamilyDescriptor::name), OptionType::kEncodedString,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"options",
+     {offset_of(&ColumnFamilyDescriptor::options), OptionType::kConfigurable,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const std::string& value, void* addr) {
+        auto cf_options = static_cast<ColumnFamilyOptions*>(addr);
+        return GetColumnFamilyOptionsFromString(opts, ColumnFamilyOptions(),
+                                                value, cf_options);
+      },
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const void* addr, std::string* value) {
+        const auto cf_options = static_cast<const ColumnFamilyOptions*>(addr);
+        std::string result;
+        auto status =
+            GetStringFromColumnFamilyOptions(opts, *cf_options, &result);
+        *value = "{" + result + "}";
+        return status;
+      },
+      [](const ConfigOptions& opts, const std::string& name, const void* addr1,
+         const void* addr2, std::string* mismatch) {
+        const auto this_one = static_cast<const ColumnFamilyOptions*>(addr1);
+        const auto that_one = static_cast<const ColumnFamilyOptions*>(addr2);
+        auto this_conf = CFOptionsAsConfigurable(*this_one);
+        auto that_conf = CFOptionsAsConfigurable(*that_one);
+        std::string mismatch_opt;
+        bool result =
+            this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt);
+        if (!result) {
+          *mismatch = name + "." + mismatch_opt;
+        }
+        return result;
+      }}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> cs_input_type_info = {
+    {"column_family",
+     OptionTypeInfo::Struct("column_family", &cfd_type_info,
+                            offset_of(&CompactionServiceInput::column_family),
+                            OptionVerificationType::kNormal,
+                            OptionTypeFlags::kNone)},
+    {"db_options",
+     {offset_of(&CompactionServiceInput::db_options), OptionType::kConfigurable,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const std::string& value, void* addr) {
+        auto options = static_cast<DBOptions*>(addr);
+        return GetDBOptionsFromString(opts, DBOptions(), value, options);
+      },
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const void* addr, std::string* value) {
+        const auto options = static_cast<const DBOptions*>(addr);
+        std::string result;
+        auto status = GetStringFromDBOptions(opts, *options, &result);
+        *value = "{" + result + "}";
+        return status;
+      },
+      [](const ConfigOptions& opts, const std::string& name, const void* addr1,
+         const void* addr2, std::string* mismatch) {
+        const auto this_one = static_cast<const DBOptions*>(addr1);
+        const auto that_one = static_cast<const DBOptions*>(addr2);
+        auto this_conf = DBOptionsAsConfigurable(*this_one);
+        auto that_conf = DBOptionsAsConfigurable(*that_one);
+        std::string mismatch_opt;
+        bool result =
+            this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt);
+        if (!result) {
+          *mismatch = name + "." + mismatch_opt;
+        }
+        return result;
+      }}},
+    {"snapshots", OptionTypeInfo::Vector<uint64_t>(
+                      offset_of(&CompactionServiceInput::snapshots),
+                      OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+                      {0, OptionType::kUInt64T})},
+    {"input_files", OptionTypeInfo::Vector<std::string>(
+                        offset_of(&CompactionServiceInput::input_files),
+                        OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+                        {0, OptionType::kEncodedString})},
+    {"output_level",
+     {offset_of(&CompactionServiceInput::output_level), OptionType::kInt,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"has_begin",
+     {offset_of(&CompactionServiceInput::has_begin), OptionType::kBoolean,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"begin",
+     {offset_of(&CompactionServiceInput::begin), OptionType::kEncodedString,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"has_end",
+     {offset_of(&CompactionServiceInput::has_end), OptionType::kBoolean,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"end",
+     {offset_of(&CompactionServiceInput::end), OptionType::kEncodedString,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"approx_size",
+     {offset_of(&CompactionServiceInput::approx_size), OptionType::kUInt64T,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    cs_output_file_type_info = {
+        {"file_name",
+         {offsetof(struct CompactionServiceOutputFile, file_name),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"smallest_seqno",
+         {offsetof(struct CompactionServiceOutputFile, smallest_seqno),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"largest_seqno",
+         {offsetof(struct CompactionServiceOutputFile, largest_seqno),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"smallest_internal_key",
+         {offsetof(struct CompactionServiceOutputFile, smallest_internal_key),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"largest_internal_key",
+         {offsetof(struct CompactionServiceOutputFile, largest_internal_key),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"oldest_ancester_time",
+         {offsetof(struct CompactionServiceOutputFile, oldest_ancester_time),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_creation_time",
+         {offsetof(struct CompactionServiceOutputFile, file_creation_time),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"paranoid_hash",
+         {offsetof(struct CompactionServiceOutputFile, paranoid_hash),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"marked_for_compaction",
+         {offsetof(struct CompactionServiceOutputFile, marked_for_compaction),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    compaction_job_stats_type_info = {
+        {"elapsed_micros",
+         {offsetof(struct CompactionJobStats, elapsed_micros),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"cpu_micros",
+         {offsetof(struct CompactionJobStats, cpu_micros), OptionType::kUInt64T,
+          OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+        {"num_input_records",
+         {offsetof(struct CompactionJobStats, num_input_records),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_blobs_read",
+         {offsetof(struct CompactionJobStats, num_blobs_read),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_input_files",
+         {offsetof(struct CompactionJobStats, num_input_files),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_input_files_at_output_level",
+         {offsetof(struct CompactionJobStats, num_input_files_at_output_level),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_output_records",
+         {offsetof(struct CompactionJobStats, num_output_records),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_output_files",
+         {offsetof(struct CompactionJobStats, num_output_files),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_output_files_blob",
+         {offsetof(struct CompactionJobStats, num_output_files_blob),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"is_full_compaction",
+         {offsetof(struct CompactionJobStats, is_full_compaction),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"is_manual_compaction",
+         {offsetof(struct CompactionJobStats, is_manual_compaction),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_input_bytes",
+         {offsetof(struct CompactionJobStats, total_input_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_blob_bytes_read",
+         {offsetof(struct CompactionJobStats, total_blob_bytes_read),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_output_bytes",
+         {offsetof(struct CompactionJobStats, total_output_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_output_bytes_blob",
+         {offsetof(struct CompactionJobStats, total_output_bytes_blob),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_records_replaced",
+         {offsetof(struct CompactionJobStats, num_records_replaced),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_input_raw_key_bytes",
+         {offsetof(struct CompactionJobStats, total_input_raw_key_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_input_raw_value_bytes",
+         {offsetof(struct CompactionJobStats, total_input_raw_value_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_input_deletion_records",
+         {offsetof(struct CompactionJobStats, num_input_deletion_records),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_expired_deletion_records",
+         {offsetof(struct CompactionJobStats, num_expired_deletion_records),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_corrupt_keys",
+         {offsetof(struct CompactionJobStats, num_corrupt_keys),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_write_nanos",
+         {offsetof(struct CompactionJobStats, file_write_nanos),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_range_sync_nanos",
+         {offsetof(struct CompactionJobStats, file_range_sync_nanos),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_fsync_nanos",
+         {offsetof(struct CompactionJobStats, file_fsync_nanos),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_prepare_write_nanos",
+         {offsetof(struct CompactionJobStats, file_prepare_write_nanos),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"smallest_output_key_prefix",
+         {offsetof(struct CompactionJobStats, smallest_output_key_prefix),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"largest_output_key_prefix",
+         {offsetof(struct CompactionJobStats, largest_output_key_prefix),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_single_del_fallthru",
+         {offsetof(struct CompactionJobStats, num_single_del_fallthru),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_single_del_mismatch",
+         {offsetof(struct CompactionJobStats, num_single_del_mismatch),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+};
+
+namespace {
+// this is a helper struct to serialize and deserialize class Status, because
+// Status's members are not public.
+struct StatusSerializationAdapter {
+  uint8_t code;
+  uint8_t subcode;
+  uint8_t severity;
+  std::string message;
+
+  StatusSerializationAdapter() {}
+  explicit StatusSerializationAdapter(const Status& s) {
+    code = s.code();
+    subcode = s.subcode();
+    severity = s.severity();
+    auto msg = s.getState();
+    message = msg ? msg : "";
+  }
+
+  Status GetStatus() {
+    return Status(static_cast<Status::Code>(code),
+                  static_cast<Status::SubCode>(subcode),
+                  static_cast<Status::Severity>(severity), message);
+  }
+};
+}  // namespace
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    status_adapter_type_info = {
+        {"code",
+         {offsetof(struct StatusSerializationAdapter, code),
+          OptionType::kUInt8T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"subcode",
+         {offsetof(struct StatusSerializationAdapter, subcode),
+          OptionType::kUInt8T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"severity",
+         {offsetof(struct StatusSerializationAdapter, severity),
+          OptionType::kUInt8T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"message",
+         {offsetof(struct StatusSerializationAdapter, message),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> cs_result_type_info = {
+    {"status",
+     {offsetof(struct CompactionServiceResult, status),
+      OptionType::kCustomizable, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone,
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const std::string& value, void* addr) {
+        auto status_obj = static_cast<Status*>(addr);
+        StatusSerializationAdapter adapter;
+        Status s = OptionTypeInfo::ParseType(
+            opts, value, status_adapter_type_info, &adapter);
+        *status_obj = adapter.GetStatus();
+        return s;
+      },
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const void* addr, std::string* value) {
+        const auto status_obj = static_cast<const Status*>(addr);
+        StatusSerializationAdapter adapter(*status_obj);
+        std::string result;
+        Status s = OptionTypeInfo::SerializeType(opts, status_adapter_type_info,
+                                                 &adapter, &result);
+        *value = "{" + result + "}";
+        return s;
+      },
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const void* addr1, const void* addr2, std::string* mismatch) {
+        const auto status1 = static_cast<const Status*>(addr1);
+        const auto status2 = static_cast<const Status*>(addr2);
+        StatusSerializationAdapter adatper1(*status1);
+        StatusSerializationAdapter adapter2(*status2);
+        return OptionTypeInfo::TypesAreEqual(opts, status_adapter_type_info,
+                                             &adatper1, &adapter2, mismatch);
+      }}},
+    {"output_files",
+     OptionTypeInfo::Vector<CompactionServiceOutputFile>(
+         offsetof(struct CompactionServiceResult, output_files),
+         OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+         OptionTypeInfo::Struct("output_files", &cs_output_file_type_info, 0,
+                                OptionVerificationType::kNormal,
+                                OptionTypeFlags::kNone))},
+    {"output_level",
+     {offsetof(struct CompactionServiceResult, output_level), OptionType::kInt,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"output_path",
+     {offsetof(struct CompactionServiceResult, output_path),
+      OptionType::kEncodedString, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"num_output_records",
+     {offsetof(struct CompactionServiceResult, num_output_records),
+      OptionType::kUInt64T, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"total_bytes",
+     {offsetof(struct CompactionServiceResult, total_bytes),
+      OptionType::kUInt64T, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"bytes_read",
+     {offsetof(struct CompactionServiceResult, bytes_read),
+      OptionType::kUInt64T, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"bytes_written",
+     {offsetof(struct CompactionServiceResult, bytes_written),
+      OptionType::kUInt64T, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"stats", OptionTypeInfo::Struct(
+                  "stats", &compaction_job_stats_type_info,
+                  offsetof(struct CompactionServiceResult, stats),
+                  OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+};
+
+Status CompactionServiceInput::Read(const std::string& data_str,
+                                    CompactionServiceInput* obj) {
+  if (data_str.size() <= sizeof(BinaryFormatVersion)) {
+    return Status::InvalidArgument("Invalid CompactionServiceInput string");
+  }
+  auto format_version = DecodeFixed32(data_str.data());
+  if (format_version == kOptionsString) {
+    ConfigOptions cf;
+    cf.invoke_prepare_options = false;
+    cf.ignore_unknown_options = true;
+    return OptionTypeInfo::ParseType(
+        cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_input_type_info,
+        obj);
+  } else {
+    return Status::NotSupported(
+        "Compaction Service Input data version not supported: " +
+        ToString(format_version));
+  }
+}
+
+Status CompactionServiceInput::Write(std::string* output) {
+  char buf[sizeof(BinaryFormatVersion)];
+  EncodeFixed32(buf, kOptionsString);
+  output->append(buf, sizeof(BinaryFormatVersion));
+  ConfigOptions cf;
+  cf.invoke_prepare_options = false;
+  return OptionTypeInfo::SerializeType(cf, cs_input_type_info, this, output);
+}
+
+Status CompactionServiceResult::Read(const std::string& data_str,
+                                     CompactionServiceResult* obj) {
+  if (data_str.size() <= sizeof(BinaryFormatVersion)) {
+    return Status::InvalidArgument("Invalid CompactionServiceResult string");
+  }
+  auto format_version = DecodeFixed32(data_str.data());
+  if (format_version == kOptionsString) {
+    ConfigOptions cf;
+    cf.invoke_prepare_options = false;
+    cf.ignore_unknown_options = true;
+    return OptionTypeInfo::ParseType(
+        cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_result_type_info,
+        obj);
+  } else {
+    return Status::NotSupported(
+        "Compaction Service Result data version not supported: " +
+        ToString(format_version));
+  }
+}
+
+Status CompactionServiceResult::Write(std::string* output) {
+  char buf[sizeof(BinaryFormatVersion)];
+  EncodeFixed32(buf, kOptionsString);
+  output->append(buf, sizeof(BinaryFormatVersion));
+  ConfigOptions cf;
+  cf.invoke_prepare_options = false;
+  return OptionTypeInfo::SerializeType(cf, cs_result_type_info, this, output);
+}
+
+#ifndef NDEBUG
+bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other) {
+  std::string mismatch;
+  return TEST_Equals(other, &mismatch);
+}
+
+bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other,
+                                          std::string* mismatch) {
+  ConfigOptions cf;
+  cf.invoke_prepare_options = false;
+  return OptionTypeInfo::TypesAreEqual(cf, cs_result_type_info, this, other,
+                                       mismatch);
+}
+
+bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other) {
+  std::string mismatch;
+  return TEST_Equals(other, &mismatch);
+}
+
+bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other,
+                                         std::string* mismatch) {
+  ConfigOptions cf;
+  cf.invoke_prepare_options = false;
+  return OptionTypeInfo::TypesAreEqual(cf, cs_input_type_info, this, other,
+                                       mismatch);
+}
+#endif  // NDEBUG
+#endif  // !ROCKSDB_LITE
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index 18e9d5e8bd2..581692d4df0 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -17,6 +17,7 @@
 #include <utility>
 #include <vector>
 
+#include "db/blob/blob_file_completion_callback.h"
 #include "db/column_family.h"
 #include "db/compaction/compaction_iterator.h"
 #include "db/dbformat.h"
@@ -51,6 +52,7 @@ class Arena;
 class ErrorHandler;
 class MemTable;
 class SnapshotChecker;
+class SystemClock;
 class TableCache;
 class Version;
 class VersionEdit;
@@ -65,12 +67,13 @@ class CompactionJob {
  public:
   CompactionJob(
       int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+      const MutableDBOptions& mutable_db_options,
       const FileOptions& file_options, VersionSet* versions,
       const std::atomic<bool>* shutting_down,
       const SequenceNumber preserve_deletes_seqnum, LogBuffer* log_buffer,
       FSDirectory* db_directory, FSDirectory* output_directory,
-      Statistics* stats, InstrumentedMutex* db_mutex,
-      ErrorHandler* db_error_handler,
+      FSDirectory* blob_output_directory, Statistics* stats,
+      InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
       std::vector<SequenceNumber> existing_snapshots,
       SequenceNumber earliest_write_conflict_snapshot,
       const SnapshotChecker* snapshot_checker,
@@ -79,9 +82,12 @@ class CompactionJob {
       const std::string& dbname, CompactionJobStats* compaction_job_stats,
       Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
       const std::atomic<int>* manual_compaction_paused = nullptr,
-      const std::string& db_id = "", const std::string& db_session_id = "");
+      const std::atomic<bool>* manual_compaction_canceled = nullptr,
+      const std::string& db_id = "", const std::string& db_session_id = "",
+      std::string full_history_ts_low = "",
+      BlobFileCompletionCallback* blob_callback = nullptr);
 
-  ~CompactionJob();
+  virtual ~CompactionJob();
 
   // no copy/move
   CompactionJob(CompactionJob&& job) = delete;
@@ -110,11 +116,36 @@ class CompactionJob {
   // Return the IO status
   IOStatus io_status() const { return io_status_; }
 
- private:
+ protected:
   struct SubcompactionState;
+  // CompactionJob state
+  struct CompactionState;
 
   void AggregateStatistics();
+  void UpdateCompactionStats();
+  void LogCompaction();
+  void RecordCompactionIOStats();
+  void CleanupCompaction();
+
+  // Call compaction filter. Then iterate through input and compact the
+  // kv-pairs
+  void ProcessKeyValueCompaction(SubcompactionState* sub_compact);
 
+  CompactionState* compact_;
+  InternalStats::CompactionStats compaction_stats_;
+  const ImmutableDBOptions& db_options_;
+  const MutableDBOptions mutable_db_options_copy_;
+  LogBuffer* log_buffer_;
+  FSDirectory* output_directory_;
+  Statistics* stats_;
+  // Is this compaction creating a file in the bottom most level?
+  bool bottommost_level_;
+
+  Env::WriteLifeTimeHint write_hint_;
+
+  IOStatus io_status_;
+
+ private:
   // Generates a histogram representing potential divisions of key ranges from
   // the input. It adds the starting and/or ending keys of certain input files
   // to the working set and then finds the approximate size of data in between
@@ -122,12 +153,12 @@ class CompactionJob {
   // consecutive groups such that each group has a similar size.
   void GenSubcompactionBoundaries();
 
+  void ProcessKeyValueCompactionWithCompactionService(
+      SubcompactionState* sub_compact);
+
   // update the thread status for starting a compaction.
   void ReportStartedCompaction(Compaction* compaction);
   void AllocateCompactionOutputFileNumbers();
-  // Call compaction filter. Then iterate through input and compact the
-  // kv-pairs
-  void ProcessKeyValueCompaction(SubcompactionState* sub_compact);
 
   Status FinishCompactionOutputFile(
       const Status& input_status, SubcompactionState* sub_compact,
@@ -135,33 +166,23 @@ class CompactionJob {
       CompactionIterationStats* range_del_out_stats,
       const Slice* next_table_min_key = nullptr);
   Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options);
-  void RecordCompactionIOStats();
   Status OpenCompactionOutputFile(SubcompactionState* sub_compact);
-  void CleanupCompaction();
   void UpdateCompactionJobStats(
     const InternalStats::CompactionStats& stats) const;
   void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats,
                          CompactionJobStats* compaction_job_stats = nullptr);
 
-  void UpdateCompactionStats();
   void UpdateCompactionInputStatsHelper(
       int* num_files, uint64_t* bytes_read, int input_level);
 
-  void LogCompaction();
-
   int job_id_;
 
-  // CompactionJob state
-  struct CompactionState;
-  CompactionState* compact_;
   CompactionJobStats* compaction_job_stats_;
-  InternalStats::CompactionStats compaction_stats_;
 
   // DBImpl state
   const std::string& dbname_;
   const std::string db_id_;
   const std::string db_session_id_;
-  const ImmutableDBOptions& db_options_;
   const FileOptions file_options_;
 
   Env* env_;
@@ -172,11 +193,10 @@ class CompactionJob {
   VersionSet* versions_;
   const std::atomic<bool>* shutting_down_;
   const std::atomic<int>* manual_compaction_paused_;
+  const std::atomic<bool>* manual_compaction_canceled_;
   const SequenceNumber preserve_deletes_seqnum_;
-  LogBuffer* log_buffer_;
   FSDirectory* db_directory_;
-  FSDirectory* output_directory_;
-  Statistics* stats_;
+  FSDirectory* blob_output_directory_;
   InstrumentedMutex* db_mutex_;
   ErrorHandler* db_error_handler_;
   // If there were two snapshots with seq numbers s1 and
@@ -196,17 +216,153 @@ class CompactionJob {
 
   EventLogger* event_logger_;
 
-  // Is this compaction creating a file in the bottom most level?
-  bool bottommost_level_;
   bool paranoid_file_checks_;
   bool measure_io_stats_;
   // Stores the Slices that designate the boundaries for each subcompaction
   std::vector<Slice> boundaries_;
   // Stores the approx size of keys covered in the range of each subcompaction
   std::vector<uint64_t> sizes_;
-  Env::WriteLifeTimeHint write_hint_;
   Env::Priority thread_pri_;
-  IOStatus io_status_;
+  std::string full_history_ts_low_;
+  BlobFileCompletionCallback* blob_callback_;
+
+  // Get table file name in where it's outputting to, which should also be in
+  // `output_directory_`.
+  virtual std::string GetTableFileName(uint64_t file_number);
+};
+
+// CompactionServiceInput is used the pass compaction information between two
+// db instances. It contains the information needed to do a compaction. It
+// doesn't contain the LSM tree information, which is passed though MANIFEST
+// file.
+struct CompactionServiceInput {
+  ColumnFamilyDescriptor column_family;
+
+  DBOptions db_options;
+
+  std::vector<SequenceNumber> snapshots;
+
+  // SST files for compaction, it should already be expended to include all the
+  // files needed for this compaction, for both input level files and output
+  // level files.
+  std::vector<std::string> input_files;
+  int output_level;
+
+  // information for subcompaction
+  bool has_begin = false;
+  std::string begin;
+  bool has_end = false;
+  std::string end;
+  uint64_t approx_size = 0;
+
+  // serialization interface to read and write the object
+  static Status Read(const std::string& data_str, CompactionServiceInput* obj);
+  Status Write(std::string* output);
+
+  // Initialize a dummy ColumnFamilyDescriptor
+  CompactionServiceInput() : column_family("", ColumnFamilyOptions()) {}
+
+#ifndef NDEBUG
+  bool TEST_Equals(CompactionServiceInput* other);
+  bool TEST_Equals(CompactionServiceInput* other, std::string* mismatch);
+#endif  // NDEBUG
+};
+
+// CompactionServiceOutputFile is the metadata for the output SST file
+struct CompactionServiceOutputFile {
+  std::string file_name;
+  SequenceNumber smallest_seqno;
+  SequenceNumber largest_seqno;
+  std::string smallest_internal_key;
+  std::string largest_internal_key;
+  uint64_t oldest_ancester_time;
+  uint64_t file_creation_time;
+  uint64_t paranoid_hash;
+  bool marked_for_compaction;
+
+  CompactionServiceOutputFile() = default;
+  CompactionServiceOutputFile(
+      const std::string& name, SequenceNumber smallest, SequenceNumber largest,
+      std::string _smallest_internal_key, std::string _largest_internal_key,
+      uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
+      uint64_t _paranoid_hash, bool _marked_for_compaction)
+      : file_name(name),
+        smallest_seqno(smallest),
+        largest_seqno(largest),
+        smallest_internal_key(std::move(_smallest_internal_key)),
+        largest_internal_key(std::move(_largest_internal_key)),
+        oldest_ancester_time(_oldest_ancester_time),
+        file_creation_time(_file_creation_time),
+        paranoid_hash(_paranoid_hash),
+        marked_for_compaction(_marked_for_compaction) {}
+};
+
+// CompactionServiceResult contains the compaction result from a different db
+// instance, with these information, the primary db instance with write
+// permission is able to install the result to the DB.
+struct CompactionServiceResult {
+  Status status;
+  std::vector<CompactionServiceOutputFile> output_files;
+  int output_level;
+
+  // location of the output files
+  std::string output_path;
+
+  // some statistics about the compaction
+  uint64_t num_output_records;
+  uint64_t total_bytes;
+  uint64_t bytes_read;
+  uint64_t bytes_written;
+  CompactionJobStats stats;
+
+  // serialization interface to read and write the object
+  static Status Read(const std::string& data_str, CompactionServiceResult* obj);
+  Status Write(std::string* output);
+
+#ifndef NDEBUG
+  bool TEST_Equals(CompactionServiceResult* other);
+  bool TEST_Equals(CompactionServiceResult* other, std::string* mismatch);
+#endif  // NDEBUG
+};
+
+// CompactionServiceCompactionJob is an read-only compaction job, it takes
+// input information from `compaction_service_input` and put result information
+// in `compaction_service_result`, the SST files are generated to `output_path`.
+class CompactionServiceCompactionJob : private CompactionJob {
+ public:
+  CompactionServiceCompactionJob(
+      int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+      const MutableDBOptions& mutable_db_options,
+      const FileOptions& file_options, VersionSet* versions,
+      const std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
+      FSDirectory* output_directory, Statistics* stats,
+      InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+      std::vector<SequenceNumber> existing_snapshots,
+      std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+      const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer,
+      const std::string& db_id, const std::string& db_session_id,
+      const std::string& output_path,
+      const CompactionServiceInput& compaction_service_input,
+      CompactionServiceResult* compaction_service_result);
+
+  // Run the compaction in current thread and return the result
+  Status Run();
+
+  void CleanupCompaction();
+
+  IOStatus io_status() const { return CompactionJob::io_status(); }
+
+ private:
+  // Get table file name in output_path
+  std::string GetTableFileName(uint64_t file_number) override;
+  // Specific the compaction output path, otherwise it uses default DB path
+  const std::string output_path_;
+
+  // Compaction job input
+  const CompactionServiceInput& compaction_input_;
+
+  // Compaction job result
+  CompactionServiceResult* compaction_result_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/compaction_job_stats_test.cc b/db/compaction/compaction_job_stats_test.cc
index 9c640a591e0..325cc247e29 100644
--- a/db/compaction/compaction_job_stats_test.cc
+++ b/db/compaction/compaction_job_stats_test.cc
@@ -24,7 +24,6 @@
 #include "db/write_batch_internal.h"
 #include "env/mock_env.h"
 #include "file/filename.h"
-#include "logging/logging.h"
 #include "memtable/hash_linklist_rep.h"
 #include "monitoring/statistics.h"
 #include "monitoring/thread_status_util.h"
@@ -298,15 +297,14 @@ class CompactionJobStatsTest : public testing::Test,
     return result;
   }
 
-  uint64_t Size(const Slice& start, const Slice& limit, int cf = 0) {
+  Status Size(uint64_t* size, const Slice& start, const Slice& limit,
+              int cf = 0) {
     Range r(start, limit);
-    uint64_t size;
     if (cf == 0) {
-      db_->GetApproximateSizes(&r, 1, &size);
+      return db_->GetApproximateSizes(&r, 1, size);
     } else {
-      db_->GetApproximateSizes(handles_[1], &r, 1, &size);
+      return db_->GetApproximateSizes(handles_[1], &r, 1, size);
     }
-    return size;
   }
 
   void Compact(int cf, const Slice& start, const Slice& limit,
diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc
index 210042ca05e..7437f1249ff 100644
--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@@ -5,6 +5,8 @@
 
 #ifndef ROCKSDB_LITE
 
+#include "db/compaction/compaction_job.h"
+
 #include <algorithm>
 #include <array>
 #include <cinttypes>
@@ -14,13 +16,13 @@
 
 #include "db/blob/blob_index.h"
 #include "db/column_family.h"
-#include "db/compaction/compaction_job.h"
 #include "db/db_impl/db_impl.h"
 #include "db/error_handler.h"
 #include "db/version_set.h"
 #include "file/writable_file_writer.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
 #include "rocksdb/options.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/mock_table.h"
@@ -67,31 +69,38 @@ void VerifyInitializationOfCompactionJobStats(
 
 }  // namespace
 
-// TODO(icanadi) Make it simpler once we mock out VersionSet
-class CompactionJobTest : public testing::Test {
- public:
-  CompactionJobTest()
+class CompactionJobTestBase : public testing::Test {
+ protected:
+  CompactionJobTestBase(std::string dbname, const Comparator* ucmp,
+                        std::function<std::string(uint64_t)> encode_u64_ts)
       : env_(Env::Default()),
-        fs_(std::make_shared<LegacyFileSystemWrapper>(env_)),
-        dbname_(test::PerThreadDBPath("compaction_job_test")),
+        fs_(env_->GetFileSystem()),
+        dbname_(std::move(dbname)),
+        ucmp_(ucmp),
         db_options_(),
         mutable_cf_options_(cf_options_),
         mutable_db_options_(),
         table_cache_(NewLRUCache(50000, 16)),
         write_buffer_manager_(db_options_.db_write_buffer_size),
-        versions_(new VersionSet(
-            dbname_, &db_options_, env_options_, table_cache_.get(),
-            &write_buffer_manager_, &write_controller_,
-            /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr)),
+        versions_(new VersionSet(dbname_, &db_options_, env_options_,
+                                 table_cache_.get(), &write_buffer_manager_,
+                                 &write_controller_,
+                                 /*block_cache_tracer=*/nullptr,
+                                 /*io_tracer=*/nullptr, /*db_session_id*/ "")),
         shutting_down_(false),
         preserve_deletes_seqnum_(0),
         mock_table_factory_(new mock::MockTableFactory()),
-        error_handler_(nullptr, db_options_, &mutex_) {
+        error_handler_(nullptr, db_options_, &mutex_),
+        encode_u64_ts_(std::move(encode_u64_ts)) {}
+
+  void SetUp() override {
     EXPECT_OK(env_->CreateDirIfMissing(dbname_));
     db_options_.env = env_;
     db_options_.fs = fs_;
     db_options_.db_paths.emplace_back(dbname_,
                                       std::numeric_limits<uint64_t>::max());
+    cf_options_.comparator = ucmp_;
+    cf_options_.table_factory = mock_table_factory_;
   }
 
   std::string GenerateFileName(uint64_t file_number) {
@@ -102,9 +111,10 @@ class CompactionJobTest : public testing::Test {
     return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId());
   }
 
-  static std::string KeyStr(const std::string& user_key,
-                            const SequenceNumber seq_num, const ValueType t) {
-    return InternalKey(user_key, seq_num, t).Encode().ToString();
+  std::string KeyStr(const std::string& user_key, const SequenceNumber seq_num,
+                     const ValueType t, uint64_t ts = 0) {
+    std::string user_key_with_ts = user_key + encode_u64_ts_(ts);
+    return InternalKey(user_key_with_ts, seq_num, t).Encode().ToString();
   }
 
   static std::string BlobStr(uint64_t blob_file_number, uint64_t offset,
@@ -144,7 +154,8 @@ class CompactionJobTest : public testing::Test {
       std::string skey;
       std::string value;
       std::tie(skey, value) = kv;
-      const Status pikStatus = ParseInternalKey(skey, &key);
+      const Status pik_status =
+          ParseInternalKey(skey, &key, true /* log_err_key */);
 
       smallest_seqno = std::min(smallest_seqno, key.sequence);
       largest_seqno = std::max(largest_seqno, key.sequence);
@@ -162,7 +173,7 @@ class CompactionJobTest : public testing::Test {
 
       first_key = false;
 
-      if (pikStatus.ok() && key.type == kTypeBlobIndex) {
+      if (pik_status.ok() && key.type == kTypeBlobIndex) {
         BlobIndex blob_index;
         const Status s = blob_index.DecodeFrom(value);
         if (!s.ok()) {
@@ -207,9 +218,9 @@ class CompactionJobTest : public testing::Test {
   // returns expected result after compaction
   mock::KVVector CreateTwoFiles(bool gen_corrupted_keys) {
     stl_wrappers::KVMap expected_results;
-    const int kKeysPerFile = 10000;
-    const int kCorruptKeysPerFile = 200;
-    const int kMatchingKeys = kKeysPerFile / 2;
+    constexpr int kKeysPerFile = 10000;
+    constexpr int kCorruptKeysPerFile = 200;
+    constexpr int kMatchingKeys = kKeysPerFile / 2;
     SequenceNumber sequence_number = 0;
 
     auto corrupt_id = [&](int id) {
@@ -238,7 +249,7 @@ class CompactionJobTest : public testing::Test {
               {bottommost_internal_key.Encode().ToString(), value});
         }
       }
-      mock::SortKVVector(&contents);
+      mock::SortKVVector(&contents, ucmp_);
 
       AddMockFile(contents);
     }
@@ -254,33 +265,29 @@ class CompactionJobTest : public testing::Test {
   }
 
   void NewDB() {
-    DestroyDB(dbname_, Options());
+    EXPECT_OK(DestroyDB(dbname_, Options()));
     EXPECT_OK(env_->CreateDirIfMissing(dbname_));
     versions_.reset(
         new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
                        &write_buffer_manager_, &write_controller_,
-                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr));
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
     compaction_job_stats_.Reset();
-    SetIdentityFile(env_, dbname_);
+    ASSERT_OK(SetIdentityFile(env_, dbname_));
 
     VersionEdit new_db;
-    if (db_options_.write_dbid_to_manifest) {
-      DBImpl* impl = new DBImpl(DBOptions(), dbname_);
-      std::string db_id;
-      impl->GetDbIdentityFromIdentityFile(&db_id);
-      new_db.SetDBId(db_id);
-    }
     new_db.SetLogNumber(0);
     new_db.SetNextFile(2);
     new_db.SetLastSequence(0);
 
     const std::string manifest = DescriptorFileName(dbname_, 1);
-    std::unique_ptr<WritableFile> file;
-    Status s = env_->NewWritableFile(
-        manifest, &file, env_->OptimizeForManifestWrite(env_options_));
+    std::unique_ptr<WritableFileWriter> file_writer;
+    const auto& fs = env_->GetFileSystem();
+    Status s = WritableFileWriter::Create(
+        fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer,
+        nullptr);
+
     ASSERT_OK(s);
-    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-        NewLegacyWritableFileWrapper(std::move(file)), manifest, env_options_));
     {
       log::Writer log(std::move(file_writer), 0, false);
       std::string record;
@@ -293,13 +300,12 @@ class CompactionJobTest : public testing::Test {
 
     ASSERT_OK(s);
 
-    std::vector<ColumnFamilyDescriptor> column_families;
-    cf_options_.table_factory = mock_table_factory_;
     cf_options_.merge_operator = merge_op_;
     cf_options_.compaction_filter = compaction_filter_.get();
+    std::vector<ColumnFamilyDescriptor> column_families;
     column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
 
-    EXPECT_OK(versions_->Recover(column_families, false));
+    ASSERT_OK(versions_->Recover(column_families, false));
     cfd_ = versions_->GetColumnFamilySet()->GetDefault();
   }
 
@@ -337,19 +343,23 @@ class CompactionJobTest : public testing::Test {
     EventLogger event_logger(db_options_.info_log.get());
     // TODO(yiwu) add a mock snapshot checker and add test for it.
     SnapshotChecker* snapshot_checker = nullptr;
+    ASSERT_TRUE(full_history_ts_low_.empty() ||
+                ucmp_->timestamp_size() == full_history_ts_low_.size());
     CompactionJob compaction_job(
-        0, &compaction, db_options_, env_options_, versions_.get(),
-        &shutting_down_, preserve_deletes_seqnum_, &log_buffer, nullptr,
-        nullptr, nullptr, &mutex_, &error_handler_, snapshots,
+        0, &compaction, db_options_, mutable_db_options_, env_options_,
+        versions_.get(), &shutting_down_, preserve_deletes_seqnum_, &log_buffer,
+        nullptr, nullptr, nullptr, nullptr, &mutex_, &error_handler_, snapshots,
         earliest_write_conflict_snapshot, snapshot_checker, table_cache_,
         &event_logger, false, false, dbname_, &compaction_job_stats_,
-        Env::Priority::USER, nullptr /* IOTracer */);
+        Env::Priority::USER, nullptr /* IOTracer */,
+        /*manual_compaction_paused=*/nullptr,
+        /*manual_compaction_canceled=*/nullptr, /*db_id=*/"",
+        /*db_session_id=*/"", full_history_ts_low_);
     VerifyInitializationOfCompactionJobStats(compaction_job_stats_);
 
     compaction_job.Prepare();
     mutex_.Unlock();
-    Status s;
-    s = compaction_job.Run();
+    Status s = compaction_job.Run();
     ASSERT_OK(s);
     ASSERT_OK(compaction_job.io_status());
     mutex_.Lock();
@@ -379,6 +389,7 @@ class CompactionJobTest : public testing::Test {
   Env* env_;
   std::shared_ptr<FileSystem> fs_;
   std::string dbname_;
+  const Comparator* const ucmp_;
   EnvOptions env_options_;
   ImmutableDBOptions db_options_;
   ColumnFamilyOptions cf_options_;
@@ -397,6 +408,17 @@ class CompactionJobTest : public testing::Test {
   std::unique_ptr<CompactionFilter> compaction_filter_;
   std::shared_ptr<MergeOperator> merge_op_;
   ErrorHandler error_handler_;
+  std::string full_history_ts_low_;
+  const std::function<std::string(uint64_t)> encode_u64_ts_;
+};
+
+// TODO(icanadi) Make it simpler once we mock out VersionSet
+class CompactionJobTest : public CompactionJobTestBase {
+ public:
+  CompactionJobTest()
+      : CompactionJobTestBase(test::PerThreadDBPath("compaction_job_test"),
+                              BytewiseComparator(),
+                              [](uint64_t /*ts*/) { return ""; }) {}
 };
 
 TEST_F(CompactionJobTest, Simple) {
@@ -1077,6 +1099,297 @@ TEST_F(CompactionJobTest, OldestBlobFileNumber) {
                 /* expected_oldest_blob_file_number */ 19);
 }
 
+TEST_F(CompactionJobTest, InputSerialization) {
+  // Setup a random CompactionServiceInput
+  CompactionServiceInput input;
+  const int kStrMaxLen = 1000;
+  Random rnd(static_cast<uint32_t>(time(nullptr)));
+  Random64 rnd64(time(nullptr));
+  input.column_family.name = rnd.RandomString(rnd.Uniform(kStrMaxLen));
+  input.column_family.options.comparator = ReverseBytewiseComparator();
+  input.column_family.options.max_bytes_for_level_base =
+      rnd64.Uniform(UINT64_MAX);
+  input.column_family.options.disable_auto_compactions = rnd.OneIn(2);
+  input.column_family.options.compression = kZSTD;
+  input.column_family.options.compression_opts.level = 4;
+  input.db_options.max_background_flushes = 10;
+  input.db_options.paranoid_checks = rnd.OneIn(2);
+  input.db_options.statistics = CreateDBStatistics();
+  input.db_options.env = env_;
+  while (!rnd.OneIn(10)) {
+    input.snapshots.emplace_back(rnd64.Uniform(UINT64_MAX));
+  }
+  while (!rnd.OneIn(10)) {
+    input.input_files.emplace_back(rnd.RandomString(rnd.Uniform(kStrMaxLen)));
+  }
+  input.output_level = 4;
+  input.has_begin = rnd.OneIn(2);
+  if (input.has_begin) {
+    input.begin = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen));
+  }
+  input.has_end = rnd.OneIn(2);
+  if (input.has_end) {
+    input.end = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen));
+  }
+  input.approx_size = rnd64.Uniform(UINT64_MAX);
+
+  std::string output;
+  ASSERT_OK(input.Write(&output));
+
+  // Test deserialization
+  CompactionServiceInput deserialized1;
+  ASSERT_OK(CompactionServiceInput::Read(output, &deserialized1));
+  ASSERT_TRUE(deserialized1.TEST_Equals(&input));
+
+  // Test mismatch
+  deserialized1.db_options.max_background_flushes += 10;
+  std::string mismatch;
+  ASSERT_FALSE(deserialized1.TEST_Equals(&input, &mismatch));
+  ASSERT_EQ(mismatch, "db_options.max_background_flushes");
+
+  // Test unknown field
+  CompactionServiceInput deserialized2;
+  output.clear();
+  ASSERT_OK(input.Write(&output));
+  output.append("new_field=123;");
+
+  ASSERT_OK(CompactionServiceInput::Read(output, &deserialized2));
+  ASSERT_TRUE(deserialized2.TEST_Equals(&input));
+
+  // Test missing field
+  CompactionServiceInput deserialized3;
+  deserialized3.output_level = 0;
+  std::string to_remove = "output_level=4;";
+  size_t pos = output.find(to_remove);
+  ASSERT_TRUE(pos != std::string::npos);
+  output.erase(pos, to_remove.length());
+  ASSERT_OK(CompactionServiceInput::Read(output, &deserialized3));
+  mismatch.clear();
+  ASSERT_FALSE(deserialized3.TEST_Equals(&input, &mismatch));
+  ASSERT_EQ(mismatch, "output_level");
+
+  // manually set the value back, should match the original structure
+  deserialized3.output_level = 4;
+  ASSERT_TRUE(deserialized3.TEST_Equals(&input));
+
+  // Test invalid version
+  output.clear();
+  ASSERT_OK(input.Write(&output));
+
+  uint32_t data_version = DecodeFixed32(output.data());
+  const size_t kDataVersionSize = sizeof(data_version);
+  ASSERT_EQ(data_version,
+            1U);  // Update once the default data version is changed
+  char buf[kDataVersionSize];
+  EncodeFixed32(buf, data_version + 10);  // make sure it's not valid
+  output.replace(0, kDataVersionSize, buf, kDataVersionSize);
+  Status s = CompactionServiceInput::Read(output, &deserialized3);
+  ASSERT_TRUE(s.IsNotSupported());
+}
+
+TEST_F(CompactionJobTest, ResultSerialization) {
+  // Setup a random CompactionServiceResult
+  CompactionServiceResult result;
+  const int kStrMaxLen = 1000;
+  Random rnd(static_cast<uint32_t>(time(nullptr)));
+  Random64 rnd64(time(nullptr));
+  std::vector<Status> status_list = {
+      Status::OK(),
+      Status::InvalidArgument("invalid option"),
+      Status::Aborted("failed to run"),
+      Status::NotSupported("not supported option"),
+  };
+  result.status =
+      status_list.at(rnd.Uniform(static_cast<int>(status_list.size())));
+  while (!rnd.OneIn(10)) {
+    result.output_files.emplace_back(
+        rnd.RandomString(rnd.Uniform(kStrMaxLen)), rnd64.Uniform(UINT64_MAX),
+        rnd64.Uniform(UINT64_MAX),
+        rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)),
+        rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)),
+        rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX),
+        rnd64.Uniform(UINT64_MAX), rnd.OneIn(2));
+  }
+  result.output_level = rnd.Uniform(10);
+  result.output_path = rnd.RandomString(rnd.Uniform(kStrMaxLen));
+  result.num_output_records = rnd64.Uniform(UINT64_MAX);
+  result.total_bytes = rnd64.Uniform(UINT64_MAX);
+  result.bytes_read = 123;
+  result.bytes_written = rnd64.Uniform(UINT64_MAX);
+  result.stats.elapsed_micros = rnd64.Uniform(UINT64_MAX);
+  result.stats.num_output_files = rnd.Uniform(1000);
+  result.stats.is_full_compaction = rnd.OneIn(2);
+  result.stats.num_single_del_mismatch = rnd64.Uniform(UINT64_MAX);
+  result.stats.num_input_files = 9;
+
+  std::string output;
+  ASSERT_OK(result.Write(&output));
+
+  // Test deserialization
+  CompactionServiceResult deserialized1;
+  ASSERT_OK(CompactionServiceResult::Read(output, &deserialized1));
+  ASSERT_TRUE(deserialized1.TEST_Equals(&result));
+
+  // Test mismatch
+  deserialized1.stats.num_input_files += 10;
+  std::string mismatch;
+  ASSERT_FALSE(deserialized1.TEST_Equals(&result, &mismatch));
+  ASSERT_EQ(mismatch, "stats.num_input_files");
+
+  // Test unknown field
+  CompactionServiceResult deserialized2;
+  output.clear();
+  ASSERT_OK(result.Write(&output));
+  output.append("new_field=123;");
+
+  ASSERT_OK(CompactionServiceResult::Read(output, &deserialized2));
+  ASSERT_TRUE(deserialized2.TEST_Equals(&result));
+
+  // Test missing field
+  CompactionServiceResult deserialized3;
+  deserialized3.bytes_read = 0;
+  std::string to_remove = "bytes_read=123;";
+  size_t pos = output.find(to_remove);
+  ASSERT_TRUE(pos != std::string::npos);
+  output.erase(pos, to_remove.length());
+  ASSERT_OK(CompactionServiceResult::Read(output, &deserialized3));
+  mismatch.clear();
+  ASSERT_FALSE(deserialized3.TEST_Equals(&result, &mismatch));
+  ASSERT_EQ(mismatch, "bytes_read");
+
+  deserialized3.bytes_read = 123;
+  ASSERT_TRUE(deserialized3.TEST_Equals(&result));
+
+  // Test invalid version
+  output.clear();
+  ASSERT_OK(result.Write(&output));
+
+  uint32_t data_version = DecodeFixed32(output.data());
+  const size_t kDataVersionSize = sizeof(data_version);
+  ASSERT_EQ(data_version,
+            1U);  // Update once the default data version is changed
+  char buf[kDataVersionSize];
+  EncodeFixed32(buf, data_version + 10);  // make sure it's not valid
+  output.replace(0, kDataVersionSize, buf, kDataVersionSize);
+  Status s = CompactionServiceResult::Read(output, &deserialized3);
+  ASSERT_TRUE(s.IsNotSupported());
+  for (const auto& item : status_list) {
+    item.PermitUncheckedError();
+  }
+}
+
+class CompactionJobTimestampTest : public CompactionJobTestBase {
+ public:
+  CompactionJobTimestampTest()
+      : CompactionJobTestBase(test::PerThreadDBPath("compaction_job_ts_test"),
+                              test::ComparatorWithU64Ts(), test::EncodeInt) {}
+};
+
+TEST_F(CompactionJobTimestampTest, GCDisabled) {
+  NewDB();
+
+  auto file1 =
+      mock::MakeMockFile({{KeyStr("a", 10, ValueType::kTypeValue, 100), "a10"},
+                          {KeyStr("a", 9, ValueType::kTypeValue, 99), "a9"},
+                          {KeyStr("b", 8, ValueType::kTypeValue, 98), "b8"}});
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile(
+      {{KeyStr("b", 7, ValueType::kTypeDeletionWithTimestamp, 97), ""},
+       {KeyStr("c", 6, ValueType::kTypeDeletionWithTimestamp, 96), ""},
+       {KeyStr("c", 5, ValueType::kTypeValue, 95), "c5"}});
+  AddMockFile(file2);
+
+  SetLastSequence(10);
+
+  auto expected_results = mock::MakeMockFile(
+      {{KeyStr("a", 10, ValueType::kTypeValue, 100), "a10"},
+       {KeyStr("a", 9, ValueType::kTypeValue, 99), "a9"},
+       {KeyStr("b", 8, ValueType::kTypeValue, 98), "b8"},
+       {KeyStr("b", 7, ValueType::kTypeDeletionWithTimestamp, 97), ""},
+       {KeyStr("c", 6, ValueType::kTypeDeletionWithTimestamp, 96), ""},
+       {KeyStr("c", 5, ValueType::kTypeValue, 95), "c5"}});
+  const auto& files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTimestampTest, NoKeyExpired) {
+  NewDB();
+
+  auto file1 =
+      mock::MakeMockFile({{KeyStr("a", 6, ValueType::kTypeValue, 100), "a6"},
+                          {KeyStr("b", 7, ValueType::kTypeValue, 101), "b7"},
+                          {KeyStr("c", 5, ValueType::kTypeValue, 99), "c5"}});
+  AddMockFile(file1);
+
+  auto file2 =
+      mock::MakeMockFile({{KeyStr("a", 4, ValueType::kTypeValue, 98), "a4"},
+                          {KeyStr("c", 3, ValueType::kTypeValue, 97), "c3"}});
+  AddMockFile(file2);
+
+  SetLastSequence(101);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 6, ValueType::kTypeValue, 100), "a6"},
+                          {KeyStr("a", 4, ValueType::kTypeValue, 98), "a4"},
+                          {KeyStr("b", 7, ValueType::kTypeValue, 101), "b7"},
+                          {KeyStr("c", 5, ValueType::kTypeValue, 99), "c5"},
+                          {KeyStr("c", 3, ValueType::kTypeValue, 97), "c3"}});
+  const auto& files = cfd_->current()->storage_info()->LevelFiles(0);
+
+  full_history_ts_low_ = encode_u64_ts_(0);
+  RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTimestampTest, AllKeysExpired) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile(
+      {{KeyStr("a", 5, ValueType::kTypeDeletionWithTimestamp, 100), ""},
+       {KeyStr("b", 6, ValueType::kTypeValue, 99), "b6"}});
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile(
+      {{KeyStr("a", 4, ValueType::kTypeValue, 98), "a4"},
+       {KeyStr("b", 3, ValueType::kTypeDeletionWithTimestamp, 97), ""},
+       {KeyStr("b", 2, ValueType::kTypeValue, 96), "b2"}});
+  AddMockFile(file2);
+
+  SetLastSequence(6);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("b", 0, ValueType::kTypeValue, 0), "b6"}});
+  const auto& files = cfd_->current()->storage_info()->LevelFiles(0);
+
+  full_history_ts_low_ = encode_u64_ts_(std::numeric_limits<uint64_t>::max());
+  RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTimestampTest, SomeKeysExpired) {
+  NewDB();
+
+  auto file1 =
+      mock::MakeMockFile({{KeyStr("a", 5, ValueType::kTypeValue, 50), "a5"},
+                          {KeyStr("b", 6, ValueType::kTypeValue, 49), "b6"}});
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile(
+      {{KeyStr("a", 3, ValueType::kTypeValue, 48), "a3"},
+       {KeyStr("a", 2, ValueType::kTypeValue, 46), "a2"},
+       {KeyStr("b", 4, ValueType::kTypeDeletionWithTimestamp, 47), ""}});
+  AddMockFile(file2);
+
+  SetLastSequence(6);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 5, ValueType::kTypeValue, 50), "a5"},
+                          {KeyStr("b", 6, ValueType::kTypeValue, 49), "b6"}});
+  const auto& files = cfd_->current()->storage_info()->LevelFiles(0);
+
+  full_history_ts_low_ = encode_u64_ts_(49);
+  RunCompaction({files}, expected_results);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc
index 523418c3d86..6d109213420 100644
--- a/db/compaction/compaction_picker.cc
+++ b/db/compaction/compaction_picker.cc
@@ -139,18 +139,16 @@ CompressionOptions GetCompressionOptions(const MutableCFOptions& cf_options,
   if (!enable_compression) {
     return cf_options.compression_opts;
   }
-  // If bottommost_compression is set and we are compacting to the
-  // bottommost level then we should use the specified compression options
-  // for the bottmomost_compression.
-  if (cf_options.bottommost_compression != kDisableCompressionOption &&
-      level >= (vstorage->num_non_empty_levels() - 1) &&
+  // If bottommost_compression_opts is enabled and we are compacting to the
+  // bottommost level then we should use the specified compression options.
+  if (level >= (vstorage->num_non_empty_levels() - 1) &&
       cf_options.bottommost_compression_opts.enabled) {
     return cf_options.bottommost_compression_opts;
   }
   return cf_options.compression_opts;
 }
 
-CompactionPicker::CompactionPicker(const ImmutableCFOptions& ioptions,
+CompactionPicker::CompactionPicker(const ImmutableOptions& ioptions,
                                    const InternalKeyComparator* icmp)
     : ioptions_(ioptions), icmp_(icmp) {}
 
@@ -532,7 +530,7 @@ bool CompactionPicker::SetupOtherInputs(
       }
     }
     if (expand_inputs) {
-      ROCKS_LOG_INFO(ioptions_.info_log,
+      ROCKS_LOG_INFO(ioptions_.logger,
                      "[%s] Expanding@%d %" ROCKSDB_PRIszt "+%" ROCKSDB_PRIszt
                      "(%" PRIu64 "+%" PRIu64 " bytes) to %" ROCKSDB_PRIszt
                      "+%" ROCKSDB_PRIszt " (%" PRIu64 "+%" PRIu64 " bytes)\n",
@@ -672,17 +670,41 @@ Compaction* CompactionPicker::CompactRange(
   // two files overlap.
   if (input_level > 0) {
     const uint64_t limit = mutable_cf_options.max_compaction_bytes;
-    uint64_t total = 0;
+    uint64_t input_level_total = 0;
+    int hint_index = -1;
+    InternalKey* smallest = nullptr;
+    InternalKey* largest = nullptr;
     for (size_t i = 0; i + 1 < inputs.size(); ++i) {
+      if (!smallest) {
+        smallest = &inputs[i]->smallest;
+      }
+      largest = &inputs[i]->largest;
+
       uint64_t s = inputs[i]->compensated_file_size;
-      total += s;
-      if (total >= limit) {
+      uint64_t output_level_total = 0;
+      if (output_level < vstorage->num_non_empty_levels()) {
+        std::vector<FileMetaData*> files;
+        vstorage->GetOverlappingInputsRangeBinarySearch(
+            output_level, smallest, largest, &files, hint_index, &hint_index);
+        for (const auto& file : files) {
+          output_level_total += file->compensated_file_size;
+        }
+      }
+
+      input_level_total += s;
+
+      if (input_level_total + output_level_total >= limit) {
         covering_the_whole_range = false;
+        // still include the current file, so the compaction could be larger
+        // than max_compaction_bytes, which is also to make sure the compaction
+        // can make progress even `max_compaction_bytes` is small (e.g. smaller
+        // than an SST file).
         inputs.files.resize(i + 1);
         break;
       }
     }
   }
+
   assert(compact_range_options.target_path_id <
          static_cast<uint32_t>(ioptions_.cf_paths.size()));
 
@@ -1006,6 +1028,7 @@ Status CompactionPicker::SanitizeCompactionInputFiles(
   // any currently-existing files.
   for (auto file_num : *input_files) {
     bool found = false;
+    int input_file_level = -1;
     for (const auto& level_meta : cf_meta.levels) {
       for (const auto& file_meta : level_meta.files) {
         if (file_num == TableFileNameToNumber(file_meta.name)) {
@@ -1015,6 +1038,7 @@ Status CompactionPicker::SanitizeCompactionInputFiles(
                                    " is already being compacted.");
           }
           found = true;
+          input_file_level = level_meta.level;
           break;
         }
       }
@@ -1027,6 +1051,13 @@ Status CompactionPicker::SanitizeCompactionInputFiles(
           "Specified compaction input file " + MakeTableFileName("", file_num) +
           " does not exist in column family " + cf_meta.name + ".");
     }
+    if (input_file_level > output_level) {
+      return Status::InvalidArgument(
+          "Cannot compact file to up level, input file: " +
+          MakeTableFileName("", file_num) + " level " +
+          ToString(input_file_level) + " > output level " +
+          ToString(output_level));
+    }
   }
 
   return Status::OK();
@@ -1045,6 +1076,8 @@ void CompactionPicker::RegisterCompaction(Compaction* c) {
     level0_compactions_in_progress_.insert(c);
   }
   compactions_in_progress_.insert(c);
+  TEST_SYNC_POINT_CALLBACK("CompactionPicker::RegisterCompaction:Registered",
+                           c);
 }
 
 void CompactionPicker::UnregisterCompaction(Compaction* c) {
diff --git a/db/compaction/compaction_picker.h b/db/compaction/compaction_picker.h
index 4bd431d712a..70de11f94e8 100644
--- a/db/compaction/compaction_picker.h
+++ b/db/compaction/compaction_picker.h
@@ -46,7 +46,7 @@ struct CompactionInputFiles;
 // compaction style specific logic for them.
 class CompactionPicker {
  public:
-  CompactionPicker(const ImmutableCFOptions& ioptions,
+  CompactionPicker(const ImmutableOptions& ioptions,
                    const InternalKeyComparator* icmp);
   virtual ~CompactionPicker();
 
@@ -218,7 +218,7 @@ class CompactionPicker {
   }
 
  protected:
-  const ImmutableCFOptions& ioptions_;
+  const ImmutableOptions& ioptions_;
 
 // A helper function to SanitizeCompactionInputFiles() that
 // sanitizes "input_files" by adding necessary files.
@@ -244,7 +244,7 @@ class CompactionPicker {
 // compaction.
 class NullCompactionPicker : public CompactionPicker {
  public:
-  NullCompactionPicker(const ImmutableCFOptions& ioptions,
+  NullCompactionPicker(const ImmutableOptions& ioptions,
                        const InternalKeyComparator* icmp)
       : CompactionPicker(ioptions, icmp) {}
   virtual ~NullCompactionPicker() {}
diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc
index 59213aec9d3..4b4c09b80f0 100644
--- a/db/compaction/compaction_picker_fifo.cc
+++ b/db/compaction/compaction_picker_fifo.cc
@@ -45,7 +45,7 @@ Compaction* FIFOCompactionPicker::PickTTLCompaction(
   uint64_t total_size = GetTotalFilesSize(level_files);
 
   int64_t _current_time;
-  auto status = ioptions_.env->GetCurrentTime(&_current_time);
+  auto status = ioptions_.clock->GetCurrentTime(&_current_time);
   if (!status.ok()) {
     ROCKS_LOG_BUFFER(log_buffer,
                      "[%s] FIFO compaction: Couldn't get current time: %s. "
@@ -244,7 +244,7 @@ Compaction* FIFOCompactionPicker::CompactRange(
   assert(input_level == 0);
   assert(output_level == 0);
   *compaction_end = nullptr;
-  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.info_log);
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.logger);
   Compaction* c = PickCompaction(cf_name, mutable_cf_options,
                                  mutable_db_options, vstorage, &log_buffer);
   log_buffer.FlushBufferToLog();
diff --git a/db/compaction/compaction_picker_fifo.h b/db/compaction/compaction_picker_fifo.h
index e8aec64c6ab..2a07f8df776 100644
--- a/db/compaction/compaction_picker_fifo.h
+++ b/db/compaction/compaction_picker_fifo.h
@@ -15,7 +15,7 @@
 namespace ROCKSDB_NAMESPACE {
 class FIFOCompactionPicker : public CompactionPicker {
  public:
-  FIFOCompactionPicker(const ImmutableCFOptions& ioptions,
+  FIFOCompactionPicker(const ImmutableOptions& ioptions,
                        const InternalKeyComparator* icmp)
       : CompactionPicker(ioptions, icmp) {}
 
diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc
index 3778d6d8dec..08c48c8f0b5 100644
--- a/db/compaction/compaction_picker_level.cc
+++ b/db/compaction/compaction_picker_level.cc
@@ -49,7 +49,7 @@ class LevelCompactionBuilder {
                          CompactionPicker* compaction_picker,
                          LogBuffer* log_buffer,
                          const MutableCFOptions& mutable_cf_options,
-                         const ImmutableCFOptions& ioptions,
+                         const ImmutableOptions& ioptions,
                          const MutableDBOptions& mutable_db_options)
       : cf_name_(cf_name),
         vstorage_(vstorage),
@@ -121,7 +121,7 @@ class LevelCompactionBuilder {
   CompactionReason compaction_reason_ = CompactionReason::kUnknown;
 
   const MutableCFOptions& mutable_cf_options_;
-  const ImmutableCFOptions& ioptions_;
+  const ImmutableOptions& ioptions_;
   const MutableDBOptions& mutable_db_options_;
   // Pick a path ID to place a newly generated file, with its level
   static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
diff --git a/db/compaction/compaction_picker_level.h b/db/compaction/compaction_picker_level.h
index 460a5ef1705..42a9b60a632 100644
--- a/db/compaction/compaction_picker_level.h
+++ b/db/compaction/compaction_picker_level.h
@@ -17,7 +17,7 @@ namespace ROCKSDB_NAMESPACE {
 // for description of Leveled compaction.
 class LevelCompactionPicker : public CompactionPicker {
  public:
-  LevelCompactionPicker(const ImmutableCFOptions& ioptions,
+  LevelCompactionPicker(const ImmutableOptions& ioptions,
                         const InternalKeyComparator* icmp)
       : CompactionPicker(ioptions, icmp) {}
   virtual Compaction* PickCompaction(
diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc
index d5580cc8ad7..5d543048f5a 100644
--- a/db/compaction/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@@ -12,7 +12,6 @@
 #include "db/compaction/compaction_picker_level.h"
 #include "db/compaction/compaction_picker_universal.h"
 
-#include "logging/logging.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/string_util.h"
@@ -31,7 +30,7 @@ class CompactionPickerTest : public testing::Test {
   const Comparator* ucmp_;
   InternalKeyComparator icmp_;
   Options options_;
-  ImmutableCFOptions ioptions_;
+  ImmutableOptions ioptions_;
   MutableCFOptions mutable_cf_options_;
   MutableDBOptions mutable_db_options_;
   LevelCompactionPicker level_compaction_picker;
@@ -142,7 +141,7 @@ class CompactionPickerTest : public testing::Test {
     if (temp_vstorage_) {
       VersionBuilder builder(FileOptions(), &ioptions_, nullptr,
                              vstorage_.get(), nullptr);
-      builder.SaveTo(temp_vstorage_.get());
+      ASSERT_OK(builder.SaveTo(temp_vstorage_.get()));
       vstorage_ = std::move(temp_vstorage_);
     }
     vstorage_->CalculateBaseBytes(ioptions_, mutable_cf_options_);
@@ -651,7 +650,7 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction3) {
 
 TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) {
   // The case where universal periodic compaction couldn't form
-  // a compaction that inlcudes any file marked for periodic compaction.
+  // a compaction that includes any file marked for periodic compaction.
   // Right now we form the compaction anyway if it is more than one
   // sorted run. Just put the case here to validate that it doesn't
   // crash.
@@ -801,7 +800,7 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) {
   Add(2, 6U, "150", "175",
       60000000U);  // Overlaps with file 26, 27, total size 521M
   Add(2, 7U, "176", "200", 60000000U);  // Overlaps with file 27, 28, total size
-                                        // 520M, the smalelst overlapping
+                                        // 520M, the smallest overlapping
   Add(2, 8U, "201", "300",
       60000000U);  // Overlaps with file 28, 29, total size 521M
 
@@ -1229,7 +1228,7 @@ TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri1) {
   Add(0, 32U, "001", "400", 1000000000U, 0, 0);
   Add(0, 33U, "001", "400", 1000000000U, 0, 0);
 
-  // L1 total size 2GB, score 2.2. If one file being comapcted, score 1.1.
+  // L1 total size 2GB, score 2.2. If one file being compacted, score 1.1.
   Add(1, 4U, "050", "300", 1000000000U, 0, 0);
   file_map_[4u].first->being_compacted = true;
   Add(1, 5U, "301", "350", 1000000000U, 0, 0);
@@ -1262,7 +1261,7 @@ TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri2) {
   Add(0, 32U, "001", "400", 1000000000U, 0, 0);
   Add(0, 33U, "001", "400", 1000000000U, 0, 0);
 
-  // L1 total size 2GB, score 2.2. If one file being comapcted, score 1.1.
+  // L1 total size 2GB, score 2.2. If one file being compacted, score 1.1.
   Add(1, 4U, "050", "300", 1000000000U, 0, 0);
   Add(1, 5U, "301", "350", 1000000000U, 0, 0);
 
diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc
index 1e95191d66b..b6f38f8282f 100644
--- a/db/compaction/compaction_picker_universal.cc
+++ b/db/compaction/compaction_picker_universal.cc
@@ -32,7 +32,7 @@ namespace {
 class UniversalCompactionBuilder {
  public:
   UniversalCompactionBuilder(
-      const ImmutableCFOptions& ioptions, const InternalKeyComparator* icmp,
+      const ImmutableOptions& ioptions, const InternalKeyComparator* icmp,
       const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
       const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
       UniversalCompactionPicker* picker, LogBuffer* log_buffer)
@@ -108,7 +108,7 @@ class UniversalCompactionBuilder {
   // overlapping.
   bool IsInputFilesNonOverlapping(Compaction* c);
 
-  const ImmutableCFOptions& ioptions_;
+  const ImmutableOptions& ioptions_;
   const InternalKeyComparator* icmp_;
   double score_;
   std::vector<SortedRun> sorted_runs_;
@@ -486,7 +486,7 @@ Compaction* UniversalCompactionBuilder::PickCompaction() {
   }
 #endif
   // update statistics
-  RecordInHistogram(ioptions_.statistics, NUM_FILES_IN_SINGLE_COMPACTION,
+  RecordInHistogram(ioptions_.stats, NUM_FILES_IN_SINGLE_COMPACTION,
                     c->inputs(0)->size());
 
   picker_->RegisterCompaction(c);
@@ -733,7 +733,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
 }
 
 // Look at overall size amplification. If size amplification
-// exceeeds the configured value, then do a compaction
+// exceeds the configured value, then do a compaction
 // of the candidate files all the way upto the earliest
 // base file (overrides configured values of file-size ratios,
 // min_merge_width and max_merge_width).
diff --git a/db/compaction/compaction_picker_universal.h b/db/compaction/compaction_picker_universal.h
index a06ad3b8562..5f897cc9b39 100644
--- a/db/compaction/compaction_picker_universal.h
+++ b/db/compaction/compaction_picker_universal.h
@@ -15,7 +15,7 @@
 namespace ROCKSDB_NAMESPACE {
 class UniversalCompactionPicker : public CompactionPicker {
  public:
-  UniversalCompactionPicker(const ImmutableCFOptions& ioptions,
+  UniversalCompactionPicker(const ImmutableOptions& ioptions,
                             const InternalKeyComparator* icmp)
       : CompactionPicker(ioptions, icmp) {}
   virtual Compaction* PickCompaction(
diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc
new file mode 100644
index 00000000000..1b9afab8910
--- /dev/null
+++ b/db/compaction/compaction_service_test.cc
@@ -0,0 +1,458 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MyTestCompactionService : public CompactionService {
+ public:
+  MyTestCompactionService(const std::string& db_path,
+                          std::shared_ptr<FileSystem> fs, Options& options)
+      : db_path_(db_path), fs_(fs), options_(options) {}
+
+  static const char* kClassName() { return "MyTestCompactionService"; }
+
+  const char* Name() const override { return kClassName(); }
+
+  CompactionServiceJobStatus Start(const std::string& compaction_service_input,
+                                   int job_id) override {
+    InstrumentedMutexLock l(&mutex_);
+    jobs_.emplace(job_id, compaction_service_input);
+    CompactionServiceJobStatus s = CompactionServiceJobStatus::kSuccess;
+    TEST_SYNC_POINT_CALLBACK("MyTestCompactionService::Start::End", &s);
+    return s;
+  }
+
+  CompactionServiceJobStatus WaitForComplete(
+      int job_id, std::string* compaction_service_result) override {
+    std::string compaction_input;
+    {
+      InstrumentedMutexLock l(&mutex_);
+      auto i = jobs_.find(job_id);
+      if (i == jobs_.end()) {
+        return CompactionServiceJobStatus::kFailure;
+      }
+      compaction_input = std::move(i->second);
+      jobs_.erase(i);
+    }
+
+    CompactionServiceOptionsOverride options_override;
+    options_override.env = options_.env;
+    options_override.file_checksum_gen_factory =
+        options_.file_checksum_gen_factory;
+    options_override.comparator = options_.comparator;
+    options_override.merge_operator = options_.merge_operator;
+    options_override.compaction_filter = options_.compaction_filter;
+    options_override.compaction_filter_factory =
+        options_.compaction_filter_factory;
+    options_override.prefix_extractor = options_.prefix_extractor;
+    options_override.table_factory = options_.table_factory;
+    options_override.sst_partitioner_factory = options_.sst_partitioner_factory;
+
+    Status s = DB::OpenAndCompact(
+        db_path_, db_path_ + "/" + ROCKSDB_NAMESPACE::ToString(job_id),
+        compaction_input, compaction_service_result, options_override);
+    TEST_SYNC_POINT_CALLBACK("MyTestCompactionService::WaitForComplete::End",
+                             compaction_service_result);
+    compaction_num_.fetch_add(1);
+    if (s.ok()) {
+      return CompactionServiceJobStatus::kSuccess;
+    } else {
+      return CompactionServiceJobStatus::kFailure;
+    }
+  }
+
+  int GetCompactionNum() { return compaction_num_.load(); }
+
+ private:
+  InstrumentedMutex mutex_;
+  std::atomic_int compaction_num_{0};
+  std::map<int, std::string> jobs_;
+  const std::string db_path_;
+  std::shared_ptr<FileSystem> fs_;
+  Options options_;
+};
+
+class CompactionServiceTest : public DBTestBase {
+ public:
+  explicit CompactionServiceTest()
+      : DBTestBase("compaction_service_test", true) {}
+
+ protected:
+  void GenerateTestData() {
+    // Generate 20 files @ L2
+    for (int i = 0; i < 20; i++) {
+      for (int j = 0; j < 10; j++) {
+        int key_id = i * 10 + j;
+        ASSERT_OK(Put(Key(key_id), "value" + ToString(key_id)));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(2);
+
+    // Generate 10 files @ L1 overlap with all 20 files @ L2
+    for (int i = 0; i < 10; i++) {
+      for (int j = 0; j < 10; j++) {
+        int key_id = i * 20 + j * 2;
+        ASSERT_OK(Put(Key(key_id), "value_new" + ToString(key_id)));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(1);
+    ASSERT_EQ(FilesPerLevel(), "0,10,20");
+  }
+
+  void VerifyTestData() {
+    for (int i = 0; i < 200; i++) {
+      auto result = Get(Key(i));
+      if (i % 2) {
+        ASSERT_EQ(result, "value" + ToString(i));
+      } else {
+        ASSERT_EQ(result, "value_new" + ToString(i));
+      }
+    }
+  }
+};
+
+TEST_F(CompactionServiceTest, BasicCompactions) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.compaction_service = std::make_shared<MyTestCompactionService>(
+      dbname_, env_->GetFileSystem(), options);
+
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < 20; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value" + ToString(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      ASSERT_OK(Put(Key(key_id), "value_new" + ToString(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // verify result
+  for (int i = 0; i < 200; i++) {
+    auto result = Get(Key(i));
+    if (i % 2) {
+      ASSERT_EQ(result, "value" + ToString(i));
+    } else {
+      ASSERT_EQ(result, "value_new" + ToString(i));
+    }
+  }
+  auto my_cs =
+      dynamic_cast<MyTestCompactionService*>(options.compaction_service.get());
+  ASSERT_GE(my_cs->GetCompactionNum(), 1);
+
+  // Test failed compaction
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImplSecondary::CompactWithoutInstallation::End", [&](void* status) {
+        // override job status
+        Status* s = static_cast<Status*>(status);
+        *s = Status::Aborted("MyTestCompactionService failed to compact!");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s;
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      s = Put(Key(key_id), "value_new" + ToString(key_id));
+      if (s.IsAborted()) {
+        break;
+      }
+    }
+    if (s.IsAborted()) {
+      break;
+    }
+    s = Flush();
+    if (s.IsAborted()) {
+      break;
+    }
+    s = dbfull()->TEST_WaitForCompact();
+    if (s.IsAborted()) {
+      break;
+    }
+  }
+  ASSERT_TRUE(s.IsAborted());
+}
+
+TEST_F(CompactionServiceTest, ManualCompaction) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.compaction_service = std::make_shared<MyTestCompactionService>(
+      dbname_, env_->GetFileSystem(), options);
+  DestroyAndReopen(options);
+  GenerateTestData();
+
+  auto my_cs =
+      dynamic_cast<MyTestCompactionService*>(options.compaction_service.get());
+
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+  uint64_t comp_num = my_cs->GetCompactionNum();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+  VerifyTestData();
+
+  start_str = Key(120);
+  start = start_str;
+  comp_num = my_cs->GetCompactionNum();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, nullptr));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+  VerifyTestData();
+
+  end_str = Key(92);
+  end = end_str;
+  comp_num = my_cs->GetCompactionNum();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, &end));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+  VerifyTestData();
+
+  comp_num = my_cs->GetCompactionNum();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+  VerifyTestData();
+}
+
+TEST_F(CompactionServiceTest, FailedToStart) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.compaction_service = std::make_shared<MyTestCompactionService>(
+      dbname_, env_->GetFileSystem(), options);
+  DestroyAndReopen(options);
+  GenerateTestData();
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "MyTestCompactionService::Start::End", [&](void* status) {
+        // override job status
+        auto s = static_cast<CompactionServiceJobStatus*>(status);
+        *s = CompactionServiceJobStatus::kFailure;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+  Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+  ASSERT_TRUE(s.IsIncomplete());
+}
+
+TEST_F(CompactionServiceTest, InvalidResult) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.compaction_service = std::make_shared<MyTestCompactionService>(
+      dbname_, env_->GetFileSystem(), options);
+  DestroyAndReopen(options);
+  GenerateTestData();
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "MyTestCompactionService::WaitForComplete::End", [&](void* result) {
+        // override job status
+        auto result_str = static_cast<std::string*>(result);
+        *result_str = "Invalid Str";
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+  Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+  ASSERT_FALSE(s.ok());
+}
+
+// TODO: support sub-compaction
+TEST_F(CompactionServiceTest, DISABLED_SubCompaction) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.max_subcompactions = 10;
+  options.target_file_size_base = 1 << 10;  // 1KB
+  options.disable_auto_compactions = true;
+  options.compaction_service = std::make_shared<MyTestCompactionService>(
+      dbname_, env_->GetFileSystem(), options);
+
+  DestroyAndReopen(options);
+  GenerateTestData();
+
+  auto cro = CompactRangeOptions();
+  cro.max_subcompactions = 10;
+  db_->CompactRange(cro, nullptr, nullptr);
+}
+
+class PartialDeleteCompactionFilter : public CompactionFilter {
+ public:
+  CompactionFilter::Decision FilterV2(
+      int /*level*/, const Slice& key, ValueType /*value_type*/,
+      const Slice& /*existing_value*/, std::string* /*new_value*/,
+      std::string* /*skip_until*/) const override {
+    int i = std::stoi(key.ToString().substr(3));
+    if (i > 5 && i <= 105) {
+      return CompactionFilter::Decision::kRemove;
+    }
+    return CompactionFilter::Decision::kKeep;
+  }
+
+  const char* Name() const override { return "PartialDeleteCompactionFilter"; }
+};
+
+TEST_F(CompactionServiceTest, CompactionFilter) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  auto delete_comp_filter = PartialDeleteCompactionFilter();
+  options.compaction_filter = &delete_comp_filter;
+  options.compaction_service = std::make_shared<MyTestCompactionService>(
+      dbname_, env_->GetFileSystem(), options);
+
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < 20; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value" + ToString(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      ASSERT_OK(Put(Key(key_id), "value_new" + ToString(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // verify result
+  for (int i = 0; i < 200; i++) {
+    auto result = Get(Key(i));
+    if (i > 5 && i <= 105) {
+      ASSERT_EQ(result, "NOT_FOUND");
+    } else if (i % 2) {
+      ASSERT_EQ(result, "value" + ToString(i));
+    } else {
+      ASSERT_EQ(result, "value_new" + ToString(i));
+    }
+  }
+  auto my_cs =
+      dynamic_cast<MyTestCompactionService*>(options.compaction_service.get());
+  ASSERT_GE(my_cs->GetCompactionNum(), 1);
+}
+
+TEST_F(CompactionServiceTest, Snapshot) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.compaction_service = std::make_shared<MyTestCompactionService>(
+      dbname_, env_->GetFileSystem(), options);
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(1), "value1"));
+  ASSERT_OK(Put(Key(2), "value1"));
+  const Snapshot* s1 = db_->GetSnapshot();
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(1), "value2"));
+  ASSERT_OK(Put(Key(3), "value2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  auto my_cs =
+      dynamic_cast<MyTestCompactionService*>(options.compaction_service.get());
+  ASSERT_GE(my_cs->GetCompactionNum(), 1);
+  ASSERT_EQ("value1", Get(Key(1), s1));
+  ASSERT_EQ("value2", Get(Key(1)));
+  db_->ReleaseSnapshot(s1);
+}
+
+TEST_F(CompactionServiceTest, ConcurrentCompaction) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 100;
+  options.env = env_;
+  options.compaction_service = std::make_shared<MyTestCompactionService>(
+      dbname_, env_->GetFileSystem(), options);
+  options.max_background_jobs = 20;
+
+  DestroyAndReopen(options);
+  GenerateTestData();
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+
+  std::vector<std::thread> threads;
+  for (const auto& file : meta.levels[1].files) {
+    threads.push_back(std::thread([&]() {
+      std::string fname = file.db_path + "/" + file.name;
+      ASSERT_OK(db_->CompactFiles(CompactionOptions(), {fname}, 2));
+    }));
+  }
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // verify result
+  for (int i = 0; i < 200; i++) {
+    auto result = Get(Key(i));
+    if (i % 2) {
+      ASSERT_EQ(result, "value" + ToString(i));
+    } else {
+      ASSERT_EQ(result, "value_new" + ToString(i));
+    }
+  }
+  auto my_cs =
+      dynamic_cast<MyTestCompactionService*>(options.compaction_service.get());
+  ASSERT_EQ(my_cs->GetCompactionNum(), 10);
+  ASSERT_EQ(FilesPerLevel(), "0,0,10");
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as CompactionService is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc
index 4eac91e2a57..876cf07fae7 100644
--- a/db/comparator_db_test.cc
+++ b/db/comparator_db_test.cc
@@ -17,7 +17,6 @@
 #include "util/string_util.h"
 #include "utilities/merge_operators.h"
 
-using std::unique_ptr;
 
 namespace ROCKSDB_NAMESPACE {
 namespace {
diff --git a/db/convenience.cc b/db/convenience.cc
index 96735d7e5f5..5af6515c8c4 100644
--- a/db/convenience.cc
+++ b/db/convenience.cc
@@ -44,7 +44,7 @@ Status VerifySstFileChecksum(const Options& options,
   std::unique_ptr<FSRandomAccessFile> file;
   uint64_t file_size;
   InternalKeyComparator internal_comparator(options.comparator);
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
 
   Status s = ioptions.fs->NewRandomAccessFile(file_path,
                                               FileOptions(env_options),
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
index 964ff11700c..cabf7e700e5 100644
--- a/db/corruption_test.cc
+++ b/db/corruption_test.cc
@@ -9,7 +9,6 @@
 
 #ifndef ROCKSDB_LITE
 
-#include <errno.h>
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
@@ -20,8 +19,8 @@
 #include "db/db_test_util.h"
 #include "db/log_format.h"
 #include "db/version_set.h"
-#include "env/composite_env_wrapper.h"
 #include "file/filename.h"
+#include "port/stack_trace.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
@@ -33,6 +32,7 @@
 #include "table/mock_table.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/cast_util.h"
 #include "util/random.h"
 #include "util/string_util.h"
 
@@ -42,7 +42,8 @@ static constexpr int kValueSize = 1000;
 
 class CorruptionTest : public testing::Test {
  public:
-  test::ErrorEnv env_;
+  std::shared_ptr<Env> env_guard_;
+  test::ErrorEnv* env_;
   std::string dbname_;
   std::shared_ptr<Cache> tiny_cache_;
   Options options_;
@@ -53,9 +54,14 @@ class CorruptionTest : public testing::Test {
     // set it to 0), test SequenceNumberRecovery will fail, likely because of a
     // bug in recovery code. Keep it 4 for now to make the test passes.
     tiny_cache_ = NewLRUCache(100, 4);
+    Env* base_env = Env::Default();
+    EXPECT_OK(
+        test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_));
+    EXPECT_NE(base_env, nullptr);
+    env_ = new test::ErrorEnv(base_env);
     options_.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
-    options_.env = &env_;
-    dbname_ = test::PerThreadDBPath("corruption_test");
+    options_.env = env_;
+    dbname_ = test::PerThreadDBPath(env_, "corruption_test");
     Status s = DestroyDB(dbname_, options_);
     EXPECT_OK(s);
 
@@ -77,8 +83,11 @@ class CorruptionTest : public testing::Test {
     if (getenv("KEEP_DB")) {
       fprintf(stdout, "db is still at %s\n", dbname_.c_str());
     } else {
-      EXPECT_OK(DestroyDB(dbname_, Options()));
+      Options opts;
+      opts.env = env_->target();
+      EXPECT_OK(DestroyDB(dbname_, opts));
     }
+    delete env_;
   }
 
   void CloseDb() {
@@ -93,7 +102,7 @@ class CorruptionTest : public testing::Test {
     if (opt.env == Options().env) {
       // If env is not overridden, replace it with ErrorEnv.
       // Otherwise, the test already uses a non-default Env.
-      opt.env = &env_;
+      opt.env = env_;
     }
     opt.arena_block_size = 4096;
     BlockBasedTableOptions table_options;
@@ -124,7 +133,7 @@ class CorruptionTest : public testing::Test {
       //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
       Slice key = Key(i + start, &key_space);
       batch.Clear();
-      ASSERT_OK(batch.Put(key, Value(i, &value_space)));
+      ASSERT_OK(batch.Put(key, Value(i + start, &value_space)));
       ASSERT_OK(db_->Write(WriteOptions(), &batch));
     }
   }
@@ -176,7 +185,7 @@ class CorruptionTest : public testing::Test {
   void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
     // Pick file to corrupt
     std::vector<std::string> filenames;
-    ASSERT_OK(env_.GetChildren(dbname_, &filenames));
+    ASSERT_OK(env_->GetChildren(dbname_, &filenames));
     uint64_t number;
     FileType type;
     std::string fname;
@@ -191,7 +200,7 @@ class CorruptionTest : public testing::Test {
     }
     ASSERT_TRUE(!fname.empty()) << filetype;
 
-    test::CorruptFile(fname, offset, bytes_to_corrupt);
+    ASSERT_OK(test::CorruptFile(env_, fname, offset, bytes_to_corrupt));
   }
 
   // corrupts exactly one file at level `level`. if no file found at level,
@@ -201,7 +210,8 @@ class CorruptionTest : public testing::Test {
     db_->GetLiveFilesMetaData(&metadata);
     for (const auto& m : metadata) {
       if (m.level == level) {
-        test::CorruptFile(dbname_ + "/" + m.name, offset, bytes_to_corrupt);
+        ASSERT_OK(test::CorruptFile(env_, dbname_ + "/" + m.name, offset,
+                                    bytes_to_corrupt));
         return;
       }
     }
@@ -256,8 +266,8 @@ TEST_F(CorruptionTest, Recovery) {
   // is not available for WAL though.
   CloseDb();
 #endif
-  Corrupt(kLogFile, 19, 1);      // WriteBatch tag for first record
-  Corrupt(kLogFile, log::kBlockSize + 1000, 1);  // Somewhere in second block
+  Corrupt(kWalFile, 19, 1);  // WriteBatch tag for first record
+  Corrupt(kWalFile, log::kBlockSize + 1000, 1);  // Somewhere in second block
   ASSERT_TRUE(!TryReopen().ok());
   options_.paranoid_checks = false;
   Reopen(&options_);
@@ -267,14 +277,14 @@ TEST_F(CorruptionTest, Recovery) {
 }
 
 TEST_F(CorruptionTest, RecoverWriteError) {
-  env_.writable_file_error_ = true;
+  env_->writable_file_error_ = true;
   Status s = TryReopen();
   ASSERT_TRUE(!s.ok());
 }
 
 TEST_F(CorruptionTest, NewFileErrorDuringWrite) {
   // Do enough writing to force minor compaction
-  env_.writable_file_error_ = true;
+  env_->writable_file_error_ = true;
   const int num =
       static_cast<int>(3 + (Options().write_buffer_size / kValueSize));
   std::string value_storage;
@@ -290,8 +300,8 @@ TEST_F(CorruptionTest, NewFileErrorDuringWrite) {
     ASSERT_TRUE(!failed || !s.ok());
   }
   ASSERT_TRUE(!s.ok());
-  ASSERT_GE(env_.num_writable_file_errors_, 1);
-  env_.writable_file_error_ = false;
+  ASSERT_GE(env_->num_writable_file_errors_, 1);
+  env_->writable_file_error_ = false;
   Reopen();
 }
 
@@ -309,7 +319,7 @@ TEST_F(CorruptionTest, TableFile) {
 
 TEST_F(CorruptionTest, VerifyChecksumReadahead) {
   Options options;
-  SpecialEnv senv(Env::Default());
+  SpecialEnv senv(env_->target());
   options.env = &senv;
   // Disable block cache as we are going to check checksum for
   // the same file twice and measure number of reads.
@@ -431,6 +441,7 @@ TEST_F(CorruptionTest, CorruptedDescriptor) {
 
 TEST_F(CorruptionTest, CompactionInputError) {
   Options options;
+  options.env = env_;
   Reopen(&options);
   Build(10);
   DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
@@ -451,6 +462,7 @@ TEST_F(CorruptionTest, CompactionInputError) {
 
 TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
   Options options;
+  options.env = env_;
   options.paranoid_checks = true;
   options.write_buffer_size = 131072;
   options.max_write_buffer_number = 2;
@@ -521,28 +533,31 @@ TEST_F(CorruptionTest, RangeDeletionCorrupted) {
   ASSERT_EQ(static_cast<size_t>(1), metadata.size());
   std::string filename = dbname_ + metadata[0].name;
 
-  std::unique_ptr<RandomAccessFile> file;
-  ASSERT_OK(options_.env->NewRandomAccessFile(filename, &file, EnvOptions()));
-  std::unique_ptr<RandomAccessFileReader> file_reader(
-      new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(file),
-                                 filename));
+  FileOptions file_opts;
+  const auto& fs = options_.env->GetFileSystem();
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  ASSERT_OK(RandomAccessFileReader::Create(fs, filename, file_opts,
+                                           &file_reader, nullptr));
 
   uint64_t file_size;
-  ASSERT_OK(options_.env->GetFileSize(filename, &file_size));
+  ASSERT_OK(
+      fs->GetFileSize(filename, file_opts.io_options, &file_size, nullptr));
 
   BlockHandle range_del_handle;
   ASSERT_OK(FindMetaBlock(
       file_reader.get(), file_size, kBlockBasedTableMagicNumber,
-      ImmutableCFOptions(options_), kRangeDelBlock, &range_del_handle));
+      ImmutableOptions(options_), kRangeDelBlock, &range_del_handle));
 
   ASSERT_OK(TryReopen());
-  test::CorruptFile(filename, static_cast<int>(range_del_handle.offset()), 1);
+  ASSERT_OK(test::CorruptFile(env_, filename,
+                              static_cast<int>(range_del_handle.offset()), 1));
   ASSERT_TRUE(TryReopen().IsCorruption());
 }
 
 TEST_F(CorruptionTest, FileSystemStateCorrupted) {
   for (int iter = 0; iter < 2; ++iter) {
     Options options;
+    options.env = env_;
     options.paranoid_checks = true;
     options.create_if_missing = true;
     Reopen(&options);
@@ -551,7 +566,7 @@ TEST_F(CorruptionTest, FileSystemStateCorrupted) {
     DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
     std::vector<LiveFileMetaData> metadata;
     dbi->GetLiveFilesMetaData(&metadata);
-    ASSERT_GT(metadata.size(), size_t(0));
+    ASSERT_GT(metadata.size(), 0);
     std::string filename = dbname_ + metadata[0].name;
 
     delete db_;
@@ -559,15 +574,15 @@ TEST_F(CorruptionTest, FileSystemStateCorrupted) {
 
     if (iter == 0) {  // corrupt file size
       std::unique_ptr<WritableFile> file;
-      env_.NewWritableFile(filename, &file, EnvOptions());
+      ASSERT_OK(env_->NewWritableFile(filename, &file, EnvOptions()));
       ASSERT_OK(file->Append(Slice("corrupted sst")));
       file.reset();
       Status x = TryReopen(&options);
       ASSERT_TRUE(x.IsCorruption());
     } else {  // delete the file
-      ASSERT_OK(env_.DeleteFile(filename));
+      ASSERT_OK(env_->DeleteFile(filename));
       Status x = TryReopen(&options);
-      ASSERT_TRUE(x.IsPathNotFound());
+      ASSERT_TRUE(x.IsCorruption());
     }
 
     ASSERT_OK(DestroyDB(dbname_, options_));
@@ -581,6 +596,7 @@ static const auto& corruption_modes = {
 
 TEST_F(CorruptionTest, ParanoidFileChecksOnFlush) {
   Options options;
+  options.env = env_;
   options.check_flush_compaction_key_order = false;
   options.paranoid_file_checks = true;
   options.create_if_missing = true;
@@ -595,7 +611,7 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnFlush) {
     options.table_factory = mock;
     mock->SetCorruptionMode(mode);
     ASSERT_OK(DB::Open(options, dbname_, &db_));
-    assert(db_ != nullptr);
+    assert(db_ != nullptr);  // suppress false clang-analyze report
     Build(10);
     s = db_->Flush(FlushOptions());
     if (mode == mock::MockTableFactory::kCorruptNone) {
@@ -608,6 +624,7 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnFlush) {
 
 TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) {
   Options options;
+  options.env = env_;
   options.paranoid_file_checks = true;
   options.create_if_missing = true;
   options.check_flush_compaction_key_order = false;
@@ -620,7 +637,7 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) {
         std::make_shared<mock::MockTableFactory>();
     options.table_factory = mock;
     ASSERT_OK(DB::Open(options, dbname_, &db_));
-    assert(db_ != nullptr);
+    assert(db_ != nullptr);  // suppress false clang-analyze report
     Build(100, 2);
     // ASSERT_OK(db_->Flush(FlushOptions()));
     DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
@@ -635,8 +652,111 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) {
   }
 }
 
+TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeFirst) {
+  Options options;
+  options.env = env_;
+  options.check_flush_compaction_key_order = false;
+  options.paranoid_file_checks = true;
+  options.create_if_missing = true;
+  for (bool do_flush : {true, false}) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options));
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    std::string start, end;
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(3, &start), Key(7, &end)));
+    auto snap = db_->GetSnapshot();
+    ASSERT_NE(snap, nullptr);
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(8, &start), Key(9, &end)));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(2, &start), Key(5, &end)));
+    Build(10);
+    if (do_flush) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    } else {
+      DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+      ASSERT_OK(dbi->TEST_FlushMemTable());
+      ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true));
+    }
+    db_->ReleaseSnapshot(snap);
+  }
+}
+
+TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRange) {
+  Options options;
+  options.env = env_;
+  options.check_flush_compaction_key_order = false;
+  options.paranoid_file_checks = true;
+  options.create_if_missing = true;
+  for (bool do_flush : {true, false}) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options));
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+    Build(10, 0, 0);
+    std::string start, end;
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(5, &start), Key(15, &end)));
+    auto snap = db_->GetSnapshot();
+    ASSERT_NE(snap, nullptr);
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(8, &start), Key(9, &end)));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(12, &start), Key(17, &end)));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(2, &start), Key(4, &end)));
+    Build(10, 10, 0);
+    if (do_flush) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    } else {
+      DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+      ASSERT_OK(dbi->TEST_FlushMemTable());
+      ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true));
+    }
+    db_->ReleaseSnapshot(snap);
+  }
+}
+
+TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeLast) {
+  Options options;
+  options.env = env_;
+  options.check_flush_compaction_key_order = false;
+  options.paranoid_file_checks = true;
+  options.create_if_missing = true;
+  for (bool do_flush : {true, false}) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options));
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+    std::string start, end;
+    Build(10);
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(3, &start), Key(7, &end)));
+    auto snap = db_->GetSnapshot();
+    ASSERT_NE(snap, nullptr);
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(6, &start), Key(8, &end)));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(2, &start), Key(5, &end)));
+    if (do_flush) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    } else {
+      DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+      ASSERT_OK(dbi->TEST_FlushMemTable());
+      ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true));
+    }
+    db_->ReleaseSnapshot(snap);
+  }
+}
+
 TEST_F(CorruptionTest, LogCorruptionErrorsInCompactionIterator) {
   Options options;
+  options.env = env_;
   options.create_if_missing = true;
   options.allow_data_in_errors = true;
   auto mode = mock::MockTableFactory::kCorruptKey;
@@ -650,7 +770,7 @@ TEST_F(CorruptionTest, LogCorruptionErrorsInCompactionIterator) {
   options.table_factory = mock;
 
   ASSERT_OK(DB::Open(options, dbname_, &db_));
-  assert(db_ != nullptr);
+  assert(db_ != nullptr);  // suppress false clang-analyze report
   Build(100, 2);
 
   DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
@@ -662,6 +782,7 @@ TEST_F(CorruptionTest, LogCorruptionErrorsInCompactionIterator) {
 
 TEST_F(CorruptionTest, CompactionKeyOrderCheck) {
   Options options;
+  options.env = env_;
   options.paranoid_file_checks = false;
   options.create_if_missing = true;
   options.check_flush_compaction_key_order = false;
@@ -672,7 +793,7 @@ TEST_F(CorruptionTest, CompactionKeyOrderCheck) {
       std::make_shared<mock::MockTableFactory>();
   options.table_factory = mock;
   ASSERT_OK(DB::Open(options, dbname_, &db_));
-  assert(db_ != nullptr);
+  assert(db_ != nullptr);  // suppress false clang-analyze report
   mock->SetCorruptionMode(mock::MockTableFactory::kCorruptReorderKey);
   Build(100, 2);
   DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
@@ -685,6 +806,7 @@ TEST_F(CorruptionTest, CompactionKeyOrderCheck) {
 
 TEST_F(CorruptionTest, FlushKeyOrderCheck) {
   Options options;
+  options.env = env_;
   options.paranoid_file_checks = false;
   options.create_if_missing = true;
   ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "true"}}));
@@ -713,7 +835,6 @@ TEST_F(CorruptionTest, FlushKeyOrderCheck) {
 }
 
 TEST_F(CorruptionTest, DisableKeyOrderCheck) {
-  Options options;
   ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "false"}}));
   DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
 
@@ -732,106 +853,10 @@ TEST_F(CorruptionTest, DisableKeyOrderCheck) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
-TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeFirst) {
-  Options options;
-  options.paranoid_file_checks = true;
-  options.create_if_missing = true;
-  for (bool do_flush : {true, false}) {
-    delete db_;
-    db_ = nullptr;
-    ASSERT_OK(DestroyDB(dbname_, options));
-    ASSERT_OK(DB::Open(options, dbname_, &db_));
-    std::string start, end;
-    assert(db_ != nullptr);
-    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                               Key(3, &start), Key(7, &end)));
-    auto snap = db_->GetSnapshot();
-    ASSERT_NE(snap, nullptr);
-    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                               Key(8, &start), Key(9, &end)));
-    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                               Key(2, &start), Key(5, &end)));
-    Build(10);
-    if (do_flush) {
-      ASSERT_OK(db_->Flush(FlushOptions()));
-    } else {
-      DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-      ASSERT_OK(dbi->TEST_FlushMemTable());
-      ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true));
-    }
-    db_->ReleaseSnapshot(snap);
-  }
-}
-
-TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRange) {
-  Options options;
-  options.paranoid_file_checks = true;
-  options.create_if_missing = true;
-  for (bool do_flush : {true, false}) {
-    delete db_;
-    db_ = nullptr;
-    ASSERT_OK(DestroyDB(dbname_, options));
-    ASSERT_OK(DB::Open(options, dbname_, &db_));
-    assert(db_ != nullptr);
-    Build(10, 0, 0);
-    std::string start, end;
-    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                               Key(5, &start), Key(15, &end)));
-    auto snap = db_->GetSnapshot();
-    ASSERT_NE(snap, nullptr);
-    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                               Key(8, &start), Key(9, &end)));
-    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                               Key(12, &start), Key(17, &end)));
-    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                               Key(2, &start), Key(4, &end)));
-    Build(10, 10, 0);
-    if (do_flush) {
-      ASSERT_OK(db_->Flush(FlushOptions()));
-    } else {
-      DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-      ASSERT_OK(dbi->TEST_FlushMemTable());
-      ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true));
-    }
-    db_->ReleaseSnapshot(snap);
-  }
-}
-
-TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeLast) {
-  Options options;
-  options.paranoid_file_checks = true;
-  options.create_if_missing = true;
-  for (bool do_flush : {true, false}) {
-    delete db_;
-    db_ = nullptr;
-    ASSERT_OK(DestroyDB(dbname_, options));
-    ASSERT_OK(DB::Open(options, dbname_, &db_));
-    assert(db_ != nullptr);
-    std::string start, end;
-    Build(10);
-    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                               Key(3, &start), Key(7, &end)));
-    auto snap = db_->GetSnapshot();
-    ASSERT_NE(snap, nullptr);
-    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                               Key(6, &start), Key(8, &end)));
-    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                               Key(2, &start), Key(5, &end)));
-    if (do_flush) {
-      ASSERT_OK(db_->Flush(FlushOptions()));
-    } else {
-      DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
-      ASSERT_OK(dbi->TEST_FlushMemTable());
-      ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true));
-    }
-    db_->ReleaseSnapshot(snap);
-  }
-}
-
 TEST_F(CorruptionTest, VerifyWholeTableChecksum) {
   CloseDb();
   Options options;
-  options.env = &env_;
+  options.env = env_;
   ASSERT_OK(DestroyDB(dbname_, options));
   options.create_if_missing = true;
   options.file_checksum_gen_factory =
@@ -840,50 +865,43 @@ TEST_F(CorruptionTest, VerifyWholeTableChecksum) {
 
   Build(10, 5);
 
-  auto* dbi = static_cast_with_check<DBImpl>(db_);
-  ASSERT_OK(dbi->VerifyFileChecksums(ReadOptions()));
+  ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
   CloseDb();
 
   // Corrupt the first byte of each table file, this must be data block.
   Corrupt(kTableFile, 0, 1);
 
   ASSERT_OK(TryReopen(&options));
-  dbi = static_cast_with_check<DBImpl>(db_);
 
   SyncPoint::GetInstance()->DisableProcessing();
   SyncPoint::GetInstance()->ClearAllCallBacks();
   int count{0};
   SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::VerifySstFileChecksum:mismatch", [&](void* arg) {
+      "DBImpl::VerifyFullFileChecksum:mismatch", [&](void* arg) {
         auto* s = reinterpret_cast<Status*>(arg);
-        assert(s);
+        ASSERT_NE(s, nullptr);
         ++count;
         ASSERT_NOK(*s);
       });
   SyncPoint::GetInstance()->EnableProcessing();
-  ASSERT_TRUE(dbi->VerifyFileChecksums(ReadOptions()).IsCorruption());
+  ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsCorruption());
   ASSERT_EQ(1, count);
-
-  CloseDb();
-  ASSERT_OK(DestroyDB(dbname_, options));
-  Reopen(&options);
-  Build(10, 5);
-  dbi = static_cast_with_check<DBImpl>(db_);
-  ASSERT_OK(dbi->VerifyFileChecksums(ReadOptions()));
-  CloseDb();
-  Corrupt(kTableFile, 0, 1);
-
-  // Set best_efforts_recovery to true
-  options.best_efforts_recovery = true;
-#ifdef OS_LINUX
-  ASSERT_TRUE(TryReopen(&options).IsCorruption());
-#endif  // OS_LINUX
 }
 
 }  // namespace ROCKSDB_NAMESPACE
 
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
 int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
   return RUN_ALL_TESTS();
 }
 
diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc
index 2aaf2c50ded..9b76c03d5ca 100644
--- a/db/cuckoo_table_db_test.cc
+++ b/db/cuckoo_table_db_test.cc
@@ -63,6 +63,15 @@ class CuckooTableDBTest : public testing::Test {
     ASSERT_OK(DB::Open(opts, dbname_, &db_));
   }
 
+  void DestroyAndReopen(Options* options) {
+    assert(options);
+    ASSERT_OK(db_->Close());
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, *options));
+    Reopen(options);
+  }
+
   Status Put(const Slice& k, const Slice& v) {
     return db_->Put(WriteOptions(), k, v);
   }
@@ -120,10 +129,10 @@ TEST_F(CuckooTableDBTest, Flush) {
   ASSERT_OK(Put("key1", "v1"));
   ASSERT_OK(Put("key2", "v2"));
   ASSERT_OK(Put("key3", "v3"));
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   TablePropertiesCollection ptc;
-  reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+  ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
   ASSERT_EQ(1U, ptc.size());
   ASSERT_EQ(3U, ptc.begin()->second->num_entries);
   ASSERT_EQ("1", FilesPerLevel());
@@ -137,9 +146,9 @@ TEST_F(CuckooTableDBTest, Flush) {
   ASSERT_OK(Put("key4", "v4"));
   ASSERT_OK(Put("key5", "v5"));
   ASSERT_OK(Put("key6", "v6"));
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
-  reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+  ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
   ASSERT_EQ(2U, ptc.size());
   auto row = ptc.begin();
   ASSERT_EQ(3U, row->second->num_entries);
@@ -155,8 +164,8 @@ TEST_F(CuckooTableDBTest, Flush) {
   ASSERT_OK(Delete("key6"));
   ASSERT_OK(Delete("key5"));
   ASSERT_OK(Delete("key4"));
-  dbfull()->TEST_FlushMemTable();
-  reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+  ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
   ASSERT_EQ(3U, ptc.size());
   row = ptc.begin();
   ASSERT_EQ(3U, row->second->num_entries);
@@ -177,10 +186,10 @@ TEST_F(CuckooTableDBTest, FlushWithDuplicateKeys) {
   ASSERT_OK(Put("key1", "v1"));
   ASSERT_OK(Put("key2", "v2"));
   ASSERT_OK(Put("key1", "v3"));  // Duplicate
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   TablePropertiesCollection ptc;
-  reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+  ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
   ASSERT_EQ(1U, ptc.size());
   ASSERT_EQ(2U, ptc.begin()->second->num_entries);
   ASSERT_EQ("1", FilesPerLevel());
@@ -205,12 +214,12 @@ static std::string Uint64Key(uint64_t i) {
 TEST_F(CuckooTableDBTest, Uint64Comparator) {
   Options options = CurrentOptions();
   options.comparator = test::Uint64Comparator();
-  Reopen(&options);
+  DestroyAndReopen(&options);
 
   ASSERT_OK(Put(Uint64Key(1), "v1"));
   ASSERT_OK(Put(Uint64Key(2), "v2"));
   ASSERT_OK(Put(Uint64Key(3), "v3"));
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   ASSERT_EQ("v1", Get(Uint64Key(1)));
   ASSERT_EQ("v2", Get(Uint64Key(2)));
@@ -219,10 +228,10 @@ TEST_F(CuckooTableDBTest, Uint64Comparator) {
 
   // Add more keys.
   ASSERT_OK(Delete(Uint64Key(2)));  // Delete.
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
   ASSERT_OK(Put(Uint64Key(3), "v0"));  // Update.
   ASSERT_OK(Put(Uint64Key(4), "v4"));
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
   ASSERT_EQ("v1", Get(Uint64Key(1)));
   ASSERT_EQ("NOT_FOUND", Get(Uint64Key(2)));
   ASSERT_EQ("v0", Get(Uint64Key(3)));
@@ -242,11 +251,11 @@ TEST_F(CuckooTableDBTest, CompactionIntoMultipleFiles) {
   for (int idx = 0; idx < 28; ++idx) {
     ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx))));
   }
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   ASSERT_EQ("1", FilesPerLevel());
 
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
-                              true /* disallow trivial move */);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                        true /* disallow trivial move */));
   ASSERT_EQ("0,2", FilesPerLevel());
   for (int idx = 0; idx < 28; ++idx) {
     ASSERT_EQ(std::string(10000, 'a' + char(idx)), Get(Key(idx)));
@@ -265,15 +274,15 @@ TEST_F(CuckooTableDBTest, SameKeyInsertedInTwoDifferentFilesAndCompacted) {
   for (int idx = 0; idx < 11; ++idx) {
     ASSERT_OK(Put(Key(idx), std::string(10000, 'a')));
   }
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   ASSERT_EQ("1", FilesPerLevel());
 
   // Generate one more file in level-0, and should trigger level-0 compaction
   for (int idx = 0; idx < 11; ++idx) {
     ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx))));
   }
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
 
   ASSERT_EQ("0,1", FilesPerLevel());
   for (int idx = 0; idx < 11; ++idx) {
@@ -294,7 +303,7 @@ TEST_F(CuckooTableDBTest, AdaptiveTable) {
   ASSERT_OK(Put("key1", "v1"));
   ASSERT_OK(Put("key2", "v2"));
   ASSERT_OK(Put("key3", "v3"));
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   // Write some keys using plain table.
   std::shared_ptr<TableFactory> block_based_factory(
@@ -310,7 +319,7 @@ TEST_F(CuckooTableDBTest, AdaptiveTable) {
   Reopen(&options);
   ASSERT_OK(Put("key4", "v4"));
   ASSERT_OK(Put("key1", "v5"));
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   // Write some keys using block based table.
   options.table_factory.reset(NewAdaptiveTableFactory(
@@ -319,7 +328,7 @@ TEST_F(CuckooTableDBTest, AdaptiveTable) {
   Reopen(&options);
   ASSERT_OK(Put("key5", "v6"));
   ASSERT_OK(Put("key2", "v7"));
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   ASSERT_EQ("v5", Get("key1"));
   ASSERT_EQ("v7", Get("key2"));
diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc
index 21723c6a1be..34f27b80924 100644
--- a/db/db_basic_test.cc
+++ b/db/db_basic_test.cc
@@ -12,6 +12,7 @@
 
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
+#include "rocksdb/flush_block_policy.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/perf_context.h"
 #include "rocksdb/utilities/debug.h"
@@ -20,6 +21,7 @@
 #if !defined(ROCKSDB_LITE)
 #include "test_util/sync_point.h"
 #endif
+#include "util/file_checksum_helper.h"
 #include "util/random.h"
 #include "utilities/fault_injection_env.h"
 #include "utilities/merge_operators.h"
@@ -37,7 +39,10 @@ TEST_F(DBBasicTest, OpenWhenOpen) {
   options.env = env_;
   DB* db2 = nullptr;
   Status s = DB::Open(options, dbname_, &db2);
-  ASSERT_NOK(s);
+  ASSERT_NOK(s) << [db2]() {
+    delete db2;
+    return "db2 open: ok";
+  }();
   ASSERT_EQ(Status::Code::kIOError, s.code());
   ASSERT_EQ(Status::SubCode::kNone, s.subcode());
   ASSERT_TRUE(strstr(s.getState(), "lock ") != nullptr);
@@ -142,7 +147,7 @@ TEST_F(DBBasicTest, ReadOnlyDB) {
 
   // Reopen and flush memtable.
   Reopen(options);
-  Flush();
+  ASSERT_OK(Flush());
   Close();
   // Now check keys in read only mode.
   ASSERT_OK(ReadOnlyReopen(options));
@@ -178,7 +183,7 @@ TEST_F(DBBasicTest, ReadOnlyDBWithWriteDBIdToManifestSet) {
 
   // Reopen and flush memtable.
   Reopen(options);
-  Flush();
+  ASSERT_OK(Flush());
   Close();
   // Now check keys in read only mode.
   ASSERT_OK(ReadOnlyReopen(options));
@@ -201,7 +206,7 @@ TEST_F(DBBasicTest, CompactedDB) {
   Reopen(options);
   // 1 L0 file, use CompactedDB if max_open_files = -1
   ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, '1')));
-  Flush();
+  ASSERT_OK(Flush());
   Close();
   ASSERT_OK(ReadOnlyReopen(options));
   Status s = Put("new", "value");
@@ -219,12 +224,12 @@ TEST_F(DBBasicTest, CompactedDB) {
   Reopen(options);
   // Add more L0 files
   ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, '2')));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, 'a')));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, 'b')));
   ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e')));
-  Flush();
+  ASSERT_OK(Flush());
   Close();
 
   ASSERT_OK(ReadOnlyReopen(options));
@@ -401,16 +406,18 @@ TEST_F(DBBasicTest, GetSnapshot) {
 
 TEST_F(DBBasicTest, CheckLock) {
   do {
-    DB* localdb;
+    DB* localdb = nullptr;
     Options options = CurrentOptions();
     ASSERT_OK(TryReopen(options));
 
     // second open should fail
     Status s = DB::Open(options, dbname_, &localdb);
-    ASSERT_NOK(s);
+    ASSERT_NOK(s) << [localdb]() {
+      delete localdb;
+      return "localdb open: ok";
+    }();
 #ifdef OS_LINUX
-    ASSERT_TRUE(s.ToString().find("lock hold by current process") !=
-                std::string::npos);
+    ASSERT_TRUE(s.ToString().find("lock ") != std::string::npos);
 #endif  // OS_LINUX
   } while (ChangeCompactOptions());
 }
@@ -659,7 +666,7 @@ TEST_F(DBBasicTest, Snapshot) {
     ASSERT_EQ("0v4", Get(0, "foo"));
     ASSERT_EQ("1v4", Get(1, "foo"));
     ASSERT_EQ(1U, GetNumSnapshots());
-    ASSERT_LE(time_snap1, GetTimeOldestSnapshots());
+    ASSERT_LT(time_snap1, GetTimeOldestSnapshots());
     ASSERT_EQ(GetSequenceOldestSnapshots(), s2->GetSequenceNumber());
 
     db_->ReleaseSnapshot(s2);
@@ -1407,10 +1414,10 @@ TEST_F(DBBasicTest, MultiGetBatchedSortedMultiFile) {
     // mix with memtable
     ASSERT_OK(Put(1, "k1", "v1"));
     ASSERT_OK(Put(1, "k2", "v2"));
-    Flush(1);
+    ASSERT_OK(Flush(1));
     ASSERT_OK(Put(1, "k3", "v3"));
     ASSERT_OK(Put(1, "k4", "v4"));
-    Flush(1);
+    ASSERT_OK(Flush(1));
     ASSERT_OK(Delete(1, "k4"));
     ASSERT_OK(Put(1, "k5", "v5"));
     ASSERT_OK(Delete(1, "no_key"));
@@ -1453,19 +1460,19 @@ TEST_F(DBBasicTest, MultiGetBatchedDuplicateKeys) {
   // mix with memtable
   ASSERT_OK(Merge(1, "k1", "v1"));
   ASSERT_OK(Merge(1, "k2", "v2"));
-  Flush(1);
+  ASSERT_OK(Flush(1));
   MoveFilesToLevel(2, 1);
   ASSERT_OK(Merge(1, "k3", "v3"));
   ASSERT_OK(Merge(1, "k4", "v4"));
-  Flush(1);
+  ASSERT_OK(Flush(1));
   MoveFilesToLevel(2, 1);
   ASSERT_OK(Merge(1, "k4", "v4_2"));
   ASSERT_OK(Merge(1, "k6", "v6"));
-  Flush(1);
+  ASSERT_OK(Flush(1));
   MoveFilesToLevel(2, 1);
   ASSERT_OK(Merge(1, "k7", "v7"));
   ASSERT_OK(Merge(1, "k8", "v8"));
-  Flush(1);
+  ASSERT_OK(Flush(1));
   MoveFilesToLevel(2, 1);
 
   get_perf_context()->Reset();
@@ -1505,12 +1512,12 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) {
     ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i)));
     num_keys++;
     if (num_keys == 8) {
-      Flush();
+      ASSERT_OK(Flush());
       num_keys = 0;
     }
   }
   if (num_keys > 0) {
-    Flush();
+    ASSERT_OK(Flush());
     num_keys = 0;
   }
   MoveFilesToLevel(2);
@@ -1519,12 +1526,12 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) {
     ASSERT_OK(Put("key_" + std::to_string(i), "val_l1_" + std::to_string(i)));
     num_keys++;
     if (num_keys == 8) {
-      Flush();
+      ASSERT_OK(Flush());
       num_keys = 0;
     }
   }
   if (num_keys > 0) {
-    Flush();
+    ASSERT_OK(Flush());
     num_keys = 0;
   }
   MoveFilesToLevel(1);
@@ -1533,12 +1540,12 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) {
     ASSERT_OK(Put("key_" + std::to_string(i), "val_l0_" + std::to_string(i)));
     num_keys++;
     if (num_keys == 8) {
-      Flush();
+      ASSERT_OK(Flush());
       num_keys = 0;
     }
   }
   if (num_keys > 0) {
-    Flush();
+    ASSERT_OK(Flush());
     num_keys = 0;
   }
   ASSERT_EQ(0, num_keys);
@@ -1584,12 +1591,12 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevelMerge) {
     ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i)));
     num_keys++;
     if (num_keys == 8) {
-      Flush();
+      ASSERT_OK(Flush());
       num_keys = 0;
     }
   }
   if (num_keys > 0) {
-    Flush();
+    ASSERT_OK(Flush());
     num_keys = 0;
   }
   MoveFilesToLevel(2);
@@ -1598,12 +1605,12 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevelMerge) {
     ASSERT_OK(Merge("key_" + std::to_string(i), "val_l1_" + std::to_string(i)));
     num_keys++;
     if (num_keys == 8) {
-      Flush();
+      ASSERT_OK(Flush());
       num_keys = 0;
     }
   }
   if (num_keys > 0) {
-    Flush();
+    ASSERT_OK(Flush());
     num_keys = 0;
   }
   MoveFilesToLevel(1);
@@ -1612,12 +1619,12 @@ TEST_F(DBBasicTest, MultiGetBatchedMultiLevelMerge) {
     ASSERT_OK(Merge("key_" + std::to_string(i), "val_l0_" + std::to_string(i)));
     num_keys++;
     if (num_keys == 8) {
-      Flush();
+      ASSERT_OK(Flush());
       num_keys = 0;
     }
   }
   if (num_keys > 0) {
-    Flush();
+    ASSERT_OK(Flush());
     num_keys = 0;
   }
   ASSERT_EQ(0, num_keys);
@@ -1699,7 +1706,7 @@ TEST_F(DBBasicTest, MultiGetBatchedValueSize) {
     ASSERT_OK(Put(1, "k7", "v7_"));
     ASSERT_OK(Put(1, "k3", "v3_"));
     ASSERT_OK(Put(1, "k4", "v4"));
-    Flush(1);
+    ASSERT_OK(Flush(1));
     ASSERT_OK(Delete(1, "k4"));
     ASSERT_OK(Put(1, "k11", "v11"));
     ASSERT_OK(Delete(1, "no_key"));
@@ -1709,7 +1716,7 @@ TEST_F(DBBasicTest, MultiGetBatchedValueSize) {
     ASSERT_OK(Put(1, "k15", "v15"));
     ASSERT_OK(Put(1, "k16", "v16"));
     ASSERT_OK(Put(1, "k17", "v17"));
-    Flush(1);
+    ASSERT_OK(Flush(1));
 
     ASSERT_OK(Put(1, "k1", "v1_"));
     ASSERT_OK(Put(1, "k2", "v2_"));
@@ -1779,12 +1786,12 @@ TEST_F(DBBasicTest, MultiGetBatchedValueSizeMultiLevelMerge) {
     ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i)));
     num_keys++;
     if (num_keys == 8) {
-      Flush();
+      ASSERT_OK(Flush());
       num_keys = 0;
     }
   }
   if (num_keys > 0) {
-    Flush();
+    ASSERT_OK(Flush());
     num_keys = 0;
   }
   MoveFilesToLevel(2);
@@ -1793,12 +1800,12 @@ TEST_F(DBBasicTest, MultiGetBatchedValueSizeMultiLevelMerge) {
     ASSERT_OK(Merge("key_" + std::to_string(i), "val_l1_" + std::to_string(i)));
     num_keys++;
     if (num_keys == 8) {
-      Flush();
+      ASSERT_OK(Flush());
       num_keys = 0;
     }
   }
   if (num_keys > 0) {
-    Flush();
+    ASSERT_OK(Flush());
     num_keys = 0;
   }
   MoveFilesToLevel(1);
@@ -1807,12 +1814,12 @@ TEST_F(DBBasicTest, MultiGetBatchedValueSizeMultiLevelMerge) {
     ASSERT_OK(Merge("key_" + std::to_string(i), "val_l0_" + std::to_string(i)));
     num_keys++;
     if (num_keys == 8) {
-      Flush();
+      ASSERT_OK(Flush());
       num_keys = 0;
     }
   }
   if (num_keys > 0) {
-    Flush();
+    ASSERT_OK(Flush());
     num_keys = 0;
   }
   ASSERT_EQ(0, num_keys);
@@ -1875,6 +1882,7 @@ TEST_F(DBBasicTest, MultiGetStats) {
   Options options;
   options.create_if_missing = true;
   options.disable_auto_compactions = true;
+  options.env = env_;
   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
   BlockBasedTableOptions table_options;
   table_options.block_size = 1;
@@ -1884,7 +1892,7 @@ TEST_F(DBBasicTest, MultiGetStats) {
   table_options.no_block_cache = true;
   table_options.cache_index_and_filter_blocks = false;
   table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   CreateAndReopenWithCF({"pikachu"}, options);
 
   int total_keys = 2000;
@@ -1901,10 +1909,10 @@ TEST_F(DBBasicTest, MultiGetStats) {
     keys[i] = Slice(keys_str[i]);
     ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000)));
     if (i % 100 == 0) {
-      Flush(1);
+      ASSERT_OK(Flush(1));
     }
   }
-  Flush(1);
+  ASSERT_OK(Flush(1));
   MoveFilesToLevel(2, 1);
 
   for (int i = 501; i < 1000; ++i) {
@@ -1912,11 +1920,11 @@ TEST_F(DBBasicTest, MultiGetStats) {
     keys[i] = Slice(keys_str[i]);
     ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000)));
     if (i % 100 == 0) {
-      Flush(1);
+      ASSERT_OK(Flush(1));
     }
   }
 
-  Flush(1);
+  ASSERT_OK(Flush(1));
   MoveFilesToLevel(2, 1);
 
   for (int i = 1001; i < total_keys; ++i) {
@@ -1924,10 +1932,10 @@ TEST_F(DBBasicTest, MultiGetStats) {
     keys[i] = Slice(keys_str[i]);
     ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000)));
     if (i % 100 == 0) {
-      Flush(1);
+      ASSERT_OK(Flush(1));
     }
   }
-  Flush(1);
+  ASSERT_OK(Flush(1));
   Close();
 
   ReopenWithColumnFamilies({"default", "pikachu"}, options);
@@ -1954,7 +1962,7 @@ TEST_F(DBBasicTest, MultiGetStats) {
   ASSERT_GT(hist_sst.max, 0);
 
   // Minimun number of blocks read in a level.
-  ASSERT_EQ(hist_data_blocks.min, 0);
+  ASSERT_EQ(hist_data_blocks.min, 3);
   ASSERT_GT(hist_index_and_filter_blocks.min, 0);
   // Minimun number of sst files read in a level.
   ASSERT_GT(hist_sst.max, 0);
@@ -2035,11 +2043,11 @@ TEST_P(DBMultiGetRowCacheTest, MultiGetBatched) {
     ASSERT_OK(Put(1, "k2", "v2"));
     ASSERT_OK(Put(1, "k3", "v3"));
     ASSERT_OK(Put(1, "k4", "v4"));
-    Flush(1);
+    ASSERT_OK(Flush(1));
     ASSERT_OK(Put(1, "k5", "v5"));
     const Snapshot* snap1 = dbfull()->GetSnapshot();
     ASSERT_OK(Delete(1, "k4"));
-    Flush(1);
+    ASSERT_OK(Flush(1));
     const Snapshot* snap2 = dbfull()->GetSnapshot();
 
     get_perf_context()->Reset();
@@ -2168,7 +2176,7 @@ TEST_F(DBBasicTest, MultiGetIOBufferOverrun) {
   table_options.block_size = 16 * 1024;
   ASSERT_TRUE(table_options.block_size >
             BlockBasedTable::kMultiGetReadStackBufSize);
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   Reopen(options);
 
   std::string zero_str(128, '\0');
@@ -2178,7 +2186,7 @@ TEST_F(DBBasicTest, MultiGetIOBufferOverrun) {
     std::string value(rnd.RandomString(128) + zero_str);
     assert(Put(Key(i), value) == Status::OK());
   }
-  Flush();
+  ASSERT_OK(Flush());
 
   std::vector<std::string> key_data(10);
   std::vector<Slice> keys;
@@ -2201,8 +2209,6 @@ TEST_F(DBBasicTest, MultiGetIOBufferOverrun) {
 
 TEST_F(DBBasicTest, IncrementalRecoveryNoCorrupt) {
   Options options = CurrentOptions();
-  options.file_checksum_gen_factory =
-      ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory();
   DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu", "eevee"}, options);
   size_t num_cfs = handles_.size();
@@ -2241,8 +2247,6 @@ TEST_F(DBBasicTest, IncrementalRecoveryNoCorrupt) {
 
 TEST_F(DBBasicTest, BestEffortsRecoveryWithVersionBuildingFailure) {
   Options options = CurrentOptions();
-  options.file_checksum_gen_factory =
-      ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory();
   DestroyAndReopen(options);
   ASSERT_OK(Put("foo", "value"));
   ASSERT_OK(Flush());
@@ -2282,6 +2286,43 @@ class TableFileListener : public EventListener {
 };
 }  // namespace
 
+TEST_F(DBBasicTest, LastSstFileNotInManifest) {
+  // If the last sst file is not tracked in MANIFEST,
+  // or the VersionEdit for the last sst file is not synced,
+  // on recovery, the last sst file should be deleted,
+  // and new sst files shouldn't reuse its file number.
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  Close();
+
+  // Manually add a sst file.
+  constexpr uint64_t kSstFileNumber = 100;
+  const std::string kSstFile = MakeTableFileName(dbname_, kSstFileNumber);
+  ASSERT_OK(WriteStringToFile(env_, /* data = */ "bad sst file content",
+                              /* fname = */ kSstFile,
+                              /* should_sync = */ true));
+  ASSERT_OK(env_->FileExists(kSstFile));
+
+  TableFileListener* listener = new TableFileListener();
+  options.listeners.emplace_back(listener);
+  Reopen(options);
+  // kSstFile should already be deleted.
+  ASSERT_TRUE(env_->FileExists(kSstFile).IsNotFound());
+
+  ASSERT_OK(Put("k", "v"));
+  ASSERT_OK(Flush());
+  // New sst file should have file number > kSstFileNumber.
+  std::vector<std::string>& files =
+      listener->GetFiles(kDefaultColumnFamilyName);
+  ASSERT_EQ(files.size(), 1);
+  const std::string fname = files[0].erase(0, (dbname_ + "/").size());
+  uint64_t number = 0;
+  FileType type = kTableFile;
+  ASSERT_TRUE(ParseFileName(fname, &number, &type));
+  ASSERT_EQ(type, kTableFile);
+  ASSERT_GT(number, kSstFileNumber);
+}
+
 TEST_F(DBBasicTest, RecoverWithMissingFiles) {
   Options options = CurrentOptions();
   DestroyAndReopen(options);
@@ -2289,8 +2330,6 @@ TEST_F(DBBasicTest, RecoverWithMissingFiles) {
   // Disable auto compaction to simplify SST file name tracking.
   options.disable_auto_compactions = true;
   options.listeners.emplace_back(listener);
-  options.file_checksum_gen_factory =
-      ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory();
   CreateAndReopenWithCF({"pikachu", "eevee"}, options);
   std::vector<std::string> all_cf_names = {kDefaultColumnFamilyName, "pikachu",
                                            "eevee"};
@@ -2351,8 +2390,6 @@ TEST_F(DBBasicTest, RecoverWithMissingFiles) {
 
 TEST_F(DBBasicTest, BestEffortsRecoveryTryMultipleManifests) {
   Options options = CurrentOptions();
-  options.file_checksum_gen_factory =
-      ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory();
   options.env = env_;
   DestroyAndReopen(options);
   ASSERT_OK(Put("foo", "value0"));
@@ -2379,8 +2416,6 @@ TEST_F(DBBasicTest, BestEffortsRecoveryTryMultipleManifests) {
 
 TEST_F(DBBasicTest, RecoverWithNoCurrentFile) {
   Options options = CurrentOptions();
-  options.file_checksum_gen_factory =
-      ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory();
   options.env = env_;
   DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, options);
@@ -2404,8 +2439,6 @@ TEST_F(DBBasicTest, RecoverWithNoCurrentFile) {
 
 TEST_F(DBBasicTest, RecoverWithNoManifest) {
   Options options = CurrentOptions();
-  options.file_checksum_gen_factory =
-      ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory();
   options.env = env_;
   DestroyAndReopen(options);
   ASSERT_OK(Put("foo", "value"));
@@ -2417,7 +2450,7 @@ TEST_F(DBBasicTest, RecoverWithNoManifest) {
     ASSERT_OK(env_->GetChildren(dbname_, &files));
     for (const auto& file : files) {
       uint64_t number = 0;
-      FileType type = kLogFile;
+      FileType type = kWalFile;
       if (ParseFileName(file, &number, &type) && type == kDescriptorFile) {
         ASSERT_OK(env_->DeleteFile(dbname_ + "/" + file));
       }
@@ -2435,8 +2468,6 @@ TEST_F(DBBasicTest, RecoverWithNoManifest) {
 
 TEST_F(DBBasicTest, SkipWALIfMissingTableFiles) {
   Options options = CurrentOptions();
-  options.file_checksum_gen_factory =
-      ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory();
   DestroyAndReopen(options);
   TableFileListener* listener = new TableFileListener();
   options.listeners.emplace_back(listener);
@@ -2475,6 +2506,42 @@ TEST_F(DBBasicTest, SkipWALIfMissingTableFiles) {
   ASSERT_FALSE(iter->Valid());
   ASSERT_OK(iter->status());
 }
+
+TEST_F(DBBasicTest, DisableTrackWal) {
+  // If WAL tracking was enabled, and then disabled during reopen,
+  // the previously tracked WALs should be removed from MANIFEST.
+
+  Options options = CurrentOptions();
+  options.track_and_verify_wals_in_manifest = true;
+  // extremely small write buffer size,
+  // so that new WALs are created more frequently.
+  options.write_buffer_size = 100;
+  options.env = env_;
+  DestroyAndReopen(options);
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put("foo" + std::to_string(i), "value" + std::to_string(i)));
+  }
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+  ASSERT_OK(db_->SyncWAL());
+  // Some WALs are tracked.
+  ASSERT_FALSE(dbfull()->TEST_GetVersionSet()->GetWalSet().GetWals().empty());
+  Close();
+
+  // Disable WAL tracking.
+  options.track_and_verify_wals_in_manifest = false;
+  options.create_if_missing = false;
+  ASSERT_OK(TryReopen(options));
+  // Previously tracked WALs are cleared.
+  ASSERT_TRUE(dbfull()->TEST_GetVersionSet()->GetWalSet().GetWals().empty());
+  Close();
+
+  // Re-enable WAL tracking again.
+  options.track_and_verify_wals_in_manifest = true;
+  options.create_if_missing = false;
+  ASSERT_OK(TryReopen(options));
+  ASSERT_TRUE(dbfull()->TEST_GetVersionSet()->GetWalSet().GetWals().empty());
+  Close();
+}
 #endif  // !ROCKSDB_LITE
 
 TEST_F(DBBasicTest, ManifestChecksumMismatch) {
@@ -2504,6 +2571,64 @@ TEST_F(DBBasicTest, ManifestChecksumMismatch) {
   ASSERT_TRUE(s.IsCorruption());
 }
 
+#ifndef ROCKSDB_LITE
+class DBBasicTestTrackWal : public DBTestBase,
+                            public testing::WithParamInterface<bool> {
+ public:
+  DBBasicTestTrackWal()
+      : DBTestBase("/db_basic_test_track_wal", /*env_do_fsync=*/false) {}
+
+  int CountWalFiles() {
+    VectorLogPtr log_files;
+    EXPECT_OK(dbfull()->GetSortedWalFiles(log_files));
+    return static_cast<int>(log_files.size());
+  };
+};
+
+TEST_P(DBBasicTestTrackWal, DoNotTrackObsoleteWal) {
+  // If a WAL becomes obsolete after flushing, but is not deleted from disk yet,
+  // then if SyncWAL is called afterwards, the obsolete WAL should not be
+  // tracked in MANIFEST.
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.track_and_verify_wals_in_manifest = true;
+  options.atomic_flush = GetParam();
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"cf"}, options);
+  ASSERT_EQ(handles_.size(), 2);  // default, cf
+  // Do not delete WALs.
+  ASSERT_OK(db_->DisableFileDeletions());
+  constexpr int n = 10;
+  std::vector<std::unique_ptr<LogFile>> wals(n);
+  for (size_t i = 0; i < n; i++) {
+    // Generate a new WAL for each key-value.
+    const int cf = i % 2;
+    ASSERT_OK(db_->GetCurrentWalFile(&wals[i]));
+    ASSERT_OK(Put(cf, "k" + std::to_string(i), "v" + std::to_string(i)));
+    ASSERT_OK(Flush({0, 1}));
+  }
+  ASSERT_EQ(CountWalFiles(), n);
+  // Since all WALs are obsolete, no WAL should be tracked in MANIFEST.
+  ASSERT_OK(db_->SyncWAL());
+
+  // Manually delete all WALs.
+  Close();
+  for (const auto& wal : wals) {
+    ASSERT_OK(env_->DeleteFile(LogFileName(dbname_, wal->LogNumber())));
+  }
+
+  // If SyncWAL tracks the obsolete WALs in MANIFEST,
+  // reopen will fail because the WALs are missing from disk.
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "cf"}, options));
+  Destroy(options);
+}
+
+INSTANTIATE_TEST_CASE_P(DBBasicTestTrackWal, DBBasicTestTrackWal,
+                        testing::Bool());
+#endif  // ROCKSDB_LITE
+
 class DBBasicTestMultiGet : public DBTestBase {
  public:
   DBBasicTestMultiGet(std::string test_dir, int num_cfs, bool compressed_cache,
@@ -2563,12 +2688,13 @@ class DBBasicTestMultiGet : public DBTestBase {
     table_options.block_cache_compressed = compressed_cache_;
     table_options.flush_block_policy_factory.reset(
         new MyFlushBlockPolicyFactory());
-    options.table_factory.reset(new BlockBasedTableFactory(table_options));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
     if (!compression_enabled_) {
       options.compression = kNoCompression;
     } else {
       options.compression_opts.parallel_threads = compression_parallel_threads;
     }
+    options_ = options;
     Reopen(options);
 
     if (num_cfs > 1) {
@@ -2589,9 +2715,9 @@ class DBBasicTestMultiGet : public DBTestBase {
                                : Put(cf, Key(i), values_[i])) == Status::OK());
       }
       if (num_cfs == 1) {
-        Flush();
+        EXPECT_OK(Flush());
       } else {
-        dbfull()->Flush(FlushOptions(), handles_[cf]);
+        EXPECT_OK(dbfull()->Flush(FlushOptions(), handles_[cf]));
       }
 
       for (int i = 0; i < 100; ++i) {
@@ -2603,9 +2729,9 @@ class DBBasicTestMultiGet : public DBTestBase {
                Status::OK());
       }
       if (num_cfs == 1) {
-        Flush();
+        EXPECT_OK(Flush());
       } else {
-        dbfull()->Flush(FlushOptions(), handles_[cf]);
+        EXPECT_OK(dbfull()->Flush(FlushOptions(), handles_[cf]));
       }
     }
   }
@@ -2638,6 +2764,7 @@ class DBBasicTestMultiGet : public DBTestBase {
   bool compression_enabled() { return compression_enabled_; }
   bool has_compressed_cache() { return compressed_cache_ != nullptr; }
   bool has_uncompressed_cache() { return uncompressed_cache_ != nullptr; }
+  Options get_options() { return options_; }
 
   static void SetUpTestCase() {}
   static void TearDownTestCase() {}
@@ -2693,6 +2820,7 @@ class DBBasicTestMultiGet : public DBTestBase {
 
     const char* Name() const override { return "MyBlockCache"; }
 
+    using Cache::Insert;
     Status Insert(const Slice& key, void* value, size_t charge,
                   void (*deleter)(const Slice& key, void* value),
                   Handle** handle = nullptr,
@@ -2701,6 +2829,7 @@ class DBBasicTestMultiGet : public DBTestBase {
       return target_->Insert(key, value, charge, deleter, handle, priority);
     }
 
+    using Cache::Lookup;
     Handle* Lookup(const Slice& key, Statistics* stats = nullptr) override {
       num_lookups_++;
       Handle* handle = target_->Lookup(key, stats);
@@ -2723,6 +2852,7 @@ class DBBasicTestMultiGet : public DBTestBase {
 
   std::shared_ptr<MyBlockCache> compressed_cache_;
   std::shared_ptr<MyBlockCache> uncompressed_cache_;
+  Options options_;
   bool compression_enabled_;
   std::vector<std::string> values_;
   std::vector<std::string> uncompressable_values_;
@@ -2865,6 +2995,123 @@ TEST_P(DBBasicTestWithParallelIO, MultiGet) {
   }
 }
 
+#ifndef ROCKSDB_LITE
+TEST_P(DBBasicTestWithParallelIO, MultiGetDirectIO) {
+  class FakeDirectIOEnv : public EnvWrapper {
+    class FakeDirectIOSequentialFile;
+    class FakeDirectIORandomAccessFile;
+
+   public:
+    FakeDirectIOEnv(Env* env) : EnvWrapper(env) {}
+
+    Status NewRandomAccessFile(const std::string& fname,
+                               std::unique_ptr<RandomAccessFile>* result,
+                               const EnvOptions& options) override {
+      std::unique_ptr<RandomAccessFile> file;
+      assert(options.use_direct_reads);
+      EnvOptions opts = options;
+      opts.use_direct_reads = false;
+      Status s = target()->NewRandomAccessFile(fname, &file, opts);
+      if (!s.ok()) {
+        return s;
+      }
+      result->reset(new FakeDirectIORandomAccessFile(std::move(file)));
+      return s;
+    }
+
+   private:
+    class FakeDirectIOSequentialFile : public SequentialFileWrapper {
+     public:
+      FakeDirectIOSequentialFile(std::unique_ptr<SequentialFile>&& file)
+          : SequentialFileWrapper(file.get()), file_(std::move(file)) {}
+      ~FakeDirectIOSequentialFile() {}
+
+      bool use_direct_io() const override { return true; }
+      size_t GetRequiredBufferAlignment() const override { return 1; }
+
+     private:
+      std::unique_ptr<SequentialFile> file_;
+    };
+
+    class FakeDirectIORandomAccessFile : public RandomAccessFileWrapper {
+     public:
+      FakeDirectIORandomAccessFile(std::unique_ptr<RandomAccessFile>&& file)
+          : RandomAccessFileWrapper(file.get()), file_(std::move(file)) {}
+      ~FakeDirectIORandomAccessFile() {}
+
+      bool use_direct_io() const override { return true; }
+      size_t GetRequiredBufferAlignment() const override { return 1; }
+
+     private:
+      std::unique_ptr<RandomAccessFile> file_;
+    };
+  };
+
+  std::unique_ptr<FakeDirectIOEnv> env(new FakeDirectIOEnv(env_));
+  Options opts = get_options();
+  opts.env = env.get();
+  opts.use_direct_reads = true;
+  Reopen(opts);
+
+  std::vector<std::string> key_data(10);
+  std::vector<Slice> keys;
+  // We cannot resize a PinnableSlice vector, so just set initial size to
+  // largest we think we will need
+  std::vector<PinnableSlice> values(10);
+  std::vector<Status> statuses;
+  ReadOptions ro;
+  ro.fill_cache = fill_cache();
+
+  // Warm up the cache first
+  key_data.emplace_back(Key(0));
+  keys.emplace_back(Slice(key_data.back()));
+  key_data.emplace_back(Key(50));
+  keys.emplace_back(Slice(key_data.back()));
+  statuses.resize(keys.size());
+
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  ASSERT_TRUE(CheckValue(0, values[0].ToString()));
+  ASSERT_TRUE(CheckValue(50, values[1].ToString()));
+
+  int random_reads = env_->random_read_counter_.Read();
+  key_data[0] = Key(1);
+  key_data[1] = Key(51);
+  keys[0] = Slice(key_data[0]);
+  keys[1] = Slice(key_data[1]);
+  values[0].Reset();
+  values[1].Reset();
+  if (uncompressed_cache_) {
+    uncompressed_cache_->SetCapacity(0);
+    uncompressed_cache_->SetCapacity(1048576);
+  }
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  ASSERT_TRUE(CheckValue(1, values[0].ToString()));
+  ASSERT_TRUE(CheckValue(51, values[1].ToString()));
+
+  bool read_from_cache = false;
+  if (fill_cache()) {
+    if (has_uncompressed_cache()) {
+      read_from_cache = true;
+    } else if (has_compressed_cache() && compression_enabled()) {
+      read_from_cache = true;
+    }
+  }
+
+  int expected_reads = random_reads;
+  if (!compression_enabled() || !has_compressed_cache()) {
+    expected_reads += 2;
+  } else {
+    expected_reads += (read_from_cache ? 0 : 2);
+  }
+  if (env_->random_read_counter_.Read() != expected_reads) {
+    ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+  }
+  Close();
+}
+#endif  // ROCKSDB_LITE
+
 TEST_P(DBBasicTestWithParallelIO, MultiGetWithChecksumMismatch) {
   std::vector<std::string> key_data(10);
   std::vector<Slice> keys;
@@ -2990,7 +3237,7 @@ class DeadlineFS : public FileSystemWrapper {
   // or to simply delay but return success anyway. The latter mimics the
   // behavior of PosixFileSystem, which does not enforce any timeout
   explicit DeadlineFS(SpecialEnv* env, bool error_on_delay)
-      : FileSystemWrapper(FileSystem::Default()),
+      : FileSystemWrapper(env->GetFileSystem()),
         deadline_(std::chrono::microseconds::zero()),
         io_timeout_(std::chrono::microseconds::zero()),
         env_(env),
@@ -3029,7 +3276,9 @@ class DeadlineFS : public FileSystemWrapper {
 
   // Increment the IO counter and return a delay in microseconds
   IOStatus ShouldDelay(const IOOptions& opts) {
-    if (!deadline_.count() && !io_timeout_.count()) {
+    if (timedout_) {
+      return IOStatus::TimedOut();
+    } else if (!deadline_.count() && !io_timeout_.count()) {
       return IOStatus::OK();
     }
     if (!ignore_deadline_ && delay_trigger_ == io_count_++) {
@@ -3165,7 +3414,7 @@ TEST_F(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) {
   std::shared_ptr<Cache> cache = NewLRUCache(1048576);
   BlockBasedTableOptions table_options;
   table_options.block_cache = cache;
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   options.env = env.get();
   SetTimeElapseOnlySleepOnReopen(&options);
   ReopenWithColumnFamilies(GetCFNames(), options);
@@ -3325,17 +3574,40 @@ TEST_F(DBBasicTest, VerifyFileChecksums) {
   DestroyAndReopen(options);
   ASSERT_OK(Put("a", "value"));
   ASSERT_OK(Flush());
-  ASSERT_TRUE(dbfull()->VerifyFileChecksums(ReadOptions()).IsInvalidArgument());
+  ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument());
 
   options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
   Reopen(options);
-  ASSERT_OK(dbfull()->VerifyFileChecksums(ReadOptions()));
+  ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
 
   // Write an L0 with checksum computed.
   ASSERT_OK(Put("b", "value"));
   ASSERT_OK(Flush());
 
-  ASSERT_OK(dbfull()->VerifyFileChecksums(ReadOptions()));
+  ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+
+  // Does the right thing but with the wrong name -- using it should lead to an
+  // error.
+  class MisnamedFileChecksumGenerator : public FileChecksumGenCrc32c {
+   public:
+    MisnamedFileChecksumGenerator(const FileChecksumGenContext& context)
+        : FileChecksumGenCrc32c(context) {}
+
+    const char* Name() const override { return "sha1"; }
+  };
+
+  class MisnamedFileChecksumGenFactory : public FileChecksumGenCrc32cFactory {
+   public:
+    std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+        const FileChecksumGenContext& context) override {
+      return std::unique_ptr<FileChecksumGenerator>(
+          new MisnamedFileChecksumGenerator(context));
+    }
+  };
+
+  options.file_checksum_gen_factory.reset(new MisnamedFileChecksumGenFactory());
+  Reopen(options);
+  ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument());
 }
 #endif  // !ROCKSDB_LITE
 
@@ -3399,7 +3671,7 @@ TEST_P(DBBasicTestDeadline, PointLookupDeadline) {
       std::string key = "k" + ToString(i);
       ASSERT_OK(Put(key, rnd.RandomString(100)));
     }
-    Flush();
+    ASSERT_OK(Flush());
 
     bool timedout = true;
     // A timeout will be forced when the IO counter reaches this value
diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc
index da37babcc26..6108bf5b728 100644
--- a/db/db_block_cache_test.cc
+++ b/db/db_block_cache_test.cc
@@ -7,10 +7,13 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include <cstdlib>
+#include <memory>
 
+#include "cache/cache_entry_roles.h"
 #include "cache/lru_cache.h"
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
+#include "rocksdb/table.h"
 #include "util/compression.h"
 #include "util/random.h"
 
@@ -147,6 +150,17 @@ class DBBlockCacheTest : public DBTestBase {
     compressed_insert_count_ = new_insert_count;
     compressed_failure_count_ = new_failure_count;
   }
+
+#ifndef ROCKSDB_LITE
+  const std::array<size_t, kNumCacheEntryRoles>& GetCacheEntryRoleCountsBg() {
+    // Verify in cache entry role stats
+    ColumnFamilyHandleImpl* cfh =
+        static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+    InternalStats* internal_stats_ptr = cfh->cfd()->internal_stats();
+    return internal_stats_ptr->TEST_GetCacheEntryRoleStats(/*foreground=*/false)
+        .entry_counts;
+  }
+#endif  // ROCKSDB_LITE
 };
 
 TEST_F(DBBlockCacheTest, IteratorBlockCacheUsage) {
@@ -228,34 +242,48 @@ TEST_F(DBBlockCacheTest, TestWithoutCompressedBlockCache) {
 
 #ifdef SNAPPY
 TEST_F(DBBlockCacheTest, TestWithCompressedBlockCache) {
-  ReadOptions read_options;
-  auto table_options = GetTableOptions();
-  auto options = GetOptions(table_options);
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.block_cache_compressed = nullptr;
+  table_options.block_size = 1;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+  table_options.cache_index_and_filter_blocks = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   options.compression = CompressionType::kSnappyCompression;
-  InitTable(options);
 
-  std::shared_ptr<Cache> cache = NewLRUCache(0, 0, false);
+  DestroyAndReopen(options);
+
+  std::string value(kValueSize, 'a');
+  for (size_t i = 0; i < kNumBlocks; i++) {
+    ASSERT_OK(Put(ToString(i), value));
+    ASSERT_OK(Flush());
+  }
+
+  ReadOptions read_options;
   std::shared_ptr<Cache> compressed_cache = NewLRUCache(1 << 25, 0, false);
+  std::shared_ptr<Cache> cache = NewLRUCache(0, 0, false);
   table_options.block_cache = cache;
+  table_options.no_block_cache = false;
   table_options.block_cache_compressed = compressed_cache;
+  table_options.max_auto_readahead_size = 0;
+  table_options.cache_index_and_filter_blocks = false;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   Reopen(options);
   RecordCacheCounters(options);
 
-  std::vector<std::unique_ptr<Iterator>> iterators(kNumBlocks - 1);
-  Iterator* iter = nullptr;
-
   // Load blocks into cache.
-  for (size_t i = 0; i + 1 < kNumBlocks; i++) {
-    iter = db_->NewIterator(read_options);
-    iter->Seek(ToString(i));
-    ASSERT_OK(iter->status());
+  for (size_t i = 0; i < kNumBlocks - 1; i++) {
+    ASSERT_EQ(value, Get(ToString(i)));
     CheckCacheCounters(options, 1, 0, 1, 0);
     CheckCompressedCacheCounters(options, 1, 0, 1, 0);
-    iterators[i].reset(iter);
   }
+
   size_t usage = cache->GetUsage();
-  ASSERT_LT(0, usage);
+  ASSERT_EQ(0, usage);
   ASSERT_EQ(usage, cache->GetPinnedUsage());
   size_t compressed_usage = compressed_cache->GetUsage();
   ASSERT_LT(0, compressed_usage);
@@ -267,24 +295,21 @@ TEST_F(DBBlockCacheTest, TestWithCompressedBlockCache) {
   cache->SetCapacity(usage);
   cache->SetStrictCapacityLimit(true);
   ASSERT_EQ(usage, cache->GetPinnedUsage());
-  iter = db_->NewIterator(read_options);
-  iter->Seek(ToString(kNumBlocks - 1));
-  ASSERT_TRUE(iter->status().IsIncomplete());
-  CheckCacheCounters(options, 1, 0, 0, 1);
+
+  // Load last key block.
+  ASSERT_EQ("Result incomplete: Insert failed due to LRU cache being full.",
+            Get(ToString(kNumBlocks - 1)));
+  // Failure won't record the miss counter.
+  CheckCacheCounters(options, 0, 0, 0, 1);
   CheckCompressedCacheCounters(options, 1, 0, 1, 0);
-  delete iter;
-  iter = nullptr;
 
   // Clear strict capacity limit flag. This time we shall hit compressed block
-  // cache.
+  // cache and load into block cache.
   cache->SetStrictCapacityLimit(false);
-  iter = db_->NewIterator(read_options);
-  iter->Seek(ToString(kNumBlocks - 1));
-  ASSERT_OK(iter->status());
+  // Load last key block.
+  ASSERT_EQ(value, Get(ToString(kNumBlocks - 1)));
   CheckCacheCounters(options, 1, 0, 1, 0);
   CheckCompressedCacheCounters(options, 0, 1, 0, 0);
-  delete iter;
-  iter = nullptr;
 }
 #endif  // SNAPPY
 
@@ -432,6 +457,33 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
   //           filter_bytes_insert);
 }
 
+#if (defined OS_LINUX || defined OS_WIN)
+TEST_F(DBBlockCacheTest, WarmCacheWithDataBlocksDuringFlush) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = NewLRUCache(1 << 25, 0, false);
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.prepopulate_block_cache =
+      BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  std::string value(kValueSize, 'a');
+  for (size_t i = 1; i <= kNumBlocks; i++) {
+    ASSERT_OK(Put(ToString(i), value));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
+
+    ASSERT_EQ(value, Get(ToString(i)));
+    ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_HIT));
+  }
+}
+#endif
+
 namespace {
 
 // A mock cache wraps LRUCache, and record how many entries have been
@@ -446,15 +498,18 @@ class MockCache : public LRUCache {
                  false /*strict_capacity_limit*/, 0.0 /*high_pri_pool_ratio*/) {
   }
 
-  Status Insert(const Slice& key, void* value, size_t charge,
-                void (*deleter)(const Slice& key, void* value), Handle** handle,
-                Priority priority) override {
+  using ShardedCache::Insert;
+
+  Status Insert(const Slice& key, void* value,
+                const Cache::CacheItemHelper* helper_cb, size_t charge,
+                Handle** handle, Priority priority) override {
+    DeleterFn delete_cb = helper_cb->del_cb;
     if (priority == Priority::LOW) {
       low_pri_insert_count++;
     } else {
       high_pri_insert_count++;
     }
-    return LRUCache::Insert(key, value, charge, deleter, handle, priority);
+    return LRUCache::Insert(key, value, charge, delete_cb, handle, priority);
   }
 };
 
@@ -533,6 +588,7 @@ class LookupLiarCache : public CacheWrapper {
   explicit LookupLiarCache(std::shared_ptr<Cache> target)
       : CacheWrapper(std::move(target)) {}
 
+  using Cache::Lookup;
   Handle* Lookup(const Slice& key, Statistics* stats) override {
     if (nth_lookup_not_found_ == 1) {
       nth_lookup_not_found_ = 0;
@@ -677,7 +733,7 @@ TEST_F(DBBlockCacheTest, ParanoidFileChecks) {
   // Create a new SST file. This will further trigger a compaction
   // and generate another file.
   ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(3, /* Totally 3 files created up to now */
             TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
@@ -692,7 +748,7 @@ TEST_F(DBBlockCacheTest, ParanoidFileChecks) {
   ASSERT_OK(Put(1, "1_key4", "val4"));
   ASSERT_OK(Put(1, "9_key4", "val4"));
   ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(3, /* Totally 3 files created up to now */
             TestGetTickerCount(options, BLOCK_CACHE_ADD));
 }
@@ -837,8 +893,9 @@ TEST_F(DBBlockCacheTest, CacheCompressionDict) {
   Random rnd(301);
   for (auto compression_type : compression_types) {
     Options options = CurrentOptions();
-    options.compression = compression_type;
-    options.compression_opts.max_dict_bytes = 4096;
+    options.bottommost_compression = compression_type;
+    options.bottommost_compression_opts.max_dict_bytes = 4096;
+    options.bottommost_compression_opts.enabled = true;
     options.create_if_missing = true;
     options.num_levels = 2;
     options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
@@ -859,7 +916,7 @@ TEST_F(DBBlockCacheTest, CacheCompressionDict) {
       }
       ASSERT_OK(Flush());
     }
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(0, NumTableFilesAtLevel(0));
     ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(1));
 
@@ -886,8 +943,364 @@ TEST_F(DBBlockCacheTest, CacheCompressionDict) {
   }
 }
 
+static void ClearCache(Cache* cache) {
+  std::deque<std::string> keys;
+  Cache::ApplyToAllEntriesOptions opts;
+  auto callback = [&](const Slice& key, void* /*value*/, size_t /*charge*/,
+                      Cache::DeleterFn /*deleter*/) {
+    keys.push_back(key.ToString());
+  };
+  cache->ApplyToAllEntries(callback, opts);
+  for (auto& k : keys) {
+    cache->Erase(k);
+  }
+}
+
+TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
+  const size_t capacity = size_t{1} << 25;
+  int iterations_tested = 0;
+  for (bool partition : {false, true}) {
+    for (std::shared_ptr<Cache> cache :
+         {NewLRUCache(capacity), NewClockCache(capacity)}) {
+      if (!cache) {
+        // Skip clock cache when not supported
+        continue;
+      }
+      ++iterations_tested;
+
+      Options options = CurrentOptions();
+      SetTimeElapseOnlySleepOnReopen(&options);
+      options.create_if_missing = true;
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      options.max_open_files = 13;
+      options.table_cache_numshardbits = 0;
+      // If this wakes up, it could interfere with test
+      options.stats_dump_period_sec = 0;
+
+      BlockBasedTableOptions table_options;
+      table_options.block_cache = cache;
+      table_options.cache_index_and_filter_blocks = true;
+      table_options.filter_policy.reset(NewBloomFilterPolicy(50));
+      if (partition) {
+        table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+        table_options.partition_filters = true;
+      }
+      table_options.metadata_cache_options.top_level_index_pinning =
+          PinningTier::kNone;
+      table_options.metadata_cache_options.partition_pinning =
+          PinningTier::kNone;
+      table_options.metadata_cache_options.unpartitioned_pinning =
+          PinningTier::kNone;
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      DestroyAndReopen(options);
+
+      // Create a new table.
+      ASSERT_OK(Put("foo", "value"));
+      ASSERT_OK(Put("bar", "value"));
+      ASSERT_OK(Flush());
+
+      ASSERT_OK(Put("zfoo", "value"));
+      ASSERT_OK(Put("zbar", "value"));
+      ASSERT_OK(Flush());
+
+      ASSERT_EQ(2, NumTableFilesAtLevel(0));
+
+      // Fresh cache
+      ClearCache(cache.get());
+
+      std::array<size_t, kNumCacheEntryRoles> expected{};
+      // For CacheEntryStatsCollector
+      expected[static_cast<size_t>(CacheEntryRole::kMisc)] = 1;
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+
+      std::array<size_t, kNumCacheEntryRoles> prev_expected = expected;
+
+      // First access only filters
+      ASSERT_EQ("NOT_FOUND", Get("different from any key added"));
+      expected[static_cast<size_t>(CacheEntryRole::kFilterBlock)] += 2;
+      if (partition) {
+        expected[static_cast<size_t>(CacheEntryRole::kFilterMetaBlock)] += 2;
+      }
+      // Within some time window, we will get cached entry stats
+      EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
+      // Not enough to force a miss
+      env_->MockSleepForSeconds(45);
+      EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
+      // Enough to force a miss
+      env_->MockSleepForSeconds(601);
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+
+      // Now access index and data block
+      ASSERT_EQ("value", Get("foo"));
+      expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
+      if (partition) {
+        // top-level
+        expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
+      }
+      expected[static_cast<size_t>(CacheEntryRole::kDataBlock)]++;
+      // Enough to force a miss
+      env_->MockSleepForSeconds(601);
+      // But inject a simulated long scan so that we need a longer
+      // interval to force a miss next time.
+      SyncPoint::GetInstance()->SetCallBack(
+          "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries",
+          [this](void*) {
+            // To spend no more than 0.2% of time scanning, we would need
+            // interval of at least 10000s
+            env_->MockSleepForSeconds(20);
+          });
+      SyncPoint::GetInstance()->EnableProcessing();
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+      prev_expected = expected;
+      SyncPoint::GetInstance()->DisableProcessing();
+      SyncPoint::GetInstance()->ClearAllCallBacks();
+
+      // The same for other file
+      ASSERT_EQ("value", Get("zfoo"));
+      expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
+      if (partition) {
+        // top-level
+        expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
+      }
+      expected[static_cast<size_t>(CacheEntryRole::kDataBlock)]++;
+      // Because of the simulated long scan, this is not enough to force
+      // a miss
+      env_->MockSleepForSeconds(601);
+      EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
+      // But this is enough
+      env_->MockSleepForSeconds(10000);
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+      prev_expected = expected;
+
+      // Also check the GetProperty interface
+      std::map<std::string, std::string> values;
+      ASSERT_TRUE(
+          db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats, &values));
+
+      EXPECT_EQ(
+          ToString(expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]),
+          values["count.index-block"]);
+      EXPECT_EQ(
+          ToString(expected[static_cast<size_t>(CacheEntryRole::kDataBlock)]),
+          values["count.data-block"]);
+      EXPECT_EQ(
+          ToString(expected[static_cast<size_t>(CacheEntryRole::kFilterBlock)]),
+          values["count.filter-block"]);
+      EXPECT_EQ(
+          ToString(
+              prev_expected[static_cast<size_t>(CacheEntryRole::kWriteBuffer)]),
+          values["count.write-buffer"]);
+      EXPECT_EQ(ToString(expected[static_cast<size_t>(CacheEntryRole::kMisc)]),
+                values["count.misc"]);
+
+      // Add one for kWriteBuffer
+      {
+        WriteBufferManager wbm(size_t{1} << 20, cache);
+        wbm.ReserveMem(1024);
+        expected[static_cast<size_t>(CacheEntryRole::kWriteBuffer)]++;
+        // Now we check that the GetProperty interface is more agressive about
+        // re-scanning stats, but not totally aggressive.
+        // Within some time window, we will get cached entry stats
+        env_->MockSleepForSeconds(1);
+        EXPECT_EQ(ToString(prev_expected[static_cast<size_t>(
+                      CacheEntryRole::kWriteBuffer)]),
+                  values["count.write-buffer"]);
+        // Not enough for a "background" miss but enough for a "foreground" miss
+        env_->MockSleepForSeconds(45);
+
+        ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats,
+                                        &values));
+        EXPECT_EQ(
+            ToString(
+                expected[static_cast<size_t>(CacheEntryRole::kWriteBuffer)]),
+            values["count.write-buffer"]);
+      }
+      prev_expected = expected;
+
+      // With collector pinned in cache, we should be able to hit
+      // even if the cache is full
+      ClearCache(cache.get());
+      Cache::Handle* h = nullptr;
+      ASSERT_OK(cache->Insert("Fill-it-up", nullptr, capacity + 1,
+                              GetNoopDeleterForRole<CacheEntryRole::kMisc>(),
+                              &h, Cache::Priority::HIGH));
+      ASSERT_GT(cache->GetUsage(), cache->GetCapacity());
+      expected = {};
+      expected[static_cast<size_t>(CacheEntryRole::kMisc)]++;
+      // Still able to hit on saved stats
+      EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
+      // Enough to force a miss
+      env_->MockSleepForSeconds(1000);
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+
+      cache->Release(h);
+    }
+    EXPECT_GE(iterations_tested, 1);
+  }
+}
+
 #endif  // ROCKSDB_LITE
 
+class DBBlockCachePinningTest
+    : public DBTestBase,
+      public testing::WithParamInterface<
+          std::tuple<bool, PinningTier, PinningTier, PinningTier>> {
+ public:
+  DBBlockCachePinningTest()
+      : DBTestBase("/db_block_cache_test", /*env_do_fsync=*/false) {}
+
+  void SetUp() override {
+    partition_index_and_filters_ = std::get<0>(GetParam());
+    top_level_index_pinning_ = std::get<1>(GetParam());
+    partition_pinning_ = std::get<2>(GetParam());
+    unpartitioned_pinning_ = std::get<3>(GetParam());
+  }
+
+  bool partition_index_and_filters_;
+  PinningTier top_level_index_pinning_;
+  PinningTier partition_pinning_;
+  PinningTier unpartitioned_pinning_;
+};
+
+TEST_P(DBBlockCachePinningTest, TwoLevelDB) {
+  // Creates one file in L0 and one file in L1. Both files have enough data that
+  // their index and filter blocks are partitioned. The L1 file will also have
+  // a compression dictionary (those are trained only during compaction), which
+  // must be unpartitioned.
+  const int kKeySize = 32;
+  const int kBlockSize = 128;
+  const int kNumBlocksPerFile = 128;
+  const int kNumKeysPerFile = kBlockSize * kNumBlocksPerFile / kKeySize;
+
+  Options options = CurrentOptions();
+  // `kNoCompression` makes the unit test more portable. But it relies on the
+  // current behavior of persisting/accessing dictionary even when there's no
+  // (de)compression happening, which seems fairly likely to change over time.
+  options.compression = kNoCompression;
+  options.compression_opts.max_dict_bytes = 4 << 10;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = NewLRUCache(1 << 20 /* capacity */);
+  table_options.block_size = kBlockSize;
+  table_options.metadata_block_size = kBlockSize;
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.metadata_cache_options.top_level_index_pinning =
+      top_level_index_pinning_;
+  table_options.metadata_cache_options.partition_pinning = partition_pinning_;
+  table_options.metadata_cache_options.unpartitioned_pinning =
+      unpartitioned_pinning_;
+  table_options.filter_policy.reset(
+      NewBloomFilterPolicy(10 /* bits_per_key */));
+  if (partition_index_and_filters_) {
+    table_options.index_type =
+        BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    table_options.partition_filters = true;
+  }
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kKeySize)));
+    }
+    ASSERT_OK(Flush());
+    if (i == 0) {
+      // Prevent trivial move so file will be rewritten with dictionary and
+      // reopened with L1's pinning settings.
+      CompactRangeOptions cro;
+      cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+      ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+    }
+  }
+
+  // Clear all unpinned blocks so unpinned blocks will show up as cache misses
+  // when reading a key from a file.
+  table_options.block_cache->EraseUnRefEntries();
+
+  // Get base cache values
+  uint64_t filter_misses = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+  uint64_t index_misses = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
+  uint64_t compression_dict_misses =
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+
+  // Read a key from the L0 file
+  Get(Key(kNumKeysPerFile));
+  uint64_t expected_filter_misses = filter_misses;
+  uint64_t expected_index_misses = index_misses;
+  uint64_t expected_compression_dict_misses = compression_dict_misses;
+  if (partition_index_and_filters_) {
+    if (top_level_index_pinning_ == PinningTier::kNone) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+    if (partition_pinning_ == PinningTier::kNone) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+  } else {
+    if (unpartitioned_pinning_ == PinningTier::kNone) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+  }
+  if (unpartitioned_pinning_ == PinningTier::kNone) {
+    ++expected_compression_dict_misses;
+  }
+  ASSERT_EQ(expected_filter_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(expected_index_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(expected_compression_dict_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS));
+
+  // Clear all unpinned blocks so unpinned blocks will show up as cache misses
+  // when reading a key from a file.
+  table_options.block_cache->EraseUnRefEntries();
+
+  // Read a key from the L1 file
+  Get(Key(0));
+  if (partition_index_and_filters_) {
+    if (top_level_index_pinning_ == PinningTier::kNone ||
+        top_level_index_pinning_ == PinningTier::kFlushedAndSimilar) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+    if (partition_pinning_ == PinningTier::kNone ||
+        partition_pinning_ == PinningTier::kFlushedAndSimilar) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+  } else {
+    if (unpartitioned_pinning_ == PinningTier::kNone ||
+        unpartitioned_pinning_ == PinningTier::kFlushedAndSimilar) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+  }
+  if (unpartitioned_pinning_ == PinningTier::kNone ||
+      unpartitioned_pinning_ == PinningTier::kFlushedAndSimilar) {
+    ++expected_compression_dict_misses;
+  }
+  ASSERT_EQ(expected_filter_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(expected_index_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(expected_compression_dict_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DBBlockCachePinningTest, DBBlockCachePinningTest,
+    ::testing::Combine(
+        ::testing::Bool(),
+        ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar,
+                          PinningTier::kAll),
+        ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar,
+                          PinningTier::kAll),
+        ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar,
+                          PinningTier::kAll)));
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc
index d4610246181..1405b737a52 100644
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@@ -7,11 +7,15 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include <iomanip>
+#include <sstream>
+
 #include "db/db_test_util.h"
 #include "options/options_helper.h"
 #include "port/stack_trace.h"
 #include "rocksdb/perf_context.h"
 #include "table/block_based/filter_policy_internal.h"
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -128,8 +132,8 @@ TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
     ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
     ASSERT_OK(Flush(1));
-    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
-                                true /* disallow trivial move */);
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+                                          true /* disallow trivial move */));
 
     numopen = TestGetTickerCount(options, NO_FILE_OPENS);
     cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
@@ -178,7 +182,7 @@ TEST_F(DBBloomFilterTest, GetFilterByPrefixBloomCustomPrefixExtractor) {
     ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2"));
     ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar"));
 
-    dbfull()->Flush(fo);
+    ASSERT_OK(dbfull()->Flush(fo));
 
     ASSERT_EQ("foo", Get("barbarbar"));
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
@@ -244,7 +248,7 @@ TEST_F(DBBloomFilterTest, GetFilterByPrefixBloom) {
     ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2"));
     ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar"));
 
-    dbfull()->Flush(fo);
+    ASSERT_OK(dbfull()->Flush(fo));
 
     ASSERT_EQ("foo", Get("barbarbar"));
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
@@ -297,7 +301,7 @@ TEST_F(DBBloomFilterTest, WholeKeyFilterProp) {
     // ranges.
     ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
     ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
-    dbfull()->Flush(fo);
+    ASSERT_OK(dbfull()->Flush(fo));
 
     Reopen(options);
     ASSERT_EQ("NOT_FOUND", Get("foo"));
@@ -328,7 +332,7 @@ TEST_F(DBBloomFilterTest, WholeKeyFilterProp) {
     // ranges.
     ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
     ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
-    db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
     // Reopen with both of whole key off and prefix extractor enabled.
     // Still no bloom filter should be used.
@@ -351,7 +355,7 @@ TEST_F(DBBloomFilterTest, WholeKeyFilterProp) {
     // ranges.
     ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
     ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
-    db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
     options.prefix_extractor.reset();
     bbto.whole_key_filtering = true;
@@ -364,7 +368,7 @@ TEST_F(DBBloomFilterTest, WholeKeyFilterProp) {
     // not filtered out by key ranges.
     ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
     ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
-    Flush();
+    ASSERT_OK(Flush());
 
     // Now we have two files:
     // File 1: An older file with prefix bloom.
@@ -467,7 +471,7 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) {
     for (int i = 0; i < N; i += 100) {
       ASSERT_OK(Put(1, Key(i), Key(i)));
     }
-    Flush(1);
+    ASSERT_OK(Flush(1));
 
     // Prevent auto compactions triggered by seeks
     env_->delay_sstable_sync_.store(true, std::memory_order_release);
@@ -503,6 +507,21 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) {
       ASSERT_LE(reads, 3 * N / 100);
     }
 
+#ifndef ROCKSDB_LITE
+    // Sanity check some table properties
+    std::map<std::string, std::string> props;
+    ASSERT_TRUE(db_->GetMapProperty(
+        handles_[1], DB::Properties::kAggregatedTableProperties, &props));
+    uint64_t nkeys = N + N / 100;
+    uint64_t filter_size = ParseUint64(props["filter_size"]);
+    EXPECT_LE(filter_size,
+              (partition_filters_ ? 12 : 11) * nkeys / /*bits / byte*/ 8);
+    EXPECT_GE(filter_size, 10 * nkeys / /*bits / byte*/ 8);
+
+    uint64_t num_filter_entries = ParseUint64(props["num_filter_entries"]);
+    EXPECT_EQ(num_filter_entries, nkeys);
+#endif  // ROCKSDB_LITE
+
     env_->delay_sstable_sync_.store(false, std::memory_order_release);
     Close();
   } while (ChangeCompactOptions());
@@ -514,24 +533,24 @@ INSTANTIATE_TEST_CASE_P(
     ::testing::Values(
         std::make_tuple(BFP::kDeprecatedBlock, false,
                         test::kDefaultFormatVersion),
-        std::make_tuple(BFP::kAuto, true, test::kDefaultFormatVersion),
-        std::make_tuple(BFP::kAuto, false, test::kDefaultFormatVersion)));
+        std::make_tuple(BFP::kAutoBloom, true, test::kDefaultFormatVersion),
+        std::make_tuple(BFP::kAutoBloom, false, test::kDefaultFormatVersion)));
 
 INSTANTIATE_TEST_CASE_P(
     FormatDef, DBBloomFilterTestWithParam,
     ::testing::Values(
         std::make_tuple(BFP::kDeprecatedBlock, false,
                         test::kDefaultFormatVersion),
-        std::make_tuple(BFP::kAuto, true, test::kDefaultFormatVersion),
-        std::make_tuple(BFP::kAuto, false, test::kDefaultFormatVersion)));
+        std::make_tuple(BFP::kAutoBloom, true, test::kDefaultFormatVersion),
+        std::make_tuple(BFP::kAutoBloom, false, test::kDefaultFormatVersion)));
 
 INSTANTIATE_TEST_CASE_P(
     FormatLatest, DBBloomFilterTestWithParam,
     ::testing::Values(
         std::make_tuple(BFP::kDeprecatedBlock, false,
                         test::kLatestFormatVersion),
-        std::make_tuple(BFP::kAuto, true, test::kLatestFormatVersion),
-        std::make_tuple(BFP::kAuto, false, test::kLatestFormatVersion)));
+        std::make_tuple(BFP::kAutoBloom, true, test::kLatestFormatVersion),
+        std::make_tuple(BFP::kAutoBloom, false, test::kLatestFormatVersion)));
 #endif  // ROCKSDB_VALGRIND_RUN
 
 TEST_F(DBBloomFilterTest, BloomFilterRate) {
@@ -771,6 +790,14 @@ class LevelAndStyleCustomFilterPolicy : public FilterPolicy {
   const std::unique_ptr<const FilterPolicy> policy_otherwise_;
 };
 
+static std::map<TableFileCreationReason, std::string>
+    table_file_creation_reason_to_string{
+        {TableFileCreationReason::kCompaction, "kCompaction"},
+        {TableFileCreationReason::kFlush, "kFlush"},
+        {TableFileCreationReason::kMisc, "kMisc"},
+        {TableFileCreationReason::kRecovery, "kRecovery"},
+    };
+
 class TestingContextCustomFilterPolicy
     : public LevelAndStyleCustomFilterPolicy {
  public:
@@ -783,11 +810,17 @@ class TestingContextCustomFilterPolicy
       const FilterBuildingContext& context) const override {
     test_report_ += "cf=";
     test_report_ += context.column_family_name;
-    test_report_ += ",cs=";
+    test_report_ += ",s=";
     test_report_ +=
         OptionsHelper::compaction_style_to_string[context.compaction_style];
-    test_report_ += ",lv=";
-    test_report_ += std::to_string(context.level_at_creation);
+    test_report_ += ",n=";
+    test_report_ += ToString(context.num_levels);
+    test_report_ += ",l=";
+    test_report_ += ToString(context.level_at_creation);
+    test_report_ += ",b=";
+    test_report_ += ToString(int{context.is_bottommost});
+    test_report_ += ",r=";
+    test_report_ += table_file_creation_reason_to_string[context.reason];
     test_report_ += "\n";
 
     return LevelAndStyleCustomFilterPolicy::GetBuilderWithContext(context);
@@ -805,18 +838,21 @@ class TestingContextCustomFilterPolicy
 }  // namespace
 
 TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) {
+  auto policy = std::make_shared<TestingContextCustomFilterPolicy>(15, 8, 5);
+  Options options;
   for (bool fifo : {true, false}) {
-    Options options = CurrentOptions();
+    options = CurrentOptions();
+    options.max_open_files = fifo ? -1 : options.max_open_files;
     options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
     options.compaction_style =
         fifo ? kCompactionStyleFIFO : kCompactionStyleLevel;
 
     BlockBasedTableOptions table_options;
-    auto policy = std::make_shared<TestingContextCustomFilterPolicy>(15, 8, 5);
     table_options.filter_policy = policy;
     table_options.format_version = 5;
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
+    TryReopen(options);
     CreateAndReopenWithCF({fifo ? "abe" : "bob"}, options);
 
     const int maxKey = 10000;
@@ -827,16 +863,16 @@ TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) {
     ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
     Flush(1);
     EXPECT_EQ(policy->DumpTestReport(),
-              fifo ? "cf=abe,cs=kCompactionStyleFIFO,lv=0\n"
-                   : "cf=bob,cs=kCompactionStyleLevel,lv=0\n");
+              fifo ? "cf=abe,s=kCompactionStyleFIFO,n=1,l=0,b=0,r=kFlush\n"
+                   : "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n");
 
     for (int i = maxKey / 2; i < maxKey; i++) {
       ASSERT_OK(Put(1, Key(i), Key(i)));
     }
     Flush(1);
     EXPECT_EQ(policy->DumpTestReport(),
-              fifo ? "cf=abe,cs=kCompactionStyleFIFO,lv=0\n"
-                   : "cf=bob,cs=kCompactionStyleLevel,lv=0\n");
+              fifo ? "cf=abe,s=kCompactionStyleFIFO,n=1,l=0,b=0,r=kFlush\n"
+                   : "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n");
 
     // Check that they can be found
     for (int i = 0; i < maxKey; i++) {
@@ -864,7 +900,7 @@ TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) {
       ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
                                   nullptr));
       EXPECT_EQ(policy->DumpTestReport(),
-                "cf=bob,cs=kCompactionStyleLevel,lv=1\n");
+                "cf=bob,s=kCompactionStyleLevel,n=7,l=1,b=1,r=kCompaction\n");
 
       // Check that we now have one filter, about 9.2% FP rate (5 bits per key)
       for (int i = 0; i < maxKey; i++) {
@@ -876,11 +912,25 @@ TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) {
         EXPECT_GE(useful_count, maxKey * 0.90);
         EXPECT_LE(useful_count, maxKey * 0.91);
       }
+    } else {
+#ifndef ROCKSDB_LITE
+      // Also try external SST file
+      {
+        std::string file_path = dbname_ + "/external.sst";
+        SstFileWriter sst_file_writer(EnvOptions(), options, handles_[1]);
+        ASSERT_OK(sst_file_writer.Open(file_path));
+        ASSERT_OK(sst_file_writer.Put("key", "value"));
+        ASSERT_OK(sst_file_writer.Finish());
+      }
+      // Note: kCompactionStyleLevel is default, ignored if num_levels == -1
+      EXPECT_EQ(policy->DumpTestReport(),
+                "cf=abe,s=kCompactionStyleLevel,n=-1,l=-1,b=0,r=kMisc\n");
+#endif
     }
 
     // Destroy
     ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
-    dbfull()->DestroyColumnFamilyHandle(handles_[1]);
+    ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
     handles_[1] = nullptr;
   }
 }
@@ -1444,9 +1494,9 @@ void PrefixScanInit(DBBloomFilterTest* dbtest) {
   snprintf(buf, sizeof(buf), "%02d______:end", 10);
   keystr = std::string(buf);
   ASSERT_OK(dbtest->Put(keystr, keystr));
-  dbtest->Flush();
-  dbtest->dbfull()->CompactRange(CompactRangeOptions(), nullptr,
-                                 nullptr);  // move to level 1
+  ASSERT_OK(dbtest->Flush());
+  ASSERT_OK(dbtest->dbfull()->CompactRange(CompactRangeOptions(), nullptr,
+                                           nullptr));  // move to level 1
 
   // GROUP 1
   for (int i = 1; i <= small_range_sstfiles; i++) {
@@ -1563,21 +1613,21 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) {
   for (int key : keys) {
     ASSERT_OK(Put(1, Key(key), "val"));
     if (++num_inserted % 1000 == 0) {
-      dbfull()->TEST_WaitForFlushMemTable();
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
     }
   }
   ASSERT_OK(Put(1, Key(0), "val"));
   ASSERT_OK(Put(1, Key(numkeys), "val"));
   ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   if (NumTableFilesAtLevel(0, 1) == 0) {
     // No Level 0 file. Create one.
     ASSERT_OK(Put(1, Key(0), "val"));
     ASSERT_OK(Put(1, Key(numkeys), "val"));
     ASSERT_OK(Flush(1));
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
 
   for (int i = 1; i < numkeys; i += 2) {
@@ -1682,7 +1732,8 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) {
       BottommostLevelCompaction::kSkip;
   compact_options.change_level = true;
   compact_options.target_level = 7;
-  db_->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+  ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)
+                  .IsNotSupported());
 
   ASSERT_EQ(trivial_move, 1);
   ASSERT_EQ(non_trivial_move, 0);
@@ -1714,10 +1765,10 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) {
 
 int CountIter(std::unique_ptr<Iterator>& iter, const Slice& key) {
   int count = 0;
-  for (iter->Seek(key); iter->Valid() && iter->status() == Status::OK();
-       iter->Next()) {
+  for (iter->Seek(key); iter->Valid(); iter->Next()) {
     count++;
   }
+  EXPECT_OK(iter->status());
   return count;
 }
 
@@ -1730,6 +1781,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
     int using_full_builder = bfp_impl != BFP::kDeprecatedBlock;
     Options options;
     options.create_if_missing = true;
+    options.env = CurrentOptions().env;
     options.prefix_extractor.reset(NewCappedPrefixTransform(4));
     options.disable_auto_compactions = true;
     options.statistics = CreateDBStatistics();
@@ -1746,7 +1798,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
     ASSERT_OK(Put("abcdxxx1", "val2"));
     ASSERT_OK(Put("abcdxxx2", "val3"));
     ASSERT_OK(Put("abcdxxx3", "val4"));
-    dbfull()->Flush(FlushOptions());
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
     {
       // prefix_extractor has not changed, BF will always be read
       Slice upper_bound("abce");
@@ -1860,6 +1912,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
   for (auto bfp_impl : BFP::kAllFixedImpls) {
     int using_full_builder = bfp_impl != BFP::kDeprecatedBlock;
     Options options;
+    options.env = CurrentOptions().env;
     options.create_if_missing = true;
     options.prefix_extractor.reset(NewFixedPrefixTransform(1));
     options.disable_auto_compactions = true;
@@ -1903,7 +1956,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
     ASSERT_OK(Put("foo4", "bar4"));
     ASSERT_OK(Put("foq5", "bar5"));
     ASSERT_OK(Put("fpb", "1"));
-    dbfull()->Flush(FlushOptions());
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
     {
       // BF is cappped:3 now
       std::unique_ptr<Iterator> iter_tmp(db_->NewIterator(read_options));
@@ -1927,7 +1980,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
     ASSERT_OK(Put("foo7", "bar7"));
     ASSERT_OK(Put("foq8", "bar8"));
     ASSERT_OK(Put("fpc", "2"));
-    dbfull()->Flush(FlushOptions());
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
     {
       // BF is fixed:2 now
       std::unique_ptr<Iterator> iter_tmp(db_->NewIterator(read_options));
@@ -2038,10 +2091,10 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) {
       ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
     }
     ASSERT_OK(dbfull()->DropColumnFamily(handles_[2]));
-    dbfull()->DestroyColumnFamilyHandle(handles_[2]);
+    ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[2]));
     handles_[2] = nullptr;
     ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
-    dbfull()->DestroyColumnFamilyHandle(handles_[1]);
+    ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
     handles_[1] = nullptr;
     iteration++;
   }
@@ -2052,6 +2105,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) {
 TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
   for (auto bfp_impl : BFP::kAllFixedImpls) {
     Options options;
+    options.env = CurrentOptions().env;
     options.create_if_missing = true;
     options.prefix_extractor.reset(NewFixedPrefixTransform(1));
     options.disable_auto_compactions = true;
@@ -2113,6 +2167,54 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
   }
 }
 
+TEST_F(DBBloomFilterTest, SeekForPrevWithPartitionedFilters) {
+  Options options = CurrentOptions();
+  constexpr size_t kNumKeys = 10000;
+  static_assert(kNumKeys <= 10000, "kNumKeys have to be <= 10000");
+  options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeys + 10));
+  options.create_if_missing = true;
+  constexpr size_t kPrefixLength = 4;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixLength));
+  options.compression = kNoCompression;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(50));
+  bbto.index_shortening =
+      BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+  bbto.block_size = 128;
+  bbto.metadata_block_size = 128;
+  bbto.partition_filters = true;
+  bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  const std::string value(64, '\0');
+
+  WriteOptions write_opts;
+  write_opts.disableWAL = true;
+  for (size_t i = 0; i < kNumKeys; ++i) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(4) << std::fixed << i;
+    ASSERT_OK(db_->Put(write_opts, oss.str(), value));
+  }
+  ASSERT_OK(Flush());
+
+  ReadOptions read_opts;
+  // Use legacy, implicit prefix seek
+  read_opts.total_order_seek = false;
+  read_opts.auto_prefix_mode = false;
+  std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+  for (size_t i = 0; i < kNumKeys; ++i) {
+    // Seek with a key after each one added but with same prefix. One will
+    // surely cross a partition boundary.
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(4) << std::fixed << i << "a";
+    it->SeekForPrev(oss.str());
+    ASSERT_OK(it->status());
+    ASSERT_TRUE(it->Valid());
+  }
+  it.reset();
+}
+
 #endif  // ROCKSDB_LITE
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/db_compaction_filter_test.cc b/db/db_compaction_filter_test.cc
index fb9186caead..d5a8db1c3e2 100644
--- a/db/db_compaction_filter_test.cc
+++ b/db/db_compaction_filter_test.cc
@@ -42,7 +42,7 @@ class DBTestCompactionFilterWithCompactParam
         option_config_ == kUniversalSubcompactions) {
       assert(options.max_subcompactions > 1);
     }
-    TryReopen(options);
+    Reopen(options);
   }
 };
 
@@ -82,6 +82,11 @@ class DeleteFilter : public CompactionFilter {
     return true;
   }
 
+  bool FilterMergeOperand(int /*level*/, const Slice& /*key*/,
+                          const Slice& /*operand*/) const override {
+    return true;
+  }
+
   const char* Name() const override { return "DeleteFilter"; }
 };
 
@@ -190,18 +195,36 @@ class KeepFilterFactory : public CompactionFilterFactory {
   bool compaction_filter_created_;
 };
 
+// This filter factory is configured with a `TableFileCreationReason`. Only
+// table files created for that reason will undergo filtering. This
+// configurability makes it useful to tests for filtering non-compaction table
+// files, such as "CompactionFilterFlush" and "CompactionFilterRecovery".
 class DeleteFilterFactory : public CompactionFilterFactory {
  public:
+  explicit DeleteFilterFactory(TableFileCreationReason reason)
+      : reason_(reason) {}
+
   std::unique_ptr<CompactionFilter> CreateCompactionFilter(
       const CompactionFilter::Context& context) override {
-    if (context.is_manual_compaction) {
-      return std::unique_ptr<CompactionFilter>(new DeleteFilter());
-    } else {
+    EXPECT_EQ(reason_, context.reason);
+    if (context.reason == TableFileCreationReason::kCompaction &&
+        !context.is_manual_compaction) {
+      // Table files created by automatic compaction do not undergo filtering.
+      // Presumably some tests rely on this.
       return std::unique_ptr<CompactionFilter>(nullptr);
     }
+    return std::unique_ptr<CompactionFilter>(new DeleteFilter());
+  }
+
+  bool ShouldFilterTableFileCreation(
+      TableFileCreationReason reason) const override {
+    return reason_ == reason;
   }
 
   const char* Name() const override { return "DeleteFilterFactory"; }
+
+ private:
+  const TableFileCreationReason reason_;
 };
 
 // Delete Filter Factory which ignores snapshots
@@ -276,7 +299,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) {
   for (int i = 0; i < 100000; i++) {
     char key[100];
     snprintf(key, sizeof(key), "B%010d", i);
-    Put(1, key, value);
+    ASSERT_OK(Put(1, key, value));
   }
   ASSERT_OK(Flush(1));
 
@@ -284,10 +307,10 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) {
   // the compaction is each level invokes the filter for
   // all the keys in that level.
   cfilter_count = 0;
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
   ASSERT_EQ(cfilter_count, 100000);
   cfilter_count = 0;
-  dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+  ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
   ASSERT_EQ(cfilter_count, 100000);
 
   ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
@@ -314,13 +337,14 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) {
     ASSERT_OK(iter->status());
     while (iter->Valid()) {
       ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-      ASSERT_OK(ParseInternalKey(iter->key(), &ikey));
+      ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
       total++;
       if (ikey.sequence != 0) {
         count++;
       }
       iter->Next();
     }
+    ASSERT_OK(iter->status());
   }
   ASSERT_EQ(total, 100000);
   ASSERT_EQ(count, 0);
@@ -337,10 +361,10 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) {
   // means that all keys should pass at least once
   // via the compaction filter
   cfilter_count = 0;
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
   ASSERT_EQ(cfilter_count, 100000);
   cfilter_count = 0;
-  dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+  ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
   ASSERT_EQ(cfilter_count, 100000);
   ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
   ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
@@ -348,7 +372,8 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) {
 
   // create a new database with the compaction
   // filter in such a way that it deletes all keys
-  options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>();
+  options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>(
+      TableFileCreationReason::kCompaction);
   options.create_if_missing = true;
   DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, options);
@@ -369,10 +394,10 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) {
   // verify that at the end of the compaction process,
   // nothing is left.
   cfilter_count = 0;
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
   ASSERT_EQ(cfilter_count, 100000);
   cfilter_count = 0;
-  dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+  ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
   ASSERT_EQ(cfilter_count, 0);
   ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
   ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
@@ -387,6 +412,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) {
       count++;
       iter->Next();
     }
+    ASSERT_OK(iter->status());
     ASSERT_EQ(count, 0);
   }
 
@@ -405,7 +431,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) {
     ASSERT_OK(iter->status());
     while (iter->Valid()) {
       ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-      ASSERT_OK(ParseInternalKey(iter->key(), &ikey));
+      ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
       ASSERT_NE(ikey.sequence, (unsigned)0);
       count++;
       iter->Next();
@@ -419,7 +445,8 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) {
 // entries in VersionEdit, but none of the 'AddFile's.
 TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) {
   Options options = CurrentOptions();
-  options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>();
+  options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>(
+      TableFileCreationReason::kCompaction);
   options.disable_auto_compactions = true;
   options.create_if_missing = true;
   DestroyAndReopen(options);
@@ -427,9 +454,9 @@ TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) {
   // put some data
   for (int table = 0; table < 4; ++table) {
     for (int i = 0; i < 10 + table; ++i) {
-      Put(ToString(table * 100 + i), "val");
+      ASSERT_OK(Put(ToString(table * 100 + i), "val"));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
 
   // this will produce empty file (delete compaction filter)
@@ -440,6 +467,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) {
 
   Iterator* itr = db_->NewIterator(ReadOptions());
   itr->SeekToFirst();
+  ASSERT_OK(itr->status());
   // empty db
   ASSERT_TRUE(!itr->Valid());
 
@@ -447,6 +475,64 @@ TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) {
 }
 #endif  // ROCKSDB_LITE
 
+TEST_F(DBTestCompactionFilter, CompactionFilterFlush) {
+  // Tests a `CompactionFilterFactory` that filters when table file is created
+  // by flush.
+  Options options = CurrentOptions();
+  options.compaction_filter_factory =
+      std::make_shared<DeleteFilterFactory>(TableFileCreationReason::kFlush);
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  Reopen(options);
+
+  // Puts and Merges are purged in flush.
+  ASSERT_OK(Put("a", "v"));
+  ASSERT_OK(Merge("b", "v"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("NOT_FOUND", Get("a"));
+  ASSERT_EQ("NOT_FOUND", Get("b"));
+
+  // However, Puts and Merges are preserved by recovery.
+  ASSERT_OK(Put("a", "v"));
+  ASSERT_OK(Merge("b", "v"));
+  Reopen(options);
+  ASSERT_EQ("v", Get("a"));
+  ASSERT_EQ("v", Get("b"));
+
+  // Likewise, compaction does not apply filtering.
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("v", Get("a"));
+  ASSERT_EQ("v", Get("b"));
+}
+
+TEST_F(DBTestCompactionFilter, CompactionFilterRecovery) {
+  // Tests a `CompactionFilterFactory` that filters when table file is created
+  // by recovery.
+  Options options = CurrentOptions();
+  options.compaction_filter_factory =
+      std::make_shared<DeleteFilterFactory>(TableFileCreationReason::kRecovery);
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  Reopen(options);
+
+  // Puts and Merges are purged in recovery.
+  ASSERT_OK(Put("a", "v"));
+  ASSERT_OK(Merge("b", "v"));
+  Reopen(options);
+  ASSERT_EQ("NOT_FOUND", Get("a"));
+  ASSERT_EQ("NOT_FOUND", Get("b"));
+
+  // However, Puts and Merges are preserved by flush.
+  ASSERT_OK(Put("a", "v"));
+  ASSERT_OK(Merge("b", "v"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("v", Get("a"));
+  ASSERT_EQ("v", Get("b"));
+
+  // Likewise, compaction does not apply filtering.
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("v", Get("a"));
+  ASSERT_EQ("v", Get("b"));
+}
+
 TEST_P(DBTestCompactionFilterWithCompactParam,
        CompactionFilterWithValueChange) {
   Options options = CurrentOptions();
@@ -463,25 +549,25 @@ TEST_P(DBTestCompactionFilterWithCompactParam,
   for (int i = 0; i < 100001; i++) {
     char key[100];
     snprintf(key, sizeof(key), "B%010d", i);
-    Put(1, key, value);
+    ASSERT_OK(Put(1, key, value));
   }
 
   // push all files to  lower levels
   ASSERT_OK(Flush(1));
   if (option_config_ != kUniversalCompactionMultiLevel &&
       option_config_ != kUniversalSubcompactions) {
-    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
-    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
   } else {
-    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
-                           nullptr);
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
   }
 
   // re-write all data again
   for (int i = 0; i < 100001; i++) {
     char key[100];
     snprintf(key, sizeof(key), "B%010d", i);
-    Put(1, key, value);
+    ASSERT_OK(Put(1, key, value));
   }
 
   // push all files to  lower levels. This should
@@ -489,11 +575,11 @@ TEST_P(DBTestCompactionFilterWithCompactParam,
   ASSERT_OK(Flush(1));
   if (option_config_ != kUniversalCompactionMultiLevel &&
       option_config_ != kUniversalSubcompactions) {
-    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
-    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
   } else {
-    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
-                           nullptr);
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
   }
 
   // verify that all keys now have the new value that
@@ -531,7 +617,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterWithMergeOperator) {
   ASSERT_OK(Flush());
   std::string newvalue = Get("foo");
   ASSERT_EQ(newvalue, three);
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   newvalue = Get("foo");
   ASSERT_EQ(newvalue, three);
 
@@ -539,12 +625,12 @@ TEST_F(DBTestCompactionFilter, CompactionFilterWithMergeOperator) {
   // merge keys.
   ASSERT_OK(db_->Put(WriteOptions(), "bar", two));
   ASSERT_OK(Flush());
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   newvalue = Get("bar");
   ASSERT_EQ("NOT_FOUND", newvalue);
   ASSERT_OK(db_->Merge(WriteOptions(), "bar", two));
   ASSERT_OK(Flush());
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   newvalue = Get("bar");
   ASSERT_EQ(two, two);
 
@@ -555,7 +641,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterWithMergeOperator) {
   ASSERT_OK(Flush());
   newvalue = Get("foobar");
   ASSERT_EQ(newvalue, three);
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   newvalue = Get("foobar");
   ASSERT_EQ(newvalue, three);
 
@@ -568,7 +654,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterWithMergeOperator) {
   ASSERT_OK(Flush());
   newvalue = Get("barfoo");
   ASSERT_EQ(newvalue, four);
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   newvalue = Get("barfoo");
   ASSERT_EQ(newvalue, four);
 }
@@ -590,21 +676,21 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) {
     for (int i = 0; i < num_keys_per_file; i++) {
       char key[100];
       snprintf(key, sizeof(key), "B%08d%02d", i, j);
-      Put(key, value);
+      ASSERT_OK(Put(key, value));
     }
-    dbfull()->TEST_FlushMemTable();
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
     // Make sure next file is much smaller so automatic compaction will not
     // be triggered.
     num_keys_per_file /= 2;
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // Force a manual compaction
   cfilter_count = 0;
   filter->expect_manual_compaction_.store(true);
   filter->expect_full_compaction_.store(true);
   filter->expect_cf_id_.store(0);
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ(cfilter_count, 700);
   ASSERT_EQ(NumSortedRuns(0), 1);
   ASSERT_TRUE(filter->compaction_filter_created());
@@ -624,7 +710,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) {
     ASSERT_OK(iter->status());
     while (iter->Valid()) {
       ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-      ASSERT_OK(ParseInternalKey(iter->key(), &ikey));
+      ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
       total++;
       if (ikey.sequence != 0) {
         count++;
@@ -654,14 +740,14 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextCfId) {
     for (int i = 0; i < num_keys_per_file; i++) {
       char key[100];
       snprintf(key, sizeof(key), "B%08d%02d", i, j);
-      Put(1, key, value);
+      ASSERT_OK(Put(1, key, value));
     }
-    Flush(1);
+    ASSERT_OK(Flush(1));
     // Make sure next file is much smaller so automatic compaction will not
     // be triggered.
     num_keys_per_file /= 2;
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_TRUE(filter->compaction_filter_created());
 }
@@ -680,9 +766,9 @@ TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) {
   const Snapshot* snapshot = nullptr;
   for (int table = 0; table < 4; ++table) {
     for (int i = 0; i < 10; ++i) {
-      Put(ToString(table * 100 + i), "val");
+      ASSERT_OK(Put(ToString(table * 100 + i), "val"));
     }
-    Flush();
+    ASSERT_OK(Flush());
 
     if (table == 0) {
       snapshot = db_->GetSnapshot();
@@ -702,6 +788,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) {
     read_options.snapshot = snapshot;
     std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
     iter->SeekToFirst();
+    ASSERT_OK(iter->status());
     int count = 0;
     while (iter->Valid()) {
       count++;
@@ -710,6 +797,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) {
     ASSERT_EQ(count, 6);
     read_options.snapshot = nullptr;
     std::unique_ptr<Iterator> iter1(db_->NewIterator(read_options));
+    ASSERT_OK(iter1->status());
     iter1->SeekToFirst();
     count = 0;
     while (iter1->Valid()) {
@@ -740,9 +828,9 @@ TEST_F(DBTestCompactionFilter, SkipUntil) {
     for (int i = table * 6; i < 39 + table * 11; ++i) {
       char key[100];
       snprintf(key, sizeof(key), "%010d", table * 100 + i);
-      Put(key, std::to_string(table * 1000 + i));
+      ASSERT_OK(Put(key, std::to_string(table * 1000 + i)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
 
   cfilter_skips = 0;
@@ -781,10 +869,10 @@ TEST_F(DBTestCompactionFilter, SkipUntilWithBloomFilter) {
   options.create_if_missing = true;
   DestroyAndReopen(options);
 
-  Put("0000000010", "v10");
-  Put("0000000020", "v20");  // skipped
-  Put("0000000050", "v50");
-  Flush();
+  ASSERT_OK(Put("0000000010", "v10"));
+  ASSERT_OK(Put("0000000020", "v20"));  // skipped
+  ASSERT_OK(Put("0000000050", "v50"));
+  ASSERT_OK(Flush());
 
   cfilter_skips = 0;
   EXPECT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
@@ -822,13 +910,13 @@ TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalse) {
   options.compaction_filter = new TestNotSupportedFilter();
   DestroyAndReopen(options);
 
-  Put("a", "v10");
-  Put("z", "v20");
-  Flush();
+  ASSERT_OK(Put("a", "v10"));
+  ASSERT_OK(Put("z", "v20"));
+  ASSERT_OK(Flush());
 
-  Put("a", "v10");
-  Put("z", "v20");
-  Flush();
+  ASSERT_OK(Put("a", "v10"));
+  ASSERT_OK(Put("z", "v20"));
+  ASSERT_OK(Flush());
 
   // Comapction should fail because IgnoreSnapshots() = false
   EXPECT_TRUE(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)
@@ -837,6 +925,49 @@ TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalse) {
   delete options.compaction_filter;
 }
 
+class TestNotSupportedFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit TestNotSupportedFilterFactory(TableFileCreationReason reason)
+      : reason_(reason) {}
+
+  bool ShouldFilterTableFileCreation(
+      TableFileCreationReason reason) const override {
+    return reason_ == reason;
+  }
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /* context */) override {
+    return std::unique_ptr<CompactionFilter>(new TestNotSupportedFilter());
+  }
+
+  const char* Name() const override { return "TestNotSupportedFilterFactory"; }
+
+ private:
+  const TableFileCreationReason reason_;
+};
+
+TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalseDuringFlush) {
+  Options options = CurrentOptions();
+  options.compaction_filter_factory =
+      std::make_shared<TestNotSupportedFilterFactory>(
+          TableFileCreationReason::kFlush);
+  Reopen(options);
+
+  ASSERT_OK(Put("a", "v10"));
+  ASSERT_TRUE(Flush().IsNotSupported());
+}
+
+TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalseRecovery) {
+  Options options = CurrentOptions();
+  options.compaction_filter_factory =
+      std::make_shared<TestNotSupportedFilterFactory>(
+          TableFileCreationReason::kRecovery);
+  Reopen(options);
+
+  ASSERT_OK(Put("a", "v10"));
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index aac39d980d9..6cb1abfab16 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -7,7 +7,11 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include <tuple>
+
+#include "db/blob/blob_index.h"
 #include "db/db_test_util.h"
+#include "env/mock_env.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/concurrent_task_limiter.h"
@@ -18,6 +22,7 @@
 #include "util/concurrent_task_limiter_impl.h"
 #include "util/random.h"
 #include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -176,6 +181,7 @@ Options DeletionTriggerOptions(Options options) {
       options.target_file_size_base * options.target_file_size_multiplier;
   options.max_bytes_for_level_multiplier = 2;
   options.disable_auto_compactions = false;
+  options.compaction_options_universal.max_size_amplification_percent = 100;
   return options;
 }
 
@@ -251,7 +257,7 @@ void VerifyCompactionStats(ColumnFamilyData& cfd,
     const CompactionStatsCollector& collector) {
 #ifndef NDEBUG
   InternalStats* internal_stats_ptr = cfd.internal_stats();
-  ASSERT_TRUE(internal_stats_ptr != nullptr);
+  ASSERT_NE(internal_stats_ptr, nullptr);
   const std::vector<InternalStats::CompactionStats>& comp_stats =
       internal_stats_ptr->TEST_GetCompactionStats();
   const int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
@@ -323,19 +329,41 @@ TEST_P(DBCompactionTestWithParam, CompactionDeletionTrigger) {
       values.push_back(rnd.RandomString(kCDTValueSize));
       ASSERT_OK(Put(Key(k), values[k]));
     }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
-    db_size[0] = Size(Key(0), Key(kTestSize - 1));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0]));
 
     for (int k = 0; k < kTestSize; ++k) {
       ASSERT_OK(Delete(Key(k)));
     }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
-    db_size[1] = Size(Key(0), Key(kTestSize - 1));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1]));
 
-    // must have much smaller db size.
-    ASSERT_GT(db_size[0] / 3, db_size[1]);
+    if (options.compaction_style == kCompactionStyleUniversal) {
+      // Claim: in universal compaction none of the original data will remain
+      // once compactions settle.
+      //
+      // Proof: The compensated size of the file containing the most tombstones
+      // is enough on its own to trigger size amp compaction. Size amp
+      // compaction is a full compaction, so all tombstones meet the obsolete
+      // keys they cover.
+      ASSERT_EQ(0, db_size[1]);
+    } else {
+      // Claim: in level compaction at most `db_size[0] / 2` of the original
+      // data will remain once compactions settle.
+      //
+      // Proof: Assume the original data is all in the bottom level. If it were
+      // not, it would meet its tombstone sooner. The original data size is
+      // large enough to require fanout to bottom level to be greater than
+      // `max_bytes_for_level_multiplier == 2`. In the level just above,
+      // tombstones must cover less than `db_size[0] / 4` bytes since fanout >=
+      // 2 and file size is compensated by doubling the size of values we expect
+      // are covered (`kDeletionWeightOnCompaction == 2`). The tombstones in
+      // levels above must cover less than `db_size[0] / 8` bytes of original
+      // data, `db_size[0] / 16`, and so on.
+      ASSERT_GT(db_size[0] / 2, db_size[1]);
+    }
   }
 }
 #endif  // ROCKSDB_VALGRIND_RUN
@@ -382,8 +410,9 @@ TEST_P(DBCompactionTestWithParam, CompactionsPreserveDeletes) {
     cro.bottommost_level_compaction =
         BottommostLevelCompaction::kForceOptimized;
 
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->CompactRange(cro, nullptr, nullptr);
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_TRUE(
+        dbfull()->CompactRange(cro, nullptr, nullptr).IsInvalidArgument());
 
     // check that normal user iterator doesn't see anything
     Iterator* db_iter = dbfull()->NewIterator(ReadOptions());
@@ -391,6 +420,7 @@ TEST_P(DBCompactionTestWithParam, CompactionsPreserveDeletes) {
     for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
       i++;
     }
+    ASSERT_OK(db_iter->status());
     ASSERT_EQ(i, 0);
     delete db_iter;
 
@@ -398,6 +428,7 @@ TEST_P(DBCompactionTestWithParam, CompactionsPreserveDeletes) {
     ReadOptions ro;
     ro.iter_start_seqnum=1;
     db_iter = dbfull()->NewIterator(ro);
+    ASSERT_OK(db_iter->status());
     i = 0;
     for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
       i++;
@@ -407,9 +438,10 @@ TEST_P(DBCompactionTestWithParam, CompactionsPreserveDeletes) {
 
     // now all deletes should be gone
     SetPreserveDeletesSequenceNumber(100000000);
-    dbfull()->CompactRange(cro, nullptr, nullptr);
+    ASSERT_NOK(dbfull()->CompactRange(cro, nullptr, nullptr));
 
     db_iter = dbfull()->NewIterator(ro);
+    ASSERT_TRUE(db_iter->status().IsInvalidArgument());
     i = 0;
     for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
       i++;
@@ -471,6 +503,10 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
   options.new_table_reader_for_compaction_inputs = true;
   options.max_open_files = 20;
   options.level0_file_num_compaction_trigger = 3;
+  // Avoid many shards with small max_open_files, where as little as
+  // two table insertions could lead to an LRU eviction, depending on
+  // hash values.
+  options.table_cache_numshardbits = 2;
   DestroyAndReopen(options);
   Random rnd(301);
 
@@ -495,8 +531,8 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
     ASSERT_OK(Put(Key(10 - k), "bar"));
     if (k < options.level0_file_num_compaction_trigger - 1) {
       num_table_cache_lookup = 0;
-      Flush();
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
       // preloading iterator issues one table cache lookup and create
       // a new table reader, if not preloaded.
       int old_num_table_cache_lookup = num_table_cache_lookup;
@@ -514,8 +550,8 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
 
   num_table_cache_lookup = 0;
   num_new_table_reader = 0;
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // Preloading iterator issues one table cache lookup and creates
   // a new table reader. One file is created for flush and one for compaction.
   // Compaction inputs make no table cache look-up for data/range deletion
@@ -542,7 +578,7 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
   cro.change_level = true;
   cro.target_level = 2;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
-  db_->CompactRange(cro, nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
   // Only verifying compaction outputs issues one table cache lookup
   // for both data block and range deletion block).
   // May preload table cache too.
@@ -583,9 +619,9 @@ TEST_P(DBCompactionTestWithParam, CompactionDeletionTriggerReopen) {
       values.push_back(rnd.RandomString(kCDTValueSize));
       ASSERT_OK(Put(Key(k), values[k]));
     }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
-    db_size[0] = Size(Key(0), Key(kTestSize - 1));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0]));
     Close();
 
     // round 2 --- disable auto-compactions and issue deletions.
@@ -596,11 +632,10 @@ TEST_P(DBCompactionTestWithParam, CompactionDeletionTriggerReopen) {
     for (int k = 0; k < kTestSize; ++k) {
       ASSERT_OK(Delete(Key(k)));
     }
-    db_size[1] = Size(Key(0), Key(kTestSize - 1));
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1]));
     Close();
-    // as auto_compaction is off, we shouldn't see too much reduce
-    // in db size.
-    ASSERT_LT(db_size[0] / 3, db_size[1]);
+    // as auto_compaction is off, we shouldn't see any reduction in db size.
+    ASSERT_LE(db_size[0], db_size[1]);
 
     // round 3 --- reopen db with auto_compaction on and see if
     // deletion compensation still work.
@@ -610,11 +645,17 @@ TEST_P(DBCompactionTestWithParam, CompactionDeletionTriggerReopen) {
     for (int k = 0; k < kTestSize / 10; ++k) {
       ASSERT_OK(Put(Key(k), values[k]));
     }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
-    db_size[2] = Size(Key(0), Key(kTestSize - 1));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[2]));
     // this time we're expecting significant drop in size.
-    ASSERT_GT(db_size[0] / 3, db_size[2]);
+    //
+    // See "CompactionDeletionTrigger" test for proof that at most
+    // `db_size[0] / 2` of the original data remains. In addition to that, this
+    // test inserts `db_size[0] / 10` to push the tombstones into SST files and
+    // then through automatic compactions. So in total `3 * db_size[0] / 5` of
+    // the original data may remain.
+    ASSERT_GT(3 * db_size[0] / 5, db_size[2]);
   }
 }
 
@@ -630,7 +671,7 @@ TEST_F(DBCompactionTest, CompactRangeBottomPri) {
     CompactRangeOptions cro;
     cro.change_level = true;
     cro.target_level = 2;
-    dbfull()->CompactRange(cro, nullptr, nullptr);
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
   }
   ASSERT_EQ("0,0,3", FilesPerLevel(0));
 
@@ -663,7 +704,7 @@ TEST_F(DBCompactionTest, CompactRangeBottomPri) {
       });
   SyncPoint::GetInstance()->EnableProcessing();
   env_->SetBackgroundThreads(1, Env::Priority::BOTTOM);
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ(1, low_pri_count);
   ASSERT_EQ(1, bottom_pri_count);
   ASSERT_EQ("0,0,2", FilesPerLevel(0));
@@ -671,12 +712,12 @@ TEST_F(DBCompactionTest, CompactRangeBottomPri) {
   // Recompact bottom most level uses bottom pool
   CompactRangeOptions cro;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
-  dbfull()->CompactRange(cro, nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
   ASSERT_EQ(1, low_pri_count);
   ASSERT_EQ(2, bottom_pri_count);
 
   env_->SetBackgroundThreads(0, Env::Priority::BOTTOM);
-  dbfull()->CompactRange(cro, nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
   // Low pri pool is used if bottom pool has size 0.
   ASSERT_EQ(2, low_pri_count);
   ASSERT_EQ(2, bottom_pri_count);
@@ -701,9 +742,16 @@ TEST_F(DBCompactionTest, DisableStatsUpdateReopen) {
       values.push_back(rnd.RandomString(kCDTValueSize));
       ASSERT_OK(Put(Key(k), values[k]));
     }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
-    db_size[0] = Size(Key(0), Key(kTestSize - 1));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    // L1 and L2 can fit deletions iff size compensation does not take effect,
+    // i.e., when `skip_stats_update_on_db_open == true`. Move any remaining
+    // files at or above L2 down to L3 to ensure obsolete data does not
+    // accidentally meet its tombstone above L3. This makes the final size more
+    // deterministic and easy to see whether size compensation for deletions
+    // took effect.
+    MoveFilesToLevel(3 /* level */);
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0]));
     Close();
 
     // round 2 --- disable auto-compactions and issue deletions.
@@ -716,27 +764,33 @@ TEST_F(DBCompactionTest, DisableStatsUpdateReopen) {
     for (int k = 0; k < kTestSize; ++k) {
       ASSERT_OK(Delete(Key(k)));
     }
-    db_size[1] = Size(Key(0), Key(kTestSize - 1));
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1]));
     Close();
-    // as auto_compaction is off, we shouldn't see too much reduce
-    // in db size.
-    ASSERT_LT(db_size[0] / 3, db_size[1]);
+    // as auto_compaction is off, we shouldn't see any reduction in db size.
+    ASSERT_LE(db_size[0], db_size[1]);
 
     // round 3 --- reopen db with auto_compaction on and see if
     // deletion compensation still work.
     options.disable_auto_compactions = false;
     Reopen(options);
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
-    db_size[2] = Size(Key(0), Key(kTestSize - 1));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[2]));
 
     if (options.skip_stats_update_on_db_open) {
       // If update stats on DB::Open is disable, we don't expect
       // deletion entries taking effect.
-      ASSERT_LT(db_size[0] / 3, db_size[2]);
+      //
+      // The deletions are small enough to fit in L1 and L2, and obsolete keys
+      // were moved to L3+, so none of the original data should have been
+      // dropped.
+      ASSERT_LE(db_size[0], db_size[2]);
     } else {
       // Otherwise, we should see a significant drop in db size.
-      ASSERT_GT(db_size[0] / 3, db_size[2]);
+      //
+      // See "CompactionDeletionTrigger" test for proof that at most
+      // `db_size[0] / 2` of the original data remains.
+      ASSERT_GT(db_size[0] / 2, db_size[2]);
     }
   }
 }
@@ -766,7 +820,7 @@ TEST_P(DBCompactionTestWithParam, CompactionTrigger) {
     }
     // put extra key to trigger flush
     ASSERT_OK(Put(1, "", ""));
-    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
     ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1);
   }
 
@@ -778,7 +832,7 @@ TEST_P(DBCompactionTestWithParam, CompactionTrigger) {
   }
   // put extra key to trigger flush
   ASSERT_OK(Put(1, "", ""));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
   ASSERT_EQ(NumTableFilesAtLevel(1, 1), 1);
@@ -820,7 +874,7 @@ TEST_F(DBCompactionTest, BGCompactionsAllowed) {
       }
       // put extra key to trigger flush
       ASSERT_OK(Put(cf, "", ""));
-      dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
       ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1);
     }
   }
@@ -837,7 +891,7 @@ TEST_F(DBCompactionTest, BGCompactionsAllowed) {
     }
     // put extra key to trigger flush
     ASSERT_OK(Put(2, "", ""));
-    dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
     ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1,
               NumTableFilesAtLevel(0, 2));
   }
@@ -848,7 +902,7 @@ TEST_F(DBCompactionTest, BGCompactionsAllowed) {
     sleeping_tasks[i].WakeUp();
     sleeping_tasks[i].WaitUntilDone();
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // Verify number of compactions allowed will come back to 1.
 
@@ -865,7 +919,7 @@ TEST_F(DBCompactionTest, BGCompactionsAllowed) {
       }
       // put extra key to trigger flush
       ASSERT_OK(Put(cf, "", ""));
-      dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
       ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1);
     }
   }
@@ -898,8 +952,8 @@ TEST_P(DBCompactionTestWithParam, CompactionsGenerateMultipleFiles) {
 
   // Reopening moves updates to level-0
   ReopenWithColumnFamilies({"default", "pikachu"}, options);
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
-                              true /* disallow trivial move */);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+                                        true /* disallow trivial move */));
 
   ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
   ASSERT_GT(NumTableFilesAtLevel(1, 1), 1);
@@ -943,27 +997,27 @@ TEST_F(DBCompactionTest, UserKeyCrossFile1) {
   DestroyAndReopen(options);
 
   // create first file and flush to l0
-  Put("4", "A");
-  Put("3", "A");
-  Flush();
-  dbfull()->TEST_WaitForFlushMemTable();
-
-  Put("2", "A");
-  Delete("3");
-  Flush();
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(Put("4", "A"));
+  ASSERT_OK(Put("3", "A"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  ASSERT_OK(Put("2", "A"));
+  ASSERT_OK(Delete("3"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   ASSERT_EQ("NOT_FOUND", Get("3"));
 
   // move both files down to l1
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ("NOT_FOUND", Get("3"));
 
   for (int i = 0; i < 3; i++) {
-    Put("2", "B");
-    Flush();
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(Put("2", "B"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_EQ("NOT_FOUND", Get("3"));
 }
@@ -976,27 +1030,27 @@ TEST_F(DBCompactionTest, UserKeyCrossFile2) {
   DestroyAndReopen(options);
 
   // create first file and flush to l0
-  Put("4", "A");
-  Put("3", "A");
-  Flush();
-  dbfull()->TEST_WaitForFlushMemTable();
-
-  Put("2", "A");
-  SingleDelete("3");
-  Flush();
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(Put("4", "A"));
+  ASSERT_OK(Put("3", "A"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  ASSERT_OK(Put("2", "A"));
+  ASSERT_OK(SingleDelete("3"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   ASSERT_EQ("NOT_FOUND", Get("3"));
 
   // move both files down to l1
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ("NOT_FOUND", Get("3"));
 
   for (int i = 0; i < 3; i++) {
-    Put("2", "B");
-    Flush();
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(Put("2", "B"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_EQ("NOT_FOUND", Get("3"));
 }
@@ -1012,17 +1066,17 @@ TEST_F(DBCompactionTest, CompactionSstPartitioner) {
   DestroyAndReopen(options);
 
   // create first file and flush to l0
-  Put("aaaa1", "A");
-  Put("bbbb1", "B");
-  Flush();
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(Put("aaaa1", "A"));
+  ASSERT_OK(Put("bbbb1", "B"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
 
-  Put("aaaa1", "A2");
-  Flush();
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(Put("aaaa1", "A2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
 
   // move both files down to l1
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
   std::vector<LiveFileMetaData> files;
   dbfull()->GetLiveFilesMetaData(&files);
@@ -1042,11 +1096,11 @@ TEST_F(DBCompactionTest, CompactionSstPartitionerNonTrivial) {
   DestroyAndReopen(options);
 
   // create first file and flush to l0
-  Put("aaaa1", "A");
-  Put("bbbb1", "B");
-  Flush();
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact(true);
+  ASSERT_OK(Put("aaaa1", "A"));
+  ASSERT_OK(Put("bbbb1", "B"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
 
   std::vector<LiveFileMetaData> files;
   dbfull()->GetLiveFilesMetaData(&files);
@@ -1076,22 +1130,23 @@ TEST_F(DBCompactionTest, ZeroSeqIdCompaction) {
 
   // create first file and flush to l0
   for (auto& key : {"1", "2", "3", "3", "3", "3"}) {
-    Put(key, std::string(key_len, 'A'));
+    ASSERT_OK(Put(key, std::string(key_len, 'A')));
     snaps.push_back(dbfull()->GetSnapshot());
   }
-  Flush();
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
 
   // create second file and flush to l0
   for (auto& key : {"3", "4", "5", "6", "7", "8"}) {
-    Put(key, std::string(key_len, 'A'));
+    ASSERT_OK(Put(key, std::string(key_len, 'A')));
     snaps.push_back(dbfull()->GetSnapshot());
   }
-  Flush();
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
 
   // move both files down to l1
-  dbfull()->CompactFiles(compact_opt, collector->GetFlushedFiles(), 1);
+  ASSERT_OK(
+      dbfull()->CompactFiles(compact_opt, collector->GetFlushedFiles(), 1));
 
   // release snap so that first instance of key(3) can have seqId=0
   for (auto snap : snaps) {
@@ -1100,12 +1155,12 @@ TEST_F(DBCompactionTest, ZeroSeqIdCompaction) {
 
   // create 3 files in l0 so to trigger compaction
   for (int i = 0; i < options.level0_file_num_compaction_trigger; i++) {
-    Put("2", std::string(1, 'A'));
-    Flush();
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(Put("2", std::string(1, 'A')));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   }
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_OK(Put("", ""));
 }
 
@@ -1120,12 +1175,12 @@ TEST_F(DBCompactionTest, ManualCompactionUnknownOutputSize) {
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < options.level0_file_num_compaction_trigger; j++) {
       // make l0 files' ranges overlap to avoid trivial move
-      Put(std::to_string(2 * i), std::string(1, 'A'));
-      Put(std::to_string(2 * i + 1), std::string(1, 'A'));
-      Flush();
-      dbfull()->TEST_WaitForFlushMemTable();
+      ASSERT_OK(Put(std::to_string(2 * i), std::string(1, 'A')));
+      ASSERT_OK(Put(std::to_string(2 * i + 1), std::string(1, 'A')));
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     }
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
     ASSERT_EQ(NumTableFilesAtLevel(1, 0), i + 1);
   }
@@ -1141,7 +1196,7 @@ TEST_F(DBCompactionTest, ManualCompactionUnknownOutputSize) {
   // note CompactionOptions::output_file_size_limit is unset.
   CompactionOptions compact_opt;
   compact_opt.compression = kNoCompression;
-  dbfull()->CompactFiles(compact_opt, input_filenames, 1);
+  ASSERT_OK(dbfull()->CompactFiles(compact_opt, input_filenames, 1));
 }
 
 // Check that writes done during a memtable compaction are recovered
@@ -1202,7 +1257,7 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveOneFile) {
   cro.exclusive_manual_compaction = exclusive_manual_compaction_;
 
   // Compaction will initiate a trivial move from L0 to L1
-  dbfull()->CompactRange(cro, nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
 
   // File moved From L0 to L1
   ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);  // 0 files in L0
@@ -1271,7 +1326,7 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) {
 
   // Since data is non-overlapping we expect compaction to initiate
   // a trivial move
-  db_->CompactRange(cro, nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
   // We expect that all the files were trivially moved from L0 to L1
   ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
   ASSERT_EQ(NumTableFilesAtLevel(1, 0) /* level1_files */, level0_files);
@@ -1308,7 +1363,7 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) {
     ASSERT_OK(Flush());
   }
 
-  db_->CompactRange(cro, nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
 
   for (size_t i = 0; i < ranges.size(); i++) {
     for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
@@ -1512,8 +1567,8 @@ TEST_P(DBCompactionTestWithParam, ManualCompactionPartial) {
   ASSERT_EQ("3,0,0,0,0,1,2", FilesPerLevel(0));
   TEST_SYNC_POINT("DBCompaction::ManualPartial:5");
 
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // After two non-trivial compactions are installed, there is 1 file in L6, and
   // 1 file in L1
   ASSERT_EQ("0,1,0,0,0,0,1", FilesPerLevel(0));
@@ -1626,7 +1681,7 @@ TEST_F(DBCompactionTest, DISABLED_ManualPartialFill) {
     for (int32_t j = 300; j < 4300; j++) {
       if (j == 2300) {
         ASSERT_OK(Flush());
-        dbfull()->TEST_WaitForFlushMemTable();
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
       }
       values[j] = rnd.RandomString(value_size);
       ASSERT_OK(Put(Key(j), values[j]));
@@ -1642,8 +1697,8 @@ TEST_F(DBCompactionTest, DISABLED_ManualPartialFill) {
   }
 
   TEST_SYNC_POINT("DBCompaction::PartialFill:2");
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   threads.join();
 
   for (int32_t i = 0; i < 4300; i++) {
@@ -1661,12 +1716,12 @@ TEST_F(DBCompactionTest, ManualCompactionWithUnorderedWrite) {
   Options options = CurrentOptions();
   options.unordered_write = true;
   DestroyAndReopen(options);
-  Put("foo", "v1");
+  ASSERT_OK(Put("foo", "v1"));
   ASSERT_OK(Flush());
 
-  Put("bar", "v1");
+  ASSERT_OK(Put("bar", "v1"));
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-  port::Thread writer([&]() { Put("foo", "v2"); });
+  port::Thread writer([&]() { ASSERT_OK(Put("foo", "v2")); });
 
   TEST_SYNC_POINT(
       "DBCompactionTest::ManualCompactionWithUnorderedWrite:WaitWriteWAL");
@@ -1732,15 +1787,15 @@ TEST_F(DBCompactionTest, DeleteFileRange) {
     for (int32_t j = 300; j < 4300; j++) {
       if (j == 2300) {
         ASSERT_OK(Flush());
-        dbfull()->TEST_WaitForFlushMemTable();
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
       }
       values[j] = rnd.RandomString(value_size);
       ASSERT_OK(Put(Key(j), values[j]));
     }
   }
   ASSERT_OK(Flush());
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // Verify level sizes
   uint64_t target_size = 4 * options.max_bytes_for_level_base;
@@ -1750,7 +1805,7 @@ TEST_F(DBCompactionTest, DeleteFileRange) {
                                         options.max_bytes_for_level_multiplier);
   }
 
-  size_t old_num_files = CountFiles();
+  const size_t old_num_files = CountFiles();
   std::string begin_string = Key(1000);
   std::string end_string = Key(2000);
   Slice begin(begin_string);
@@ -1785,7 +1840,7 @@ TEST_F(DBCompactionTest, DeleteFileRange) {
   compact_options.change_level = true;
   compact_options.target_level = 1;
   ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_OK(
       DeleteFilesInRange(db_, db_->DefaultColumnFamily(), nullptr, nullptr));
@@ -1794,12 +1849,11 @@ TEST_F(DBCompactionTest, DeleteFileRange) {
   for (int32_t i = 0; i < 4300; i++) {
     ReadOptions roptions;
     std::string result;
-    Status s = db_->Get(roptions, Key(i), &result);
-    ASSERT_TRUE(s.IsNotFound());
+    ASSERT_TRUE(db_->Get(roptions, Key(i), &result).IsNotFound());
     deleted_count2++;
   }
   ASSERT_GT(deleted_count2, deleted_count);
-  size_t new_num_files = CountFiles();
+  const size_t new_num_files = CountFiles();
   ASSERT_GT(old_num_files, new_num_files);
 }
 
@@ -1954,14 +2008,14 @@ TEST_F(DBCompactionTest, DeleteFileRangeFileEndpointsOverlapBug) {
   std::string vals[kNumL0Files];
   for (int i = 0; i < kNumL0Files; ++i) {
     vals[i] = rnd.RandomString(kValSize);
-    Put(Key(i), vals[i]);
-    Put(Key(i + 1), vals[i]);
-    Flush();
+    ASSERT_OK(Put(Key(i), vals[i]));
+    ASSERT_OK(Put(Key(i + 1), vals[i]));
+    ASSERT_OK(Flush());
     if (i == 0) {
       snapshot = db_->GetSnapshot();
     }
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // Verify `DeleteFilesInRange` can't drop only file 0 which would cause
   // "1 -> vals[0]" to reappear.
@@ -2048,16 +2102,8 @@ TEST_P(DBCompactionTestWithParam, LevelCompactionThirdPath) {
   options.num_levels = 4;
   options.max_bytes_for_level_base = 400 * 1024;
   options.max_subcompactions = max_subcompactions_;
-  //  options = CurrentOptions(options);
 
-  std::vector<std::string> filenames;
-  env_->GetChildren(options.db_paths[1].path, &filenames);
-  // Delete archival files.
-  for (size_t i = 0; i < filenames.size(); ++i) {
-    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
-  }
-  env_->DeleteDir(options.db_paths[1].path);
-  Reopen(options);
+  DestroyAndReopen(options);
 
   Random rnd(301);
   int key_idx = 0;
@@ -2165,16 +2211,8 @@ TEST_P(DBCompactionTestWithParam, LevelCompactionPathUse) {
   options.num_levels = 4;
   options.max_bytes_for_level_base = 400 * 1024;
   options.max_subcompactions = max_subcompactions_;
-  //  options = CurrentOptions(options);
 
-  std::vector<std::string> filenames;
-  env_->GetChildren(options.db_paths[1].path, &filenames);
-  // Delete archival files.
-  for (size_t i = 0; i < filenames.size(); ++i) {
-    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
-  }
-  env_->DeleteDir(options.db_paths[1].path);
-  Reopen(options);
+  DestroyAndReopen(options);
 
   Random rnd(301);
   int key_idx = 0;
@@ -2411,7 +2449,7 @@ TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) {
     ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
   }
   ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_GT(TotalTableFiles(1, 4), 1);
   int non_level0_num_files = 0;
@@ -2447,7 +2485,8 @@ TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) {
   compact_options.bottommost_level_compaction =
       BottommostLevelCompaction::kForce;
   compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
-  dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+  ASSERT_OK(
+      dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr));
 
   // Only 1 file in L0
   ASSERT_EQ("1", FilesPerLevel(1));
@@ -2468,9 +2507,9 @@ TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) {
   for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) {
     ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
   }
-  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(dbfull()->Flush(FlushOptions()));
   ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   for (int i = 1; i < options.num_levels; i++) {
     ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
@@ -2480,6 +2519,7 @@ TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) {
   // compaction style
   std::string keys_in_db;
   Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]);
+  ASSERT_OK(iter->status());
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     keys_in_db.append(iter->key().ToString());
     keys_in_db.push_back(',');
@@ -2517,24 +2557,24 @@ TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_a) {
 TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_b) {
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    Put(1, "", "");
+    ASSERT_OK(Put(1, "", ""));
     ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    Delete(1, "e");
-    Put(1, "", "");
+    ASSERT_OK(Delete(1, "e"));
+    ASSERT_OK(Put(1, "", ""));
     ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    Put(1, "c", "cv");
+    ASSERT_OK(Put(1, "c", "cv"));
     ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    Put(1, "", "");
+    ASSERT_OK(Put(1, "", ""));
     ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    Put(1, "", "");
+    ASSERT_OK(Put(1, "", ""));
     env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
     ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    Put(1, "d", "dv");
+    ASSERT_OK(Put(1, "d", "dv"));
     ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    Put(1, "", "");
+    ASSERT_OK(Put(1, "", ""));
     ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    Delete(1, "d");
-    Delete(1, "b");
+    ASSERT_OK(Delete(1, "d"));
+    ASSERT_OK(Delete(1, "b"));
     ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
     ASSERT_EQ("(->)(c->cv)", Contents(1));
     env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
@@ -2551,34 +2591,35 @@ TEST_F(DBCompactionTest, ManualAutoRace) {
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
-  Put(1, "foo", "");
-  Put(1, "bar", "");
-  Flush(1);
-  Put(1, "foo", "");
-  Put(1, "bar", "");
+  ASSERT_OK(Put(1, "foo", ""));
+  ASSERT_OK(Put(1, "bar", ""));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "foo", ""));
+  ASSERT_OK(Put(1, "bar", ""));
   // Generate four files in CF 0, which should trigger an auto compaction
-  Put("foo", "");
-  Put("bar", "");
-  Flush();
-  Put("foo", "");
-  Put("bar", "");
-  Flush();
-  Put("foo", "");
-  Put("bar", "");
-  Flush();
-  Put("foo", "");
-  Put("bar", "");
-  Flush();
+  ASSERT_OK(Put("foo", ""));
+  ASSERT_OK(Put("bar", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", ""));
+  ASSERT_OK(Put("bar", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", ""));
+  ASSERT_OK(Put("bar", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", ""));
+  ASSERT_OK(Put("bar", ""));
+  ASSERT_OK(Flush());
 
   // The auto compaction is scheduled but waited until here
   TEST_SYNC_POINT("DBCompactionTest::ManualAutoRace:1");
   // The auto compaction will wait until the manual compaction is registerd
   // before processing so that it will be cancelled.
-  dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                                   nullptr));
   ASSERT_EQ("0,1", FilesPerLevel(1));
 
   // Eventually the cancelled compaction will be rescheduled and executed.
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("0,1", FilesPerLevel(0));
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
@@ -2623,7 +2664,7 @@ TEST_P(DBCompactionTestWithParam, ManualCompaction) {
         options.statistics->getTickerCount(BLOCK_CACHE_ADD);
     CompactRangeOptions cro;
     cro.exclusive_manual_compaction = exclusive_manual_compaction_;
-    db_->CompactRange(cro, handles_[1], nullptr, nullptr);
+    ASSERT_OK(db_->CompactRange(cro, handles_[1], nullptr, nullptr));
     // Verify manual compaction doesn't fill block cache
     ASSERT_EQ(prev_block_cache_add,
               options.statistics->getTickerCount(BLOCK_CACHE_ADD));
@@ -2704,7 +2745,8 @@ TEST_P(DBCompactionTestWithParam, ManualLevelCompactionOutputPathId) {
     CompactRangeOptions compact_options;
     compact_options.target_path_id = 1;
     compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
-    db_->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
     ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
     ASSERT_EQ("0,1", FilesPerLevel(1));
@@ -2763,8 +2805,8 @@ TEST_P(DBCompactionTestWithParam, DISABLED_CompactFilesOnLevelCompaction) {
   for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
     ASSERT_OK(Put(1, ToString(key), rnd.RandomString(kTestValueSize)));
   }
-  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ColumnFamilyMetaData cf_meta;
   dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
@@ -2840,10 +2882,10 @@ TEST_P(DBCompactionTestWithParam, PartialCompactionFailure) {
     keys.emplace_back(rnd.RandomString(kKeySize));
     values.emplace_back(rnd.RandomString(kKvSize - kKeySize));
     ASSERT_OK(Put(Slice(keys[k]), Slice(values[k])));
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   }
 
-  dbfull()->TEST_FlushMemTable(true);
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
   // Make sure the number of L0 files can trigger compaction.
   ASSERT_GE(NumTableFilesAtLevel(0),
             options.level0_file_num_compaction_trigger);
@@ -2909,7 +2951,7 @@ TEST_P(DBCompactionTestWithParam, DeleteMovedFileAfterCompaction) {
       ASSERT_OK(Flush());
     }
     // this should execute L0->L1
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ("0,1", FilesPerLevel(0));
 
     // block compactions
@@ -2926,7 +2968,7 @@ TEST_P(DBCompactionTestWithParam, DeleteMovedFileAfterCompaction) {
     sleeping_task.WaitUntilDone();
 
     // this should execute L1->L2 (move)
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
     ASSERT_EQ("0,0,1", FilesPerLevel(0));
 
@@ -2944,7 +2986,7 @@ TEST_P(DBCompactionTestWithParam, DeleteMovedFileAfterCompaction) {
       ASSERT_OK(Flush());
     }
     // this should execute both L0->L1 and L1->L2 (merge with previous file)
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
     ASSERT_EQ("0,0,2", FilesPerLevel(0));
 
@@ -2952,6 +2994,7 @@ TEST_P(DBCompactionTestWithParam, DeleteMovedFileAfterCompaction) {
     ASSERT_OK(env_->FileExists(dbname_ + moved_file_name));
 
     listener->SetExpectedFileName(dbname_ + moved_file_name);
+    ASSERT_OK(iterator->status());
     iterator.reset();
 
     // this file should have been compacted away
@@ -3114,7 +3157,7 @@ TEST_F(DBCompactionTest, SuggestCompactRangeNoTwoLevel0Compactions) {
   for (int num = 0; num < 10; num++) {
     GenerateNewRandomFile(&rnd);
   }
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
       {{"CompactionJob::Run():Start",
@@ -3135,7 +3178,7 @@ TEST_F(DBCompactionTest, SuggestCompactRangeNoTwoLevel0Compactions) {
       "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1");
 
   GenerateNewRandomFile(&rnd, /* nowait */ true);
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
   for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
        num++) {
@@ -3145,7 +3188,7 @@ TEST_F(DBCompactionTest, SuggestCompactRangeNoTwoLevel0Compactions) {
 
   TEST_SYNC_POINT(
       "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2");
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 }
 
 static std::string ShortKey(int i) {
@@ -3270,7 +3313,7 @@ TEST_P(DBCompactionTestWithParam, IntraL0Compaction) {
   table_options.block_cache = NewLRUCache(64 << 20);  // 64MB
   table_options.cache_index_and_filter_blocks = true;
   table_options.pin_l0_filter_and_index_blocks_in_cache = true;
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   DestroyAndReopen(options);
 
@@ -3309,7 +3352,7 @@ TEST_P(DBCompactionTestWithParam, IntraL0Compaction) {
     }
     ASSERT_OK(Flush());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 
   std::vector<std::vector<FileMetaData>> level_to_files;
@@ -3387,7 +3430,7 @@ TEST_P(DBCompactionTestWithParam, IntraL0CompactionDoesNotObsoleteDeletions) {
     ASSERT_OK(Put(Key(i + 1), value));
     ASSERT_OK(Flush());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 
   std::vector<std::vector<FileMetaData>> level_to_files;
@@ -3435,7 +3478,7 @@ TEST_P(DBCompactionTestWithParam, FullCompactionInBottomPriThreadPool) {
       int key_idx = 0;
       GenerateNewFile(&rnd, &key_idx);
     }
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
     ASSERT_EQ(1, num_bottom_pri_compactions);
 
@@ -3459,8 +3502,8 @@ TEST_F(DBCompactionTest, OptimizedDeletionObsoleting) {
   // So key 0, 2, and 4+ fall outside these levels' key-ranges.
   for (int level = 2; level >= 1; --level) {
     for (int i = 0; i < 2; ++i) {
-      Put(Key(2 * i + 1), "val");
-      Flush();
+      ASSERT_OK(Put(Key(2 * i + 1), "val"));
+      ASSERT_OK(Flush());
     }
     MoveFilesToLevel(level);
     ASSERT_EQ(2, NumTableFilesAtLevel(level));
@@ -3470,11 +3513,11 @@ TEST_F(DBCompactionTest, OptimizedDeletionObsoleting) {
   // - Tombstones for keys 2 and 4 can be dropped early.
   // - Tombstones for keys 1 and 3 must be kept due to L2 files' key-ranges.
   for (int i = 0; i < kNumL0Files; ++i) {
-    Put(Key(0), "val");  // sentinel to prevent trivial move
-    Delete(Key(i + 1));
-    Flush();
+    ASSERT_OK(Put(Key(0), "val"));  // sentinel to prevent trivial move
+    ASSERT_OK(Delete(Key(i + 1)));
+    ASSERT_OK(Flush());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   for (int i = 0; i < kNumL0Files; ++i) {
     std::string value;
@@ -3538,10 +3581,10 @@ TEST_F(DBCompactionTest, CompactFilesPendingL0Bug) {
 TEST_F(DBCompactionTest, CompactFilesOverlapInL0Bug) {
   // Regression test for bug of not pulling in L0 files that overlap the user-
   // specified input files in time- and key-ranges.
-  Put(Key(0), "old_val");
-  Flush();
-  Put(Key(0), "new_val");
-  Flush();
+  ASSERT_OK(Put(Key(0), "old_val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(0), "new_val"));
+  ASSERT_OK(Flush());
 
   ColumnFamilyMetaData cf_meta;
   dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
@@ -3557,6 +3600,41 @@ TEST_F(DBCompactionTest, CompactFilesOverlapInL0Bug) {
   ASSERT_EQ("new_val", Get(Key(0)));
 }
 
+TEST_F(DBCompactionTest, DeleteFilesInRangeConflictWithCompaction) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  const Snapshot* snapshot = nullptr;
+  const int kMaxKey = 10;
+
+  for (int i = 0; i < kMaxKey; i++) {
+    ASSERT_OK(Put(Key(i), Key(i)));
+    ASSERT_OK(Delete(Key(i)));
+    if (!snapshot) {
+      snapshot = db_->GetSnapshot();
+    }
+  }
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+  ASSERT_OK(Put(Key(kMaxKey), Key(kMaxKey)));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // test DeleteFilesInRange() deletes the files already picked for compaction
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"VersionSet::LogAndApply:WriteManifestStart",
+        "BackgroundCallCompaction:0"},
+       {"DBImpl::BackgroundCompaction:Finish",
+        "VersionSet::LogAndApply:WriteManifestDone"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // release snapshot which mark bottommost file for compaction
+  db_->ReleaseSnapshot(snapshot);
+  std::string begin_string = Key(0);
+  std::string end_string = Key(kMaxKey + 1);
+  Slice begin(begin_string);
+  Slice end(end_string);
+  ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
 TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) {
   // bottom-level files may contain deletions due to snapshots protecting the
   // deleted keys. Once the snapshot is released, we should see files with many
@@ -3587,12 +3665,12 @@ TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) {
         ASSERT_OK(Delete(Key(j)));
       }
     }
-    Flush();
+    ASSERT_OK(Flush());
     if (i < kNumLevelFiles - 1) {
       ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
     }
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1));
 
   std::vector<LiveFileMetaData> pre_release_metadata, post_release_metadata;
@@ -3613,7 +3691,7 @@ TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) {
                     CompactionReason::kBottommostFiles);
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   db_->GetLiveFilesMetaData(&post_release_metadata);
   ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size());
 
@@ -3662,12 +3740,12 @@ TEST_F(DBCompactionTest, NoCompactBottomLevelFilesWithDeletions) {
         ASSERT_OK(Delete(Key(j)));
       }
     }
-    Flush();
+    ASSERT_OK(Flush());
     if (i < kNumLevelFiles - 1) {
       ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
     }
   }
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr));
   ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1));
 
   std::vector<LiveFileMetaData> pre_release_metadata, post_release_metadata;
@@ -3683,7 +3761,7 @@ TEST_F(DBCompactionTest, NoCompactBottomLevelFilesWithDeletions) {
       [&](void* /*arg*/) { num_compactions.fetch_add(1); });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
   db_->ReleaseSnapshot(snapshot);
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(0, num_compactions);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 
@@ -3721,9 +3799,9 @@ TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) {
       ASSERT_OK(
           Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   MoveFilesToLevel(3);
   ASSERT_EQ("0,0,0,2", FilesPerLevel());
 
@@ -3732,9 +3810,9 @@ TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) {
     for (int j = 0; j < kNumKeysPerFile; ++j) {
       ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("2,0,0,2", FilesPerLevel());
   MoveFilesToLevel(1);
   ASSERT_EQ("0,2,0,2", FilesPerLevel());
@@ -3745,14 +3823,14 @@ TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) {
   // Just do a simple write + flush so that the Ttl expired files get
   // compacted.
   ASSERT_OK(Put("a", "1"));
-  Flush();
+  ASSERT_OK(Flush());
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
         Compaction* compaction = reinterpret_cast<Compaction*>(arg);
         ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl);
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // All non-L0 files are deleted, as they contained only deleted data.
   ASSERT_EQ("1", FilesPerLevel());
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
@@ -3768,9 +3846,9 @@ TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) {
       ASSERT_OK(
           Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   MoveFilesToLevel(3);
   ASSERT_EQ("0,0,0,2", FilesPerLevel());
 
@@ -3779,9 +3857,9 @@ TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) {
     for (int j = 0; j < kNumKeysPerFile; ++j) {
       ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("2,0,0,2", FilesPerLevel());
   MoveFilesToLevel(1);
   ASSERT_EQ("0,2,0,2", FilesPerLevel());
@@ -3790,8 +3868,8 @@ TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) {
   // trigger as ttl is set to 24 hours.
   env_->MockSleepForSeconds(12 * 60 * 60);
   ASSERT_OK(Put("a", "1"));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("1,2,0,2", FilesPerLevel());
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
@@ -3804,7 +3882,7 @@ TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) {
   // Dynamically change ttl to 10 hours.
   // This should trigger a ttl compaction, as 12 hours have already passed.
   ASSERT_OK(dbfull()->SetOptions({{"ttl", "36000"}}));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // All non-L0 files are deleted, as they contained only deleted data.
   ASSERT_EQ("1", FilesPerLevel());
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
@@ -3864,7 +3942,7 @@ TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) {
       for (int i = 1; i <= 100; ++i) {
         ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
       }
-      Flush();
+      ASSERT_OK(Flush());
       // Get the first file's creation time. This will be the oldest file in the
       // DB. Compactions inolving this file's descendents should keep getting
       // this time.
@@ -3877,7 +3955,7 @@ TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) {
       for (int i = 101; i <= 200; ++i) {
         ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
       }
-      Flush();
+      ASSERT_OK(Flush());
       MoveFilesToLevel(6);
       ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
 
@@ -3886,12 +3964,12 @@ TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) {
       for (int i = 1; i <= 50; ++i) {
         ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
       }
-      Flush();
+      ASSERT_OK(Flush());
       env_->MockSleepForSeconds(1 * 60 * 60);
       for (int i = 51; i <= 150; ++i) {
         ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
       }
-      Flush();
+      ASSERT_OK(Flush());
       MoveFilesToLevel(4);
       ASSERT_EQ("0,0,0,0,2,0,2", FilesPerLevel());
 
@@ -3900,8 +3978,8 @@ TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) {
       for (int i = 26; i <= 75; ++i) {
         ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
       }
-      Flush();
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
       MoveFilesToLevel(1);
       ASSERT_EQ("0,1,0,0,2,0,2", FilesPerLevel());
 
@@ -3931,9 +4009,9 @@ TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) {
       if (if_restart) {
         Reopen(options);
       } else {
-        Flush();
+        ASSERT_OK(Flush());
       }
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
       ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
       ASSERT_EQ(5, ttl_compactions);
 
@@ -3946,9 +4024,9 @@ TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) {
       if (if_restart) {
         Reopen(options);
       } else {
-        Flush();
+        ASSERT_OK(Flush());
       }
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
       ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
       ASSERT_GE(ttl_compactions, 6);
 
@@ -4013,9 +4091,9 @@ TEST_F(DBCompactionTest, LevelPeriodicCompaction) {
           ASSERT_OK(
               Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
         }
-        Flush();
+        ASSERT_OK(Flush());
       }
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
       ASSERT_EQ("2", FilesPerLevel());
       ASSERT_EQ(0, periodic_compactions);
@@ -4023,8 +4101,8 @@ TEST_F(DBCompactionTest, LevelPeriodicCompaction) {
       // Add 50 hours and do a write
       env_->MockSleepForSeconds(50 * 60 * 60);
       ASSERT_OK(Put("a", "1"));
-      Flush();
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
       // Assert that the files stay in the same level
       ASSERT_EQ("3", FilesPerLevel());
       // The two old files go through the periodic compaction process
@@ -4039,9 +4117,9 @@ TEST_F(DBCompactionTest, LevelPeriodicCompaction) {
       if (if_restart) {
         Reopen(options);
       } else {
-        Flush();
+        ASSERT_OK(Flush());
       }
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
       ASSERT_EQ("1,3", FilesPerLevel());
       // The three old files now go through the periodic compaction process. 2
       // + 3.
@@ -4050,8 +4128,8 @@ TEST_F(DBCompactionTest, LevelPeriodicCompaction) {
       // Add another 50 hours and do another write
       env_->MockSleepForSeconds(50 * 60 * 60);
       ASSERT_OK(Put("c", "3"));
-      Flush();
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
       ASSERT_EQ("2,3", FilesPerLevel());
       // The four old files now go through the periodic compaction process. 5
       // + 4.
@@ -4108,7 +4186,7 @@ TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) {
       ASSERT_OK(
           Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
     }
-    Flush();
+    ASSERT_OK(Flush());
     // Move the first two files to L2.
     if (i == 1) {
       MoveFilesToLevel(2);
@@ -4172,9 +4250,9 @@ TEST_F(DBCompactionTest, LevelPeriodicAndTtlCompaction) {
       ASSERT_OK(
           Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   MoveFilesToLevel(3);
 
@@ -4185,8 +4263,8 @@ TEST_F(DBCompactionTest, LevelPeriodicAndTtlCompaction) {
   // Add some time greater than periodic_compaction_time.
   env_->MockSleepForSeconds(50 * 60 * 60);
   ASSERT_OK(Put("a", "1"));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // Files in the bottom level go through periodic compactions.
   ASSERT_EQ("1,0,0,2", FilesPerLevel());
   ASSERT_EQ(2, periodic_compactions);
@@ -4195,8 +4273,8 @@ TEST_F(DBCompactionTest, LevelPeriodicAndTtlCompaction) {
   // Add a little more time than ttl
   env_->MockSleepForSeconds(11 * 60 * 60);
   ASSERT_OK(Put("b", "1"));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // Notice that the previous file in level 1 falls down to the bottom level
   // due to ttl compactions, one level at a time.
   // And bottom level files don't get picked up for ttl compactions.
@@ -4207,8 +4285,8 @@ TEST_F(DBCompactionTest, LevelPeriodicAndTtlCompaction) {
   // Add some time greater than periodic_compaction_time.
   env_->MockSleepForSeconds(50 * 60 * 60);
   ASSERT_OK(Put("c", "1"));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   // Previous L0 file falls one level at a time to bottom level due to ttl.
   // And all 4 bottom files go through periodic compactions.
   ASSERT_EQ("1,0,0,4", FilesPerLevel());
@@ -4284,9 +4362,9 @@ TEST_F(DBCompactionTest, LevelPeriodicCompactionWithCompactionFilters) {
         ASSERT_OK(
             Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
       }
-      Flush();
+      ASSERT_OK(Flush());
     }
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
     ASSERT_EQ("2", FilesPerLevel());
     ASSERT_EQ(0, periodic_compactions);
@@ -4294,8 +4372,8 @@ TEST_F(DBCompactionTest, LevelPeriodicCompactionWithCompactionFilters) {
     // Add 31 days and do a write
     env_->MockSleepForSeconds(31 * 24 * 60 * 60);
     ASSERT_OK(Put("a", "1"));
-    Flush();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     // Assert that the files stay in the same level
     ASSERT_EQ("3", FilesPerLevel());
     // The two old files go through the periodic compaction process
@@ -4344,16 +4422,16 @@ TEST_F(DBCompactionTest, CompactRangeDelayedByL0FileCount) {
       for (int k = 0; k < 2; ++k) {
         ASSERT_OK(Put(Key(k), rnd.RandomString(1024)));
       }
-      Flush();
+      ASSERT_OK(Flush());
     }
     auto manual_compaction_thread = port::Thread([this]() {
       CompactRangeOptions cro;
       cro.allow_write_stall = false;
-      db_->CompactRange(cro, nullptr, nullptr);
+      ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
     });
 
     manual_compaction_thread.join();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(0, NumTableFilesAtLevel(0));
     ASSERT_GT(NumTableFilesAtLevel(1), 0);
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
@@ -4400,17 +4478,17 @@ TEST_F(DBCompactionTest, CompactRangeDelayedByImmMemTableCount) {
       FlushOptions flush_opts;
       flush_opts.wait = false;
       flush_opts.allow_write_stall = true;
-      dbfull()->Flush(flush_opts);
+      ASSERT_OK(dbfull()->Flush(flush_opts));
     }
 
     auto manual_compaction_thread = port::Thread([this]() {
       CompactRangeOptions cro;
       cro.allow_write_stall = false;
-      db_->CompactRange(cro, nullptr, nullptr);
+      ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
     });
 
     manual_compaction_thread.join();
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     ASSERT_EQ(0, NumTableFilesAtLevel(0));
     ASSERT_GT(NumTableFilesAtLevel(1), 0);
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
@@ -4446,12 +4524,11 @@ TEST_F(DBCompactionTest, CompactRangeShutdownWhileDelayed) {
       for (int k = 0; k < 2; ++k) {
         ASSERT_OK(Put(1, Key(k), rnd.RandomString(1024)));
       }
-      Flush(1);
+      ASSERT_OK(Flush(1));
     }
     auto manual_compaction_thread = port::Thread([this, i]() {
       CompactRangeOptions cro;
       cro.allow_write_stall = false;
-      Status s = db_->CompactRange(cro, handles_[1], nullptr, nullptr);
       if (i == 0) {
         ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr)
                         .IsColumnFamilyDropped());
@@ -4471,7 +4548,7 @@ TEST_F(DBCompactionTest, CompactRangeShutdownWhileDelayed) {
     manual_compaction_thread.join();
     TEST_SYNC_POINT(
         "DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual");
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   }
 }
@@ -4506,25 +4583,26 @@ TEST_F(DBCompactionTest, CompactRangeSkipFlushAfterDelay) {
     for (int j = 0; j < 2; ++j) {
       ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
     }
-    dbfull()->Flush(flush_opts);
+    ASSERT_OK(dbfull()->Flush(flush_opts));
   }
   auto manual_compaction_thread = port::Thread([this]() {
     CompactRangeOptions cro;
     cro.allow_write_stall = false;
-    db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
   });
 
   TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush");
-  Put(ToString(0), rnd.RandomString(1024));
-  dbfull()->Flush(flush_opts);
-  Put(ToString(0), rnd.RandomString(1024));
+  ASSERT_OK(Put(ToString(0), rnd.RandomString(1024)));
+  ASSERT_OK(dbfull()->Flush(flush_opts));
+  ASSERT_OK(Put(ToString(0), rnd.RandomString(1024)));
   TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush");
   manual_compaction_thread.join();
 
   // If CompactRange's flush was skipped, the final Put above will still be
   // in the active memtable.
   std::string num_keys_in_memtable;
-  db_->GetProperty(DB::Properties::kNumEntriesActiveMemTable, &num_keys_in_memtable);
+  ASSERT_TRUE(db_->GetProperty(DB::Properties::kNumEntriesActiveMemTable,
+                               &num_keys_in_memtable));
   ASSERT_EQ(ToString(1), num_keys_in_memtable);
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
@@ -4582,7 +4660,7 @@ TEST_F(DBCompactionTest, CompactRangeFlushOverlappingMemtable) {
       } else {
         ASSERT_EQ(2, num_memtable_entries);
         // flush anyways to prepare for next iteration
-        db_->Flush(FlushOptions());
+        ASSERT_OK(db_->Flush(FlushOptions()));
       }
     }
   }
@@ -4597,12 +4675,12 @@ TEST_F(DBCompactionTest, CompactionStatsTest) {
 
   for (int i = 0; i < 32; i++) {
     for (int j = 0; j < 5000; j++) {
-      Put(std::to_string(j), std::string(1, 'A'));
+      ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
     }
     ASSERT_OK(Flush());
     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ColumnFamilyHandleImpl* cfh =
       static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
   ColumnFamilyData* cfd = cfh->cfd();
@@ -4687,7 +4765,7 @@ TEST_F(DBCompactionTest, CompactionHasEmptyOutput) {
   ASSERT_OK(Delete("b"));
   ASSERT_OK(Flush());
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
   ASSERT_EQ(NumTableFilesAtLevel(1), 0);
 
@@ -4820,7 +4898,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) {
     }
 
     for (unsigned int cf = 0; cf < cf_count; cf++) {
-      dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
     }
   }
 
@@ -4838,7 +4916,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) {
     }
     // put extra key to trigger flush
     ASSERT_OK(Put(0, "", ""));
-    dbfull()->TEST_WaitForFlushMemTable(handles_[0]);
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
     ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1,
               NumTableFilesAtLevel(0, 0));
   }
@@ -4853,7 +4931,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) {
   }
 
   for (unsigned int cf = 0; cf < cf_count; cf++) {
-    dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
   }
 
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
@@ -4875,7 +4953,7 @@ TEST_F(DBCompactionTest, CompactionLimiter) {
   // put extra key to trigger flush
   ASSERT_OK(Put(cf_test, "", ""));
 
-  dbfull()->TEST_WaitForFlushMemTable(handles_[cf_test]);
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf_test]));
   ASSERT_EQ(1, NumTableFilesAtLevel(0, cf_test));
 
   Compact(cf_test, Key(0), Key(keyIndex));
@@ -4961,7 +5039,7 @@ TEST_P(CompactionPriTest, Test) {
     ASSERT_OK(Put(Key(keys[i]), rnd.RandomString(102)));
   }
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   for (int i = 0; i < kNKeys; i++) {
     ASSERT_NE("NOT_FOUND", Get(Key(i)));
   }
@@ -5000,9 +5078,9 @@ TEST_F(DBCompactionTest, PartialManualCompaction) {
   Random rnd(301);
   for (auto i = 0; i < 8; ++i) {
     for (auto j = 0; j < 10; ++j) {
-      Merge("foo", rnd.RandomString(1024));
+      ASSERT_OK(Merge("foo", rnd.RandomString(1024)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
 
   MoveFilesToLevel(2);
@@ -5015,7 +5093,7 @@ TEST_F(DBCompactionTest, PartialManualCompaction) {
 
   CompactRangeOptions cro;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
-  dbfull()->CompactRange(cro, nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
 }
 
 TEST_F(DBCompactionTest, ManualCompactionFailsInReadOnlyMode) {
@@ -5023,7 +5101,7 @@ TEST_F(DBCompactionTest, ManualCompactionFailsInReadOnlyMode) {
   // is in read-only mode. Verify it now at least returns, despite failing.
   const int kNumL0Files = 4;
   std::unique_ptr<FaultInjectionTestEnv> mock_env(
-      new FaultInjectionTestEnv(Env::Default()));
+      new FaultInjectionTestEnv(env_));
   Options opts = CurrentOptions();
   opts.disable_auto_compactions = true;
   opts.env = mock_env.get();
@@ -5032,9 +5110,9 @@ TEST_F(DBCompactionTest, ManualCompactionFailsInReadOnlyMode) {
   Random rnd(301);
   for (int i = 0; i < kNumL0Files; ++i) {
     // Make sure files are overlapping in key-range to prevent trivial move.
-    Put("key1", rnd.RandomString(1024));
-    Put("key2", rnd.RandomString(1024));
-    Flush();
+    ASSERT_OK(Put("key1", rnd.RandomString(1024)));
+    ASSERT_OK(Put("key2", rnd.RandomString(1024)));
+    ASSERT_OK(Flush());
   }
   ASSERT_EQ(kNumL0Files, NumTableFilesAtLevel(0));
 
@@ -5083,7 +5161,7 @@ TEST_F(DBCompactionTest, ManualCompactionBottomLevelOptimized) {
       ASSERT_OK(
           Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
 
   MoveFilesToLevel(2);
@@ -5093,7 +5171,7 @@ TEST_F(DBCompactionTest, ManualCompactionBottomLevelOptimized) {
       ASSERT_OK(
           Put("bar" + std::to_string(i * 10 + j), rnd.RandomString(1024)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
   const std::vector<InternalStats::CompactionStats>& comp_stats =
       internal_stats_ptr->TEST_GetCompactionStats();
@@ -5102,7 +5180,7 @@ TEST_F(DBCompactionTest, ManualCompactionBottomLevelOptimized) {
 
   CompactRangeOptions cro;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
-  dbfull()->CompactRange(cro, nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
 
   const std::vector<InternalStats::CompactionStats>& comp_stats2 =
       internal_stats_ptr->TEST_GetCompactionStats();
@@ -5110,6 +5188,94 @@ TEST_F(DBCompactionTest, ManualCompactionBottomLevelOptimized) {
   ASSERT_EQ(num, 0);
 }
 
+TEST_F(DBCompactionTest, ManualCompactionMax) {
+  uint64_t l1_avg_size = 0, l2_avg_size = 0;
+  auto generate_sst_func = [&]() {
+    Random rnd(301);
+    for (auto i = 0; i < 100; i++) {
+      for (auto j = 0; j < 10; j++) {
+        ASSERT_OK(Put(Key(i * 10 + j), rnd.RandomString(1024)));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(2);
+
+    for (auto i = 0; i < 10; i++) {
+      for (auto j = 0; j < 10; j++) {
+        ASSERT_OK(Put(Key(i * 100 + j * 10), rnd.RandomString(1024)));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(1);
+
+    std::vector<std::vector<FileMetaData>> level_to_files;
+    dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                    &level_to_files);
+
+    uint64_t total = 0;
+    for (const auto& file : level_to_files[1]) {
+      total += file.compensated_file_size;
+    }
+    l1_avg_size = total / level_to_files[1].size();
+
+    total = 0;
+    for (const auto& file : level_to_files[2]) {
+      total += file.compensated_file_size;
+    }
+    l2_avg_size = total / level_to_files[2].size();
+  };
+
+  std::atomic_int num_compactions(0);
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BGWorkCompaction", [&](void* /*arg*/) { ++num_compactions; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options opts = CurrentOptions();
+  opts.disable_auto_compactions = true;
+
+  // with default setting (1.6G by default), it should cover all files in 1
+  // compaction
+  DestroyAndReopen(opts);
+  generate_sst_func();
+  num_compactions.store(0);
+  CompactRangeOptions cro;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_TRUE(num_compactions.load() == 1);
+
+  // split the compaction to 5
+  int num_split = 5;
+  DestroyAndReopen(opts);
+  generate_sst_func();
+  uint64_t total_size = (l1_avg_size * 10) + (l2_avg_size * 100);
+  opts.max_compaction_bytes = total_size / num_split;
+  Reopen(opts);
+  num_compactions.store(0);
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_TRUE(num_compactions.load() == num_split);
+
+  // very small max_compaction_bytes, it should still move forward
+  opts.max_compaction_bytes = l1_avg_size / 2;
+  DestroyAndReopen(opts);
+  generate_sst_func();
+  num_compactions.store(0);
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_TRUE(num_compactions.load() > 10);
+
+  // dynamically set the option
+  num_split = 2;
+  opts.max_compaction_bytes = 0;
+  DestroyAndReopen(opts);
+  generate_sst_func();
+  total_size = (l1_avg_size * 10) + (l2_avg_size * 100);
+  Status s = db_->SetOptions(
+      {{"max_compaction_bytes", std::to_string(total_size / num_split)}});
+  ASSERT_OK(s);
+
+  num_compactions.store(0);
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_TRUE(num_compactions.load() == num_split);
+}
+
 TEST_F(DBCompactionTest, CompactionDuringShutdown) {
   Options opts = CurrentOptions();
   opts.level0_file_num_compaction_trigger = 2;
@@ -5127,14 +5293,15 @@ TEST_F(DBCompactionTest, CompactionDuringShutdown) {
       ASSERT_OK(
           Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun",
       [&](void* /*arg*/) { dbfull()->shutting_down_.store(true); });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_TRUE(s.ok() || s.IsShutdownInProgress());
   ASSERT_OK(dbfull()->error_handler_.GetBGError());
 }
 
@@ -5148,7 +5315,7 @@ TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) {
 
   // Generate an external SST file containing a single key, i.e. 99
   std::string sst_files_dir = dbname_ + "/sst_files/";
-  DestroyDir(env_, sst_files_dir);
+  ASSERT_OK(DestroyDir(env_, sst_files_dir));
   ASSERT_OK(env_->CreateDir(sst_files_dir));
   SstFileWriter sst_writer(EnvOptions(), options);
   const std::string sst_file_path = sst_files_dir + "test.sst";
@@ -5187,7 +5354,7 @@ TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) {
       // extra key to trigger flush.
       ASSERT_OK(Put("", ""));
     }
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i + 1);
   }
   // When we reach this point, there will be level0_stop_writes_trigger L0
@@ -5235,7 +5402,12 @@ TEST_F(DBCompactionTest, ConsistencyFailTest) {
 
   for (int k = 0; k < 2; ++k) {
     ASSERT_OK(Put("foo", "bar"));
-    Flush();
+    Status s = Flush();
+    if (k < 1) {
+      ASSERT_OK(s);
+    } else {
+      ASSERT_TRUE(s.IsCorruption());
+    }
   }
 
   ASSERT_NOK(Put("foo", "bar"));
@@ -5250,7 +5422,7 @@ TEST_F(DBCompactionTest, ConsistencyFailTest2) {
   options.level0_file_num_compaction_trigger = 2;
   BlockBasedTableOptions bbto;
   bbto.block_size = 400;  // small block size
-  options.table_factory.reset(new BlockBasedTableFactory(bbto));
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
   DestroyAndReopen(options);
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
@@ -5271,14 +5443,15 @@ TEST_F(DBCompactionTest, ConsistencyFailTest2) {
 
   ASSERT_OK(Put("foo1", value));
   ASSERT_OK(Put("z", ""));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_OK(Put("foo2", value));
   ASSERT_OK(Put("z", ""));
-  Flush();
+  Status s = Flush();
+  ASSERT_TRUE(s.ok() || s.IsCorruption());
 
   // This probably returns non-OK, but we rely on the next Put()
   // to determine the DB is frozen.
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_NOK(dbfull()->TEST_WaitForCompact());
   ASSERT_NOK(Put("foo", "bar"));
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   SyncPoint::GetInstance()->ClearAllCallBacks();
@@ -5364,7 +5537,7 @@ TEST_P(DBCompactionTestWithParam,
   // Put one key, to make biggest log sequence number in this memtable is bigger
   // than sst which would be ingested in next step.
   ASSERT_OK(Put(Key(2), "b"));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   std::vector<std::vector<FileMetaData>> level_to_files;
   dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
@@ -5452,7 +5625,7 @@ TEST_P(DBCompactionTestWithParam,
   // Wake up flush job
   sleeping_tasks.WakeUp();
   sleeping_tasks.WaitUntilDone();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 
   uint64_t error_count = 0;
@@ -5478,7 +5651,7 @@ TEST_P(DBCompactionTestWithBottommostParam, SequenceKeysManualCompaction) {
     for (int j = 1; j < UCHAR_MAX; j++) {
       auto key = std::string(kSstNum, '\0');
       key[kSstNum - i] += static_cast<char>(j);
-      Put(key, std::string(i % 1000, 'A'));
+      ASSERT_OK(Put(key, std::string(i % 1000, 'A')));
     }
     ASSERT_OK(Flush());
   }
@@ -5488,7 +5661,7 @@ TEST_P(DBCompactionTestWithBottommostParam, SequenceKeysManualCompaction) {
 
   auto cro = CompactRangeOptions();
   cro.bottommost_level_compaction = bottommost_level_compaction_;
-  db_->CompactRange(cro, nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
   if (bottommost_level_compaction_ == BottommostLevelCompaction::kForce ||
       bottommost_level_compaction_ ==
           BottommostLevelCompaction::kForceOptimized) {
@@ -5527,12 +5700,12 @@ TEST_F(DBCompactionTest, UpdateLevelSubCompactionTest) {
   // Trigger compaction
   for (int i = 0; i < 32; i++) {
     for (int j = 0; j < 5000; j++) {
-      Put(std::to_string(j), std::string(1, 'A'));
+      ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
     }
     ASSERT_OK(Flush());
     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_TRUE(has_compaction);
 
   has_compaction = false;
@@ -5550,12 +5723,12 @@ TEST_F(DBCompactionTest, UpdateLevelSubCompactionTest) {
   // Trigger compaction
   for (int i = 0; i < 32; i++) {
     for (int j = 0; j < 5000; j++) {
-      Put(std::to_string(j), std::string(1, 'A'));
+      ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
     }
     ASSERT_OK(Flush());
     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_TRUE(has_compaction);
 }
 
@@ -5578,12 +5751,12 @@ TEST_F(DBCompactionTest, UpdateUniversalSubCompactionTest) {
   // Trigger compaction
   for (int i = 0; i < 32; i++) {
     for (int j = 0; j < 5000; j++) {
-      Put(std::to_string(j), std::string(1, 'A'));
+      ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
     }
     ASSERT_OK(Flush());
     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_TRUE(has_compaction);
   has_compaction = false;
 
@@ -5601,12 +5774,12 @@ TEST_F(DBCompactionTest, UpdateUniversalSubCompactionTest) {
   // Trigger compaction
   for (int i = 0; i < 32; i++) {
     for (int j = 0; j < 5000; j++) {
-      Put(std::to_string(j), std::string(1, 'A'));
+      ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
     }
     ASSERT_OK(Flush());
     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_TRUE(has_compaction);
 }
 
@@ -5654,7 +5827,7 @@ TEST_P(ChangeLevelConflictsWithAuto, TestConflict) {
     ASSERT_OK(Put("bar", "v3"));
     ASSERT_OK(Put("foo", "v3"));
     ASSERT_OK(Flush());
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     TEST_SYNC_POINT("AutoCompactionFinished2");
   });
 
@@ -5664,7 +5837,7 @@ TEST_P(ChangeLevelConflictsWithAuto, TestConflict) {
     cro.target_level = GetParam() ? 1 : 0;
     // This should return non-OK, but it's more important for the test to
     // make sure that the DB is not corrupted.
-    dbfull()->CompactRange(cro, nullptr, nullptr);
+    ASSERT_NOK(dbfull()->CompactRange(cro, nullptr, nullptr));
   }
   auto_comp.join();
   // Refitting didn't happen.
@@ -5845,6 +6018,734 @@ TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) {
   ASSERT_EQ("0,5", FilesPerLevel(0));
 }
 
+TEST_F(DBCompactionTest, CompactionWithBlob) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char second_key[] = "second_key";
+  constexpr char first_value[] = "first_value";
+  constexpr char second_value[] = "second_value";
+  constexpr char third_value[] = "third_value";
+
+  ASSERT_OK(Put(first_key, first_value));
+  ASSERT_OK(Put(second_key, first_value));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(first_key, second_value));
+  ASSERT_OK(Put(second_key, second_value));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(first_key, third_value));
+  ASSERT_OK(Put(second_key, third_value));
+  ASSERT_OK(Flush());
+
+  options.enable_blob_files = true;
+
+  Reopen(options);
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  ASSERT_EQ(Get(first_key), third_value);
+  ASSERT_EQ(Get(second_key), third_value);
+
+  VersionSet* const versions = dbfull()->TEST_GetVersionSet();
+  assert(versions);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  ASSERT_NE(cfd, nullptr);
+
+  Version* const current = cfd->current();
+  ASSERT_NE(current, nullptr);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  ASSERT_NE(storage_info, nullptr);
+
+  const auto& l1_files = storage_info->LevelFiles(1);
+  ASSERT_EQ(l1_files.size(), 1);
+
+  const FileMetaData* const table_file = l1_files[0];
+  ASSERT_NE(table_file, nullptr);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_EQ(blob_files.size(), 1);
+
+  const auto& blob_file = blob_files.begin()->second;
+  ASSERT_NE(blob_file, nullptr);
+
+  ASSERT_EQ(table_file->smallest.user_key(), first_key);
+  ASSERT_EQ(table_file->largest.user_key(), second_key);
+  ASSERT_EQ(table_file->oldest_blob_file_number,
+            blob_file->GetBlobFileNumber());
+
+  ASSERT_EQ(blob_file->GetTotalBlobCount(), 2);
+
+  const InternalStats* const internal_stats = cfd->internal_stats();
+  ASSERT_NE(internal_stats, nullptr);
+
+  const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+  ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+  ASSERT_EQ(compaction_stats[1].bytes_written, table_file->fd.GetFileSize());
+  ASSERT_EQ(compaction_stats[1].bytes_written_blob,
+            blob_file->GetTotalBlobBytes());
+  ASSERT_EQ(compaction_stats[1].num_output_files, 1);
+  ASSERT_EQ(compaction_stats[1].num_output_files_blob, 1);
+}
+
+class DBCompactionTestBlobError
+    : public DBCompactionTest,
+      public testing::WithParamInterface<std::string> {
+ public:
+  DBCompactionTestBlobError() : sync_point_(GetParam()) {}
+
+  std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(DBCompactionTestBlobError, DBCompactionTestBlobError,
+                        ::testing::ValuesIn(std::vector<std::string>{
+                            "BlobFileBuilder::WriteBlobToFile:AddRecord",
+                            "BlobFileBuilder::WriteBlobToFile:AppendFooter"}));
+
+TEST_P(DBCompactionTestBlobError, CompactionError) {
+  Options options;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char second_key[] = "second_key";
+  constexpr char first_value[] = "first_value";
+  constexpr char second_value[] = "second_value";
+  constexpr char third_value[] = "third_value";
+
+  ASSERT_OK(Put(first_key, first_value));
+  ASSERT_OK(Put(second_key, first_value));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(first_key, second_value));
+  ASSERT_OK(Put(second_key, second_value));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(first_key, third_value));
+  ASSERT_OK(Put(second_key, third_value));
+  ASSERT_OK(Flush());
+
+  options.enable_blob_files = true;
+
+  Reopen(options);
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) {
+    Status* const s = static_cast<Status*>(arg);
+    assert(s);
+
+    (*s) = Status::IOError(sync_point_);
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), begin, end).IsIOError());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  VersionSet* const versions = dbfull()->TEST_GetVersionSet();
+  assert(versions);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  ASSERT_NE(cfd, nullptr);
+
+  Version* const current = cfd->current();
+  ASSERT_NE(current, nullptr);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  ASSERT_NE(storage_info, nullptr);
+
+  const auto& l1_files = storage_info->LevelFiles(1);
+  ASSERT_TRUE(l1_files.empty());
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_TRUE(blob_files.empty());
+
+  const InternalStats* const internal_stats = cfd->internal_stats();
+  ASSERT_NE(internal_stats, nullptr);
+
+  const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  if (sync_point_ == "BlobFileBuilder::WriteBlobToFile:AddRecord") {
+    ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+    ASSERT_EQ(compaction_stats[1].bytes_written, 0);
+    ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+    ASSERT_EQ(compaction_stats[1].num_output_files, 0);
+    ASSERT_EQ(compaction_stats[1].num_output_files_blob, 0);
+  } else {
+    // SST file writing succeeded; blob file writing failed (during Finish)
+    ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+    ASSERT_GT(compaction_stats[1].bytes_written, 0);
+    ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+    ASSERT_EQ(compaction_stats[1].num_output_files, 1);
+    ASSERT_EQ(compaction_stats[1].num_output_files_blob, 0);
+  }
+}
+
+class DBCompactionTestBlobGC
+    : public DBCompactionTest,
+      public testing::WithParamInterface<std::tuple<double, bool>> {
+ public:
+  DBCompactionTestBlobGC()
+      : blob_gc_age_cutoff_(std::get<0>(GetParam())),
+        updated_enable_blob_files_(std::get<1>(GetParam())) {}
+
+  double blob_gc_age_cutoff_;
+  bool updated_enable_blob_files_;
+};
+
+INSTANTIATE_TEST_CASE_P(DBCompactionTestBlobGC, DBCompactionTestBlobGC,
+                        ::testing::Combine(::testing::Values(0.0, 0.5, 1.0),
+                                           ::testing::Bool()));
+
+TEST_P(DBCompactionTestBlobGC, CompactionWithBlobGC) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  options.blob_file_size = 32;  // one blob per file
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = blob_gc_age_cutoff_;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+
+  ASSERT_OK(Put(first_key, first_value));
+  ASSERT_OK(Put(second_key, second_value));
+  ASSERT_OK(Flush());
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "third_value";
+  constexpr char fourth_key[] = "fourth_key";
+  constexpr char fourth_value[] = "fourth_value";
+
+  ASSERT_OK(Put(third_key, third_value));
+  ASSERT_OK(Put(fourth_key, fourth_value));
+  ASSERT_OK(Flush());
+
+  const std::vector<uint64_t> original_blob_files = GetBlobFileNumbers();
+
+  ASSERT_EQ(original_blob_files.size(), 4);
+
+  const size_t cutoff_index = static_cast<size_t>(
+      options.blob_garbage_collection_age_cutoff * original_blob_files.size());
+
+  // Note: turning off enable_blob_files before the compaction results in
+  // garbage collected values getting inlined.
+  size_t expected_number_of_files = original_blob_files.size();
+
+  if (!updated_enable_blob_files_) {
+    ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}}));
+
+    expected_number_of_files -= cutoff_index;
+  }
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  ASSERT_EQ(Get(first_key), first_value);
+  ASSERT_EQ(Get(second_key), second_value);
+  ASSERT_EQ(Get(third_key), third_value);
+  ASSERT_EQ(Get(fourth_key), fourth_value);
+
+  const std::vector<uint64_t> new_blob_files = GetBlobFileNumbers();
+
+  ASSERT_EQ(new_blob_files.size(), expected_number_of_files);
+
+  // Original blob files below the cutoff should be gone, original blob files at
+  // or above the cutoff should be still there
+  for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) {
+    ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]);
+  }
+
+  VersionSet* const versions = dbfull()->TEST_GetVersionSet();
+  assert(versions);
+  assert(versions->GetColumnFamilySet());
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  const InternalStats* const internal_stats = cfd->internal_stats();
+  assert(internal_stats);
+
+  const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  if (blob_gc_age_cutoff_ > 0.0) {
+    ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
+
+    if (updated_enable_blob_files_) {
+      // GC relocated some blobs to new blob files
+      ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
+      ASSERT_EQ(compaction_stats[1].bytes_read_blob,
+                compaction_stats[1].bytes_written_blob);
+    } else {
+      // GC moved some blobs back to the LSM, no new blob files
+      ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+    }
+  } else {
+    ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+    ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+  }
+}
+
+TEST_F(DBCompactionTest, CompactionWithBlobGCError_CorruptIndex) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 1.0;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+  ASSERT_OK(Put(second_key, second_value));
+
+  ASSERT_OK(Flush());
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "third_value";
+  ASSERT_OK(Put(third_key, third_value));
+
+  constexpr char fourth_key[] = "fourth_key";
+  constexpr char corrupt_blob_index[] = "foobar";
+
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key,
+                                             corrupt_blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Flush());
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_TRUE(
+      db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption());
+}
+
+TEST_F(DBCompactionTest, CompactionWithBlobGCError_InlinedTTLIndex) {
+  constexpr uint64_t min_blob_size = 10;
+
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = min_blob_size;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 1.0;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+  ASSERT_OK(Put(second_key, second_value));
+
+  ASSERT_OK(Flush());
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "third_value";
+  ASSERT_OK(Put(third_key, third_value));
+
+  constexpr char fourth_key[] = "fourth_key";
+  constexpr char blob[] = "short";
+  static_assert(sizeof(short) - 1 < min_blob_size,
+                "Blob too long to be inlined");
+
+  // Fake an inlined TTL blob index.
+  std::string blob_index;
+
+  constexpr uint64_t expiration = 1234567890;
+
+  BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob);
+
+  WriteBatch batch;
+  ASSERT_OK(
+      WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key, blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Flush());
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_TRUE(
+      db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption());
+}
+
+TEST_F(DBCompactionTest, CompactionWithBlobGCError_IndexWithInvalidFileNumber) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 1.0;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+  ASSERT_OK(Put(second_key, second_value));
+
+  ASSERT_OK(Flush());
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "third_value";
+  ASSERT_OK(Put(third_key, third_value));
+
+  constexpr char fourth_key[] = "fourth_key";
+
+  // Fake a blob index referencing a non-existent blob file.
+  std::string blob_index;
+
+  constexpr uint64_t blob_file_number = 1000;
+  constexpr uint64_t offset = 1234;
+  constexpr uint64_t size = 5678;
+
+  BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+                        kNoCompression);
+
+  WriteBatch batch;
+  ASSERT_OK(
+      WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key, blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Flush());
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_TRUE(
+      db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption());
+}
+
+TEST_F(DBCompactionTest, CompactionWithChecksumHandoff1) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 3;
+  options.env = fault_fs_env.get();
+  options.create_if_missing = true;
+  options.checksum_handoff_file_types.Add(FileType::kTableFile);
+  Status s;
+  Reopen(options);
+
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+  Destroy(options);
+  Reopen(options);
+
+  // The hash does not match, compaction write fails
+  // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  // Since the file system returns IOStatus::Corruption, it is an
+  // unrecoverable error.
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) {
+        fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(),
+            ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+  Reopen(options);
+
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+
+  // Each write will be similated as corrupted.
+  // Since the file system returns IOStatus::Corruption, it is an
+  // unrecoverable error.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0",
+      [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(),
+            ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  Destroy(options);
+}
+
+TEST_F(DBCompactionTest, CompactionWithChecksumHandoff2) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 3;
+  options.env = fault_fs_env.get();
+  options.create_if_missing = true;
+  Status s;
+  Reopen(options);
+
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+  Destroy(options);
+  Reopen(options);
+
+  // options is not set, the checksum handoff will not be triggered
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) {
+        fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+  Reopen(options);
+
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+
+  // options is not set, the checksum handoff will not be triggered
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0",
+      [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+
+  Destroy(options);
+}
+
+TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest1) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 3;
+  options.env = fault_fs_env.get();
+  options.create_if_missing = true;
+  options.checksum_handoff_file_types.Add(FileType::kDescriptorFile);
+  Status s;
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  Reopen(options);
+
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+  Destroy(options);
+  Reopen(options);
+
+  // The hash does not match, compaction write fails
+  // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  // Since the file system returns IOStatus::Corruption, it is mapped to
+  // kFatalError error.
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) {
+        fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+}
+
+TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest2) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 3;
+  options.env = fault_fs_env.get();
+  options.create_if_missing = true;
+  options.checksum_handoff_file_types.Add(FileType::kDescriptorFile);
+  Status s;
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  Reopen(options);
+
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+
+  // Each write will be similated as corrupted.
+  // Since the file system returns IOStatus::Corruption, it is mapped to
+  // kFatalError error.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0",
+      [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  Destroy(options);
+}
+
 #endif  // !defined(ROCKSDB_LITE)
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/db_dynamic_level_test.cc b/db/db_dynamic_level_test.cc
index 6ecf727c91e..6e2f6a283e7 100644
--- a/db/db_dynamic_level_test.cc
+++ b/db/db_dynamic_level_test.cc
@@ -13,6 +13,7 @@
 #if !defined(ROCKSDB_LITE)
 
 #include "db/db_test_util.h"
+#include "env/mock_env.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "util/random.h"
@@ -102,7 +103,8 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase) {
       }
 
       // Test compact range works
-      dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+      ASSERT_OK(
+          dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
       // All data should be in the last level.
       ColumnFamilyMetaData cf_meta;
       db_->GetColumnFamilyMetaData(&cf_meta);
@@ -141,6 +143,7 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
   options.max_background_compactions = 2;
   options.num_levels = 5;
   options.max_compaction_bytes = 0;  // Force not expanding in compactions
+  options.db_host_id = "";  // Setting this messes up the file size calculation
   BlockBasedTableOptions table_options;
   table_options.block_size = 1024;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
@@ -165,8 +168,8 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
   ASSERT_OK(dbfull()->SetOptions({
       {"disable_auto_compactions", "false"},
   }));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
   ASSERT_EQ(4U, int_prop);
 
@@ -183,8 +186,8 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
   ASSERT_OK(dbfull()->SetOptions({
       {"disable_auto_compactions", "false"},
   }));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
   ASSERT_EQ(3U, int_prop);
   ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop));
@@ -204,8 +207,8 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
   ASSERT_OK(dbfull()->SetOptions({
       {"disable_auto_compactions", "false"},
   }));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
   ASSERT_EQ(3U, int_prop);
 
@@ -233,8 +236,8 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
   }));
 
   TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:0");
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
   ASSERT_EQ(2U, int_prop);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
@@ -263,7 +266,7 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
   }
   TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:2");
 
-  Flush();
+  ASSERT_OK(Flush());
 
   thread.join();
 
@@ -301,7 +304,7 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesCompactRange) {
   DestroyAndReopen(options);
 
   // Compact against empty DB
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
   uint64_t int_prop;
   std::string str_prop;
@@ -315,13 +318,13 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesCompactRange) {
     ASSERT_OK(
         Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))), rnd.RandomString(80)));
   }
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   if (NumTableFilesAtLevel(0) == 0) {
     // Make sure level 0 is not empty
     ASSERT_OK(
         Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))), rnd.RandomString(80)));
-    Flush();
+    ASSERT_OK(Flush());
   }
 
   ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
@@ -342,7 +345,7 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesCompactRange) {
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ(output_levels.size(), 2);
   ASSERT_TRUE(output_levels.find(3) != output_levels.end());
   ASSERT_TRUE(output_levels.find(4) != output_levels.end());
@@ -388,8 +391,8 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBaseInc) {
     PutFixed32(&value, static_cast<uint32_t>(i));
     ASSERT_OK(Put(Key(i), value));
   }
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 
   ASSERT_EQ(non_trivial, 0);
@@ -448,7 +451,7 @@ TEST_F(DBTestDynamicLevel, DISABLED_MigrateToDynamicLevelMaxBytesBase) {
     ASSERT_OK(Delete(Key(i / 10)));
   }
   verify_func(total_keys, false);
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   options.level_compaction_dynamic_level_bytes = true;
   options.disable_auto_compactions = true;
@@ -463,7 +466,7 @@ TEST_F(DBTestDynamicLevel, DISABLED_MigrateToDynamicLevelMaxBytesBase) {
     CompactRangeOptions compact_options;
     compact_options.change_level = true;
     compact_options.target_level = options.num_levels - 1;
-    dbfull()->CompactRange(compact_options, nullptr, nullptr);
+    ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
     compaction_finished.store(true);
   });
   do {
@@ -483,7 +486,7 @@ TEST_F(DBTestDynamicLevel, DISABLED_MigrateToDynamicLevelMaxBytesBase) {
   }
 
   verify_func(total_keys2, false);
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   verify_func(total_keys2, false);
 
   // Base level is not level 1
diff --git a/db/db_encryption_test.cc b/db/db_encryption_test.cc
index dddc8330713..c2ec3ec8a52 100644
--- a/db/db_encryption_test.cc
+++ b/db/db_encryption_test.cc
@@ -18,6 +18,13 @@ class DBEncryptionTest : public DBTestBase {
  public:
   DBEncryptionTest()
       : DBTestBase("/db_encryption_test", /*env_do_fsync=*/true) {}
+  Env* GetTargetEnv() {
+    if (encrypted_env_ != nullptr) {
+      return (static_cast<EnvWrapper*>(encrypted_env_))->target();
+    } else {
+      return env_;
+    }
+  }
 };
 
 #ifndef ROCKSDB_LITE
@@ -34,20 +41,20 @@ TEST_F(DBEncryptionTest, CheckEncrypted) {
   auto status = env_->GetChildren(dbname_, &fileNames);
   ASSERT_OK(status);
 
-  auto defaultEnv = Env::Default();
+  Env* target = GetTargetEnv();
   int hits = 0;
   for (auto it = fileNames.begin() ; it != fileNames.end(); ++it) {
-    if ((*it == "..") || (*it == ".")) {
+    if (*it == "LOCK") {
       continue;
     }
     auto filePath = dbname_ + "/" + *it;
     std::unique_ptr<SequentialFile> seqFile;
     auto envOptions = EnvOptions(CurrentOptions());
-    status = defaultEnv->NewSequentialFile(filePath, &seqFile, envOptions);
+    status = target->NewSequentialFile(filePath, &seqFile, envOptions);
     ASSERT_OK(status);
 
     uint64_t fileSize;
-    status = defaultEnv->GetFileSize(filePath, &fileSize);
+    status = target->GetFileSize(filePath, &fileSize);
     ASSERT_OK(status);
 
     std::string scratch;
@@ -85,7 +92,7 @@ TEST_F(DBEncryptionTest, CheckEncrypted) {
 }
 
 TEST_F(DBEncryptionTest, ReadEmptyFile) {
-  auto defaultEnv = Env::Default();
+  auto defaultEnv = GetTargetEnv();
 
   // create empty file for reading it back in later
   auto envOptions = EnvOptions(CurrentOptions());
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index 35b8f648e04..fce28c02cc9 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -98,7 +98,14 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
 
   ret.emplace_back(CurrentFileName(""));
   ret.emplace_back(DescriptorFileName("", versions_->manifest_file_number()));
-  ret.emplace_back(OptionsFileName("", versions_->options_file_number()));
+  // The OPTIONS file number is zero in read-write mode when OPTIONS file
+  // writing failed and the DB was configured with
+  // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file
+  // number is zero when no OPTIONS file exist at all. In those cases we do not
+  // record any OPTIONS file in the live file list.
+  if (versions_->options_file_number() != 0) {
+    ret.emplace_back(OptionsFileName("", versions_->options_file_number()));
+  }
 
   // find length of manifest file while holding the mutex lock
   *manifest_file_size = versions_->manifest_file_size();
diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc
index 529acbc51dc..b5d3026d86e 100644
--- a/db/db_flush_test.cc
+++ b/db/db_flush_test.cc
@@ -11,13 +11,16 @@
 
 #include "db/db_impl/db_impl.h"
 #include "db/db_test_util.h"
+#include "env/mock_env.h"
 #include "file/filename.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
+#include "rocksdb/utilities/transaction_db.h"
 #include "test_util/sync_point.h"
 #include "util/cast_util.h"
 #include "util/mutexlock.h"
 #include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -63,7 +66,7 @@ TEST_F(DBFlushTest, FlushWhileWritingManifest) {
   ASSERT_OK(Put("bar", "v"));
   ASSERT_OK(dbfull()->Flush(no_wait));
   // If the issue is hit we will wait here forever.
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
 #ifndef ROCKSDB_LITE
   ASSERT_EQ(2, TotalTableFiles());
 #endif  // ROCKSDB_LITE
@@ -79,41 +82,26 @@ TEST_F(DBFlushTest, SyncFail) {
   options.env = fault_injection_env.get();
 
   SyncPoint::GetInstance()->LoadDependency(
-      {{"DBFlushTest::SyncFail:GetVersionRefCount:1",
-        "DBImpl::FlushMemTableToOutputFile:BeforePickMemtables"},
-       {"DBImpl::FlushMemTableToOutputFile:AfterPickMemtables",
-        "DBFlushTest::SyncFail:GetVersionRefCount:2"},
-       {"DBFlushTest::SyncFail:1", "DBImpl::SyncClosedLogs:Start"},
+      {{"DBFlushTest::SyncFail:1", "DBImpl::SyncClosedLogs:Start"},
        {"DBImpl::SyncClosedLogs:Failed", "DBFlushTest::SyncFail:2"}});
   SyncPoint::GetInstance()->EnableProcessing();
 
   CreateAndReopenWithCF({"pikachu"}, options);
-  Put("key", "value");
-  auto* cfd =
-      static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
-          ->cfd();
+  ASSERT_OK(Put("key", "value"));
   FlushOptions flush_options;
   flush_options.wait = false;
   ASSERT_OK(dbfull()->Flush(flush_options));
   // Flush installs a new super-version. Get the ref count after that.
-  auto current_before = cfd->current();
-  int refs_before = cfd->current()->TEST_refs();
-  TEST_SYNC_POINT("DBFlushTest::SyncFail:GetVersionRefCount:1");
-  TEST_SYNC_POINT("DBFlushTest::SyncFail:GetVersionRefCount:2");
-  int refs_after_picking_memtables = cfd->current()->TEST_refs();
-  ASSERT_EQ(refs_before + 1, refs_after_picking_memtables);
   fault_injection_env->SetFilesystemActive(false);
   TEST_SYNC_POINT("DBFlushTest::SyncFail:1");
   TEST_SYNC_POINT("DBFlushTest::SyncFail:2");
   fault_injection_env->SetFilesystemActive(true);
   // Now the background job will do the flush; wait for it.
-  dbfull()->TEST_WaitForFlushMemTable();
+  // Returns the IO error happend during flush.
+  ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable());
 #ifndef ROCKSDB_LITE
   ASSERT_EQ("", FilesPerLevel());  // flush failed.
 #endif                             // ROCKSDB_LITE
-  // Backgroun flush job should release ref count to current version.
-  ASSERT_EQ(current_before, cfd->current());
-  ASSERT_EQ(refs_before, cfd->current()->TEST_refs());
   Destroy(options);
 }
 
@@ -126,7 +114,7 @@ TEST_F(DBFlushTest, SyncSkip) {
   SyncPoint::GetInstance()->EnableProcessing();
 
   Reopen(options);
-  Put("key", "value");
+  ASSERT_OK(Put("key", "value"));
 
   FlushOptions flush_options;
   flush_options.wait = false;
@@ -136,7 +124,7 @@ TEST_F(DBFlushTest, SyncSkip) {
   TEST_SYNC_POINT("DBFlushTest::SyncSkip:2");
 
   // Now the background job will do the flush; wait for it.
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
 
   Destroy(options);
 }
@@ -171,13 +159,73 @@ TEST_F(DBFlushTest, FlushInLowPriThreadPool) {
   ASSERT_OK(Put("key", "val"));
   for (int i = 0; i < 4; ++i) {
     ASSERT_OK(Put("key", "val"));
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(4, num_flushes);
   ASSERT_EQ(1, num_compactions);
 }
 
+// Test when flush job is submitted to low priority thread pool and when DB is
+// closed in the meanwhile, CloseHelper doesn't hang.
+TEST_F(DBFlushTest, CloseDBWhenFlushInLowPri) {
+  Options options = CurrentOptions();
+  options.max_background_flushes = 1;
+  options.max_total_wal_size = 8192;
+
+  DestroyAndReopen(options);
+  CreateColumnFamilies({"cf1", "cf2"}, options);
+
+  env_->SetBackgroundThreads(0, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  int num_flushes = 0;
+
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::BGWorkFlush",
+                                        [&](void* /*arg*/) { ++num_flushes; });
+
+  int num_low_flush_unscheduled = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::UnscheduleLowFlushCallback", [&](void* /*arg*/) {
+        num_low_flush_unscheduled++;
+        // There should be one flush job in low pool that needs to be
+        // unscheduled
+        ASSERT_EQ(num_low_flush_unscheduled, 1);
+      });
+
+  int num_high_flush_unscheduled = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::UnscheduleHighFlushCallback", [&](void* /*arg*/) {
+        num_high_flush_unscheduled++;
+        // There should be no flush job in high pool
+        ASSERT_EQ(num_high_flush_unscheduled, 0);
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(0, "key1", DummyString(8192)));
+  // Block thread so that flush cannot be run and can be removed from the queue
+  // when called Unschedule.
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  sleeping_task_low.WaitUntilSleeping();
+
+  // Trigger flush and flush job will be scheduled to LOW priority thread.
+  ASSERT_OK(Put(0, "key2", DummyString(8192)));
+
+  // Close DB and flush job in low priority queue will be removed without
+  // running.
+  Close();
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+  ASSERT_EQ(0, num_flushes);
+
+  TryReopenWithColumnFamilies({"default", "cf1", "cf2"}, options);
+  ASSERT_OK(Put(0, "key3", DummyString(8192)));
+  ASSERT_OK(Flush(0));
+  ASSERT_EQ(1, num_flushes);
+}
+
 TEST_F(DBFlushTest, ManualFlushWithMinWriteBufferNumberToMerge) {
   Options options = CurrentOptions();
   options.write_buffer_size = 100;
@@ -237,6 +285,379 @@ TEST_F(DBFlushTest, ScheduleOnlyOneBgThread) {
   SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
+// The following 3 tests are designed for testing garbage statistics at flush
+// time.
+//
+// ======= General Information ======= (from GitHub Wiki).
+// There are three scenarios where memtable flush can be triggered:
+//
+// 1 - Memtable size exceeds ColumnFamilyOptions::write_buffer_size
+//     after a write.
+// 2 - Total memtable size across all column families exceeds
+// DBOptions::db_write_buffer_size,
+//     or DBOptions::write_buffer_manager signals a flush. In this scenario
+//     the largest memtable will be flushed.
+// 3 - Total WAL file size exceeds DBOptions::max_total_wal_size.
+//     In this scenario the memtable with the oldest data will be flushed,
+//     in order to allow the WAL file with data from this memtable to be
+//     purged.
+//
+// As a result, a memtable can be flushed before it is full. This is one
+// reason the generated SST file can be smaller than the corresponding
+// memtable. Compression is another factor to make SST file smaller than
+// corresponding memtable, since data in memtable is uncompressed.
+
+TEST_F(DBFlushTest, StatisticsGarbageBasic) {
+  Options options = CurrentOptions();
+
+  // The following options are used to enforce several values that
+  // may already exist as default values to make this test resilient
+  // to default value updates in the future.
+  options.statistics = CreateDBStatistics();
+
+  // Record all statistics.
+  options.statistics->set_stats_level(StatsLevel::kAll);
+
+  // create the DB if it's not already present
+  options.create_if_missing = true;
+
+  // Useful for now as we are trying to compare uncompressed data savings on
+  // flush().
+  options.compression = kNoCompression;
+
+  // Prevent memtable in place updates. Should already be disabled
+  // (from Wiki:
+  //  In place updates can be enabled by toggling on the bool
+  //  inplace_update_support flag. However, this flag is by default set to
+  //  false
+  //  because this thread-safe in-place update support is not compatible
+  //  with concurrent memtable writes. Note that the bool
+  //  allow_concurrent_memtable_write is set to true by default )
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+
+  // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes).
+  options.write_buffer_size = 64 << 20;
+
+  ASSERT_OK(TryReopen(options));
+
+  // Put multiple times the same key-values.
+  // The encoded length of a db entry in the memtable is
+  // defined in db/memtable.cc (MemTable::Add) as the variable:
+  // encoded_len=  VarintLength(internal_key_size)  --> =
+  // log_256(internal_key).
+  // Min # of bytes
+  //                                                       necessary to
+  //                                                       store
+  //                                                       internal_key_size.
+  //             + internal_key_size                --> = actual key string,
+  //             (size key_size: w/o term null char)
+  //                                                      + 8 bytes for
+  //                                                      fixed uint64 "seq
+  //                                                      number
+  // +
+  //                                                      insertion type"
+  //             + VarintLength(val_size)           --> = min # of bytes to
+  //             store val_size
+  //             + val_size                         --> = actual value
+  //             string
+  // For example, in our situation, "key1" : size 4, "value1" : size 6
+  // (the terminating null characters are not copied over to the memtable).
+  // And therefore encoded_len = 1 + (4+8) + 1 + 6 = 20 bytes per entry.
+  // However in terms of raw data contained in the memtable, and written
+  // over to the SSTable, we only count internal_key_size and val_size,
+  // because this is the only raw chunk of bytes that contains everything
+  // necessary to reconstruct a user entry: sequence number, insertion type,
+  // key, and value.
+
+  // To test the relevance of our Memtable garbage statistics,
+  // namely MEMTABLE_PAYLOAD_BYTES_AT_FLUSH and MEMTABLE_GARBAGE_BYTES_AT_FLUSH,
+  // we insert K-V pairs with 3 distinct keys (of length 4),
+  // and random values of arbitrary length RAND_VALUES_LENGTH,
+  // and we repeat this step NUM_REPEAT times total.
+  // At the end, we insert 3 final K-V pairs with the same 3 keys
+  // and known values (these will be the final values, of length 6).
+  // I chose NUM_REPEAT=2,000 such that no automatic flush is
+  // triggered (the number of bytes in the memtable is therefore
+  // well below any meaningful heuristic for a memtable of size 64MB).
+  // As a result, since each K-V pair is inserted as a payload
+  // of N meaningful bytes (sequence number, insertion type,
+  // key, and value = 8 + 4 + RAND_VALUE_LENGTH),
+  // MEMTABLE_GARBAGE_BYTES_AT_FLUSH should be equal to 2,000 * N bytes
+  // and MEMTABLE_PAYLAOD_BYTES_AT_FLUSH = MEMTABLE_GARBAGE_BYTES_AT_FLUSH +
+  // (3*(8 + 4 + 6)) bytes. For RAND_VALUE_LENGTH = 172 (arbitrary value), we
+  // expect:
+  //      N = 8 + 4 + 172 = 184 bytes
+  //      MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 2,000 * 184 = 368,000 bytes.
+  //      MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 368,000 + 3*18 = 368,054 bytes.
+
+  const size_t NUM_REPEAT = 2000;
+  const size_t RAND_VALUES_LENGTH = 172;
+  const std::string KEY1 = "key1";
+  const std::string KEY2 = "key2";
+  const std::string KEY3 = "key3";
+  const std::string VALUE1 = "value1";
+  const std::string VALUE2 = "value2";
+  const std::string VALUE3 = "value3";
+  uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0;
+  uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0;
+
+  Random rnd(301);
+  // Insertion of of K-V pairs, multiple times.
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+    ASSERT_OK(Put(KEY1, p_v1));
+    ASSERT_OK(Put(KEY2, p_v2));
+    ASSERT_OK(Put(KEY3, p_v3));
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY1.size() + p_v1.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY2.size() + p_v2.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY3.size() + p_v3.size() + sizeof(uint64_t);
+  }
+
+  // The memtable data bytes includes the "garbage"
+  // bytes along with the useful payload.
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH =
+      EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH;
+
+  ASSERT_OK(Put(KEY1, VALUE1));
+  ASSERT_OK(Put(KEY2, VALUE2));
+  ASSERT_OK(Put(KEY3, VALUE3));
+
+  // Add useful payload to the memtable data bytes:
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH +=
+      KEY1.size() + VALUE1.size() + KEY2.size() + VALUE2.size() + KEY3.size() +
+      VALUE3.size() + 3 * sizeof(uint64_t);
+
+  // We assert that the last K-V pairs have been successfully inserted,
+  // and that the valid values are VALUE1, VALUE2, VALUE3.
+  PinnableSlice value;
+  ASSERT_OK(Get(KEY1, &value));
+  ASSERT_EQ(value.ToString(), VALUE1);
+  ASSERT_OK(Get(KEY2, &value));
+  ASSERT_EQ(value.ToString(), VALUE2);
+  ASSERT_OK(Get(KEY3, &value));
+  ASSERT_EQ(value.ToString(), VALUE3);
+
+  // Force flush to SST. Increments the statistics counter.
+  ASSERT_OK(Flush());
+
+  // Collect statistics.
+  uint64_t mem_data_bytes =
+      TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  uint64_t mem_garbage_bytes =
+      TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  Close();
+}
+
+TEST_F(DBFlushTest, StatisticsGarbageInsertAndDeletes) {
+  Options options = CurrentOptions();
+  options.statistics = CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+  options.write_buffer_size = 67108864;
+
+  ASSERT_OK(TryReopen(options));
+
+  const size_t NUM_REPEAT = 2000;
+  const size_t RAND_VALUES_LENGTH = 37;
+  const std::string KEY1 = "key1";
+  const std::string KEY2 = "key2";
+  const std::string KEY3 = "key3";
+  const std::string KEY4 = "key4";
+  const std::string KEY5 = "key5";
+  const std::string KEY6 = "key6";
+
+  uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0;
+  uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0;
+
+  WriteBatch batch;
+
+  Random rnd(301);
+  // Insertion of of K-V pairs, multiple times.
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+    ASSERT_OK(Put(KEY1, p_v1));
+    ASSERT_OK(Put(KEY2, p_v2));
+    ASSERT_OK(Put(KEY3, p_v3));
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY1.size() + p_v1.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY2.size() + p_v2.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY3.size() + p_v3.size() + sizeof(uint64_t);
+    ASSERT_OK(Delete(KEY1));
+    ASSERT_OK(Delete(KEY2));
+    ASSERT_OK(Delete(KEY3));
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY1.size() + KEY2.size() + KEY3.size() + 3 * sizeof(uint64_t);
+  }
+
+  // The memtable data bytes includes the "garbage"
+  // bytes along with the useful payload.
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH =
+      EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH;
+
+  // Note : one set of delete for KEY1, KEY2, KEY3 is written to
+  // SSTable to propagate the delete operations to K-V pairs
+  // that could have been inserted into the database during past Flush
+  // opeartions.
+  EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH -=
+      KEY1.size() + KEY2.size() + KEY3.size() + 3 * sizeof(uint64_t);
+
+  // Additional useful paylaod.
+  ASSERT_OK(Delete(KEY4));
+  ASSERT_OK(Delete(KEY5));
+  ASSERT_OK(Delete(KEY6));
+
+  // // Add useful payload to the memtable data bytes:
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH +=
+      KEY4.size() + KEY5.size() + KEY6.size() + 3 * sizeof(uint64_t);
+
+  // We assert that the K-V pairs have been successfully deleted.
+  PinnableSlice value;
+  ASSERT_NOK(Get(KEY1, &value));
+  ASSERT_NOK(Get(KEY2, &value));
+  ASSERT_NOK(Get(KEY3, &value));
+
+  // Force flush to SST. Increments the statistics counter.
+  ASSERT_OK(Flush());
+
+  // Collect statistics.
+  uint64_t mem_data_bytes =
+      TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  uint64_t mem_garbage_bytes =
+      TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  Close();
+}
+
+TEST_F(DBFlushTest, StatisticsGarbageRangeDeletes) {
+  Options options = CurrentOptions();
+  options.statistics = CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+  options.write_buffer_size = 67108864;
+
+  ASSERT_OK(TryReopen(options));
+
+  const size_t NUM_REPEAT = 1000;
+  const size_t RAND_VALUES_LENGTH = 42;
+  const std::string KEY1 = "key1";
+  const std::string KEY2 = "key2";
+  const std::string KEY3 = "key3";
+  const std::string KEY4 = "key4";
+  const std::string KEY5 = "key5";
+  const std::string KEY6 = "key6";
+  const std::string VALUE3 = "value3";
+
+  uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0;
+  uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0;
+
+  Random rnd(301);
+  // Insertion of of K-V pairs, multiple times.
+  // Also insert DeleteRange
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+    ASSERT_OK(Put(KEY1, p_v1));
+    ASSERT_OK(Put(KEY2, p_v2));
+    ASSERT_OK(Put(KEY3, p_v3));
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY1.size() + p_v1.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY2.size() + p_v2.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY3.size() + p_v3.size() + sizeof(uint64_t);
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY1,
+                               KEY2));
+    // Note: DeleteRange have an exclusive upper bound, e.g. here: [KEY2,KEY3)
+    // is deleted.
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY2,
+                               KEY3));
+    // Delete ranges are stored as a regular K-V pair, with key=STARTKEY,
+    // value=ENDKEY.
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        (KEY1.size() + KEY2.size() + sizeof(uint64_t)) +
+        (KEY2.size() + KEY3.size() + sizeof(uint64_t));
+  }
+
+  // The memtable data bytes includes the "garbage"
+  // bytes along with the useful payload.
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH =
+      EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH;
+
+  // Note : one set of deleteRange for (KEY1, KEY2) and (KEY2, KEY3) is written
+  // to SSTable to propagate the deleteRange operations to K-V pairs that could
+  // have been inserted into the database during past Flush opeartions.
+  EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH -=
+      (KEY1.size() + KEY2.size() + sizeof(uint64_t)) +
+      (KEY2.size() + KEY3.size() + sizeof(uint64_t));
+
+  // Overwrite KEY3 with known value (VALUE3)
+  // Note that during the whole time KEY3 has never been deleted
+  // by the RangeDeletes.
+  ASSERT_OK(Put(KEY3, VALUE3));
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH +=
+      KEY3.size() + VALUE3.size() + sizeof(uint64_t);
+
+  // Additional useful paylaod.
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY4, KEY5));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY5, KEY6));
+
+  // Add useful payload to the memtable data bytes:
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH +=
+      (KEY4.size() + KEY5.size() + sizeof(uint64_t)) +
+      (KEY5.size() + KEY6.size() + sizeof(uint64_t));
+
+  // We assert that the K-V pairs have been successfully deleted.
+  PinnableSlice value;
+  ASSERT_NOK(Get(KEY1, &value));
+  ASSERT_NOK(Get(KEY2, &value));
+  // And that KEY3's value is correct.
+  ASSERT_OK(Get(KEY3, &value));
+  ASSERT_EQ(value, VALUE3);
+
+  // Force flush to SST. Increments the statistics counter.
+  ASSERT_OK(Flush());
+
+  // Collect statistics.
+  uint64_t mem_data_bytes =
+      TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  uint64_t mem_garbage_bytes =
+      TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  Close();
+}
+
 TEST_P(DBFlushDirectIOTest, DirectIO) {
   Options options;
   options.create_if_missing = true;
@@ -306,7 +727,8 @@ TEST_F(DBFlushTest, ManualFlushFailsInReadOnlyMode) {
   // mode.
   fault_injection_env->SetFilesystemActive(false);
   ASSERT_OK(db_->ContinueBackgroundWork());
-  dbfull()->TEST_WaitForFlushMemTable();
+  // We ingested the error to env, so the returned status is not OK.
+  ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable());
 #ifndef ROCKSDB_LITE
   uint64_t num_bg_errors;
   ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBackgroundErrors,
@@ -451,6 +873,7 @@ TEST_F(DBFlushTest, FlushWithBlob) {
   options.enable_blob_files = true;
   options.min_blob_size = min_blob_size;
   options.disable_auto_compactions = true;
+  options.env = env_;
 
   Reopen(options);
 
@@ -468,9 +891,7 @@ TEST_F(DBFlushTest, FlushWithBlob) {
   ASSERT_OK(Flush());
 
   ASSERT_EQ(Get("key1"), short_value);
-
-  // TODO: enable once Get support is implemented for blobs
-  // ASSERT_EQ(Get("key2"), long_value);
+  ASSERT_EQ(Get("key2"), long_value);
 
   VersionSet* const versions = dbfull()->TEST_GetVersionSet();
   assert(versions);
@@ -509,26 +930,225 @@ TEST_F(DBFlushTest, FlushWithBlob) {
   const InternalStats* const internal_stats = cfd->internal_stats();
   assert(internal_stats);
 
-  const uint64_t expected_bytes =
-      table_file->fd.GetFileSize() + blob_file->GetTotalBlobBytes();
-
   const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
   ASSERT_FALSE(compaction_stats.empty());
-  ASSERT_EQ(compaction_stats[0].bytes_written, expected_bytes);
-  ASSERT_EQ(compaction_stats[0].num_output_files, 2);
+  ASSERT_EQ(compaction_stats[0].bytes_written, table_file->fd.GetFileSize());
+  ASSERT_EQ(compaction_stats[0].bytes_written_blob,
+            blob_file->GetTotalBlobBytes());
+  ASSERT_EQ(compaction_stats[0].num_output_files, 1);
+  ASSERT_EQ(compaction_stats[0].num_output_files_blob, 1);
 
   const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue();
-  ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED], expected_bytes);
+  ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED],
+            compaction_stats[0].bytes_written +
+                compaction_stats[0].bytes_written_blob);
 #endif  // ROCKSDB_LITE
 }
 
+TEST_F(DBFlushTest, FlushWithChecksumHandoff1) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  options.disable_auto_compactions = true;
+  options.env = fault_fs_env.get();
+  options.checksum_handoff_file_types.Add(FileType::kTableFile);
+  Reopen(options);
+
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(Put("key2", "value2"));
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+
+  // The hash does not match, write fails
+  // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  // Since the file system returns IOStatus::Corruption, it is an
+  // unrecoverable error.
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  });
+  ASSERT_OK(Put("key3", "value3"));
+  ASSERT_OK(Put("key4", "value4"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  Status s = Flush();
+  ASSERT_EQ(s.severity(),
+            ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+  Reopen(options);
+
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  ASSERT_OK(Put("key5", "value5"));
+  ASSERT_OK(Put("key6", "value6"));
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+
+  // Each write will be similated as corrupted.
+  // Since the file system returns IOStatus::Corruption, it is an
+  // unrecoverable error.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs->IngestDataCorruptionBeforeWrite();
+  });
+  ASSERT_OK(Put("key7", "value7"));
+  ASSERT_OK(Put("key8", "value8"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(),
+            ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  Destroy(options);
+}
+
+TEST_F(DBFlushTest, FlushWithChecksumHandoff2) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  options.disable_auto_compactions = true;
+  options.env = fault_fs_env.get();
+  Reopen(options);
+
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(Put("key2", "value2"));
+  ASSERT_OK(Flush());
+
+  // options is not set, the checksum handoff will not be triggered
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  });
+  ASSERT_OK(Put("key3", "value3"));
+  ASSERT_OK(Put("key4", "value4"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+  Reopen(options);
+
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  ASSERT_OK(Put("key5", "value5"));
+  ASSERT_OK(Put("key6", "value6"));
+  ASSERT_OK(Flush());
+
+  // options is not set, the checksum handoff will not be triggered
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs->IngestDataCorruptionBeforeWrite();
+  });
+  ASSERT_OK(Put("key7", "value7"));
+  ASSERT_OK(Put("key8", "value8"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  Destroy(options);
+}
+
+TEST_F(DBFlushTest, FlushWithChecksumHandoffManifest1) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  options.disable_auto_compactions = true;
+  options.env = fault_fs_env.get();
+  options.checksum_handoff_file_types.Add(FileType::kDescriptorFile);
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  Reopen(options);
+
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(Put("key2", "value2"));
+  ASSERT_OK(Flush());
+
+  // The hash does not match, write fails
+  // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  // Since the file system returns IOStatus::Corruption, it is mapped to
+  // kFatalError error.
+  ASSERT_OK(Put("key3", "value3"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+      });
+  ASSERT_OK(Put("key3", "value3"));
+  ASSERT_OK(Put("key4", "value4"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  Status s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+}
+
+TEST_F(DBFlushTest, FlushWithChecksumHandoffManifest2) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  options.disable_auto_compactions = true;
+  options.env = fault_fs_env.get();
+  options.checksum_handoff_file_types.Add(FileType::kDescriptorFile);
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  Reopen(options);
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  ASSERT_OK(Put("key5", "value5"));
+  ASSERT_OK(Put("key6", "value6"));
+  ASSERT_OK(Flush());
+
+  // Each write will be similated as corrupted.
+  // Since the file system returns IOStatus::Corruption, it is mapped to
+  // kFatalError error.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); });
+  ASSERT_OK(Put("key7", "value7"));
+  ASSERT_OK(Put("key8", "value8"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  Status s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  Destroy(options);
+}
+
 class DBFlushTestBlobError : public DBFlushTest,
                              public testing::WithParamInterface<std::string> {
  public:
-  DBFlushTestBlobError() : fault_injection_env_(env_) {}
-  ~DBFlushTestBlobError() { Close(); }
+  DBFlushTestBlobError() : sync_point_(GetParam()) {}
 
-  FaultInjectionTestEnv fault_injection_env_;
+  std::string sync_point_;
 };
 
 INSTANTIATE_TEST_CASE_P(DBFlushTestBlobError, DBFlushTestBlobError,
@@ -540,19 +1160,18 @@ TEST_P(DBFlushTestBlobError, FlushError) {
   Options options;
   options.enable_blob_files = true;
   options.disable_auto_compactions = true;
-  options.env = &fault_injection_env_;
+  options.env = env_;
 
   Reopen(options);
 
   ASSERT_OK(Put("key", "blob"));
 
-  SyncPoint::GetInstance()->SetCallBack(GetParam(), [this](void* /* arg */) {
-    fault_injection_env_.SetFilesystemActive(false, Status::IOError());
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) {
+    Status* const s = static_cast<Status*>(arg);
+    assert(s);
+
+    (*s) = Status::IOError(sync_point_);
   });
-  SyncPoint::GetInstance()->SetCallBack(
-      "BuildTable:BeforeFinishBuildTable", [this](void* /* arg */) {
-        fault_injection_env_.SetFilesystemActive(true);
-      });
   SyncPoint::GetInstance()->EnableProcessing();
 
   ASSERT_NOK(Flush());
@@ -599,14 +1218,117 @@ TEST_P(DBFlushTestBlobError, FlushError) {
 
   const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
   ASSERT_FALSE(compaction_stats.empty());
-  ASSERT_EQ(compaction_stats[0].bytes_written, 0);
-  ASSERT_EQ(compaction_stats[0].num_output_files, 0);
+
+  if (sync_point_ == "BlobFileBuilder::WriteBlobToFile:AddRecord") {
+    ASSERT_EQ(compaction_stats[0].bytes_written, 0);
+    ASSERT_EQ(compaction_stats[0].bytes_written_blob, 0);
+    ASSERT_EQ(compaction_stats[0].num_output_files, 0);
+    ASSERT_EQ(compaction_stats[0].num_output_files_blob, 0);
+  } else {
+    // SST file writing succeeded; blob file writing failed (during Finish)
+    ASSERT_GT(compaction_stats[0].bytes_written, 0);
+    ASSERT_EQ(compaction_stats[0].bytes_written_blob, 0);
+    ASSERT_EQ(compaction_stats[0].num_output_files, 1);
+    ASSERT_EQ(compaction_stats[0].num_output_files_blob, 0);
+  }
 
   const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue();
-  ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED], 0);
+  ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED],
+            compaction_stats[0].bytes_written +
+                compaction_stats[0].bytes_written_blob);
 #endif  // ROCKSDB_LITE
 }
 
+#ifndef ROCKSDB_LITE
+TEST_P(DBAtomicFlushTest, ManualFlushUnder2PC) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.allow_2pc = true;
+  options.atomic_flush = GetParam();
+  // 64MB so that memtable flush won't be trigger by the small writes.
+  options.write_buffer_size = (static_cast<size_t>(64) << 20);
+
+  // Destroy the DB to recreate as a TransactionDB.
+  Close();
+  Destroy(options, true);
+
+  // Create a TransactionDB.
+  TransactionDB* txn_db = nullptr;
+  TransactionDBOptions txn_db_opts;
+  txn_db_opts.write_policy = TxnDBWritePolicy::WRITE_COMMITTED;
+  ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db));
+  ASSERT_NE(txn_db, nullptr);
+  db_ = txn_db;
+
+  // Create two more columns other than default CF.
+  std::vector<std::string> cfs = {"puppy", "kitty"};
+  CreateColumnFamilies(cfs, options);
+  ASSERT_EQ(handles_.size(), 2);
+  ASSERT_EQ(handles_[0]->GetName(), cfs[0]);
+  ASSERT_EQ(handles_[1]->GetName(), cfs[1]);
+  const size_t kNumCfToFlush = options.atomic_flush ? 2 : 1;
+
+  WriteOptions wopts;
+  TransactionOptions txn_opts;
+  // txn1 only prepare, but does not commit.
+  // The WAL containing the prepared but uncommitted data must be kept.
+  Transaction* txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  // txn2 not only prepare, but also commit.
+  Transaction* txn2 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  ASSERT_NE(txn1, nullptr);
+  ASSERT_NE(txn2, nullptr);
+  for (size_t i = 0; i < kNumCfToFlush; i++) {
+    ASSERT_OK(txn1->Put(handles_[i], "k1", "v1"));
+    ASSERT_OK(txn2->Put(handles_[i], "k2", "v2"));
+  }
+  // A txn must be named before prepare.
+  ASSERT_OK(txn1->SetName("txn1"));
+  ASSERT_OK(txn2->SetName("txn2"));
+  // Prepare writes to WAL, but not to memtable. (WriteCommitted)
+  ASSERT_OK(txn1->Prepare());
+  ASSERT_OK(txn2->Prepare());
+  // Commit writes to memtable.
+  ASSERT_OK(txn2->Commit());
+  delete txn1;
+  delete txn2;
+
+  // There are still data in memtable not flushed.
+  // But since data is small enough to reside in the active memtable,
+  // there are no immutable memtable.
+  for (size_t i = 0; i < kNumCfToFlush; i++) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty());
+  }
+
+  // Atomic flush memtables,
+  // the min log with prepared data should be written to MANIFEST.
+  std::vector<ColumnFamilyHandle*> cfs_to_flush(kNumCfToFlush);
+  for (size_t i = 0; i < kNumCfToFlush; i++) {
+    cfs_to_flush[i] = handles_[i];
+  }
+  ASSERT_OK(txn_db->Flush(FlushOptions(), cfs_to_flush));
+
+  // There are no remaining data in memtable after flush.
+  for (size_t i = 0; i < kNumCfToFlush; i++) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+    ASSERT_EQ(cfh->cfd()->GetFlushReason(), FlushReason::kManualFlush);
+  }
+
+  // The recovered min log number with prepared data should be non-zero.
+  // In 2pc mode, MinLogNumberToKeep returns the
+  // VersionSet::min_log_number_to_keep_2pc recovered from MANIFEST, if it's 0,
+  // it means atomic flush didn't write the min_log_number_to_keep to MANIFEST.
+  cfs.push_back(kDefaultColumnFamilyName);
+  ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+  DBImpl* db_impl = reinterpret_cast<DBImpl*>(db_);
+  ASSERT_TRUE(db_impl->allow_2pc());
+  ASSERT_NE(db_impl->MinLogNumberToKeep(), 0);
+}
+#endif  // ROCKSDB_LITE
+
 TEST_P(DBAtomicFlushTest, ManualAtomicFlush) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
@@ -621,18 +1343,84 @@ TEST_P(DBAtomicFlushTest, ManualAtomicFlush) {
   for (size_t i = 0; i != num_cfs; ++i) {
     ASSERT_OK(Put(static_cast<int>(i) /*cf*/, "key", "value", wopts));
   }
+
+  for (size_t i = 0; i != num_cfs; ++i) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty());
+  }
+
   std::vector<int> cf_ids;
   for (size_t i = 0; i != num_cfs; ++i) {
     cf_ids.emplace_back(static_cast<int>(i));
   }
   ASSERT_OK(Flush(cf_ids));
+
   for (size_t i = 0; i != num_cfs; ++i) {
     auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(cfh->cfd()->GetFlushReason(), FlushReason::kManualFlush);
     ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
     ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
   }
 }
 
+TEST_P(DBAtomicFlushTest, PrecomputeMinLogNumberToKeepNon2PC) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = GetParam();
+  options.write_buffer_size = (static_cast<size_t>(64) << 20);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  const size_t num_cfs = handles_.size();
+  ASSERT_EQ(num_cfs, 2);
+  WriteOptions wopts;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    ASSERT_OK(Put(static_cast<int>(i) /*cf*/, "key", "value", wopts));
+  }
+
+  {
+    // Flush the default CF only.
+    std::vector<int> cf_ids{0};
+    ASSERT_OK(Flush(cf_ids));
+
+    autovector<ColumnFamilyData*> flushed_cfds;
+    autovector<autovector<VersionEdit*>> flush_edits;
+    auto flushed_cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[0]);
+    flushed_cfds.push_back(flushed_cfh->cfd());
+    flush_edits.push_back({});
+    auto unflushed_cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[1]);
+
+    ASSERT_EQ(PrecomputeMinLogNumberToKeepNon2PC(dbfull()->TEST_GetVersionSet(),
+                                                 flushed_cfds, flush_edits),
+              unflushed_cfh->cfd()->GetLogNumber());
+  }
+
+  {
+    // Flush all CFs.
+    std::vector<int> cf_ids;
+    for (size_t i = 0; i != num_cfs; ++i) {
+      cf_ids.emplace_back(static_cast<int>(i));
+    }
+    ASSERT_OK(Flush(cf_ids));
+    uint64_t log_num_after_flush = dbfull()->TEST_GetCurrentLogNumber();
+
+    uint64_t min_log_number_to_keep = port::kMaxUint64;
+    autovector<ColumnFamilyData*> flushed_cfds;
+    autovector<autovector<VersionEdit*>> flush_edits;
+    for (size_t i = 0; i != num_cfs; ++i) {
+      auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+      flushed_cfds.push_back(cfh->cfd());
+      flush_edits.push_back({});
+      min_log_number_to_keep =
+          std::min(min_log_number_to_keep, cfh->cfd()->GetLogNumber());
+    }
+    ASSERT_EQ(min_log_number_to_keep, log_num_after_flush);
+    ASSERT_EQ(PrecomputeMinLogNumberToKeepNon2PC(dbfull()->TEST_GetVersionSet(),
+                                                 flushed_cfds, flush_edits),
+              min_log_number_to_keep);
+  }
+}
+
 TEST_P(DBAtomicFlushTest, AtomicFlushTriggeredByMemTableFull) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
@@ -713,7 +1501,8 @@ TEST_P(DBAtomicFlushTest, AtomicFlushRollbackSomeJobs) {
   fault_injection_env->SetFilesystemActive(false);
   TEST_SYNC_POINT("DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:2");
   for (auto* cfh : handles_) {
-    dbfull()->TEST_WaitForFlushMemTable(cfh);
+    // Returns the IO error happend during flush.
+    ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable(cfh));
   }
   for (size_t i = 0; i != num_cfs; ++i) {
     auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
diff --git a/db/compacted_db_impl.cc b/db/db_impl/compacted_db_impl.cc
similarity index 93%
rename from db/compacted_db_impl.cc
rename to db/db_impl/compacted_db_impl.cc
index cd4f27b9e63..076ce818fd1 100644
--- a/db/compacted_db_impl.cc
+++ b/db/db_impl/compacted_db_impl.cc
@@ -4,7 +4,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #ifndef ROCKSDB_LITE
-#include "db/compacted_db_impl.h"
+#include "db/db_impl/compacted_db_impl.h"
 
 #include "db/db_impl/db_impl.h"
 #include "db/version_set.h"
@@ -17,11 +17,13 @@ extern void MarkKeyMayExist(void* arg);
 extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
                       const Slice& v, bool hit_and_return);
 
-CompactedDBImpl::CompactedDBImpl(
-  const DBOptions& options, const std::string& dbname)
-  : DBImpl(options, dbname), cfd_(nullptr), version_(nullptr),
-    user_comparator_(nullptr) {
-}
+CompactedDBImpl::CompactedDBImpl(const DBOptions& options,
+                                 const std::string& dbname)
+    : DBImpl(options, dbname, /*seq_per_batch*/ false, +/*batch_per_txn*/ true,
+             /*read_only*/ true),
+      cfd_(nullptr),
+      version_(nullptr),
+      user_comparator_(nullptr) {}
 
 CompactedDBImpl::~CompactedDBImpl() {
 }
@@ -78,6 +80,7 @@ std::vector<Status> CompactedDBImpl::MultiGet(const ReadOptions& options,
                              nullptr, nullptr, nullptr, true, nullptr, nullptr);
       LookupKey lkey(keys[idx], kMaxSequenceNumber);
       Status s = r->Get(options, lkey.internal_key(), &get_context, nullptr);
+      assert(static_cast<size_t>(idx) < statuses.size());
       if (!s.ok() && !s.IsNotFound()) {
         statuses[idx] = s;
       } else {
diff --git a/db/compacted_db_impl.h b/db/db_impl/compacted_db_impl.h
similarity index 96%
rename from db/compacted_db_impl.h
rename to db/db_impl/compacted_db_impl.h
index 7099566fc81..4cf00785e2f 100644
--- a/db/compacted_db_impl.h
+++ b/db/db_impl/compacted_db_impl.h
@@ -18,7 +18,7 @@ class CompactedDBImpl : public DBImpl {
   CompactedDBImpl(const CompactedDBImpl&) = delete;
   void operator=(const CompactedDBImpl&) = delete;
 
-  virtual ~CompactedDBImpl();
+  ~CompactedDBImpl() override;
 
   static Status Open(const Options& options, const std::string& dbname,
                      DB** dbptr);
@@ -82,6 +82,11 @@ class CompactedDBImpl : public DBImpl {
                        ColumnFamilyHandle* /*column_family*/) override {
     return Status::NotSupported("Not supported in compacted db mode.");
   }
+
+  virtual Status SyncWAL() override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
   using DB::IngestExternalFile;
   virtual Status IngestExternalFile(
       ColumnFamilyHandle* /*column_family*/,
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 29b4ac10abd..4e64963ed68 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -53,7 +53,6 @@
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
 #include "db/write_callback.h"
-#include "env/composite_env_wrapper.h"
 #include "file/file_util.h"
 #include "file/filename.h"
 #include "file/random_access_file_reader.h"
@@ -83,6 +82,7 @@
 #include "rocksdb/stats_history.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
+#include "rocksdb/version.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_table_factory.h"
@@ -94,7 +94,6 @@
 #include "table/two_level_iterator.h"
 #include "test_util/sync_point.h"
 #include "util/autovector.h"
-#include "util/build_version.h"
 #include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/compression.h"
@@ -147,27 +146,31 @@ void DumpSupportInfo(Logger* logger) {
 }  // namespace
 
 DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
-               const bool seq_per_batch, const bool batch_per_txn)
+               const bool seq_per_batch, const bool batch_per_txn,
+               bool read_only)
     : dbname_(dbname),
       own_info_log_(options.info_log == nullptr),
-      initial_db_options_(SanitizeOptions(dbname, options)),
+      initial_db_options_(SanitizeOptions(dbname, options, read_only)),
       env_(initial_db_options_.env),
       io_tracer_(std::make_shared<IOTracer>()),
       immutable_db_options_(initial_db_options_),
       fs_(immutable_db_options_.fs, io_tracer_),
       mutable_db_options_(initial_db_options_),
-      stats_(immutable_db_options_.statistics.get()),
-      mutex_(stats_, env_, DB_MUTEX_WAIT_MICROS,
+      stats_(immutable_db_options_.stats),
+      mutex_(stats_, immutable_db_options_.clock, DB_MUTEX_WAIT_MICROS,
              immutable_db_options_.use_adaptive_mutex),
       default_cf_handle_(nullptr),
+      error_handler_(this, immutable_db_options_, &mutex_),
+      event_logger_(immutable_db_options_.info_log.get()),
       max_total_in_memory_state_(0),
       file_options_(BuildDBOptions(immutable_db_options_, mutable_db_options_)),
       file_options_for_compaction_(fs_->OptimizeForCompactionTableWrite(
           file_options_, immutable_db_options_)),
       seq_per_batch_(seq_per_batch),
       batch_per_txn_(batch_per_txn),
-      db_lock_(nullptr),
+      next_job_id_(1),
       shutting_down_(false),
+      db_lock_(nullptr),
       manual_compaction_paused_(false),
       bg_cv_(&mutex_),
       logfile_number_(0),
@@ -192,9 +195,8 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
       bg_purge_scheduled_(0),
       disable_delete_obsolete_files_(0),
       pending_purge_obsolete_files_(0),
-      delete_obsolete_files_last_run_(env_->NowMicros()),
+      delete_obsolete_files_last_run_(immutable_db_options_.clock->NowMicros()),
       last_stats_dump_time_microsec_(0),
-      next_job_id_(1),
       has_unpersisted_data_(false),
       unable_to_release_oldest_log_(false),
       num_running_ingest_file_(0),
@@ -202,7 +204,6 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
       wal_manager_(immutable_db_options_, file_options_, io_tracer_,
                    seq_per_batch),
 #endif  // ROCKSDB_LITE
-      event_logger_(immutable_db_options_.info_log.get()),
       bg_work_paused_(0),
       bg_compaction_paused_(0),
       refitting_level_(false),
@@ -231,8 +232,9 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
       own_sfm_(options.sst_file_manager == nullptr),
       preserve_deletes_(options.preserve_deletes),
       closed_(false),
-      error_handler_(this, immutable_db_options_, &mutex_),
-      atomic_flush_install_cv_(&mutex_) {
+      atomic_flush_install_cv_(&mutex_),
+      blob_callback_(immutable_db_options_.sst_file_manager.get(), &mutex_,
+                     &error_handler_) {
   // !batch_per_trx_ implies seq_per_batch_ because it is only unset for
   // WriteUnprepared, which should use seq_per_batch_.
   assert(batch_per_txn_ || seq_per_batch_);
@@ -249,16 +251,17 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
   co.num_shard_bits = immutable_db_options_.table_cache_numshardbits;
   co.metadata_charge_policy = kDontChargeCacheMetadata;
   table_cache_ = NewLRUCache(co);
+  SetDbSessionId();
+  assert(!db_session_id_.empty());
 
   versions_.reset(new VersionSet(dbname_, &immutable_db_options_, file_options_,
                                  table_cache_.get(), write_buffer_manager_,
                                  &write_controller_, &block_cache_tracer_,
-                                 io_tracer_));
+                                 io_tracer_, db_session_id_));
   column_family_memtables_.reset(
       new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
 
   DumpRocksDBBuildVersion(immutable_db_options_.info_log.get());
-  SetDbSessionId();
   DumpDBFileSummary(immutable_db_options_, dbname_, db_session_id_);
   immutable_db_options_.Dump(immutable_db_options_.info_log.get());
   mutable_db_options_.Dump(immutable_db_options_.info_log.get());
@@ -268,6 +271,10 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
   // we won't drop any deletion markers until SetPreserveDeletesSequenceNumber()
   // is called by client and this seqnum is advanced.
   preserve_deletes_seqnum_.store(0);
+
+  if (write_buffer_manager_) {
+    wbm_stall_.reset(new WBMStallInterface());
+  }
 }
 
 Status DBImpl::Resume() {
@@ -307,18 +314,21 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
   mutex_.AssertHeld();
   WaitForBackgroundWork();
 
-  Status bg_error = error_handler_.GetBGError();
   Status s;
   if (shutdown_initiated_) {
     // Returning shutdown status to SFM during auto recovery will cause it
     // to abort the recovery and allow the shutdown to progress
     s = Status::ShutdownInProgress();
   }
-  if (s.ok() && bg_error.severity() > Status::Severity::kHardError) {
-    ROCKS_LOG_INFO(
-        immutable_db_options_.info_log,
-        "DB resume requested but failed due to Fatal/Unrecoverable error");
-    s = bg_error;
+
+  if (s.ok()) {
+    Status bg_error = error_handler_.GetBGError();
+    if (bg_error.severity() > Status::Severity::kHardError) {
+      ROCKS_LOG_INFO(
+          immutable_db_options_.info_log,
+          "DB resume requested but failed due to Fatal/Unrecoverable error");
+      s = bg_error;
+    }
   }
 
   // Make sure the IO Status stored in version set is set to OK.
@@ -393,6 +403,11 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
   FindObsoleteFiles(&job_context, true);
   if (s.ok()) {
     s = error_handler_.ClearBGError();
+  } else {
+    // NOTE: this is needed to pass ASSERT_STATUS_CHECKED
+    // in the DBSSTTest.DBWithMaxSpaceAllowedRandomized test.
+    // See https://github.com/facebook/rocksdb/pull/7715#issuecomment-754947952
+    error_handler_.GetRecoveryError().PermitUncheckedError();
   }
   mutex_.Unlock();
 
@@ -409,6 +424,12 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
     if (file_deletion_disabled) {
       // Always return ok
       s = EnableFileDeletions(/*force=*/true);
+      if (!s.ok()) {
+        ROCKS_LOG_INFO(
+            immutable_db_options_.info_log,
+            "DB resume requested but could not enable file deletions [%s]",
+            s.ToString().c_str());
+      }
     }
     ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");
   }
@@ -460,7 +481,9 @@ void DBImpl::CancelAllBackgroundWork(bool wait) {
       autovector<ColumnFamilyData*> cfds;
       SelectColumnFamiliesForAtomicFlush(&cfds);
       mutex_.Unlock();
-      AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kShutDown);
+      Status s =
+          AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kShutDown);
+      s.PermitUncheckedError();  //**TODO: What to do on error?
       mutex_.Lock();
     } else {
       for (auto cfd : *versions_->GetColumnFamilySet()) {
@@ -496,19 +519,20 @@ Status DBImpl::CloseHelper() {
   }
   mutex_.Unlock();
 
+  // Below check is added as recovery_error_ is not checked and it causes crash
+  // in DBSSTTest.DBWithMaxSpaceAllowedWithBlobFiles when space limit is
+  // reached.
+  error_handler_.GetRecoveryError().PermitUncheckedError();
+
   // CancelAllBackgroundWork called with false means we just set the shutdown
   // marker. After this we do a variant of the waiting and unschedule work
   // (to consider: moving all the waiting into CancelAllBackgroundWork(true))
   CancelAllBackgroundWork(false);
-  int bottom_compactions_unscheduled =
-      env_->UnSchedule(this, Env::Priority::BOTTOM);
-  int compactions_unscheduled = env_->UnSchedule(this, Env::Priority::LOW);
-  int flushes_unscheduled = env_->UnSchedule(this, Env::Priority::HIGH);
-  Status ret = Status::OK();
   mutex_.Lock();
-  bg_bottom_compaction_scheduled_ -= bottom_compactions_unscheduled;
-  bg_compaction_scheduled_ -= compactions_unscheduled;
-  bg_flush_scheduled_ -= flushes_unscheduled;
+  env_->UnSchedule(this, Env::Priority::BOTTOM);
+  env_->UnSchedule(this, Env::Priority::LOW);
+  env_->UnSchedule(this, Env::Priority::HIGH);
+  Status ret = Status::OK();
 
   // Wait for background work to finish
   while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
@@ -636,11 +660,15 @@ Status DBImpl::CloseHelper() {
 
   if (immutable_db_options_.info_log && own_info_log_) {
     Status s = immutable_db_options_.info_log->Close();
-    if (!s.ok() && ret.ok()) {
+    if (!s.ok() && !s.IsNotSupported() && ret.ok()) {
       ret = s;
     }
   }
 
+  if (write_buffer_manager_ && wbm_stall_) {
+    write_buffer_manager_->RemoveDBFromQueue(wbm_stall_.get());
+  }
+
   if (ret.IsAborted()) {
     // Reserve IsAborted() error for those where users didn't release
     // certain resource and they can release them and come back and
@@ -670,8 +698,8 @@ void DBImpl::MaybeIgnoreError(Status* s) const {
 }
 
 const Status DBImpl::CreateArchivalDirectory() {
-  if (immutable_db_options_.wal_ttl_seconds > 0 ||
-      immutable_db_options_.wal_size_limit_mb > 0) {
+  if (immutable_db_options_.WAL_ttl_seconds > 0 ||
+      immutable_db_options_.WAL_size_limit_MB > 0) {
     std::string archivalPath = ArchivalDirectory(immutable_db_options_.wal_dir);
     return env_->CreateDirIfMissing(archivalPath);
   }
@@ -679,7 +707,7 @@ const Status DBImpl::CreateArchivalDirectory() {
 }
 
 void DBImpl::PrintStatistics() {
-  auto dbstats = immutable_db_options_.statistics.get();
+  auto dbstats = immutable_db_options_.stats;
   if (dbstats) {
     ROCKS_LOG_INFO(immutable_db_options_.info_log, "STATISTICS:\n %s",
                    dbstats->ToString().c_str());
@@ -688,6 +716,18 @@ void DBImpl::PrintStatistics() {
 
 void DBImpl::StartPeriodicWorkScheduler() {
 #ifndef ROCKSDB_LITE
+
+#ifndef NDEBUG
+  // It only used by test to disable scheduler
+  bool disable_scheduler = false;
+  TEST_SYNC_POINT_CALLBACK(
+      "DBImpl::StartPeriodicWorkScheduler:DisableScheduler",
+      &disable_scheduler);
+  if (disable_scheduler) {
+    return;
+  }
+#endif  // !NDEBUG
+
   {
     InstrumentedMutexLock l(&mutex_);
     periodic_work_scheduler_ = PeriodicWorkScheduler::Default();
@@ -725,9 +765,10 @@ void DBImpl::PersistStats() {
     return;
   }
   TEST_SYNC_POINT("DBImpl::PersistStats:StartRunning");
-  uint64_t now_seconds = env_->NowMicros() / kMicrosInSecond;
+  uint64_t now_seconds =
+      immutable_db_options_.clock->NowMicros() / kMicrosInSecond;
 
-  Statistics* statistics = immutable_db_options_.statistics.get();
+  Statistics* statistics = immutable_db_options_.stats;
   if (!statistics) {
     return;
   }
@@ -866,13 +907,6 @@ Status DBImpl::GetStatsHistory(
 void DBImpl::DumpStats() {
   TEST_SYNC_POINT("DBImpl::DumpStats:1");
 #ifndef ROCKSDB_LITE
-  const DBPropertyInfo* cf_property_info =
-      GetPropertyInfo(DB::Properties::kCFStats);
-  assert(cf_property_info != nullptr);
-  const DBPropertyInfo* db_property_info =
-      GetPropertyInfo(DB::Properties::kDBStats);
-  assert(db_property_info != nullptr);
-
   std::string stats;
   if (shutdown_initiated_) {
     return;
@@ -880,18 +914,29 @@ void DBImpl::DumpStats() {
   TEST_SYNC_POINT("DBImpl::DumpStats:StartRunning");
   {
     InstrumentedMutexLock l(&mutex_);
-    default_cf_internal_stats_->GetStringProperty(
-        *db_property_info, DB::Properties::kDBStats, &stats);
+    const std::string* property = &DB::Properties::kDBStats;
+    const DBPropertyInfo* property_info = GetPropertyInfo(*property);
+    assert(property_info != nullptr);
+    default_cf_internal_stats_->GetStringProperty(*property_info, *property,
+                                                  &stats);
+
+    property = &DB::Properties::kCFStatsNoFileHistogram;
+    property_info = GetPropertyInfo(*property);
+    assert(property_info != nullptr);
     for (auto cfd : *versions_->GetColumnFamilySet()) {
       if (cfd->initialized()) {
-        cfd->internal_stats()->GetStringProperty(
-            *cf_property_info, DB::Properties::kCFStatsNoFileHistogram, &stats);
+        cfd->internal_stats()->GetStringProperty(*property_info, *property,
+                                                 &stats);
       }
     }
+
+    property = &DB::Properties::kCFFileHistogram;
+    property_info = GetPropertyInfo(*property);
+    assert(property_info != nullptr);
     for (auto cfd : *versions_->GetColumnFamilySet()) {
       if (cfd->initialized()) {
-        cfd->internal_stats()->GetStringProperty(
-            *cf_property_info, DB::Properties::kCFFileHistogram, &stats);
+        cfd->internal_stats()->GetStringProperty(*property_info, *property,
+                                                 &stats);
       }
     }
   }
@@ -1281,7 +1326,11 @@ Status DBImpl::SyncWAL() {
   TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1");
   {
     InstrumentedMutexLock l(&mutex_);
-    MarkLogsSynced(current_log_number, need_log_dir_sync, status);
+    if (status.ok()) {
+      status = MarkLogsSynced(current_log_number, need_log_dir_sync);
+    } else {
+      MarkLogsNotSynced(current_log_number);
+    }
   }
   TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2");
 
@@ -1307,27 +1356,54 @@ Status DBImpl::UnlockWAL() {
   return Status::OK();
 }
 
-void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
-                            const Status& status) {
+Status DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir) {
   mutex_.AssertHeld();
-  if (synced_dir && logfile_number_ == up_to && status.ok()) {
+  if (synced_dir && logfile_number_ == up_to) {
     log_dir_synced_ = true;
   }
+  VersionEdit synced_wals;
   for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) {
-    auto& log = *it;
-    assert(log.getting_synced);
-    if (status.ok() && logs_.size() > 1) {
-      logs_to_free_.push_back(log.ReleaseWriter());
+    auto& wal = *it;
+    assert(wal.getting_synced);
+    if (logs_.size() > 1) {
+      if (immutable_db_options_.track_and_verify_wals_in_manifest &&
+          wal.writer->file()->GetFileSize() > 0) {
+        synced_wals.AddWal(wal.number,
+                           WalMetadata(wal.writer->file()->GetFileSize()));
+      }
+      logs_to_free_.push_back(wal.ReleaseWriter());
       // To modify logs_ both mutex_ and log_write_mutex_ must be held
       InstrumentedMutexLock l(&log_write_mutex_);
       it = logs_.erase(it);
     } else {
-      log.getting_synced = false;
+      wal.getting_synced = false;
       ++it;
     }
   }
-  assert(!status.ok() || logs_.empty() || logs_[0].number > up_to ||
+  assert(logs_.empty() || logs_[0].number > up_to ||
          (logs_.size() == 1 && !logs_[0].getting_synced));
+
+  Status s;
+  if (synced_wals.IsWalAddition()) {
+    // not empty, write to MANIFEST.
+    s = versions_->LogAndApplyToDefaultColumnFamily(&synced_wals, &mutex_);
+    if (!s.ok() && versions_->io_status().IsIOError()) {
+      s = error_handler_.SetBGError(versions_->io_status(),
+                                    BackgroundErrorReason::kManifestWrite);
+    }
+  }
+  log_sync_cv_.SignalAll();
+  return s;
+}
+
+void DBImpl::MarkLogsNotSynced(uint64_t up_to) {
+  mutex_.AssertHeld();
+  for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;
+       ++it) {
+    auto& wal = *it;
+    assert(wal.getting_synced);
+    wal.getting_synced = false;
+  }
   log_sync_cv_.SignalAll();
 }
 
@@ -1595,8 +1671,8 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
   }
 #endif  // NDEBUG
 
-  PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
-  StopWatch sw(env_, stats_, DB_GET);
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock);
+  StopWatch sw(immutable_db_options_.clock, stats_, DB_GET);
   PERF_TIMER_GUARD(get_snapshot_time);
 
   auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(
@@ -1660,7 +1736,9 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
   }
   // If timestamp is used, we use read callback to ensure <key,t,s> is returned
   // only if t <= read_opts.timestamp and s <= snapshot.
-  if (ts_sz > 0 && !get_impl_options.callback) {
+  if (ts_sz > 0) {
+    assert(!get_impl_options
+                .callback);  // timestamp with callback is not supported
     read_cb.Refresh(snapshot);
     get_impl_options.callback = &read_cb;
   }
@@ -1784,8 +1862,8 @@ std::vector<Status> DBImpl::MultiGet(
     const std::vector<ColumnFamilyHandle*>& column_family,
     const std::vector<Slice>& keys, std::vector<std::string>* values,
     std::vector<std::string>* timestamps) {
-  PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
-  StopWatch sw(env_, stats_, DB_MULTIGET);
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock);
+  StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET);
   PERF_TIMER_GUARD(get_snapshot_time);
 
 #ifndef NDEBUG
@@ -1802,6 +1880,16 @@ std::vector<Status> DBImpl::MultiGet(
   }
 #endif  // NDEBUG
 
+  if (tracer_) {
+    // TODO: This mutex should be removed later, to improve performance when
+    // tracing is enabled.
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      // TODO: maybe handle the tracing status?
+      tracer_->MultiGet(column_family, keys).PermitUncheckedError();
+    }
+  }
+
   SequenceNumber consistent_seqnum;
 
   std::unordered_map<uint32_t, MultiGetColumnFamilyData> multiget_cf_data(
@@ -1914,9 +2002,8 @@ std::vector<Status> DBImpl::MultiGet(
         break;
       }
     }
-
     if (read_options.deadline.count() &&
-        env_->NowMicros() >
+        immutable_db_options_.clock->NowMicros() >
             static_cast<uint64_t>(read_options.deadline.count())) {
       break;
     }
@@ -1925,8 +2012,8 @@ std::vector<Status> DBImpl::MultiGet(
   if (keys_read < num_keys) {
     // The only reason to break out of the loop is when the deadline is
     // exceeded
-    assert(env_->NowMicros() >
-        static_cast<uint64_t>(read_options.deadline.count()));
+    assert(immutable_db_options_.clock->NowMicros() >
+           static_cast<uint64_t>(read_options.deadline.count()));
     for (++keys_read; keys_read < num_keys; ++keys_read) {
       stat_list[keys_read] = Status::TimedOut();
     }
@@ -2114,6 +2201,16 @@ void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys,
   }
 #endif  // NDEBUG
 
+  if (tracer_) {
+    // TODO: This mutex should be removed later, to improve performance when
+    // tracing is enabled.
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      // TODO: maybe handle the tracing status?
+      tracer_->MultiGet(num_keys, column_families, keys).PermitUncheckedError();
+    }
+  }
+
   autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
   autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
   sorted_keys.resize(num_keys);
@@ -2172,7 +2269,7 @@ void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys,
   for (; cf_iter != multiget_cf_data.end(); ++cf_iter) {
     s = MultiGetImpl(read_options, cf_iter->start, cf_iter->num_keys,
                      &sorted_keys, cf_iter->super_version, consistent_seqnum,
-                     read_callback, nullptr);
+                     read_callback);
     if (!s.ok()) {
       break;
     }
@@ -2276,6 +2373,15 @@ void DBImpl::MultiGet(const ReadOptions& read_options,
                       const Slice* keys, PinnableSlice* values,
                       std::string* timestamps, Status* statuses,
                       const bool sorted_input) {
+  if (tracer_) {
+    // TODO: This mutex should be removed later, to improve performance when
+    // tracing is enabled.
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      // TODO: maybe handle the tracing status?
+      tracer_->MultiGet(num_keys, column_family, keys).PermitUncheckedError();
+    }
+  }
   autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
   autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
   sorted_keys.resize(num_keys);
@@ -2336,15 +2442,16 @@ void DBImpl::MultiGetWithCallback(
   }
 
   GetWithTimestampReadCallback timestamp_read_callback(0);
-  ReadCallback* read_callback = nullptr;
+  ReadCallback* read_callback = callback;
   if (read_options.timestamp && read_options.timestamp->size() > 0) {
+    assert(!read_callback);  // timestamp with callback is not supported
     timestamp_read_callback.Refresh(consistent_seqnum);
     read_callback = &timestamp_read_callback;
   }
 
   Status s = MultiGetImpl(read_options, 0, num_keys, sorted_keys,
                           multiget_cf_data[0].super_version, consistent_seqnum,
-                          read_callback, nullptr);
+                          read_callback);
   assert(s.ok() || s.IsTimedOut() || s.IsAborted());
   ReturnAndCleanupSuperVersion(multiget_cf_data[0].cfd,
                                multiget_cf_data[0].super_version);
@@ -2363,9 +2470,9 @@ Status DBImpl::MultiGetImpl(
     const ReadOptions& read_options, size_t start_key, size_t num_keys,
     autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
     SuperVersion* super_version, SequenceNumber snapshot,
-    ReadCallback* callback, bool* is_blob_index) {
-  PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
-  StopWatch sw(env_, stats_, DB_MULTIGET);
+    ReadCallback* callback) {
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock);
+  StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET);
 
   // For each of the given keys, apply the entire "get" process as follows:
   // First look in the memtable, then in the immutable memtable (if any).
@@ -2376,7 +2483,7 @@ Status DBImpl::MultiGetImpl(
   uint64_t curr_value_size = 0;
   while (keys_left) {
     if (read_options.deadline.count() &&
-        env_->NowMicros() >
+        immutable_db_options_.clock->NowMicros() >
             static_cast<uint64_t>(read_options.deadline.count())) {
       s = Status::TimedOut();
       break;
@@ -2402,11 +2509,9 @@ Status DBImpl::MultiGetImpl(
         (read_options.read_tier == kPersistedTier &&
          has_unpersisted_data_.load(std::memory_order_relaxed));
     if (!skip_memtable) {
-      super_version->mem->MultiGet(read_options, &range, callback,
-                                   is_blob_index);
+      super_version->mem->MultiGet(read_options, &range, callback);
       if (!range.empty()) {
-        super_version->imm->MultiGet(read_options, &range, callback,
-                                     is_blob_index);
+        super_version->imm->MultiGet(read_options, &range, callback);
       }
       if (!range.empty()) {
         lookup_current = true;
@@ -2416,8 +2521,7 @@ Status DBImpl::MultiGetImpl(
     }
     if (lookup_current) {
       PERF_TIMER_GUARD(get_from_output_files_time);
-      super_version->current->MultiGet(read_options, &range, callback,
-                                       is_blob_index);
+      super_version->current->MultiGet(read_options, &range, callback);
     }
     curr_value_size = range.GetValueSize();
     if (curr_value_size > read_options.value_size_soft_limit) {
@@ -2771,7 +2875,7 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
                                     /* allow_unprepared_value */ true);
     result = NewDBIterator(
         env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
-        cfd->user_comparator(), iter, kMaxSequenceNumber,
+        cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber,
         sv->mutable_cf_options.max_sequential_skip_in_iterations, read_callback,
         this, cfd);
 #endif
@@ -2792,7 +2896,7 @@ ArenaWrappedDBIter* DBImpl::NewIteratorImpl(const ReadOptions& read_options,
                                             ColumnFamilyData* cfd,
                                             SequenceNumber snapshot,
                                             ReadCallback* read_callback,
-                                            bool allow_blob,
+                                            bool expose_blob_index,
                                             bool allow_refresh) {
   SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
 
@@ -2857,9 +2961,9 @@ ArenaWrappedDBIter* DBImpl::NewIteratorImpl(const ReadOptions& read_options,
   // likely that any iterator pointer is close to the iterator it points to so
   // that they are likely to be in the same cache line and/or page.
   ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
-      env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, snapshot,
-      sv->mutable_cf_options.max_sequential_skip_in_iterations,
-      sv->version_number, read_callback, this, cfd, allow_blob,
+      env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, sv->current,
+      snapshot, sv->mutable_cf_options.max_sequential_skip_in_iterations,
+      sv->version_number, read_callback, this, cfd, expose_blob_index,
       read_options.snapshot != nullptr ? false : allow_refresh);
 
   InternalIterator* internal_iter = NewInternalIterator(
@@ -2897,7 +3001,7 @@ Status DBImpl::NewIterators(
                                       /* allow_unprepared_value */ true);
       iterators->push_back(NewDBIterator(
           env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
-          cfd->user_comparator(), iter, kMaxSequenceNumber,
+          cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber,
           sv->mutable_cf_options.max_sequential_skip_in_iterations,
           read_callback, this, cfd));
     }
@@ -2932,7 +3036,8 @@ const Snapshot* DBImpl::GetSnapshotForWriteConflictBoundary() {
 SnapshotImpl* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary,
                                       bool lock) {
   int64_t unix_time = 0;
-  env_->GetCurrentTime(&unix_time).PermitUncheckedError();  // Ignore error
+  immutable_db_options_.clock->GetCurrentTime(&unix_time)
+      .PermitUncheckedError();  // Ignore error
   SnapshotImpl* s = new SnapshotImpl;
 
   if (lock) {
@@ -3069,20 +3174,25 @@ const std::string& DBImpl::GetName() const { return dbname_; }
 Env* DBImpl::GetEnv() const { return env_; }
 
 FileSystem* DB::GetFileSystem() const {
-  static LegacyFileSystemWrapper fs_wrap(GetEnv());
-  return &fs_wrap;
+  const auto& fs = GetEnv()->GetFileSystem();
+  return fs.get();
 }
 
 FileSystem* DBImpl::GetFileSystem() const {
   return immutable_db_options_.fs.get();
 }
 
+SystemClock* DBImpl::GetSystemClock() const {
+  return immutable_db_options_.clock;
+}
+
 #ifndef ROCKSDB_LITE
 
-Status DBImpl::StartIOTrace(Env* env, const TraceOptions& trace_options,
+Status DBImpl::StartIOTrace(const TraceOptions& trace_options,
                             std::unique_ptr<TraceWriter>&& trace_writer) {
   assert(trace_writer != nullptr);
-  return io_tracer_->StartIOTrace(env, trace_options, std::move(trace_writer));
+  return io_tracer_->StartIOTrace(GetSystemClock(), trace_options,
+                                  std::move(trace_writer));
 }
 
 Status DBImpl::EndIOTrace() {
@@ -3201,7 +3311,7 @@ bool DBImpl::GetIntPropertyInternal(ColumnFamilyData* cfd,
 
 bool DBImpl::GetPropertyHandleOptionsStatistics(std::string* value) {
   assert(value != nullptr);
-  Statistics* statistics = immutable_db_options_.statistics.get();
+  Statistics* statistics = immutable_db_options_.stats;
   if (!statistics) {
     return false;
   }
@@ -3368,6 +3478,10 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
     return Status::InvalidArgument("Invalid options");
   }
 
+  const Comparator* const ucmp = column_family->GetComparator();
+  assert(ucmp);
+  size_t ts_sz = ucmp->timestamp_size();
+
   Version* v;
   auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   auto cfd = cfh->cfd();
@@ -3375,9 +3489,23 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
   v = sv->current;
 
   for (int i = 0; i < n; i++) {
+    Slice start = range[i].start;
+    Slice limit = range[i].limit;
+
+    // Add timestamp if needed
+    std::string start_with_ts, limit_with_ts;
+    if (ts_sz > 0) {
+      // Maximum timestamp means including all key with any timestamp
+      AppendKeyWithMaxTimestamp(&start_with_ts, start, ts_sz);
+      // Append a maximum timestamp as the range limit is exclusive:
+      // [start, limit)
+      AppendKeyWithMaxTimestamp(&limit_with_ts, limit, ts_sz);
+      start = start_with_ts;
+      limit = limit_with_ts;
+    }
     // Convert user_key into a corresponding internal key.
-    InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
-    InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
+    InternalKey k1(start, kMaxSequenceNumber, kValueTypeForSeek);
+    InternalKey k2(limit, kMaxSequenceNumber, kValueTypeForSeek);
     sizes[i] = 0;
     if (options.include_files) {
       sizes[i] += versions_->ApproximateSize(
@@ -3429,14 +3557,13 @@ Status DBImpl::DeleteFile(std::string name) {
   FileType type;
   WalFileType log_type;
   if (!ParseFileName(name, &number, &type, &log_type) ||
-      (type != kTableFile && type != kLogFile)) {
+      (type != kTableFile && type != kWalFile)) {
     ROCKS_LOG_ERROR(immutable_db_options_.info_log, "DeleteFile %s failed.\n",
                     name.c_str());
     return Status::InvalidArgument("Invalid file name");
   }
 
-  Status status;
-  if (type == kLogFile) {
+  if (type == kWalFile) {
     // Only allow deleting archived log files
     if (log_type != kArchivedLogFile) {
       ROCKS_LOG_ERROR(immutable_db_options_.info_log,
@@ -3444,7 +3571,7 @@ Status DBImpl::DeleteFile(std::string name) {
                       name.c_str());
       return Status::NotSupported("Delete only supported for archived logs");
     }
-    status = wal_manager_.DeleteFile(name, number);
+    Status status = wal_manager_.DeleteFile(name, number);
     if (!status.ok()) {
       ROCKS_LOG_ERROR(immutable_db_options_.info_log,
                       "DeleteFile %s failed -- %s.\n", name.c_str(),
@@ -3453,6 +3580,7 @@ Status DBImpl::DeleteFile(std::string name) {
     return status;
   }
 
+  Status status;
   int level;
   FileMetaData* metadata;
   ColumnFamilyData* cfd;
@@ -3526,7 +3654,7 @@ Status DBImpl::DeleteFile(std::string name) {
 Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
                                    const RangePtr* ranges, size_t n,
                                    bool include_end) {
-  Status status;
+  Status status = Status::OK();
   auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   ColumnFamilyData* cfd = cfh->cfd();
   VersionEdit edit;
@@ -3581,11 +3709,13 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
           deleted_files.insert(level_file);
           level_file->being_compacted = true;
         }
+        vstorage->ComputeCompactionScore(*cfd->ioptions(),
+                                         *cfd->GetLatestMutableCFOptions());
       }
     }
     if (edit.GetDeletedFiles().empty()) {
       job_context.Clean();
-      return Status::OK();
+      return status;
     }
     input_version->Ref();
     status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
@@ -3861,7 +3991,8 @@ Status DestroyDB(const std::string& dbname, const Options& options,
         std::string path_to_delete = dbname + "/" + fname;
         if (type == kMetaDatabase) {
           del = DestroyDB(path_to_delete, options);
-        } else if (type == kTableFile || type == kLogFile) {
+        } else if (type == kTableFile || type == kWalFile ||
+                   type == kBlobFile) {
           del = DeleteDBFile(&soptions, path_to_delete, dbname,
                              /*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
         } else {
@@ -3886,9 +4017,10 @@ Status DestroyDB(const std::string& dbname, const Options& options,
       if (env->GetChildren(path, &filenames).ok()) {
         for (const auto& fname : filenames) {
           if (ParseFileName(fname, &number, &type) &&
-              type == kTableFile) {  // Lock file will be deleted at end
-            std::string table_path = path + "/" + fname;
-            Status del = DeleteDBFile(&soptions, table_path, dbname,
+              (type == kTableFile ||
+               type == kBlobFile)) {  // Lock file will be deleted at end
+            std::string file_path = path + "/" + fname;
+            Status del = DeleteDBFile(&soptions, file_path, dbname,
                                       /*force_bg=*/false, /*force_fg=*/false);
             if (!del.ok() && result.ok()) {
               result = del;
@@ -3915,7 +4047,7 @@ Status DestroyDB(const std::string& dbname, const Options& options,
     if (env->GetChildren(archivedir, &archiveFiles).ok()) {
       // Delete archival files.
       for (const auto& file : archiveFiles) {
-        if (ParseFileName(file, &number, &type) && type == kLogFile) {
+        if (ParseFileName(file, &number, &type) && type == kWalFile) {
           Status del =
               DeleteDBFile(&soptions, archivedir + "/" + file, archivedir,
                            /*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
@@ -3931,7 +4063,7 @@ Status DestroyDB(const std::string& dbname, const Options& options,
     // Delete log files in the WAL dir
     if (wal_dir_exists) {
       for (const auto& file : walDirFiles) {
-        if (ParseFileName(file, &number, &type) && type == kLogFile) {
+        if (ParseFileName(file, &number, &type) && type == kWalFile) {
           Status del =
               DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number),
                            soptions.wal_dir, /*force_bg=*/false,
@@ -4136,16 +4268,17 @@ void DBImpl::EraseThreadStatusDbInfo() const {}
 //
 // A global method that can dump out the build version
 void DumpRocksDBBuildVersion(Logger* log) {
-#if !defined(IOS_CROSS_COMPILE)
-  // if we compile with Xcode, we don't run build_detect_version, so we don't
-  // generate util/build_version.cc
-  ROCKS_LOG_HEADER(log, "RocksDB version: %d.%d.%d\n", ROCKSDB_MAJOR,
-                   ROCKSDB_MINOR, ROCKSDB_PATCH);
-  ROCKS_LOG_HEADER(log, "Git sha %s", rocksdb_build_git_sha);
-  ROCKS_LOG_HEADER(log, "Compile date %s", rocksdb_build_compile_date);
-#else
-  (void)log;  // ignore "-Wunused-parameter"
-#endif
+  ROCKS_LOG_HEADER(log, "RocksDB version: %s\n",
+                   GetRocksVersionAsString().c_str());
+  const auto& props = GetRocksBuildProperties();
+  const auto& sha = props.find("rocksdb_build_git_sha");
+  if (sha != props.end()) {
+    ROCKS_LOG_HEADER(log, "Git sha %s", sha->second.c_str());
+  }
+  const auto date = props.find("rocksdb_build_date");
+  if (date != props.end()) {
+    ROCKS_LOG_HEADER(log, "Compile date %s", date->second.c_str());
+  }
 }
 
 #ifndef ROCKSDB_LITE
@@ -4310,7 +4443,7 @@ Status DBImpl::IngestExternalFiles(
     }
   }
   // Ingest multiple external SST files atomically.
-  size_t num_cfs = args.size();
+  const size_t num_cfs = args.size();
   for (size_t i = 0; i != num_cfs; ++i) {
     if (args[i].external_files.empty()) {
       char err_msg[128] = {0};
@@ -4347,14 +4480,11 @@ Status DBImpl::IngestExternalFiles(
   std::vector<ExternalSstFileIngestionJob> ingestion_jobs;
   for (const auto& arg : args) {
     auto* cfd = static_cast<ColumnFamilyHandleImpl*>(arg.column_family)->cfd();
-    ingestion_jobs.emplace_back(
-        env_, versions_.get(), cfd, immutable_db_options_, file_options_,
-        &snapshots_, arg.options, &directories_, &event_logger_, io_tracer_);
-  }
-  std::vector<std::pair<bool, Status>> exec_results;
-  for (size_t i = 0; i != num_cfs; ++i) {
-    exec_results.emplace_back(false, Status::OK());
+    ingestion_jobs.emplace_back(versions_.get(), cfd, immutable_db_options_,
+                                file_options_, &snapshots_, arg.options,
+                                &directories_, &event_logger_, io_tracer_);
   }
+
   // TODO(yanqin) maybe make jobs run in parallel
   uint64_t start_file_number = next_file_number;
   for (size_t i = 1; i != num_cfs; ++i) {
@@ -4362,10 +4492,13 @@ Status DBImpl::IngestExternalFiles(
     auto* cfd =
         static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
     SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
-    exec_results[i].second = ingestion_jobs[i].Prepare(
+    Status es = ingestion_jobs[i].Prepare(
         args[i].external_files, args[i].files_checksums,
         args[i].files_checksum_func_names, start_file_number, super_version);
-    exec_results[i].first = true;
+    // capture first error only
+    if (!es.ok() && status.ok()) {
+      status = es;
+    }
     CleanupSuperVersion(super_version);
   }
   TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0");
@@ -4374,23 +4507,17 @@ Status DBImpl::IngestExternalFiles(
     auto* cfd =
         static_cast<ColumnFamilyHandleImpl*>(args[0].column_family)->cfd();
     SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
-    exec_results[0].second = ingestion_jobs[0].Prepare(
+    Status es = ingestion_jobs[0].Prepare(
         args[0].external_files, args[0].files_checksums,
         args[0].files_checksum_func_names, next_file_number, super_version);
-    exec_results[0].first = true;
-    CleanupSuperVersion(super_version);
-  }
-  for (const auto& exec_result : exec_results) {
-    if (!exec_result.second.ok()) {
-      status = exec_result.second;
-      break;
+    if (!es.ok()) {
+      status = es;
     }
+    CleanupSuperVersion(super_version);
   }
   if (!status.ok()) {
     for (size_t i = 0; i != num_cfs; ++i) {
-      if (exec_results[i].first) {
-        ingestion_jobs[i].Cleanup(status);
-      }
+      ingestion_jobs[i].Cleanup(status);
     }
     InstrumentedMutexLock l(&mutex_);
     ReleaseFileNumberFromPendingOutputs(pending_output_elem);
@@ -4560,8 +4687,7 @@ Status DBImpl::IngestExternalFiles(
       // TODO: distinguish between MANIFEST write and CURRENT renaming
       const IOStatus& io_s = versions_->io_status();
       // Should handle return error?
-      error_handler_.SetBGError(io_s, BackgroundErrorReason::kManifestWrite)
-          .PermitUncheckedError();
+      error_handler_.SetBGError(io_s, BackgroundErrorReason::kManifestWrite);
     }
 
     // Resume writes to the DB
@@ -4623,9 +4749,9 @@ Status DBImpl::CreateColumnFamilyWithImport(
   // Import sst files from metadata.
   auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(*handle);
   auto cfd = cfh->cfd();
-  ImportColumnFamilyJob import_job(env_, versions_.get(), cfd,
-                                   immutable_db_options_, file_options_,
-                                   import_options, metadata.files, io_tracer_);
+  ImportColumnFamilyJob import_job(versions_.get(), cfd, immutable_db_options_,
+                                   file_options_, import_options,
+                                   metadata.files, io_tracer_);
 
   SuperVersionContext dummy_sv_ctx(/* create_superversion */ true);
   VersionEdit dummy_edit;
@@ -4782,22 +4908,42 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options,
            j++) {
         const auto& fd_with_krange = vstorage->LevelFilesBrief(i).files[j];
         const auto& fd = fd_with_krange.fd;
+        const FileMetaData* fmeta = fd_with_krange.file_metadata;
+        assert(fmeta);
         std::string fname = TableFileName(cfd->ioptions()->cf_paths,
                                           fd.GetNumber(), fd.GetPathId());
         if (use_file_checksum) {
-          const FileMetaData* fmeta = fd_with_krange.file_metadata;
-          assert(fmeta);
-          s = VerifySstFileChecksum(*fmeta, fname, read_options);
+          s = VerifyFullFileChecksum(fmeta->file_checksum,
+                                     fmeta->file_checksum_func_name, fname,
+                                     read_options);
         } else {
           s = ROCKSDB_NAMESPACE::VerifySstFileChecksum(opts, file_options_,
                                                        read_options, fname);
         }
       }
     }
+
+    if (s.ok() && use_file_checksum) {
+      const auto& blob_files = vstorage->GetBlobFiles();
+      for (const auto& pair : blob_files) {
+        const uint64_t blob_file_number = pair.first;
+        const auto& meta = pair.second;
+        assert(meta);
+        const std::string blob_file_name = BlobFileName(
+            cfd->ioptions()->cf_paths.front().path, blob_file_number);
+        s = VerifyFullFileChecksum(meta->GetChecksumValue(),
+                                   meta->GetChecksumMethod(), blob_file_name,
+                                   read_options);
+        if (!s.ok()) {
+          break;
+        }
+      }
+    }
     if (!s.ok()) {
       break;
     }
   }
+
   bool defer_purge =
           immutable_db_options().avoid_unnecessary_blocking_io;
   {
@@ -4822,29 +4968,31 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options,
   return s;
 }
 
-Status DBImpl::VerifySstFileChecksum(const FileMetaData& fmeta,
-                                     const std::string& fname,
-                                     const ReadOptions& read_options) {
+Status DBImpl::VerifyFullFileChecksum(const std::string& file_checksum_expected,
+                                      const std::string& func_name_expected,
+                                      const std::string& fname,
+                                      const ReadOptions& read_options) {
   Status s;
-  if (fmeta.file_checksum == kUnknownFileChecksum) {
+  if (file_checksum_expected == kUnknownFileChecksum) {
     return s;
   }
   std::string file_checksum;
   std::string func_name;
   s = ROCKSDB_NAMESPACE::GenerateOneFileChecksum(
       fs_.get(), fname, immutable_db_options_.file_checksum_gen_factory.get(),
-      fmeta.file_checksum_func_name, &file_checksum, &func_name,
+      func_name_expected, &file_checksum, &func_name,
       read_options.readahead_size, immutable_db_options_.allow_mmap_reads,
-      io_tracer_);
+      io_tracer_, immutable_db_options_.rate_limiter.get());
   if (s.ok()) {
-    assert(fmeta.file_checksum_func_name == func_name);
-    if (file_checksum != fmeta.file_checksum) {
+    assert(func_name_expected == func_name);
+    if (file_checksum != file_checksum_expected) {
       std::ostringstream oss;
       oss << fname << " file checksum mismatch, ";
-      oss << "expecting " << Slice(fmeta.file_checksum).ToString(/*hex=*/true);
+      oss << "expecting "
+          << Slice(file_checksum_expected).ToString(/*hex=*/true);
       oss << ", but actual " << Slice(file_checksum).ToString(/*hex=*/true);
       s = Status::Corruption(oss.str());
-      TEST_SYNC_POINT_CALLBACK("DBImpl::VerifySstFileChecksum:mismatch", &s);
+      TEST_SYNC_POINT_CALLBACK("DBImpl::VerifyFullFileChecksum:mismatch", &s);
     }
   }
   return s;
@@ -4879,7 +5027,8 @@ void DBImpl::WaitForIngestFile() {
 Status DBImpl::StartTrace(const TraceOptions& trace_options,
                           std::unique_ptr<TraceWriter>&& trace_writer) {
   InstrumentedMutexLock lock(&trace_mutex_);
-  tracer_.reset(new Tracer(env_, trace_options, std::move(trace_writer)));
+  tracer_.reset(new Tracer(immutable_db_options_.clock, trace_options,
+                           std::move(trace_writer)));
   return Status::OK();
 }
 
@@ -4898,8 +5047,8 @@ Status DBImpl::EndTrace() {
 Status DBImpl::StartBlockCacheTrace(
     const TraceOptions& trace_options,
     std::unique_ptr<TraceWriter>&& trace_writer) {
-  return block_cache_tracer_.StartTrace(env_, trace_options,
-                                        std::move(trace_writer));
+  return block_cache_tracer_.StartTrace(immutable_db_options_.clock,
+                                        trace_options, std::move(trace_writer));
 }
 
 Status DBImpl::EndBlockCacheTrace() {
@@ -4907,24 +5056,27 @@ Status DBImpl::EndBlockCacheTrace() {
   return Status::OK();
 }
 
-Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key) {
+Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key,
+                                 const Slice& lower_bound,
+                                 const Slice upper_bound) {
   Status s;
   if (tracer_) {
     InstrumentedMutexLock lock(&trace_mutex_);
     if (tracer_) {
-      s = tracer_->IteratorSeek(cf_id, key);
+      s = tracer_->IteratorSeek(cf_id, key, lower_bound, upper_bound);
     }
   }
   return s;
 }
 
-Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id,
-                                        const Slice& key) {
+Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key,
+                                        const Slice& lower_bound,
+                                        const Slice upper_bound) {
   Status s;
   if (tracer_) {
     InstrumentedMutexLock lock(&trace_mutex_);
     if (tracer_) {
-      s = tracer_->IteratorSeekForPrev(cf_id, key);
+      s = tracer_->IteratorSeekForPrev(cf_id, key, lower_bound, upper_bound);
     }
   }
   return s;
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 5010bb6f46a..ff46896baba 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -54,9 +54,6 @@
 #include "rocksdb/transaction_log.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/scoped_arena_iterator.h"
-#include "trace_replay/block_cache_tracer.h"
-#include "trace_replay/io_tracer.h"
-#include "trace_replay/trace_replay.h"
 #include "util/autovector.h"
 #include "util/hash.h"
 #include "util/repeatable_thread.h"
@@ -132,7 +129,8 @@ class Directories {
 class DBImpl : public DB {
  public:
   DBImpl(const DBOptions& options, const std::string& dbname,
-         const bool seq_per_batch = false, const bool batch_per_txn = true);
+         const bool seq_per_batch = false, const bool batch_per_txn = true,
+         bool read_only = false);
   // No copying allowed
   DBImpl(const DBImpl&) = delete;
   void operator=(const DBImpl&) = delete;
@@ -437,7 +435,8 @@ class DBImpl : public DB {
       const ExportImportFilesMetaData& metadata,
       ColumnFamilyHandle** handle) override;
 
-  Status VerifyFileChecksums(const ReadOptions& read_options);
+  using DB::VerifyFileChecksums;
+  Status VerifyFileChecksums(const ReadOptions& read_options) override;
 
   using DB::VerifyChecksum;
   virtual Status VerifyChecksum(const ReadOptions& /*read_options*/) override;
@@ -455,9 +454,10 @@ class DBImpl : public DB {
   Status VerifyChecksumInternal(const ReadOptions& read_options,
                                 bool use_file_checksum);
 
-  Status VerifySstFileChecksum(const FileMetaData& fmeta,
-                               const std::string& fpath,
-                               const ReadOptions& read_options);
+ Status VerifyFullFileChecksum(const std::string& file_checksum_expected,
+                                const std::string& func_name_expected,
+                                const std::string& fpath,
+                                const ReadOptions& read_options);
 
   using DB::StartTrace;
   virtual Status StartTrace(
@@ -476,7 +476,7 @@ class DBImpl : public DB {
   Status EndBlockCacheTrace() override;
 
   using DB::StartIOTrace;
-  Status StartIOTrace(Env* env, const TraceOptions& options,
+  Status StartIOTrace(const TraceOptions& options,
                       std::unique_ptr<TraceWriter>&& trace_writer) override;
 
   using DB::EndIOTrace;
@@ -493,6 +493,7 @@ class DBImpl : public DB {
 #endif  // ROCKSDB_LITE
 
   // ---- End of implementations of the DB interface ----
+  SystemClock* GetSystemClock() const;
 
   struct GetImplOptions {
     ColumnFamilyHandle* column_family = nullptr;
@@ -527,7 +528,7 @@ class DBImpl : public DB {
                                       ColumnFamilyData* cfd,
                                       SequenceNumber snapshot,
                                       ReadCallback* read_callback,
-                                      bool allow_blob = false,
+                                      bool expose_blob_index = false,
                                       bool allow_refresh = true);
 
   virtual SequenceNumber GetLastPublishedSequence() const {
@@ -600,8 +601,11 @@ class DBImpl : public DB {
                                  bool* found_record_for_key,
                                  bool* is_blob_index = nullptr);
 
-  Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key);
-  Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key);
+  Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key,
+                           const Slice& lower_bound, const Slice upper_bound);
+  Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key,
+                                  const Slice& lower_bound,
+                                  const Slice upper_bound);
 #endif  // ROCKSDB_LITE
 
   // Similar to GetSnapshot(), but also lets the db know that this snapshot
@@ -948,7 +952,7 @@ class DBImpl : public DB {
                            ColumnFamilyHandle* column_family = nullptr,
                            bool disallow_trivial_move = false);
 
-  void TEST_SwitchWAL();
+  Status TEST_SwitchWAL();
 
   bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; }
 
@@ -980,6 +984,9 @@ class DBImpl : public DB {
   // is only for the special test of CancelledCompactions
   Status TEST_WaitForCompact(bool waitUnscheduled = false);
 
+  // Get the background error status
+  Status TEST_GetBGError();
+
   // Return the maximum overlapping data (in bytes) at next level for any
   // file at a level >= 1.
   int64_t TEST_MaxNextLevelOverlappingBytes(
@@ -1043,6 +1050,12 @@ class DBImpl : public DB {
 
   VersionSet* TEST_GetVersionSet() const { return versions_.get(); }
 
+  uint64_t TEST_GetCurrentLogNumber() const {
+    InstrumentedMutexLock l(mutex());
+    assert(!logs_.empty());
+    return logs_.back().number;
+  }
+
   const std::unordered_set<uint64_t>& TEST_GetFilesGrabbedForPurge() const {
     return files_grabbed_for_purge_;
   }
@@ -1062,6 +1075,56 @@ class DBImpl : public DB {
   // flush LOG out of application buffer
   void FlushInfoLog();
 
+  // Interface to block and signal the DB in case of stalling writes by
+  // WriteBufferManager. Each DBImpl object contains ptr to WBMStallInterface.
+  // When DB needs to be blocked or signalled by WriteBufferManager,
+  // state_ is changed accordingly.
+  class WBMStallInterface : public StallInterface {
+   public:
+    enum State {
+      BLOCKED = 0,
+      RUNNING,
+    };
+
+    WBMStallInterface() : state_cv_(&state_mutex_) {
+      MutexLock lock(&state_mutex_);
+      state_ = State::RUNNING;
+    }
+
+    void SetState(State state) {
+      MutexLock lock(&state_mutex_);
+      state_ = state;
+    }
+
+    // Change the state_ to State::BLOCKED and wait until its state is
+    // changed by WriteBufferManager. When stall is cleared, Signal() is
+    // called to change the state and unblock the DB.
+    void Block() override {
+      MutexLock lock(&state_mutex_);
+      while (state_ == State::BLOCKED) {
+        TEST_SYNC_POINT("WBMStallInterface::BlockDB");
+        state_cv_.Wait();
+      }
+    }
+
+    // Called from WriteBufferManager. This function changes the state_
+    // to State::RUNNING indicating the stall is cleared and DB can proceed.
+    void Signal() override {
+      MutexLock lock(&state_mutex_);
+      state_ = State::RUNNING;
+      state_cv_.Signal();
+    }
+
+   private:
+    // Conditional variable and mutex to block and
+    // signal the DB during stalling process.
+    port::Mutex state_mutex_;
+    port::CondVar state_cv_;
+    // state represting whether DB is running or blocked because of stall by
+    // WriteBufferManager.
+    State state_;
+  };
+
  protected:
   const std::string dbname_;
   std::string db_id_;
@@ -1094,6 +1157,14 @@ class DBImpl : public DB {
   ColumnFamilyHandleImpl* default_cf_handle_;
   InternalStats* default_cf_internal_stats_;
 
+  // table_cache_ provides its own synchronization
+  std::shared_ptr<Cache> table_cache_;
+
+  ErrorHandler error_handler_;
+
+  // Unified interface for logging events
+  EventLogger event_logger_;
+
   // only used for dynamically adjusting max_total_wal_size. it is a sum of
   // [write_buffer_size * max_write_buffer_number] over all column families
   uint64_t max_total_in_memory_state_;
@@ -1124,12 +1195,27 @@ class DBImpl : public DB {
   // Default: true
   const bool batch_per_txn_;
 
+  // Each flush or compaction gets its own job id. this counter makes sure
+  // they're unique
+  std::atomic<int> next_job_id_;
+
+  std::atomic<bool> shutting_down_;
+
   // Except in DB::Open(), WriteOptionsFile can only be called when:
   // Persist options to options file.
   // If need_mutex_lock = false, the method will lock DB mutex.
   // If need_enter_write_thread = false, the method will enter write thread.
   Status WriteOptionsFile(bool need_mutex_lock, bool need_enter_write_thread);
 
+  Status CompactRangeInternal(const CompactRangeOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end);
+
+  Status GetApproximateSizesInternal(const SizeApproximationOptions& options,
+                                     ColumnFamilyHandle* column_family,
+                                     const Range* range, int n,
+                                     uint64_t* sizes);
+
   // The following two functions can only be called when:
   // 1. WriteThread::Writer::EnterUnbatched() is used.
   // 2. db_mutex is NOT held
@@ -1241,14 +1327,22 @@ class DBImpl : public DB {
 
   virtual bool OwnTablesAndLogs() const { return true; }
 
+  // Set DB identity file, and write DB ID to manifest if necessary.
+  Status SetDBId(bool read_only);
+
   // REQUIRES: db mutex held when calling this function, but the db mutex can
   // be released and re-acquired. Db mutex will be held when the function
   // returns.
-  // After best-efforts recovery, there may be SST files in db/cf paths that are
-  // not referenced in the MANIFEST. We delete these SST files. In the
+  // After recovery, there may be SST files in db/cf paths that are
+  // not referenced in the MANIFEST (e.g.
+  // 1. It's best effort recovery;
+  // 2. The VersionEdits referencing the SST files are appended to
+  // MANIFEST, DB crashes when syncing the MANIFEST, the VersionEdits are
+  // still not synced to MANIFEST during recovery.)
+  // We delete these SST files. In the
   // meantime, we find out the largest file number present in the paths, and
   // bump up the version set's next_file_number_ to be 1 + largest_file_number.
-  Status FinishBestEffortsRecovery();
+  Status DeleteUnreferencedSstFiles();
 
   // SetDbSessionId() should be called in the constuctor DBImpl()
   // to ensure that db_session_id_ gets updated every time the DB is opened
@@ -1306,6 +1400,7 @@ class DBImpl : public DB {
 
   struct LogFileNumberSize {
     explicit LogFileNumberSize(uint64_t _number) : number(_number) {}
+    LogFileNumberSize() {}
     void AddSize(uint64_t new_size) { size += new_size; }
     uint64_t number;
     uint64_t size = 0;
@@ -1386,15 +1481,16 @@ class DBImpl : public DB {
     uint32_t output_path_id;
     Status status;
     bool done;
-    bool in_progress;            // compaction request being processed?
-    bool incomplete;             // only part of requested range compacted
-    bool exclusive;              // current behavior of only one manual
-    bool disallow_trivial_move;  // Force actual compaction to run
-    const InternalKey* begin;    // nullptr means beginning of key range
-    const InternalKey* end;      // nullptr means end of key range
-    InternalKey* manual_end;     // how far we are compacting
-    InternalKey tmp_storage;     // Used to keep track of compaction progress
-    InternalKey tmp_storage1;    // Used to keep track of compaction progress
+    bool in_progress;             // compaction request being processed?
+    bool incomplete;              // only part of requested range compacted
+    bool exclusive;               // current behavior of only one manual
+    bool disallow_trivial_move;   // Force actual compaction to run
+    const InternalKey* begin;     // nullptr means beginning of key range
+    const InternalKey* end;       // nullptr means end of key range
+    InternalKey* manual_end;      // how far we are compacting
+    InternalKey tmp_storage;      // Used to keep track of compaction progress
+    InternalKey tmp_storage1;     // Used to keep track of compaction progress
+    std::atomic<bool>* canceled;  // Compaction canceled by the user?
   };
   struct PrepickedCompaction {
     // background compaction takes ownership of `compaction`.
@@ -1411,6 +1507,7 @@ class DBImpl : public DB {
     DBImpl* db;
     // background compaction takes ownership of `prepicked_compaction`.
     PrepickedCompaction* prepicked_compaction;
+    Env::Priority compaction_pri_;
   };
 
   // Initialize the built-in column family for persistent stats. Depending on
@@ -1505,6 +1602,12 @@ class DBImpl : public DB {
   Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
                                      MemTable* mem, VersionEdit* edit);
 
+  // Get the size of a log file and, if truncate is true, truncate the
+  // log file to its actual size, thereby freeing preallocated space.
+  // Return success even if truncate fails
+  Status GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
+                                    LogFileNumberSize* log);
+
   // Restore alive_log_files_ and total_log_size_ after recovery.
   // It needs to run only when there's no flush during recovery
   // (e.g. avoid_flush_during_recovery=true). May also trigger flush
@@ -1515,6 +1618,10 @@ class DBImpl : public DB {
   //            `num_bytes` going through.
   Status DelayWrite(uint64_t num_bytes, const WriteOptions& write_options);
 
+  // Begin stalling of writes when memory usage increases beyond a certain
+  // threshold.
+  void WriteBufferManagerStallWrites();
+
   Status ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
                                       WriteBatch* my_batch);
 
@@ -1594,7 +1701,7 @@ class DBImpl : public DB {
   Status SwitchWAL(WriteContext* write_context);
 
   // REQUIRES: mutex locked and in write thread.
-  Status HandleWriteBufferFull(WriteContext* write_context);
+  Status HandleWriteBufferManagerFlush(WriteContext* write_context);
 
   // REQUIRES: mutex locked
   Status PreprocessWrite(const WriteOptions& write_options, bool* need_log_sync,
@@ -1728,7 +1835,9 @@ class DBImpl : public DB {
       std::unique_ptr<TaskLimiterToken>* token, LogBuffer* log_buffer);
 
   // helper function to call after some of the logs_ were synced
-  void MarkLogsSynced(uint64_t up_to, bool synced_dir, const Status& status);
+  Status MarkLogsSynced(uint64_t up_to, bool synced_dir);
+  // WALs with log number up to up_to are not synced successfully.
+  void MarkLogsNotSynced(uint64_t up_to);
 
   SnapshotImpl* GetSnapshotImpl(bool is_write_conflict_boundary,
                                 bool lock = true);
@@ -1867,13 +1976,11 @@ class DBImpl : public DB {
   Status MultiGetImpl(
       const ReadOptions& read_options, size_t start_key, size_t num_keys,
       autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
-      SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback,
-      bool* is_blob_index);
+      SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback);
 
   Status DisableFileDeletionsWithLock();
 
-  // table_cache_ provides its own synchronization
-  std::shared_ptr<Cache> table_cache_;
+  Status IncreaseFullHistoryTsLow(ColumnFamilyData* cfd, std::string ts_low);
 
   // Lock over the persistent DB state.  Non-nullptr iff successfully acquired.
   FileLock* db_lock_;
@@ -1888,8 +1995,6 @@ class DBImpl : public DB {
   // mutex_, the order should be first mutex_ and then log_write_mutex_.
   InstrumentedMutex log_write_mutex_;
 
-  std::atomic<bool> shutting_down_;
-
   // If zero, manual compactions are allowed to proceed. If non-zero, manual
   // compactions may still be running, but will quickly fail with
   // `Status::Incomplete`. The value indicates how many threads have paused
@@ -2098,10 +2203,6 @@ class DBImpl : public DB {
   // Number of threads intending to write to memtable
   std::atomic<size_t> pending_memtable_writes_ = {};
 
-  // Each flush or compaction gets its own job id. this counter makes sure
-  // they're unique
-  std::atomic<int> next_job_id_;
-
   // A flag indicating whether the current rocksdb database has any
   // data that is not yet persisted into either WAL or SST file.
   // Used when disableWAL is true.
@@ -2130,9 +2231,6 @@ class DBImpl : public DB {
   WalManager wal_manager_;
 #endif  // ROCKSDB_LITE
 
-  // Unified interface for logging events
-  EventLogger event_logger_;
-
   // A value of > 0 temporarily disables scheduling of background work
   int bg_work_paused_;
 
@@ -2200,8 +2298,6 @@ class DBImpl : public DB {
   // Flag to check whether Close() has been called on this DB
   bool closed_;
 
-  ErrorHandler error_handler_;
-
   // Conditional variable to coordinate installation of atomic flush results.
   // With atomic flush, each bg thread installs the result of flushing multiple
   // column families, and different threads can flush different column
@@ -2226,11 +2322,18 @@ class DBImpl : public DB {
                    LogBuffer* log_buffer, PluggableCompactionResult* result);
 
   bool wal_in_db_path_;
+
+  BlobFileCompletionCallback blob_callback_;
+
+  // Pointer to WriteBufferManager stalling interface.
+  std::unique_ptr<StallInterface> wbm_stall_;
 };
 
-extern Options SanitizeOptions(const std::string& db, const Options& src);
+extern Options SanitizeOptions(const std::string& db, const Options& src,
+                               bool read_only = false);
 
-extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src);
+extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src,
+                                 bool read_only = false);
 
 extern CompressionType GetCompressionFlush(
     const ImmutableCFOptions& ioptions,
@@ -2242,11 +2345,27 @@ extern CompressionType GetCompressionFlush(
 // `memtables_to_flush`) will be flushed and thus will not depend on any WAL
 // file.
 // The function is only applicable to 2pc mode.
-extern uint64_t PrecomputeMinLogNumberToKeep(
+extern uint64_t PrecomputeMinLogNumberToKeep2PC(
     VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
-    autovector<VersionEdit*> edit_list,
+    const autovector<VersionEdit*>& edit_list,
     const autovector<MemTable*>& memtables_to_flush,
     LogsWithPrepTracker* prep_tracker);
+// For atomic flush.
+extern uint64_t PrecomputeMinLogNumberToKeep2PC(
+    VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+    const autovector<autovector<VersionEdit*>>& edit_lists,
+    const autovector<const autovector<MemTable*>*>& memtables_to_flush,
+    LogsWithPrepTracker* prep_tracker);
+
+// In non-2PC mode, WALs with log number < the returned number can be
+// deleted after the cfd_to_flush column family is flushed successfully.
+extern uint64_t PrecomputeMinLogNumberToKeepNon2PC(
+    VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+    const autovector<VersionEdit*>& edit_list);
+// For atomic flush.
+extern uint64_t PrecomputeMinLogNumberToKeepNon2PC(
+    VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+    const autovector<autovector<VersionEdit*>>& edit_lists);
 
 // `cfd_to_flush` is the column family whose memtable will be flushed and thus
 // will not depend on any WAL file. nullptr means no memtable is being flushed.
@@ -2254,6 +2373,10 @@ extern uint64_t PrecomputeMinLogNumberToKeep(
 extern uint64_t FindMinPrepLogReferencedByMemTable(
     VersionSet* vset, const ColumnFamilyData* cfd_to_flush,
     const autovector<MemTable*>& memtables_to_flush);
+// For atomic flush.
+extern uint64_t FindMinPrepLogReferencedByMemTable(
+    VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+    const autovector<const autovector<MemTable*>*>& memtables_to_flush);
 
 // Fix user-supplied options to be reasonable
 template <class T, class V>
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index cce44f8626f..75571e96e77 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -35,8 +35,10 @@ bool DBImpl::EnoughRoomForCompaction(
     // Pass the current bg_error_ to SFM so it can decide what checks to
     // perform. If this DB instance hasn't seen any error yet, the SFM can be
     // optimistic and not do disk space checks
-    enough_room =
-        sfm->EnoughRoomForCompaction(cfd, inputs, error_handler_.GetBGError());
+    Status bg_error = error_handler_.GetBGError();
+    enough_room = sfm->EnoughRoomForCompaction(cfd, inputs, bg_error);
+    bg_error.PermitUncheckedError();  // bg_error is just a copy of the Status
+                                      // from the error_handler_
     if (enough_room) {
       *sfm_reserved_compact_space = true;
     }
@@ -123,20 +125,17 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context) {
 
     // "number <= current_log_number - 1" is equivalent to
     // "number < current_log_number".
-    MarkLogsSynced(current_log_number - 1, true, io_s);
+    if (io_s.ok()) {
+      io_s = status_to_io_status(MarkLogsSynced(current_log_number - 1, true));
+    } else {
+      MarkLogsNotSynced(current_log_number - 1);
+    }
     if (!io_s.ok()) {
-      if (total_log_size_ > 0) {
-        error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush)
-            .PermitUncheckedError();
-      } else {
-        // If the WAL is empty, we use different error reason
-        error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL)
-            .PermitUncheckedError();
-      }
       TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Failed");
       return io_s;
     }
   }
+  TEST_SYNC_POINT("DBImpl::SyncClosedLogs:end");
   return io_s;
 }
 
@@ -155,27 +154,25 @@ Status DBImpl::FlushMemTableToOutputFile(
 
   FlushJob flush_job(
       dbname_, cfd, immutable_db_options_, mutable_cf_options,
-      nullptr /* memtable_id */, file_options_for_compaction_, versions_.get(),
-      &mutex_, &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot,
-      snapshot_checker, job_context, log_buffer, directories_.GetDbDir(),
-      GetDataDir(cfd, 0U),
+      port::kMaxUint64 /* memtable_id */, file_options_for_compaction_,
+      versions_.get(), &mutex_, &shutting_down_, snapshot_seqs,
+      earliest_write_conflict_snapshot, snapshot_checker, job_context,
+      log_buffer, directories_.GetDbDir(), GetDataDir(cfd, 0U),
       GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_,
       &event_logger_, mutable_cf_options.report_bg_io_stats,
       true /* sync_output_directory */, true /* write_manifest */, thread_pri,
-      io_tracer_, db_id_, db_session_id_);
+      io_tracer_, db_id_, db_session_id_, cfd->GetFullHistoryTsLow(),
+      &blob_callback_);
   FileMetaData file_meta;
 
-  TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:BeforePickMemtables");
-  flush_job.PickMemTable();
-  TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:AfterPickMemtables");
-
 #ifndef ROCKSDB_LITE
   // may temporarily unlock and lock the mutex.
   NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id);
 #endif  // ROCKSDB_LITE
 
   Status s;
-  IOStatus io_s = IOStatus::OK();
+  bool need_cancel = false;
+  IOStatus log_io_s = IOStatus::OK();
   if (logfile_number_ > 0 &&
       versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1) {
     // If there are more than one column families, we need to make sure that
@@ -184,16 +181,24 @@ Status DBImpl::FlushMemTableToOutputFile(
     // flushed SST may contain data from write batches whose updates to
     // other column families are missing.
     // SyncClosedLogs() may unlock and re-lock the db_mutex.
-    io_s = SyncClosedLogs(job_context);
-    s = io_s;
-    if (!io_s.ok() && !io_s.IsShutdownInProgress() &&
-        !io_s.IsColumnFamilyDropped()) {
-      error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush)
-          .PermitUncheckedError();
+    log_io_s = SyncClosedLogs(job_context);
+    if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() &&
+        !log_io_s.IsColumnFamilyDropped()) {
+      error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush);
     }
   } else {
     TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Skip");
   }
+  s = log_io_s;
+
+  // If the log sync failed, we do not need to pick memtable. Otherwise,
+  // num_flush_not_started_ needs to be rollback.
+  TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:BeforePickMemtables");
+  if (s.ok()) {
+    flush_job.PickMemTable();
+    need_cancel = true;
+  }
+  TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:AfterPickMemtables");
 
   // Within flush_job.Run, rocksdb may call event listener to notify
   // file creation and deletion.
@@ -203,11 +208,16 @@ Status DBImpl::FlushMemTableToOutputFile(
   // is unlocked by the current thread.
   if (s.ok()) {
     s = flush_job.Run(&logs_with_prep_tracker_, &file_meta);
-  } else {
+    need_cancel = false;
+  }
+
+  if (!s.ok() && need_cancel) {
     flush_job.Cancel();
   }
-  if (io_s.ok()) {
-    io_s = flush_job.io_status();
+  IOStatus io_s = IOStatus::OK();
+  io_s = flush_job.io_status();
+  if (s.ok()) {
+    s = io_s;
   }
 
   if (s.ok()) {
@@ -243,30 +253,30 @@ Status DBImpl::FlushMemTableToOutputFile(
   if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) {
     if (!io_s.ok() && !io_s.IsShutdownInProgress() &&
         !io_s.IsColumnFamilyDropped()) {
+      assert(log_io_s.ok());
       // Error while writing to MANIFEST.
       // In fact, versions_->io_status() can also be the result of renaming
       // CURRENT file. With current code, it's just difficult to tell. So just
       // be pessimistic and try write to a new MANIFEST.
       // TODO: distinguish between MANIFEST write and CURRENT renaming
       if (!versions_->io_status().ok()) {
-        // Should handle return error?
-        error_handler_.SetBGError(io_s, BackgroundErrorReason::kManifestWrite)
-            .PermitUncheckedError();
-      } else if (total_log_size_ > 0) {
-        // Should handle return error?
-        error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush)
-            .PermitUncheckedError();
+        // If WAL sync is successful (either WAL size is 0 or there is no IO
+        // error), all the Manifest write will be map to soft error.
+        // TODO: kManifestWriteNoWAL and kFlushNoWAL are misleading. Refactor is
+        // needed.
+        error_handler_.SetBGError(io_s,
+                                  BackgroundErrorReason::kManifestWriteNoWAL);
       } else {
-        // If the WAL is empty, we use different error reason
-        // Should handle return error?
-        error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL)
-            .PermitUncheckedError();
+        // If WAL sync is successful (either WAL size is 0 or there is no IO
+        // error), all the other SST file write errors will be set as
+        // kFlushNoWAL.
+        error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL);
       }
     } else {
-      Status new_bg_error = s;
-      // Should handle return error?
-      error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush)
-          .PermitUncheckedError();
+      if (log_io_s.ok()) {
+        Status new_bg_error = s;
+        error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+      }
     }
   } else {
     // If we got here, then we decided not to care about the i_os status (either
@@ -284,16 +294,17 @@ Status DBImpl::FlushMemTableToOutputFile(
       // Notify sst_file_manager that a new file was added
       std::string file_path = MakeTableFileName(
           cfd->ioptions()->cf_paths[0].path, file_meta.fd.GetNumber());
-      sfm->OnAddFile(file_path);
+      // TODO (PR7798).  We should only add the file to the FileManager if it
+      // exists. Otherwise, some tests may fail.  Ignore the error in the
+      // interim.
+      sfm->OnAddFile(file_path).PermitUncheckedError();
       if (sfm->IsMaxAllowedSpaceReached()) {
         Status new_bg_error =
             Status::SpaceLimit("Max allowed space was reached");
         TEST_SYNC_POINT_CALLBACK(
             "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached",
             &new_bg_error);
-        // Should handle this error?
-        error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush)
-            .PermitUncheckedError();
+        error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
       }
     }
 #endif  // ROCKSDB_LITE
@@ -309,30 +320,22 @@ Status DBImpl::FlushMemTablesToOutputFiles(
     return AtomicFlushMemTablesToOutputFiles(
         bg_flush_args, made_progress, job_context, log_buffer, thread_pri);
   }
+  assert(bg_flush_args.size() == 1);
   std::vector<SequenceNumber> snapshot_seqs;
   SequenceNumber earliest_write_conflict_snapshot;
   SnapshotChecker* snapshot_checker;
   GetSnapshotContext(job_context, &snapshot_seqs,
                      &earliest_write_conflict_snapshot, &snapshot_checker);
-  Status status;
-  for (auto& arg : bg_flush_args) {
-    ColumnFamilyData* cfd = arg.cfd_;
-    MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
-    SuperVersionContext* superversion_context = arg.superversion_context_;
-    Status s = FlushMemTableToOutputFile(
-        cfd, mutable_cf_options, made_progress, job_context,
-        superversion_context, snapshot_seqs, earliest_write_conflict_snapshot,
-        snapshot_checker, log_buffer, thread_pri);
-    if (!s.ok()) {
-      status = s;
-      if (!s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) {
-        // At this point, DB is not shutting down, nor is cfd dropped.
-        // Something is wrong, thus we break out of the loop.
-        break;
-      }
-    }
-  }
-  return status;
+  const auto& bg_flush_arg = bg_flush_args[0];
+  ColumnFamilyData* cfd = bg_flush_arg.cfd_;
+  MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+  SuperVersionContext* superversion_context =
+      bg_flush_arg.superversion_context_;
+  Status s = FlushMemTableToOutputFile(
+      cfd, mutable_cf_options, made_progress, job_context, superversion_context,
+      snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
+      log_buffer, thread_pri);
+  return s;
 }
 
 /*
@@ -395,7 +398,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
 
     all_mutable_cf_options.emplace_back(*cfd->GetLatestMutableCFOptions());
     const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.back();
-    const uint64_t* max_memtable_id = &(bg_flush_args[i].max_memtable_id_);
+    uint64_t max_memtable_id = bg_flush_args[i].max_memtable_id_;
     jobs.emplace_back(new FlushJob(
         dbname_, cfd, immutable_db_options_, mutable_cf_options,
         max_memtable_id, file_options_for_compaction_, versions_.get(), &mutex_,
@@ -404,13 +407,13 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
         data_dir, GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
         stats_, &event_logger_, mutable_cf_options.report_bg_io_stats,
         false /* sync_output_directory */, false /* write_manifest */,
-        thread_pri, io_tracer_, db_id_, db_session_id_));
-    jobs.back()->PickMemTable();
+        thread_pri, io_tracer_, db_id_, db_session_id_,
+        cfd->GetFullHistoryTsLow()));
   }
 
   std::vector<FileMetaData> file_meta(num_cfs);
   Status s;
-  IOStatus io_s;
+  IOStatus log_io_s = IOStatus::OK();
   assert(num_cfs == static_cast<int>(jobs.size()));
 
 #ifndef ROCKSDB_LITE
@@ -425,18 +428,36 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
   if (logfile_number_ > 0) {
     // TODO (yanqin) investigate whether we should sync the closed logs for
     // single column family case.
-    io_s = SyncClosedLogs(job_context);
-    s = io_s;
+    log_io_s = SyncClosedLogs(job_context);
+    if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() &&
+        !log_io_s.IsColumnFamilyDropped()) {
+      if (total_log_size_ > 0) {
+        error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush);
+      } else {
+        // If the WAL is empty, we use different error reason
+        error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlushNoWAL);
+      }
+    }
   }
+  s = log_io_s;
 
   // exec_status stores the execution status of flush_jobs as
   // <bool /* executed */, Status /* status code */>
   autovector<std::pair<bool, Status>> exec_status;
   autovector<IOStatus> io_status;
+  std::vector<bool> pick_status;
   for (int i = 0; i != num_cfs; ++i) {
     // Initially all jobs are not executed, with status OK.
     exec_status.emplace_back(false, Status::OK());
     io_status.emplace_back(IOStatus::OK());
+    pick_status.push_back(false);
+  }
+
+  if (s.ok()) {
+    for (int i = 0; i != num_cfs; ++i) {
+      jobs[i]->PickMemTable();
+      pick_status[i] = true;
+    }
   }
 
   if (s.ok()) {
@@ -477,6 +498,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
     s = error_status.ok() ? s : error_status;
   }
 
+  IOStatus io_s = IOStatus::OK();
   if (io_s.ok()) {
     IOStatus io_error = IOStatus::OK();
     for (int i = 0; i != static_cast<int>(io_status.size()); i++) {
@@ -512,12 +534,12 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
     // Have to cancel the flush jobs that have NOT executed because we need to
     // unref the versions.
     for (int i = 0; i != num_cfs; ++i) {
-      if (!exec_status[i].first) {
+      if (pick_status[i] && !exec_status[i].first) {
         jobs[i]->Cancel();
       }
     }
     for (int i = 0; i != num_cfs; ++i) {
-      if (exec_status[i].first && exec_status[i].second.ok()) {
+      if (exec_status[i].second.ok() && exec_status[i].first) {
         auto& mems = jobs[i]->GetMemTables();
         cfds[i]->imm()->RollbackMemtableFlush(mems,
                                               file_meta[i].fd.GetNumber());
@@ -581,7 +603,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
 
     s = InstallMemtableAtomicFlushResults(
         nullptr /* imm_lists */, tmp_cfds, mutable_cf_options_list, mems_list,
-        versions_.get(), &mutex_, tmp_file_meta,
+        versions_.get(), &logs_with_prep_tracker_, &mutex_, tmp_file_meta,
         &job_context->memtables_to_free, directories_.GetDbDir(), log_buffer);
   }
 
@@ -627,7 +649,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
     auto sfm = static_cast<SstFileManagerImpl*>(
         immutable_db_options_.sst_file_manager.get());
     assert(all_mutable_cf_options.size() == static_cast<size_t>(num_cfs));
-    for (int i = 0; i != num_cfs; ++i) {
+    for (int i = 0; s.ok() && i != num_cfs; ++i) {
       if (cfds[i]->IsDropped()) {
         continue;
       }
@@ -636,14 +658,16 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
       if (sfm) {
         std::string file_path = MakeTableFileName(
             cfds[i]->ioptions()->cf_paths[0].path, file_meta[i].fd.GetNumber());
-        sfm->OnAddFile(file_path);
+        // TODO (PR7798).  We should only add the file to the FileManager if it
+        // exists. Otherwise, some tests may fail.  Ignore the error in the
+        // interim.
+        sfm->OnAddFile(file_path).PermitUncheckedError();
         if (sfm->IsMaxAllowedSpaceReached() &&
             error_handler_.GetBGError().ok()) {
           Status new_bg_error =
               Status::SpaceLimit("Max allowed space was reached");
-          // Should Handle this error?
-          error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush)
-              .PermitUncheckedError();
+          error_handler_.SetBGError(new_bg_error,
+                                    BackgroundErrorReason::kFlush);
         }
       }
     }
@@ -654,30 +678,30 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
   // it is not because of CF drop.
   if (!s.ok() && !s.IsColumnFamilyDropped()) {
     if (!io_s.ok() && !io_s.IsColumnFamilyDropped()) {
+      assert(log_io_s.ok());
       // Error while writing to MANIFEST.
       // In fact, versions_->io_status() can also be the result of renaming
       // CURRENT file. With current code, it's just difficult to tell. So just
       // be pessimistic and try write to a new MANIFEST.
       // TODO: distinguish between MANIFEST write and CURRENT renaming
       if (!versions_->io_status().ok()) {
-        // Should Handle this error?
-        error_handler_.SetBGError(io_s, BackgroundErrorReason::kManifestWrite)
-            .PermitUncheckedError();
-      } else if (total_log_size_ > 0) {
-        // Should Handle this error?
-        error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlush)
-            .PermitUncheckedError();
+        // If WAL sync is successful (either WAL size is 0 or there is no IO
+        // error), all the Manifest write will be map to soft error.
+        // TODO: kManifestWriteNoWAL and kFlushNoWAL are misleading. Refactor
+        // is needed.
+        error_handler_.SetBGError(io_s,
+                                  BackgroundErrorReason::kManifestWriteNoWAL);
       } else {
-        // If the WAL is empty, we use different error reason
-        // Should Handle this error?
-        error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL)
-            .PermitUncheckedError();
+        // If WAL sync is successful (either WAL size is 0 or there is no IO
+        // error), all the other SST file write errors will be set as
+        // kFlushNoWAL.
+        error_handler_.SetBGError(io_s, BackgroundErrorReason::kFlushNoWAL);
       }
     } else {
-      Status new_bg_error = s;
-      // Should Handle this error?
-      error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush)
-          .PermitUncheckedError();
+      if (log_io_s.ok()) {
+        Status new_bg_error = s;
+        error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+      }
     }
   }
 
@@ -777,7 +801,68 @@ void DBImpl::NotifyOnFlushCompleted(
 
 Status DBImpl::CompactRange(const CompactRangeOptions& options,
                             ColumnFamilyHandle* column_family,
-                            const Slice* begin, const Slice* end) {
+                            const Slice* begin_without_ts,
+                            const Slice* end_without_ts) {
+  if (manual_compaction_paused_.load(std::memory_order_acquire) > 0) {
+    return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
+
+  if (options.canceled && options.canceled->load(std::memory_order_acquire)) {
+    return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
+
+  const Comparator* const ucmp = column_family->GetComparator();
+  assert(ucmp);
+  size_t ts_sz = ucmp->timestamp_size();
+  if (ts_sz == 0) {
+    return CompactRangeInternal(options, column_family, begin_without_ts,
+                                end_without_ts);
+  }
+
+  std::string begin_str;
+  std::string end_str;
+
+  // CompactRange compact all keys: [begin, end] inclusively. Add maximum
+  // timestamp to include all `begin` keys, and add minimal timestamp to include
+  // all `end` keys.
+  if (begin_without_ts != nullptr) {
+    AppendKeyWithMaxTimestamp(&begin_str, *begin_without_ts, ts_sz);
+  }
+  if (end_without_ts != nullptr) {
+    AppendKeyWithMinTimestamp(&end_str, *end_without_ts, ts_sz);
+  }
+  Slice begin(begin_str);
+  Slice end(end_str);
+
+  Slice* begin_with_ts = begin_without_ts ? &begin : nullptr;
+  Slice* end_with_ts = end_without_ts ? &end : nullptr;
+
+  return CompactRangeInternal(options, column_family, begin_with_ts,
+                              end_with_ts);
+}
+
+Status DBImpl::IncreaseFullHistoryTsLow(ColumnFamilyData* cfd,
+                                        std::string ts_low) {
+  VersionEdit edit;
+  edit.SetColumnFamily(cfd->GetID());
+  edit.SetFullHistoryTsLow(ts_low);
+
+  InstrumentedMutexLock l(&mutex_);
+  std::string current_ts_low = cfd->GetFullHistoryTsLow();
+  const Comparator* ucmp = cfd->user_comparator();
+  if (!current_ts_low.empty() &&
+      ucmp->CompareTimestamp(ts_low, current_ts_low) < 0) {
+    return Status::InvalidArgument(
+        "Cannot decrease full_history_timestamp_low");
+  }
+
+  return versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit,
+                                &mutex_);
+}
+
+Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
+                                    ColumnFamilyHandle* column_family,
+                                    const Slice* begin, const Slice* end) {
   auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
   auto cfd = cfh->cfd();
 
@@ -786,18 +871,36 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options,
   }
 
   bool flush_needed = true;
+
+  // Update full_history_ts_low if it's set
+  if (options.full_history_ts_low != nullptr &&
+      !options.full_history_ts_low->empty()) {
+    std::string ts_low = options.full_history_ts_low->ToString();
+    if (begin != nullptr || end != nullptr) {
+      return Status::InvalidArgument(
+          "Cannot specify compaction range with full_history_ts_low");
+    }
+    Status s = IncreaseFullHistoryTsLow(cfd, ts_low);
+    if (!s.ok()) {
+      LogFlush(immutable_db_options_.info_log);
+      return s;
+    }
+  }
+
+  Status s;
   if (begin != nullptr && end != nullptr) {
     // TODO(ajkr): We could also optimize away the flush in certain cases where
     // one/both sides of the interval are unbounded. But it requires more
     // changes to RangesOverlapWithMemtables.
     Range range(*begin, *end);
     SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
-    cfd->RangesOverlapWithMemtables({range}, super_version, &flush_needed);
+    s = cfd->RangesOverlapWithMemtables(
+        {range}, super_version, immutable_db_options_.allow_data_in_errors,
+        &flush_needed);
     CleanupSuperVersion(super_version);
   }
 
-  Status s;
-  if (flush_needed) {
+  if (s.ok() && flush_needed) {
     FlushOptions fo;
     fo.allow_write_stall = options.allow_write_stall;
     if (immutable_db_options_.atomic_flush) {
@@ -993,7 +1096,7 @@ Status DBImpl::CompactFiles(const CompactionOptions& compact_options,
   assert(cfd);
 
   Status s;
-  JobContext job_context(0, true);
+  JobContext job_context(next_job_id_.fetch_add(1), true);
   LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
                        immutable_db_options_.info_log.get());
 
@@ -1144,16 +1247,18 @@ Status DBImpl::CompactFilesImpl(
   assert(is_snapshot_supported_ || snapshots_.empty());
   CompactionJobStats compaction_job_stats;
   CompactionJob compaction_job(
-      job_context->job_id, c.get(), immutable_db_options_,
+      job_context->job_id, c.get(), immutable_db_options_, mutable_db_options_,
       file_options_for_compaction_, versions_.get(), &shutting_down_,
       preserve_deletes_seqnum_.load(), log_buffer, directories_.GetDbDir(),
-      GetDataDir(c->column_family_data(), c->output_path_id()), stats_, &mutex_,
-      &error_handler_, snapshot_seqs, earliest_write_conflict_snapshot,
-      snapshot_checker, table_cache_, &event_logger_,
+      GetDataDir(c->column_family_data(), c->output_path_id()),
+      GetDataDir(c->column_family_data(), 0), stats_, &mutex_, &error_handler_,
+      snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
+      table_cache_, &event_logger_,
       c->mutable_cf_options()->paranoid_file_checks,
       c->mutable_cf_options()->report_bg_io_stats, dbname_,
       &compaction_job_stats, Env::Priority::USER, io_tracer_,
-      &manual_compaction_paused_, db_id_, db_session_id_);
+      &manual_compaction_paused_, nullptr, db_id_, db_session_id_,
+      c->column_family_data()->GetFullHistoryTsLow());
 
   // Creating a compaction influences the compaction score because the score
   // takes running compactions into account (by skipping files that are already
@@ -1167,7 +1272,8 @@ Status DBImpl::CompactFilesImpl(
   mutex_.Unlock();
   TEST_SYNC_POINT("CompactFilesImpl:0");
   TEST_SYNC_POINT("CompactFilesImpl:1");
-  compaction_job.Run();
+  // Ignore the status here, as it will be checked in the Install down below...
+  compaction_job.Run().PermitUncheckedError();
   TEST_SYNC_POINT("CompactFilesImpl:2");
   TEST_SYNC_POINT("CompactFilesImpl:3");
   mutex_.Lock();
@@ -1217,18 +1323,16 @@ Status DBImpl::CompactFilesImpl(
                    job_context->job_id, status.ToString().c_str());
     IOStatus io_s = compaction_job.io_status();
     if (!io_s.ok()) {
-      error_handler_.SetBGError(io_s, BackgroundErrorReason::kCompaction)
-          .PermitUncheckedError();
+      error_handler_.SetBGError(io_s, BackgroundErrorReason::kCompaction);
     } else {
-      error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction)
-          .PermitUncheckedError();
+      error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
     }
   }
 
   if (output_file_names != nullptr) {
     for (const auto& newf : c->edit()->GetNewFiles()) {
       (*output_file_names)
-          .push_back(TableFileName(c->immutable_cf_options()->cf_paths,
+          .push_back(TableFileName(c->immutable_options()->cf_paths,
                                    newf.second.fd.GetNumber(),
                                    newf.second.fd.GetPathId()));
     }
@@ -1326,10 +1430,13 @@ void DBImpl::NotifyOnCompactionCompleted(
   if (shutting_down_.load(std::memory_order_acquire)) {
     return;
   }
+  // TODO: Should disabling manual compaction squash compaction completed
+  //   notifications that aren't the result of a shutdown?
   if (c->is_manual_compaction() &&
       manual_compaction_paused_.load(std::memory_order_acquire) > 0) {
     return;
   }
+
   Version* current = cfd->current();
   current->Ref();
   // release lock while notifying events
@@ -1366,8 +1473,6 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
 
   SuperVersionContext sv_context(/* create_superversion */ true);
 
-  Status status;
-
   InstrumentedMutexLock guard_lock(&mutex_);
 
   // only allow one thread refitting
@@ -1431,8 +1536,9 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
                     "[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
                     edit.DebugString().data());
 
-    status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_,
-                                    directories_.GetDbDir());
+    Status status = versions_->LogAndApply(cfd, mutable_cf_options, &edit,
+                                           &mutex_, directories_.GetDbDir());
+
     InstallSuperVersionAndScheduleWork(cfd, &sv_context, mutable_cf_options);
 
     ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] LogAndApply: %s\n",
@@ -1443,12 +1549,14 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
                       "[%s] After refitting:\n%s", cfd->GetName().c_str(),
                       cfd->current()->DebugString().data());
     }
+    sv_context.Clean();
+    refitting_level_ = false;
+
+    return status;
   }
 
-  sv_context.Clean();
   refitting_level_ = false;
-
-  return status;
+  return Status::OK();
 }
 
 int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) {
@@ -1553,6 +1661,7 @@ Status DBImpl::RunManualCompaction(
   manual.incomplete = false;
   manual.exclusive = exclusive;
   manual.disallow_trivial_move = disallow_trivial_move;
+  manual.canceled = compact_range_options.canceled;
   // For universal compaction, we enforce every manual compaction to compact
   // all files.
   if (begin == nullptr ||
@@ -1644,6 +1753,7 @@ Status DBImpl::RunManualCompaction(
       }
       ca = new CompactionArg;
       ca->db = this;
+      ca->compaction_pri_ = Env::Priority::LOW;
       ca->prepicked_compaction = new PrepickedCompaction;
       ca->prepicked_compaction->manual_compaction_state = &manual;
       ca->prepicked_compaction->compaction = compaction;
@@ -1690,8 +1800,9 @@ void DBImpl::GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
 Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
                              const FlushOptions& flush_options,
                              FlushReason flush_reason, bool writes_stopped) {
+  // This method should not be called if atomic_flush is true.
+  assert(!immutable_db_options_.atomic_flush);
   Status s;
-  uint64_t flush_memtable_id = 0;
   if (!flush_options.allow_write_stall) {
     bool flush_needed = true;
     s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed);
@@ -1701,7 +1812,8 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
     }
   }
 
-  FlushRequest flush_req;
+  autovector<FlushRequest> flush_reqs;
+  autovector<uint64_t> memtable_ids_to_wait;
   {
     WriteContext context;
     InstrumentedMutexLock guard_lock(&mutex_);
@@ -1716,18 +1828,24 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
     }
     WaitForPendingWrites();
 
-    if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) {
-      if (flush_reason != FlushReason::kErrorRecoveryRetryFlush) {
-        s = SwitchMemtable(cfd, &context);
-      } else {
-        assert(cfd->imm()->NumNotFlushed() > 0);
-      }
+    if (flush_reason != FlushReason::kErrorRecoveryRetryFlush &&
+        (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load())) {
+      // Note that, when flush reason is kErrorRecoveryRetryFlush, during the
+      // auto retry resume, we want to avoid creating new small memtables.
+      // Therefore, SwitchMemtable will not be called. Also, since ResumeImpl
+      // will iterate through all the CFs and call FlushMemtable during auto
+      // retry resume, it is possible that in some CFs,
+      // cfd->imm()->NumNotFlushed() = 0. In this case, so no flush request will
+      // be created and scheduled, status::OK() will be returned.
+      s = SwitchMemtable(cfd, &context);
     }
+    const uint64_t flush_memtable_id = port::kMaxUint64;
     if (s.ok()) {
       if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
           !cached_recoverable_state_empty_.load()) {
-        flush_memtable_id = cfd->imm()->GetLatestMemTableID();
-        flush_req.emplace_back(cfd, flush_memtable_id);
+        FlushRequest req{{cfd, flush_memtable_id}};
+        flush_reqs.emplace_back(std::move(req));
+        memtable_ids_to_wait.emplace_back(cfd->imm()->GetLatestMemTableID());
       }
       if (immutable_db_options_.persist_stats_to_disk &&
           flush_reason != FlushReason::kErrorRecoveryRetryFlush) {
@@ -1753,15 +1871,19 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
                            "to avoid holding old logs",
                            cfd->GetName().c_str());
             s = SwitchMemtable(cfd_stats, &context);
-            flush_memtable_id = cfd_stats->imm()->GetLatestMemTableID();
-            flush_req.emplace_back(cfd_stats, flush_memtable_id);
+            FlushRequest req{{cfd_stats, flush_memtable_id}};
+            flush_reqs.emplace_back(std::move(req));
+            memtable_ids_to_wait.emplace_back(
+                cfd->imm()->GetLatestMemTableID());
           }
         }
       }
     }
-    if (s.ok() && !flush_req.empty()) {
-      for (auto& elem : flush_req) {
-        ColumnFamilyData* loop_cfd = elem.first;
+
+    if (s.ok() && !flush_reqs.empty()) {
+      for (const auto& req : flush_reqs) {
+        assert(req.size() == 1);
+        ColumnFamilyData* loop_cfd = req[0].first;
         loop_cfd->imm()->FlushRequested();
       }
       // If the caller wants to wait for this flush to complete, it indicates
@@ -1769,12 +1891,15 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
       // other threads which may drop the column family concurrently.
       // Therefore, we increase the cfd's ref count.
       if (flush_options.wait) {
-        for (auto& elem : flush_req) {
-          ColumnFamilyData* loop_cfd = elem.first;
+        for (const auto& req : flush_reqs) {
+          assert(req.size() == 1);
+          ColumnFamilyData* loop_cfd = req[0].first;
           loop_cfd->Ref();
         }
       }
-      SchedulePendingFlush(flush_req, flush_reason);
+      for (const auto& req : flush_reqs) {
+        SchedulePendingFlush(req, flush_reason);
+      }
       MaybeScheduleFlushOrCompaction();
     }
 
@@ -1790,9 +1915,11 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
   if (s.ok() && flush_options.wait) {
     autovector<ColumnFamilyData*> cfds;
     autovector<const uint64_t*> flush_memtable_ids;
-    for (auto& iter : flush_req) {
-      cfds.push_back(iter.first);
-      flush_memtable_ids.push_back(&(iter.second));
+    assert(flush_reqs.size() == memtable_ids_to_wait.size());
+    for (size_t i = 0; i < flush_reqs.size(); ++i) {
+      assert(flush_reqs[i].size() == 1);
+      cfds.push_back(flush_reqs[i][0].first);
+      flush_memtable_ids.push_back(&(memtable_ids_to_wait[i]));
     }
     s = WaitForFlushMemTables(
         cfds, flush_memtable_ids,
@@ -1974,12 +2101,12 @@ Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
       // check whether one extra immutable memtable or an extra L0 file would
       // cause write stalling mode to be entered. It could still enter stall
       // mode due to pending compaction bytes, but that's less common
-      write_stall_condition =
-          ColumnFamilyData::GetWriteStallConditionAndCause(
-              cfd->imm()->NumNotFlushed() + 1,
-              vstorage->l0_delay_trigger_count() + 1,
-              vstorage->estimated_compaction_needed_bytes(), mutable_cf_options)
-              .first;
+      write_stall_condition = ColumnFamilyData::GetWriteStallConditionAndCause(
+                                  cfd->imm()->NumNotFlushed() + 1,
+                                  vstorage->l0_delay_trigger_count() + 1,
+                                  vstorage->estimated_compaction_needed_bytes(),
+                                  mutable_cf_options, *cfd->ioptions())
+                                  .first;
     } while (write_stall_condition != WriteStallCondition::kNormal);
   }
   return Status::OK();
@@ -2158,6 +2285,7 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
          unscheduled_compactions_ > 0) {
     CompactionArg* ca = new CompactionArg;
     ca->db = this;
+    ca->compaction_pri_ = Env::Priority::LOW;
     ca->prepicked_compaction = nullptr;
     bg_compaction_scheduled_++;
     unscheduled_compactions_--;
@@ -2217,6 +2345,17 @@ DBImpl::FlushRequest DBImpl::PopFirstFromFlushQueue() {
   assert(!flush_queue_.empty());
   FlushRequest flush_req = flush_queue_.front();
   flush_queue_.pop_front();
+  if (!immutable_db_options_.atomic_flush) {
+    assert(flush_req.size() == 1);
+  }
+  for (const auto& elem : flush_req) {
+    if (!immutable_db_options_.atomic_flush) {
+      ColumnFamilyData* cfd = elem.first;
+      assert(cfd);
+      assert(cfd->queued_for_flush());
+      cfd->set_queued_for_flush(false);
+    }
+  }
   // TODO: need to unset flush reason?
   return flush_req;
 }
@@ -2249,19 +2388,36 @@ ColumnFamilyData* DBImpl::PickCompactionFromQueue(
 
 void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req,
                                   FlushReason flush_reason) {
+  mutex_.AssertHeld();
   if (flush_req.empty()) {
     return;
   }
-  for (auto& iter : flush_req) {
-    ColumnFamilyData* cfd = iter.first;
-    cfd->Ref();
-    cfd->SetFlushReason(flush_reason);
+  if (!immutable_db_options_.atomic_flush) {
+    // For the non-atomic flush case, we never schedule multiple column
+    // families in the same flush request.
+    assert(flush_req.size() == 1);
+    ColumnFamilyData* cfd = flush_req[0].first;
+    assert(cfd);
+    if (!cfd->queued_for_flush() && cfd->imm()->IsFlushPending()) {
+      cfd->Ref();
+      cfd->set_queued_for_flush(true);
+      cfd->SetFlushReason(flush_reason);
+      ++unscheduled_flushes_;
+      flush_queue_.push_back(flush_req);
+    }
+  } else {
+    for (auto& iter : flush_req) {
+      ColumnFamilyData* cfd = iter.first;
+      cfd->Ref();
+      cfd->SetFlushReason(flush_reason);
+    }
+    ++unscheduled_flushes_;
+    flush_queue_.push_back(flush_req);
   }
-  ++unscheduled_flushes_;
-  flush_queue_.push_back(flush_req);
 }
 
 void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) {
+  mutex_.AssertHeld();
   if (!cfd->queued_for_compaction() && cfd->NeedsCompaction()) {
     AddToCompactionQueue(cfd);
     ++unscheduled_compactions_;
@@ -2317,7 +2473,16 @@ void DBImpl::BGWorkPurge(void* db) {
 }
 
 void DBImpl::UnscheduleCompactionCallback(void* arg) {
-  CompactionArg ca = *(reinterpret_cast<CompactionArg*>(arg));
+  CompactionArg* ca_ptr = reinterpret_cast<CompactionArg*>(arg);
+  Env::Priority compaction_pri = ca_ptr->compaction_pri_;
+  if (Env::Priority::BOTTOM == compaction_pri) {
+    // Decrement bg_bottom_compaction_scheduled_ if priority is BOTTOM
+    ca_ptr->db->bg_bottom_compaction_scheduled_--;
+  } else if (Env::Priority::LOW == compaction_pri) {
+    // Decrement bg_compaction_scheduled_ if priority is LOW
+    ca_ptr->db->bg_compaction_scheduled_--;
+  }
+  CompactionArg ca = *(ca_ptr);
   delete reinterpret_cast<CompactionArg*>(arg);
   if (ca.prepicked_compaction != nullptr) {
     if (ca.prepicked_compaction->compaction != nullptr) {
@@ -2329,6 +2494,14 @@ void DBImpl::UnscheduleCompactionCallback(void* arg) {
 }
 
 void DBImpl::UnscheduleFlushCallback(void* arg) {
+  // Decrement bg_flush_scheduled_ in flush callback
+  reinterpret_cast<FlushThreadArg*>(arg)->db_->bg_flush_scheduled_--;
+  Env::Priority flush_pri = reinterpret_cast<FlushThreadArg*>(arg)->thread_pri_;
+  if (Env::Priority::LOW == flush_pri) {
+    TEST_SYNC_POINT("DBImpl::UnscheduleLowFlushCallback");
+  } else if (Env::Priority::HIGH == flush_pri) {
+    TEST_SYNC_POINT("DBImpl::UnscheduleHighFlushCallback");
+  }
   delete reinterpret_cast<FlushThreadArg*>(arg);
   TEST_SYNC_POINT("DBImpl::UnscheduleFlushCallback");
 }
@@ -2421,6 +2594,8 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
 
   LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
                        immutable_db_options_.info_log.get());
+  TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:1");
+  TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:2");
   {
     InstrumentedMutexLock l(&mutex_);
     assert(bg_flush_scheduled_);
@@ -2449,7 +2624,7 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
                       s.ToString().c_str(), error_cnt);
       log_buffer.FlushBufferToLog();
       LogFlush(immutable_db_options_.info_log);
-      env_->SleepForMicroseconds(1000000);
+      immutable_db_options_.clock->SleepForMicroseconds(1000000);
       mutex_.Lock();
     }
 
@@ -2522,7 +2697,8 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
     if (s.IsBusy()) {
       bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
       mutex_.Unlock();
-      env_->SleepForMicroseconds(10000);  // prevent hot loop
+      immutable_db_options_.clock->SleepForMicroseconds(
+          10000);  // prevent hot loop
       mutex_.Lock();
     } else if (!s.ok() && !s.IsShutdownInProgress() &&
                !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped()) {
@@ -2540,7 +2716,7 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
                       "Accumulated background error counts: %" PRIu64,
                       s.ToString().c_str(), error_cnt);
       LogFlush(immutable_db_options_.info_log);
-      env_->SleepForMicroseconds(1000000);
+      immutable_db_options_.clock->SleepForMicroseconds(1000000);
       mutex_.Lock();
     } else if (s.IsManualCompactionPaused()) {
       ManualCompactionState* m = prepicked_compaction->manual_compaction_state;
@@ -2556,7 +2732,8 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
     // failure). Thus, we force full scan in FindObsoleteFiles()
     FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
                                         !s.IsManualCompactionPaused() &&
-                                        !s.IsColumnFamilyDropped());
+                                        !s.IsColumnFamilyDropped() &&
+                                        !s.IsBusy());
     TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles");
 
     // delete unnecessary files if any, this is done outside the mutex
@@ -2590,6 +2767,14 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
 
     // See if there's more work to be done
     MaybeScheduleFlushOrCompaction();
+
+    if (prepicked_compaction != nullptr &&
+        prepicked_compaction->task_token != nullptr) {
+      // Releasing task tokens affects the DB state, so must be done before we
+      // potentially signal the DB close process to proceed below.
+      prepicked_compaction->task_token->ReleaseOnce();
+    }
+
     if (made_progress ||
         (bg_compaction_scheduled_ == 0 &&
          bg_bottom_compaction_scheduled_ == 0) ||
@@ -2642,6 +2827,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     } else if (is_manual &&
                manual_compaction_paused_.load(std::memory_order_acquire) > 0) {
       status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+    } else if (is_manual && manual_compaction->canceled &&
+               manual_compaction->canceled->load(std::memory_order_acquire)) {
+      status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
     }
   } else {
     status = error_handler_.GetBGError();
@@ -2769,7 +2957,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
           c->column_family_data()
               ->current()
               ->storage_info()
-              ->ComputeCompactionScore(*(c->immutable_cf_options()),
+              ->ComputeCompactionScore(*(c->immutable_options()),
                                        *(c->mutable_cf_options()));
           AddToCompactionQueue(cfd);
           ++unscheduled_compactions_;
@@ -2929,6 +3117,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     TEST_SYNC_POINT("DBImpl::BackgroundCompaction:ForwardToBottomPriPool");
     CompactionArg* ca = new CompactionArg;
     ca->db = this;
+    ca->compaction_pri_ = Env::Priority::BOTTOM;
     ca->prepicked_compaction = new PrepickedCompaction;
     ca->prepicked_compaction->compaction = c.release();
     ca->prepicked_compaction->manual_compaction_state = nullptr;
@@ -2952,16 +3141,19 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     assert(is_snapshot_supported_ || snapshots_.empty());
     CompactionJob compaction_job(
         job_context->job_id, c.get(), immutable_db_options_,
-        file_options_for_compaction_, versions_.get(), &shutting_down_,
-        preserve_deletes_seqnum_.load(), log_buffer, directories_.GetDbDir(),
-        GetDataDir(c->column_family_data(), c->output_path_id()), stats_,
-        &mutex_, &error_handler_, snapshot_seqs,
-        earliest_write_conflict_snapshot, snapshot_checker, table_cache_,
-        &event_logger_, c->mutable_cf_options()->paranoid_file_checks,
+        mutable_db_options_, file_options_for_compaction_, versions_.get(),
+        &shutting_down_, preserve_deletes_seqnum_.load(), log_buffer,
+        directories_.GetDbDir(),
+        GetDataDir(c->column_family_data(), c->output_path_id()),
+        GetDataDir(c->column_family_data(), 0), stats_, &mutex_,
+        &error_handler_, snapshot_seqs, earliest_write_conflict_snapshot,
+        snapshot_checker, table_cache_, &event_logger_,
+        c->mutable_cf_options()->paranoid_file_checks,
         c->mutable_cf_options()->report_bg_io_stats, dbname_,
         &compaction_job_stats, thread_pri, io_tracer_,
-        is_manual ? &manual_compaction_paused_ : nullptr, db_id_,
-        db_session_id_);
+        is_manual ? &manual_compaction_paused_ : nullptr,
+        is_manual ? manual_compaction->canceled : nullptr, db_id_,
+        db_session_id_, c->column_family_data()->GetFullHistoryTsLow());
     compaction_job.Prepare();
 
     NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
@@ -3036,10 +3228,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
       auto err_reason = versions_->io_status().ok()
                             ? BackgroundErrorReason::kCompaction
                             : BackgroundErrorReason::kManifestWrite;
-      error_handler_.SetBGError(io_s, err_reason).PermitUncheckedError();
+      error_handler_.SetBGError(io_s, err_reason);
     } else {
-      error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction)
-          .PermitUncheckedError();
+      error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
     }
     if (c != nullptr && !is_manual && !error_handler_.IsBGWorkStopped()) {
       // Put this cfd back in the compaction queue so we can retry after some
@@ -3051,7 +3242,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
       c->column_family_data()
           ->current()
           ->storage_info()
-          ->ComputeCompactionScore(*(c->immutable_cf_options()),
+          ->ComputeCompactionScore(*(c->immutable_options()),
                                    *(c->mutable_cf_options()));
       if (!cfd->queued_for_compaction()) {
         AddToCompactionQueue(cfd);
@@ -3192,7 +3383,7 @@ bool DBImpl::MCOverlap(ManualCompactionState* m, ManualCompactionState* m1) {
   if (m->cfd != m1->cfd) {
     return false;
   }
-  return true;
+  return false;
 }
 
 #ifndef ROCKSDB_LITE
@@ -3216,7 +3407,7 @@ void DBImpl::BuildCompactionJobInfo(
     for (const auto fmd : *c->inputs(i)) {
       const FileDescriptor& desc = fmd->fd;
       const uint64_t file_number = desc.GetNumber();
-      auto fn = TableFileName(c->immutable_cf_options()->cf_paths, file_number,
+      auto fn = TableFileName(c->immutable_options()->cf_paths, file_number,
                               desc.GetPathId());
       compaction_job_info->input_files.push_back(fn);
       compaction_job_info->input_file_infos.push_back(CompactionFileInfo{
@@ -3235,7 +3426,7 @@ void DBImpl::BuildCompactionJobInfo(
     const FileDescriptor& desc = meta.fd;
     const uint64_t file_number = desc.GetNumber();
     compaction_job_info->output_files.push_back(TableFileName(
-        c->immutable_cf_options()->cf_paths, file_number, desc.GetPathId()));
+        c->immutable_options()->cf_paths, file_number, desc.GetPathId()));
     compaction_job_info->output_file_infos.push_back(CompactionFileInfo{
         newf.first, file_number, meta.oldest_blob_file_number});
   }
diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc
index c21c9fa8f18..e590607c6d7 100644
--- a/db/db_impl/db_impl_debug.cc
+++ b/db/db_impl/db_impl_debug.cc
@@ -22,12 +22,13 @@ uint64_t DBImpl::TEST_GetLevel0TotalSize() {
   return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0);
 }
 
-void DBImpl::TEST_SwitchWAL() {
+Status DBImpl::TEST_SwitchWAL() {
   WriteContext write_context;
   InstrumentedMutexLock l(&mutex_);
   void* writer = TEST_BeginWrite();
-  SwitchWAL(&write_context);
+  auto s = SwitchWAL(&write_context);
   TEST_EndWrite(writer);
+  return s;
 }
 
 bool DBImpl::TEST_WALBufferIsEmpty(bool lock) {
@@ -170,12 +171,17 @@ Status DBImpl::TEST_WaitForCompact(bool wait_unscheduled) {
   while ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
           bg_flush_scheduled_ ||
           (wait_unscheduled && unscheduled_compactions_)) &&
-         (error_handler_.GetBGError() == Status::OK())) {
+         (error_handler_.GetBGError().ok())) {
     bg_cv_.Wait();
   }
   return error_handler_.GetBGError();
 }
 
+Status DBImpl::TEST_GetBGError() {
+  InstrumentedMutexLock l(&mutex_);
+  return error_handler_.GetBGError();
+}
+
 void DBImpl::TEST_LockMutex() { mutex_.Lock(); }
 
 void DBImpl::TEST_UnlockMutex() { mutex_.Unlock(); }
diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc
index 2d30f5857b9..c0405d6bf48 100644
--- a/db/db_impl/db_impl_files.cc
+++ b/db/db_impl/db_impl_files.cc
@@ -6,16 +6,17 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-#include "db/db_impl/db_impl.h"
-
 #include <cinttypes>
 #include <set>
 #include <unordered_set>
+
+#include "db/db_impl/db_impl.h"
 #include "db/event_helpers.h"
 #include "db/memtable_list.h"
 #include "file/file_util.h"
 #include "file/filename.h"
 #include "file/sst_file_manager_impl.h"
+#include "port/port.h"
 #include "util/autovector.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -119,7 +120,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
              mutable_db_options_.delete_obsolete_files_period_micros == 0) {
     doing_the_full_scan = true;
   } else {
-    const uint64_t now_micros = env_->NowMicros();
+    const uint64_t now_micros = immutable_db_options_.clock->NowMicros();
     if ((delete_obsolete_files_last_run_ +
          mutable_db_options_.delete_obsolete_files_period_micros) <
         now_micros) {
@@ -190,7 +191,8 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
       // set of all files in the directory. We'll exclude files that are still
       // alive in the subsequent processings.
       std::vector<std::string> files;
-      env_->GetChildren(path, &files).PermitUncheckedError();  // Ignore errors
+      Status s = env_->GetChildren(path, &files);
+      s.PermitUncheckedError();  // TODO: What should we do on error?
       for (const std::string& file : files) {
         uint64_t number;
         FileType type;
@@ -206,7 +208,8 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
           continue;
         }
 
-        // TODO(icanadi) clean up this mess to avoid having one-off "/" prefixes
+        // TODO(icanadi) clean up this mess to avoid having one-off "/"
+        // prefixes
         job_context->full_scan_candidate_files.emplace_back("/" + file, path);
       }
     }
@@ -214,9 +217,8 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
     // Add log files in wal_dir
     if (immutable_db_options_.wal_dir != dbname_) {
       std::vector<std::string> log_files;
-      env_->GetChildren(immutable_db_options_.wal_dir,
-                        &log_files)
-          .PermitUncheckedError();  // Ignore errors
+      Status s = env_->GetChildren(immutable_db_options_.wal_dir, &log_files);
+      s.PermitUncheckedError();  // TODO: What should we do on error?
       for (const std::string& log_file : log_files) {
         job_context->full_scan_candidate_files.emplace_back(
             log_file, immutable_db_options_.wal_dir);
@@ -226,9 +228,9 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
     if (!immutable_db_options_.db_log_dir.empty() &&
         immutable_db_options_.db_log_dir != dbname_) {
       std::vector<std::string> info_log_files;
-      // Ignore errors
-      env_->GetChildren(immutable_db_options_.db_log_dir, &info_log_files)
-          .PermitUncheckedError();
+      Status s =
+          env_->GetChildren(immutable_db_options_.db_log_dir, &info_log_files);
+      s.PermitUncheckedError();  // TODO: What should we do on error?
       for (std::string& log_file : info_log_files) {
         job_context->full_scan_candidate_files.emplace_back(
             log_file, immutable_db_options_.db_log_dir);
@@ -319,7 +321,7 @@ void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname,
                            const_cast<std::string*>(&fname));
 
   Status file_deletion_status;
-  if (type == kTableFile || type == kBlobFile || type == kLogFile) {
+  if (type == kTableFile || type == kBlobFile || type == kWalFile) {
     file_deletion_status =
         DeleteDBFile(&immutable_db_options_, fname, path_to_sync,
                      /*force_bg=*/false, /*force_fg=*/!wal_in_db_path_);
@@ -466,7 +468,7 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
 
     bool keep = true;
     switch (type) {
-      case kLogFile:
+      case kWalFile:
         keep = ((number >= state.log_number) ||
                 (number == state.prev_log_number) ||
                 (log_recycle_files_set.find(number) !=
@@ -546,7 +548,7 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
       dir_to_sync = candidate_file.file_path;
     } else {
       dir_to_sync =
-          (type == kLogFile) ? immutable_db_options_.wal_dir : dbname_;
+          (type == kWalFile) ? immutable_db_options_.wal_dir : dbname_;
       fname = dir_to_sync +
               ((!dir_to_sync.empty() && dir_to_sync.back() == '/') ||
                        (!to_delete.empty() && to_delete.front() == '/')
@@ -556,8 +558,8 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
     }
 
 #ifndef ROCKSDB_LITE
-    if (type == kLogFile && (immutable_db_options_.wal_ttl_seconds > 0 ||
-                             immutable_db_options_.wal_size_limit_mb > 0)) {
+    if (type == kWalFile && (immutable_db_options_.WAL_ttl_seconds > 0 ||
+                             immutable_db_options_.WAL_size_limit_MB > 0)) {
       wal_manager_.ArchiveWALFile(fname, number);
       continue;
     }
@@ -658,13 +660,15 @@ uint64_t FindMinPrepLogReferencedByMemTable(
 
   // we must look through the memtables for two phase transactions
   // that have been committed but not yet flushed
+  std::unordered_set<MemTable*> memtables_to_flush_set(
+      memtables_to_flush.begin(), memtables_to_flush.end());
   for (auto loop_cfd : *vset->GetColumnFamilySet()) {
     if (loop_cfd->IsDropped() || loop_cfd == cfd_to_flush) {
       continue;
     }
 
     auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection(
-        memtables_to_flush);
+        &memtables_to_flush_set);
 
     if (log > 0 && (min_log == 0 || log < min_log)) {
       min_log = log;
@@ -680,16 +684,41 @@ uint64_t FindMinPrepLogReferencedByMemTable(
   return min_log;
 }
 
-uint64_t PrecomputeMinLogNumberToKeep(
+uint64_t FindMinPrepLogReferencedByMemTable(
+    VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+    const autovector<const autovector<MemTable*>*>& memtables_to_flush) {
+  uint64_t min_log = 0;
+
+  std::unordered_set<ColumnFamilyData*> cfds_to_flush_set(cfds_to_flush.begin(),
+                                                          cfds_to_flush.end());
+  std::unordered_set<MemTable*> memtables_to_flush_set;
+  for (const autovector<MemTable*>* memtables : memtables_to_flush) {
+    memtables_to_flush_set.insert(memtables->begin(), memtables->end());
+  }
+  for (auto loop_cfd : *vset->GetColumnFamilySet()) {
+    if (loop_cfd->IsDropped() || cfds_to_flush_set.count(loop_cfd)) {
+      continue;
+    }
+
+    auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection(
+        &memtables_to_flush_set);
+    if (log > 0 && (min_log == 0 || log < min_log)) {
+      min_log = log;
+    }
+
+    log = loop_cfd->mem()->GetMinLogContainingPrepSection();
+    if (log > 0 && (min_log == 0 || log < min_log)) {
+      min_log = log;
+    }
+  }
+
+  return min_log;
+}
+
+uint64_t PrecomputeMinLogNumberToKeepNon2PC(
     VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
-    autovector<VersionEdit*> edit_list,
-    const autovector<MemTable*>& memtables_to_flush,
-    LogsWithPrepTracker* prep_tracker) {
+    const autovector<VersionEdit*>& edit_list) {
   assert(vset != nullptr);
-  assert(prep_tracker != nullptr);
-  // Calculate updated min_log_number_to_keep
-  // Since the function should only be called in 2pc mode, log number in
-  // the version edit should be sufficient.
 
   // Precompute the min log number containing unflushed data for the column
   // family being flushed (`cfd_to_flush`).
@@ -713,6 +742,58 @@ uint64_t PrecomputeMinLogNumberToKeep(
     min_log_number_to_keep =
         std::min(cf_min_log_number_to_keep, min_log_number_to_keep);
   }
+  return min_log_number_to_keep;
+}
+
+uint64_t PrecomputeMinLogNumberToKeepNon2PC(
+    VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+    const autovector<autovector<VersionEdit*>>& edit_lists) {
+  assert(vset != nullptr);
+  assert(!cfds_to_flush.empty());
+  assert(cfds_to_flush.size() == edit_lists.size());
+
+  uint64_t min_log_number_to_keep = port::kMaxUint64;
+  for (const auto& edit_list : edit_lists) {
+    uint64_t log = 0;
+    for (const auto& e : edit_list) {
+      if (e->HasLogNumber()) {
+        log = std::max(log, e->GetLogNumber());
+      }
+    }
+    if (log != 0) {
+      min_log_number_to_keep = std::min(min_log_number_to_keep, log);
+    }
+  }
+  if (min_log_number_to_keep == port::kMaxUint64) {
+    min_log_number_to_keep = cfds_to_flush[0]->GetLogNumber();
+    for (size_t i = 1; i < cfds_to_flush.size(); i++) {
+      min_log_number_to_keep =
+          std::min(min_log_number_to_keep, cfds_to_flush[i]->GetLogNumber());
+    }
+  }
+
+  std::unordered_set<const ColumnFamilyData*> flushed_cfds(
+      cfds_to_flush.begin(), cfds_to_flush.end());
+  min_log_number_to_keep =
+      std::min(min_log_number_to_keep,
+               vset->PreComputeMinLogNumberWithUnflushedData(flushed_cfds));
+
+  return min_log_number_to_keep;
+}
+
+uint64_t PrecomputeMinLogNumberToKeep2PC(
+    VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+    const autovector<VersionEdit*>& edit_list,
+    const autovector<MemTable*>& memtables_to_flush,
+    LogsWithPrepTracker* prep_tracker) {
+  assert(vset != nullptr);
+  assert(prep_tracker != nullptr);
+  // Calculate updated min_log_number_to_keep
+  // Since the function should only be called in 2pc mode, log number in
+  // the version edit should be sufficient.
+
+  uint64_t min_log_number_to_keep =
+      PrecomputeMinLogNumberToKeepNon2PC(vset, cfd_to_flush, edit_list);
 
   // if are 2pc we must consider logs containing prepared
   // sections of outstanding transactions.
@@ -741,7 +822,81 @@ uint64_t PrecomputeMinLogNumberToKeep(
   return min_log_number_to_keep;
 }
 
-Status DBImpl::FinishBestEffortsRecovery() {
+uint64_t PrecomputeMinLogNumberToKeep2PC(
+    VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+    const autovector<autovector<VersionEdit*>>& edit_lists,
+    const autovector<const autovector<MemTable*>*>& memtables_to_flush,
+    LogsWithPrepTracker* prep_tracker) {
+  assert(vset != nullptr);
+  assert(prep_tracker != nullptr);
+  assert(cfds_to_flush.size() == edit_lists.size());
+  assert(cfds_to_flush.size() == memtables_to_flush.size());
+
+  uint64_t min_log_number_to_keep =
+      PrecomputeMinLogNumberToKeepNon2PC(vset, cfds_to_flush, edit_lists);
+
+  uint64_t min_log_in_prep_heap =
+      prep_tracker->FindMinLogContainingOutstandingPrep();
+
+  if (min_log_in_prep_heap != 0 &&
+      min_log_in_prep_heap < min_log_number_to_keep) {
+    min_log_number_to_keep = min_log_in_prep_heap;
+  }
+
+  uint64_t min_log_refed_by_mem = FindMinPrepLogReferencedByMemTable(
+      vset, cfds_to_flush, memtables_to_flush);
+
+  if (min_log_refed_by_mem != 0 &&
+      min_log_refed_by_mem < min_log_number_to_keep) {
+    min_log_number_to_keep = min_log_refed_by_mem;
+  }
+
+  return min_log_number_to_keep;
+}
+
+Status DBImpl::SetDBId(bool read_only) {
+  Status s;
+  // Happens when immutable_db_options_.write_dbid_to_manifest is set to true
+  // the very first time.
+  if (db_id_.empty()) {
+    // Check for the IDENTITY file and create it if not there.
+    s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr);
+    // Typically Identity file is created in NewDB() and for some reason if
+    // it is no longer available then at this point DB ID is not in Identity
+    // file or Manifest.
+    if (s.IsNotFound()) {
+      // Create a new DB ID, saving to file only if allowed
+      if (read_only) {
+        db_id_ = env_->GenerateUniqueId();
+        return Status::OK();
+      } else {
+        s = SetIdentityFile(env_, dbname_);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+    } else if (!s.ok()) {
+      assert(s.IsIOError());
+      return s;
+    }
+    s = GetDbIdentityFromIdentityFile(&db_id_);
+    if (immutable_db_options_.write_dbid_to_manifest && s.ok()) {
+      VersionEdit edit;
+      edit.SetDBId(db_id_);
+      Options options;
+      MutableCFOptions mutable_cf_options(options);
+      versions_->db_id_ = db_id_;
+      s = versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                                 mutable_cf_options, &edit, &mutex_, nullptr,
+                                 /* new_descriptor_log */ false);
+    }
+  } else if (!read_only) {
+    s = SetIdentityFile(env_, dbname_, db_id_);
+  }
+  return s;
+}
+
+Status DBImpl::DeleteUnreferencedSstFiles() {
   mutex_.AssertHeld();
   std::vector<std::string> paths;
   paths.push_back(NormalizePath(dbname_ + std::string(1, kFilePathSeparator)));
@@ -788,7 +943,7 @@ Status DBImpl::FinishBestEffortsRecovery() {
     return s;
   }
 
-  if (largest_file_number > next_file_number) {
+  if (largest_file_number >= next_file_number) {
     versions_->next_file_number_.store(largest_file_number + 1);
   }
 
@@ -797,8 +952,6 @@ Status DBImpl::FinishBestEffortsRecovery() {
   assert(versions_->GetColumnFamilySet());
   ColumnFamilyData* default_cfd = versions_->GetColumnFamilySet()->GetDefault();
   assert(default_cfd);
-  // Even if new_descriptor_log is false, we will still switch to a new
-  // MANIFEST and update CURRENT file, since this is in recovery.
   s = versions_->LogAndApply(
       default_cfd, *default_cfd->GetLatestMutableCFOptions(), &edit, &mutex_,
       directories_.GetDbDir(), /*new_descriptor_log*/ false);
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 71f13bacf0f..64f4ae929ea 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -33,15 +33,17 @@ extern "C" bool RocksDbFileChecksumsVerificationEnabledOnRecovery()
 #endif  // !ROCKSDB_LITE && OS_LINUX
 
 namespace ROCKSDB_NAMESPACE {
-Options SanitizeOptions(const std::string& dbname, const Options& src) {
-  auto db_options = SanitizeOptions(dbname, DBOptions(src));
+Options SanitizeOptions(const std::string& dbname, const Options& src,
+                        bool read_only) {
+  auto db_options = SanitizeOptions(dbname, DBOptions(src), read_only);
   ImmutableDBOptions immutable_db_options(db_options);
   auto cf_options =
       SanitizeOptions(immutable_db_options, ColumnFamilyOptions(src));
   return Options(db_options, cf_options);
 }
 
-DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
+DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src,
+                          bool read_only) {
   DBOptions result(src);
 
   if (result.env == nullptr) {
@@ -59,7 +61,7 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
                              &result.max_open_files);
   }
 
-  if (result.info_log == nullptr) {
+  if (result.info_log == nullptr && !read_only) {
     Status s = CreateLoggerFromOptions(dbname, result, &result.info_log);
     if (!s.ok()) {
       // No place suitable for logging
@@ -156,7 +158,8 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
     // DeleteScheduler::CleanupDirectory on the same dir later, it will be
     // safe
     std::vector<std::string> filenames;
-    result.env->GetChildren(result.wal_dir, &filenames).PermitUncheckedError();
+    Status s = result.env->GetChildren(result.wal_dir, &filenames);
+    s.PermitUncheckedError();  //**TODO: What to do on error?
     for (std::string& filename : filenames) {
       if (filename.find(".log.trash", filename.length() -
                                           std::string(".log.trash").length()) !=
@@ -172,7 +175,8 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
   // was not used)
   auto sfm = static_cast<SstFileManagerImpl*>(result.sst_file_manager.get());
   for (size_t i = 0; i < result.db_paths.size(); i++) {
-    DeleteScheduler::CleanupDirectory(result.env, sfm, result.db_paths[i].path);
+    DeleteScheduler::CleanupDirectory(result.env, sfm, result.db_paths[i].path)
+        .PermitUncheckedError();
   }
 
   // Create a default SstFileManager for purposes of tracking compaction size
@@ -182,7 +186,7 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
         NewSstFileManager(result.env, result.info_log));
     result.sst_file_manager = sst_file_manager;
   }
-#endif
+#endif  // !ROCKSDB_LITE
 
   if (!result.paranoid_checks) {
     result.skip_checking_sst_file_sizes_on_db_open = true;
@@ -290,23 +294,28 @@ Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
   ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n");
   const std::string manifest = DescriptorFileName(dbname_, 1);
   {
+    if (fs_->FileExists(manifest, IOOptions(), nullptr).ok()) {
+      fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError();
+    }
     std::unique_ptr<FSWritableFile> file;
     FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_);
     s = NewWritableFile(fs_.get(), manifest, &file, file_options);
     if (!s.ok()) {
       return s;
     }
+    FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types;
     file->SetPreallocationBlockSize(
         immutable_db_options_.manifest_preallocation_size);
     std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-        std::move(file), manifest, file_options, env_, io_tracer_,
-        nullptr /* stats */, immutable_db_options_.listeners));
+        std::move(file), manifest, file_options, immutable_db_options_.clock,
+        io_tracer_, nullptr /* stats */, immutable_db_options_.listeners,
+        nullptr, tmp_set.Contains(FileType::kDescriptorFile)));
     log::Writer log(std::move(file_writer), 0, false);
     std::string record;
     new_db.EncodeTo(&record);
     s = log.AddRecord(record);
     if (s.ok()) {
-      s = SyncManifest(env_, &immutable_db_options_, log.file());
+      s = SyncManifest(&immutable_db_options_, log.file());
     }
   }
   if (s.ok()) {
@@ -317,7 +326,7 @@ Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
           manifest.substr(manifest.find_last_of("/\\") + 1));
     }
   } else {
-    fs_->DeleteFile(manifest, IOOptions(), nullptr);
+    fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError();
   }
   return s;
 }
@@ -410,7 +419,7 @@ Status DBImpl::Recover(
       }
       for (const std::string& file : files_in_dbname) {
         uint64_t number = 0;
-        FileType type = kLogFile;  // initialize
+        FileType type = kWalFile;  // initialize
         if (ParseFileName(file, &number, &type) && type == kDescriptorFile) {
           // Found MANIFEST (descriptor log), thus best-efforts recovery does
           // not have to treat the db as empty.
@@ -488,42 +497,14 @@ Status DBImpl::Recover(
       // TryRecover may delete previous column_family_set_.
       column_family_memtables_.reset(
           new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
-      s = FinishBestEffortsRecovery();
     }
   }
   if (!s.ok()) {
     return s;
   }
-  // Happens when immutable_db_options_.write_dbid_to_manifest is set to true
-  // the very first time.
-  if (db_id_.empty()) {
-    // Check for the IDENTITY file and create it if not there.
-    s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr);
-    // Typically Identity file is created in NewDB() and for some reason if
-    // it is no longer available then at this point DB ID is not in Identity
-    // file or Manifest.
-    if (s.IsNotFound()) {
-      s = SetIdentityFile(env_, dbname_);
-      if (!s.ok()) {
-        return s;
-      }
-    } else if (!s.ok()) {
-      assert(s.IsIOError());
-      return s;
-    }
-    s = GetDbIdentityFromIdentityFile(&db_id_);
-    if (immutable_db_options_.write_dbid_to_manifest && s.ok()) {
-      VersionEdit edit;
-      edit.SetDBId(db_id_);
-      Options options;
-      MutableCFOptions mutable_cf_options(options);
-      versions_->db_id_ = db_id_;
-      s = versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
-                             mutable_cf_options, &edit, &mutex_, nullptr,
-                             false);
-    }
-  } else {
-    s = SetIdentityFile(env_, dbname_, db_id_);
+  s = SetDBId(read_only);
+  if (s.ok() && !read_only) {
+    s = DeleteUnreferencedSstFiles();
   }
 
   if (immutable_db_options_.paranoid_checks && s.ok()) {
@@ -545,7 +526,7 @@ Status DBImpl::Recover(
 
   std::vector<std::string> files_in_wal_dir;
   if (s.ok()) {
-    // Initial max_total_in_memory_state_ before recovery logs. Log recovery
+    // Initial max_total_in_memory_state_ before recovery wals. Log recovery
     // may check this value to decide whether to flush.
     max_total_in_memory_state_ = 0;
     for (auto cfd : *versions_->GetColumnFamilySet()) {
@@ -580,32 +561,53 @@ Status DBImpl::Recover(
       return s;
     }
 
-    std::vector<uint64_t> logs;
+    std::unordered_map<uint64_t, std::string> wal_files;
     for (const auto& file : files_in_wal_dir) {
       uint64_t number;
       FileType type;
-      if (ParseFileName(file, &number, &type) && type == kLogFile) {
+      if (ParseFileName(file, &number, &type) && type == kWalFile) {
         if (is_new_db) {
           return Status::Corruption(
               "While creating a new Db, wal_dir contains "
               "existing log file: ",
               file);
         } else {
-          logs.push_back(number);
+          wal_files[number] =
+              LogFileName(immutable_db_options_.wal_dir, number);
         }
       }
     }
 
-    if (logs.size() > 0) {
+    if (immutable_db_options_.track_and_verify_wals_in_manifest) {
+      if (!immutable_db_options_.best_efforts_recovery) {
+        // Verify WALs in MANIFEST.
+        s = versions_->GetWalSet().CheckWals(env_, wal_files);
+      }  // else since best effort recovery does not recover from WALs, no need
+         // to check WALs.
+    } else if (!versions_->GetWalSet().GetWals().empty()) {
+      // Tracking is disabled, clear previously tracked WALs from MANIFEST,
+      // otherwise, in the future, if WAL tracking is enabled again,
+      // since the WALs deleted when WAL tracking is disabled are not persisted
+      // into MANIFEST, WAL check may fail.
+      VersionEdit edit;
+      WalNumber max_wal_number =
+          versions_->GetWalSet().GetWals().rbegin()->first;
+      edit.DeleteWalsBefore(max_wal_number + 1);
+      s = versions_->LogAndApplyToDefaultColumnFamily(&edit, &mutex_);
+    }
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (!wal_files.empty()) {
       if (error_if_wal_file_exists) {
         return Status::Corruption(
             "The db was opened in readonly mode with error_if_wal_file_exists"
             "flag but a WAL file already exists");
       } else if (error_if_data_exists_in_wals) {
-        for (auto& log : logs) {
-          std::string fname = LogFileName(immutable_db_options_.wal_dir, log);
+        for (auto& wal_file : wal_files) {
           uint64_t bytes;
-          s = env_->GetFileSize(fname, &bytes);
+          s = env_->GetFileSize(wal_file.second, &bytes);
           if (s.ok()) {
             if (bytes > 0) {
               return Status::Corruption(
@@ -617,13 +619,19 @@ Status DBImpl::Recover(
       }
     }
 
-    if (!logs.empty()) {
-      // Recover in the order in which the logs were generated
-      std::sort(logs.begin(), logs.end());
-      bool corrupted_log_found = false;
-      s = RecoverLogFiles(logs, &next_sequence, read_only,
-                          &corrupted_log_found);
-      if (corrupted_log_found && recovered_seq != nullptr) {
+    if (!wal_files.empty()) {
+      // Recover in the order in which the wals were generated
+      std::vector<uint64_t> wals;
+      wals.reserve(wal_files.size());
+      for (const auto& wal_file : wal_files) {
+        wals.push_back(wal_file.first);
+      }
+      std::sort(wals.begin(), wals.end());
+
+      bool corrupted_wal_found = false;
+      s = RecoverLogFiles(wals, &next_sequence, read_only,
+                          &corrupted_wal_found);
+      if (corrupted_wal_found && recovered_seq != nullptr) {
         *recovered_seq = next_sequence;
       }
       if (!s.ok()) {
@@ -776,10 +784,10 @@ Status DBImpl::InitPersistStatsColumnFamily() {
   return s;
 }
 
-// REQUIRES: log_numbers are sorted in ascending order
-Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+// REQUIRES: wal_numbers are sorted in ascending order
+Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
                                SequenceNumber* next_sequence, bool read_only,
-                               bool* corrupted_log_found) {
+                               bool* corrupted_wal_found) {
   struct LogReporter : public log::Reader::Reporter {
     Env* env;
     Logger* info_log;
@@ -809,10 +817,10 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
     auto stream = event_logger_.Log();
     stream << "job" << job_id << "event"
            << "recovery_started";
-    stream << "log_files";
+    stream << "wal_files";
     stream.StartArray();
-    for (auto log_number : log_numbers) {
-      stream << log_number;
+    for (auto wal_number : wal_numbers) {
+      stream << wal_number;
     }
     stream.EndArray();
   }
@@ -835,25 +843,25 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
   bool stop_replay_by_wal_filter = false;
   bool stop_replay_for_corruption = false;
   bool flushed = false;
-  uint64_t corrupted_log_number = kMaxSequenceNumber;
-  uint64_t min_log_number = MinLogNumberToKeep();
-  for (auto log_number : log_numbers) {
-    if (log_number < min_log_number) {
+  uint64_t corrupted_wal_number = kMaxSequenceNumber;
+  uint64_t min_wal_number = MinLogNumberToKeep();
+  for (auto wal_number : wal_numbers) {
+    if (wal_number < min_wal_number) {
       ROCKS_LOG_INFO(immutable_db_options_.info_log,
                      "Skipping log #%" PRIu64
                      " since it is older than min log to keep #%" PRIu64,
-                     log_number, min_log_number);
+                     wal_number, min_wal_number);
       continue;
     }
     // The previous incarnation may not have written any MANIFEST
     // records after allocating this log number.  So we manually
     // update the file number allocation counter in VersionSet.
-    versions_->MarkFileNumberUsed(log_number);
+    versions_->MarkFileNumberUsed(wal_number);
     // Open the log file
-    std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number);
+    std::string fname = LogFileName(immutable_db_options_.wal_dir, wal_number);
 
     ROCKS_LOG_INFO(immutable_db_options_.info_log,
-                   "Recovering log #%" PRIu64 " mode %d", log_number,
+                   "Recovering log #%" PRIu64 " mode %d", wal_number,
                    static_cast<int>(immutable_db_options_.wal_recovery_mode));
     auto logFileDropped = [this, &fname]() {
       uint64_t bytes;
@@ -906,7 +914,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
     // to be skipped instead of propagating bad information (like overly
     // large sequence numbers).
     log::Reader reader(immutable_db_options_.info_log, std::move(file_reader),
-                       &reporter, true /*checksum*/, log_number);
+                       &reporter, true /*checksum*/, wal_number);
 
     // Determine if we should tolerate incomplete records at the tail end of the
     // Read all the records and add to a memtable
@@ -954,7 +962,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
 
         WalFilter::WalProcessingOption wal_processing_option =
             immutable_db_options_.wal_filter->LogRecordFound(
-                log_number, fname, batch, &new_batch, &batch_changed);
+                wal_number, fname, batch, &new_batch, &batch_changed);
 
         switch (wal_processing_option) {
           case WalFilter::WalProcessingOption::kContinueProcessing:
@@ -1006,7 +1014,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
                 " mode %d log filter %s returned "
                 "more records (%d) than original (%d) which is not allowed. "
                 "Aborting recovery.",
-                log_number,
+                wal_number,
                 static_cast<int>(immutable_db_options_.wal_recovery_mode),
                 immutable_db_options_.wal_filter->Name(), new_count,
                 original_count);
@@ -1033,7 +1041,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
       bool has_valid_writes = false;
       status = WriteBatchInternal::InsertInto(
           &batch, column_family_memtables_.get(), &flush_scheduler_,
-          &trim_history_scheduler_, true, log_number, this,
+          &trim_history_scheduler_, true, wal_number, this,
           false /* concurrent_memtable_writes */, next_sequence,
           &has_valid_writes, seq_per_batch_, batch_per_txn_);
       MaybeIgnoreError(&status);
@@ -1053,7 +1061,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
           cfd->UnrefAndTryDelete();
           // If this asserts, it means that InsertInto failed in
           // filtering updates to already-flushed column families
-          assert(cfd->GetLogNumber() <= log_number);
+          assert(cfd->GetLogNumber() <= wal_number);
           auto iter = version_edits.find(cfd->GetID());
           assert(iter != version_edits.end());
           VersionEdit* edit = &iter->second;
@@ -1090,21 +1098,21 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
                           " seq #%" PRIu64
                           ". %s. This likely mean loss of synced WAL, "
                           "thus recovery fails.",
-                          log_number, *next_sequence,
+                          wal_number, *next_sequence,
                           status.ToString().c_str());
           return status;
         }
         // We should ignore the error but not continue replaying
         status = Status::OK();
         stop_replay_for_corruption = true;
-        corrupted_log_number = log_number;
-        if (corrupted_log_found != nullptr) {
-          *corrupted_log_found = true;
+        corrupted_wal_number = wal_number;
+        if (corrupted_wal_found != nullptr) {
+          *corrupted_wal_found = true;
         }
         ROCKS_LOG_INFO(immutable_db_options_.info_log,
                        "Point in time recovered to log #%" PRIu64
                        " seq #%" PRIu64,
-                       log_number, *next_sequence);
+                       wal_number, *next_sequence);
       } else {
         assert(immutable_db_options_.wal_recovery_mode ==
                    WALRecoveryMode::kTolerateCorruptedTailRecords ||
@@ -1130,7 +1138,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
   // corruption. This could during PIT recovery when the WAL is corrupted and
   // some (but not all) CFs are flushed
   // Exclude the PIT case where no log is dropped after the corruption point.
-  // This is to cover the case for empty logs after corrupted log, in which we
+  // This is to cover the case for empty wals after corrupted log, in which we
   // don't reset stop_replay_for_corruption.
   if (stop_replay_for_corruption == true &&
       (immutable_db_options_.wal_recovery_mode ==
@@ -1138,11 +1146,29 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
        immutable_db_options_.wal_recovery_mode ==
            WALRecoveryMode::kTolerateCorruptedTailRecords)) {
     for (auto cfd : *versions_->GetColumnFamilySet()) {
-      if (cfd->GetLogNumber() > corrupted_log_number) {
+      // One special case cause cfd->GetLogNumber() > corrupted_wal_number but
+      // the CF is still consistent: If a new column family is created during
+      // the flush and the WAL sync fails at the same time, the new CF points to
+      // the new WAL but the old WAL is curropted. Since the new CF is empty, it
+      // is still consistent. We add the check of CF sst file size to avoid the
+      // false positive alert.
+
+      // Note that, the check of (cfd->GetLiveSstFilesSize() > 0) may leads to
+      // the ignorance of a very rare inconsistency case caused in data
+      // canclation. One CF is empty due to KV deletion. But those operations
+      // are in the WAL. If the WAL is corrupted, the status of this CF might
+      // not be consistent with others. However, the consistency check will be
+      // bypassed due to empty CF.
+      // TODO: a better and complete implementation is needed to ensure strict
+      // consistency check in WAL recovery including hanlding the tailing
+      // issues.
+      if (cfd->GetLogNumber() > corrupted_wal_number &&
+          cfd->GetLiveSstFilesSize() > 0) {
         ROCKS_LOG_ERROR(immutable_db_options_.info_log,
                         "Column family inconsistency: SST file contains data"
                         " beyond the point of corruption.");
-        return Status::Corruption("SST file is ahead of WALs");
+        return Status::Corruption("SST file is ahead of WALs in CF " +
+                                  cfd->GetName());
       }
     }
   }
@@ -1153,16 +1179,16 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
   if (!read_only) {
     // no need to refcount since client still doesn't have access
     // to the DB and can not drop column families while we iterate
-    auto max_log_number = log_numbers.back();
+    const WalNumber max_wal_number = wal_numbers.back();
     for (auto cfd : *versions_->GetColumnFamilySet()) {
       auto iter = version_edits.find(cfd->GetID());
       assert(iter != version_edits.end());
       VersionEdit* edit = &iter->second;
 
-      if (cfd->GetLogNumber() > max_log_number) {
+      if (cfd->GetLogNumber() > max_wal_number) {
         // Column family cfd has already flushed the data
-        // from all logs. Memtable has to be empty because
-        // we filter the updates based on log_number
+        // from all wals. Memtable has to be empty because
+        // we filter the updates based on wal_number
         // (in WriteBatch::InsertInto)
         assert(cfd->mem()->GetFirstSequenceNumber() == 0);
         assert(edit->NumEntries() == 0);
@@ -1194,13 +1220,13 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
       // Update the log number info in the version edit corresponding to this
       // column family. Note that the version edits will be written to MANIFEST
       // together later.
-      // writing log_number in the manifest means that any log file
-      // with number strongly less than (log_number + 1) is already
+      // writing wal_number in the manifest means that any log file
+      // with number strongly less than (wal_number + 1) is already
       // recovered and should be ignored on next reincarnation.
-      // Since we already recovered max_log_number, we want all logs
-      // with numbers `<= max_log_number` (includes this one) to be ignored
+      // Since we already recovered max_wal_number, we want all wals
+      // with numbers `<= max_wal_number` (includes this one) to be ignored
       if (flushed || cfd->mem()->GetFirstSequenceNumber() == 0) {
-        edit->SetLogNumber(max_log_number + 1);
+        edit->SetLogNumber(max_wal_number + 1);
       }
     }
     if (status.ok()) {
@@ -1208,7 +1234,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
       // not actually used. that is because VersionSet assumes
       // VersionSet::next_file_number_ always to be strictly greater than any
       // log number
-      versions_->MarkFileNumberUsed(max_log_number + 1);
+      versions_->MarkFileNumberUsed(max_wal_number + 1);
 
       autovector<ColumnFamilyData*> cfds;
       autovector<const MutableCFOptions*> cf_opts;
@@ -1220,6 +1246,14 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
         assert(iter != version_edits.end());
         edit_lists.push_back({&iter->second});
       }
+
+      std::unique_ptr<VersionEdit> wal_deletion;
+      if (immutable_db_options_.track_and_verify_wals_in_manifest) {
+        wal_deletion.reset(new VersionEdit);
+        wal_deletion->DeleteWalsBefore(max_wal_number + 1);
+        edit_lists.back().push_back(wal_deletion.get());
+      }
+
       // write MANIFEST with update
       status = versions_->LogAndApply(cfds, cf_opts, edit_lists, &mutex_,
                                       directories_.GetDbDir(),
@@ -1227,8 +1261,17 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
     }
   }
 
-  if (status.ok() && data_seen && !flushed) {
-    status = RestoreAliveLogFiles(log_numbers);
+  if (status.ok()) {
+    if (data_seen && !flushed) {
+      status = RestoreAliveLogFiles(wal_numbers);
+    } else {
+      // If there's no data in the WAL, or we flushed all the data, still
+      // truncate the log file. If the process goes into a crash loop before
+      // the file is deleted, the preallocated space will never get freed.
+      const bool truncate = !read_only;
+      GetLogSizeAndMaybeTruncate(wal_numbers.back(), truncate, nullptr)
+          .PermitUncheckedError();
+    }
   }
 
   event_logger_.Log() << "job" << job_id << "event"
@@ -1237,8 +1280,42 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
   return status;
 }
 
-Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& log_numbers) {
-  if (log_numbers.empty()) {
+Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
+                                          LogFileNumberSize* log_ptr) {
+  LogFileNumberSize log(wal_number);
+  std::string fname = LogFileName(immutable_db_options_.wal_dir, wal_number);
+  Status s;
+  // This gets the appear size of the wals, not including preallocated space.
+  s = env_->GetFileSize(fname, &log.size);
+  if (s.ok() && truncate) {
+    std::unique_ptr<FSWritableFile> last_log;
+    Status truncate_status = fs_->ReopenWritableFile(
+        fname,
+        fs_->OptimizeForLogWrite(
+            file_options_,
+            BuildDBOptions(immutable_db_options_, mutable_db_options_)),
+        &last_log, nullptr);
+    if (truncate_status.ok()) {
+      truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr);
+    }
+    if (truncate_status.ok()) {
+      truncate_status = last_log->Close(IOOptions(), nullptr);
+    }
+    // Not a critical error if fail to truncate.
+    if (!truncate_status.ok() && !truncate_status.IsNotSupported()) {
+      ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                     "Failed to truncate log #%" PRIu64 ": %s", wal_number,
+                     truncate_status.ToString().c_str());
+    }
+  }
+  if (log_ptr) {
+    *log_ptr = log;
+  }
+  return s;
+}
+
+Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
+  if (wal_numbers.empty()) {
     return Status::OK();
   }
   Status s;
@@ -1251,40 +1328,18 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& log_numbers) {
   // FindObsoleteFiles()
   total_log_size_ = 0;
   log_empty_ = false;
-  for (auto log_number : log_numbers) {
-    LogFileNumberSize log(log_number);
-    std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number);
-    // This gets the appear size of the logs, not including preallocated space.
-    s = env_->GetFileSize(fname, &log.size);
+  for (auto wal_number : wal_numbers) {
+    // We preallocate space for wals, but then after a crash and restart, those
+    // preallocated space are not needed anymore. It is likely only the last
+    // log has such preallocated space, so we only truncate for the last log.
+    LogFileNumberSize log;
+    s = GetLogSizeAndMaybeTruncate(
+        wal_number, /*truncate=*/(wal_number == wal_numbers.back()), &log);
     if (!s.ok()) {
       break;
     }
     total_log_size_ += log.size;
     alive_log_files_.push_back(log);
-    // We preallocate space for logs, but then after a crash and restart, those
-    // preallocated space are not needed anymore. It is likely only the last
-    // log has such preallocated space, so we only truncate for the last log.
-    if (log_number == log_numbers.back()) {
-      std::unique_ptr<FSWritableFile> last_log;
-      Status truncate_status = fs_->ReopenWritableFile(
-          fname,
-          fs_->OptimizeForLogWrite(
-              file_options_,
-              BuildDBOptions(immutable_db_options_, mutable_db_options_)),
-          &last_log, nullptr);
-      if (truncate_status.ok()) {
-        truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr);
-      }
-      if (truncate_status.ok()) {
-        truncate_status = last_log->Close(IOOptions(), nullptr);
-      }
-      // Not a critical error if fail to truncate.
-      if (!truncate_status.ok()) {
-        ROCKS_LOG_WARN(immutable_db_options_.info_log,
-                       "Failed to truncate log #%" PRIu64 ": %s", log_number,
-                       truncate_status.ToString().c_str());
-      }
-    }
   }
   if (two_write_queues_) {
     log_write_mutex_.Unlock();
@@ -1295,7 +1350,7 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& log_numbers) {
 Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
                                            MemTable* mem, VersionEdit* edit) {
   mutex_.AssertHeld();
-  const uint64_t start_micros = env_->NowMicros();
+  const uint64_t start_micros = immutable_db_options_.clock->NowMicros();
 
   FileMetaData meta;
   std::vector<BlobFileAddition> blob_file_additions;
@@ -1323,7 +1378,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
         cfd->GetLatestMutableCFOptions()->paranoid_file_checks;
 
     int64_t _current_time = 0;
-    env_->GetCurrentTime(&_current_time)
+    immutable_db_options_.clock->GetCurrentTime(&_current_time)
         .PermitUncheckedError();  // ignore error
     const uint64_t current_time = static_cast<uint64_t>(_current_time);
     meta.oldest_ancester_time = current_time;
@@ -1348,21 +1403,23 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
       }
 
       IOStatus io_s;
-      s = BuildTable(
-          dbname_, versions_.get(), env_, fs_.get(), *cfd->ioptions(),
-          mutable_cf_options, file_options_for_compaction_, cfd->table_cache(),
-          iter.get(), std::move(range_del_iters), &meta, &blob_file_additions,
-          cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
-          cfd->GetID(), cfd->GetName(), snapshot_seqs,
-          earliest_write_conflict_snapshot, snapshot_checker,
+      TableBuilderOptions tboptions(
+          *cfd->ioptions(), mutable_cf_options, cfd->internal_comparator(),
+          cfd->int_tbl_prop_collector_factories(),
           GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
-          mutable_cf_options.sample_for_compression,
-          mutable_cf_options.compression_opts, paranoid_file_checks,
-          cfd->internal_stats(), TableFileCreationReason::kRecovery, &io_s,
-          io_tracer_, &event_logger_, job_id, Env::IO_HIGH,
-          nullptr /* table_properties */, -1 /* level */, current_time,
-          0 /* oldest_key_time */, write_hint, 0 /* file_creation_time */,
-          db_id_, db_session_id_);
+          mutable_cf_options.compression_opts, cfd->GetID(), cfd->GetName(),
+          0 /* level */, false /* is_bottommost */,
+          TableFileCreationReason::kRecovery, current_time,
+          0 /* oldest_key_time */, 0 /* file_creation_time */, db_id_,
+          db_session_id_, 0 /* target_file_size */, meta.fd.GetNumber());
+      s = BuildTable(
+          dbname_, versions_.get(), immutable_db_options_, tboptions,
+          file_options_for_compaction_, cfd->table_cache(), iter.get(),
+          std::move(range_del_iters), &meta, &blob_file_additions,
+          snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
+          paranoid_file_checks, cfd->internal_stats(), &io_s, io_tracer_,
+          &event_logger_, job_id, Env::IO_HIGH, nullptr /* table_properties */,
+          write_hint, nullptr /*full_history_ts_low*/, &blob_callback_);
       LogFlush(immutable_db_options_.info_log);
       ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
                       "[%s] [WriteLevel0TableForRecovery]"
@@ -1370,6 +1427,9 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
                       cfd->GetName().c_str(), meta.fd.GetNumber(),
                       meta.fd.GetFileSize(), s.ToString().c_str());
       mutex_.Lock();
+
+      io_s.PermitUncheckedError();  // TODO(AR) is this correct, or should we
+                                    // return io_s if not ok()?
     }
   }
   ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
@@ -1377,7 +1437,6 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
   // Note that if file_size is zero, the file has been deleted and
   // should not be added to the manifest.
   const bool has_output = meta.fd.GetFileSize() > 0;
-  assert(has_output || blob_file_additions.empty());
 
   constexpr int level = 0;
 
@@ -1389,26 +1448,30 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
                   meta.oldest_ancester_time, meta.file_creation_time,
                   meta.file_checksum, meta.file_checksum_func_name);
 
-    edit->SetBlobFileAdditions(std::move(blob_file_additions));
+    for (const auto& blob : blob_file_additions) {
+      edit->AddBlobFile(blob);
+    }
   }
 
   InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
-  stats.micros = env_->NowMicros() - start_micros;
+  stats.micros = immutable_db_options_.clock->NowMicros() - start_micros;
 
   if (has_output) {
     stats.bytes_written = meta.fd.GetFileSize();
+    stats.num_output_files = 1;
+  }
 
-    const auto& blobs = edit->GetBlobFileAdditions();
-    for (const auto& blob : blobs) {
-      stats.bytes_written += blob.GetTotalBlobBytes();
-    }
-
-    stats.num_output_files = static_cast<int>(blobs.size()) + 1;
+  const auto& blobs = edit->GetBlobFileAdditions();
+  for (const auto& blob : blobs) {
+    stats.bytes_written_blob += blob.GetTotalBlobBytes();
   }
 
+  stats.num_output_files_blob = static_cast<int>(blobs.size());
+
   cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER, stats);
-  cfd->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED,
-                                    stats.bytes_written);
+  cfd->internal_stats()->AddCFStats(
+      InternalStats::BYTES_FLUSHED,
+      stats.bytes_written + stats.bytes_written_blob);
   RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
   return s;
 }
@@ -1498,9 +1561,11 @@ IOStatus DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
     lfile->SetPreallocationBlockSize(preallocate_block_size);
 
     const auto& listeners = immutable_db_options_.listeners;
+    FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types;
     std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-        std::move(lfile), log_fname, opt_file_options, env_, io_tracer_,
-        nullptr /* stats */, listeners));
+        std::move(lfile), log_fname, opt_file_options,
+        immutable_db_options_.clock, io_tracer_, nullptr /* stats */, listeners,
+        nullptr, tmp_set.Contains(FileType::kWalFile)));
     *new_log = new log::Writer(std::move(file_writer), log_file_num,
                                immutable_db_options_.recycle_log_file_num > 0,
                                immutable_db_options_.manual_wal_flush);
@@ -1581,6 +1646,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
       InstrumentedMutexLock wl(&impl->log_write_mutex_);
       impl->logfile_number_ = new_log_number;
       assert(new_log != nullptr);
+      assert(impl->logs_.empty());
       impl->logs_.emplace_back(new_log_number, new_log);
     }
 
@@ -1635,7 +1701,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
       // In WritePrepared there could be gap in sequence numbers. This breaks
       // the trick we use in kPointInTimeRecovery which assumes the first seq in
       // the log right after the corrupted log is one larger than the last seq
-      // we read from the logs. To let this trick keep working, we add a dummy
+      // we read from the wals. To let this trick keep working, we add a dummy
       // entry with the expected sequence to the first log right after recovery.
       // In non-WritePrepared case also the new log after recovery could be
       // empty, and thus missing the consecutive seq hint to distinguish
@@ -1728,6 +1794,8 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
 
     std::vector<LiveFileMetaData> metadata;
 
+    // TODO: Once GetLiveFilesMetaData supports blob files, update the logic
+    // below to get known_file_sizes for blob files.
     impl->mutex_.Lock();
     impl->versions_->GetLiveFilesMetaData(&metadata);
     impl->mutex_.Unlock();
@@ -1753,21 +1821,19 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
     paths.erase(std::unique(paths.begin(), paths.end()), paths.end());
     for (auto& path : paths) {
       std::vector<std::string> existing_files;
-      // TODO: Check for errors here?
       impl->immutable_db_options_.env->GetChildren(path, &existing_files)
-          .PermitUncheckedError();
+          .PermitUncheckedError();  //**TODO: What do to on error?
       for (auto& file_name : existing_files) {
         uint64_t file_number;
         FileType file_type;
         std::string file_path = path + "/" + file_name;
         if (ParseFileName(file_name, &file_number, &file_type) &&
-            file_type == kTableFile) {
+            (file_type == kTableFile || file_type == kBlobFile)) {
           // TODO: Check for errors from OnAddFile?
           if (known_file_sizes.count(file_name)) {
             // We're assuming that each sst file name exists in at most one of
             // the paths.
-            sfm->OnAddFile(file_path, known_file_sizes.at(file_name),
-                           /* compaction */ false)
+            sfm->OnAddFile(file_path, known_file_sizes.at(file_name))
                 .PermitUncheckedError();
           } else {
             sfm->OnAddFile(file_path).PermitUncheckedError();
diff --git a/db/db_impl/db_impl_readonly.cc b/db/db_impl/db_impl_readonly.cc
index 57825afbb38..825e83d2fb8 100644
--- a/db/db_impl/db_impl_readonly.cc
+++ b/db/db_impl/db_impl_readonly.cc
@@ -6,7 +6,7 @@
 #include "db/db_impl/db_impl_readonly.h"
 
 #include "db/arena_wrapped_db_iter.h"
-#include "db/compacted_db_impl.h"
+#include "db/db_impl/compacted_db_impl.h"
 #include "db/db_impl/db_impl.h"
 #include "db/db_iter.h"
 #include "db/merge_context.h"
@@ -19,7 +19,8 @@ namespace ROCKSDB_NAMESPACE {
 
 DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options,
                                const std::string& dbname)
-    : DBImpl(db_options, dbname) {
+    : DBImpl(db_options, dbname, /*seq_per_batch*/ false,
+             /*batch_per_txn*/ true, /*read_only*/ true) {
   ROCKS_LOG_INFO(immutable_db_options_.info_log,
                  "Opening the db in read only mode");
   LogFlush(immutable_db_options_.info_log);
@@ -83,7 +84,7 @@ Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options,
   ReadCallback* read_callback = nullptr;  // No read callback provided.
   auto db_iter = NewArenaWrappedDbIterator(
       env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options,
-      read_seq,
+      super_version->current, read_seq,
       super_version->mutable_cf_options.max_sequential_skip_in_iterations,
       super_version->version_number, read_callback);
   auto internal_iter = NewInternalIterator(
@@ -115,7 +116,8 @@ Status DBImplReadOnly::NewIterators(
     auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
     auto* sv = cfd->GetSuperVersion()->Ref();
     auto* db_iter = NewArenaWrappedDbIterator(
-        env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, read_seq,
+        env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
+        sv->current, read_seq,
         sv->mutable_cf_options.max_sequential_skip_in_iterations,
         sv->version_number, read_callback);
     auto* internal_iter = NewInternalIterator(
@@ -130,8 +132,8 @@ Status DBImplReadOnly::NewIterators(
 }
 
 namespace {
-// Return OK if dbname exists in the file system
-// or create_if_missing is false
+// Return OK if dbname exists in the file system or create it if
+// create_if_missing
 Status OpenForReadOnlyCheckExistence(const DBOptions& db_options,
                                      const std::string& dbname) {
   Status s;
@@ -142,9 +144,9 @@ Status OpenForReadOnlyCheckExistence(const DBOptions& db_options,
     uint64_t manifest_file_number;
     s = VersionSet::GetCurrentManifestPath(dbname, fs.get(), &manifest_path,
                                            &manifest_file_number);
-    if (!s.ok()) {
-      return Status::NotFound(CurrentFileName(dbname), "does not exist");
-    }
+  } else {
+    // Historic behavior that doesn't necessarily make sense
+    s = db_options.env->CreateDirIfMissing(dbname);
   }
   return s;
 }
@@ -152,7 +154,6 @@ Status OpenForReadOnlyCheckExistence(const DBOptions& db_options,
 
 Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
                            DB** dbptr, bool /*error_if_wal_file_exists*/) {
-  // If dbname does not exist in the file system, should not do anything
   Status s = OpenForReadOnlyCheckExistence(options, dbname);
   if (!s.ok()) {
     return s;
diff --git a/db/db_impl/db_impl_remote_compaction.cc b/db/db_impl/db_impl_remote_compaction.cc
index 3fc4d5506ff..f8e9f381932 100644
--- a/db/db_impl/db_impl_remote_compaction.cc
+++ b/db/db_impl/db_impl_remote_compaction.cc
@@ -127,15 +127,18 @@ Status DBImpl::doCompact(const CompactionOptions& compact_options,
 
   // create compaction job
   CompactionJob compaction_job(
-      job_context->job_id, c.get(), immutable_db_options_,
+      job_context->job_id, c.get(), immutable_db_options_, mutable_db_options_,
       file_options_for_compaction_, versions_.get(), &shutting_down_,
       preserve_deletes_seqnum_.load(), log_buffer, directories_.GetDbDir(),
-      GetDataDir(c->column_family_data(), c->output_path_id()), stats_, &mutex_,
-      &error_handler_, existing_snapshots, earliest_write_conflict_snapshot,
-      snapshot_checker, table_cache_, &event_logger_,
+      GetDataDir(c->column_family_data(), c->output_path_id()),
+      GetDataDir(c->column_family_data(), 0), stats_, &mutex_, &error_handler_,
+      existing_snapshots, earliest_write_conflict_snapshot, snapshot_checker,
+      table_cache_, &event_logger_,
       c->mutable_cf_options()->paranoid_file_checks,
       c->mutable_cf_options()->report_bg_io_stats, dbname_,
-      &compaction_job_stats, Env::Priority::USER, nullptr);
+      &compaction_job_stats, Env::Priority::USER, io_tracer_,
+      &manual_compaction_paused_, nullptr, db_id_, db_session_id_,
+      c->column_family_data()->GetFullHistoryTsLow());
 
   compaction_job.Prepare();
   mutex_.Unlock();
diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc
index c0572948e23..10c04b3e169 100644
--- a/db/db_impl/db_impl_secondary.cc
+++ b/db/db_impl/db_impl_secondary.cc
@@ -11,14 +11,17 @@
 #include "db/merge_context.h"
 #include "logging/auto_roll_logger.h"
 #include "monitoring/perf_context_imp.h"
+#include "rocksdb/configurable.h"
 #include "util/cast_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 #ifndef ROCKSDB_LITE
 DBImplSecondary::DBImplSecondary(const DBOptions& db_options,
-                                 const std::string& dbname)
-    : DBImpl(db_options, dbname) {
+                                 const std::string& dbname,
+                                 std::string secondary_path)
+    : DBImpl(db_options, dbname, false, true, true),
+      secondary_path_(std::move(secondary_path)) {
   ROCKS_LOG_INFO(immutable_db_options_.info_log,
                  "Opening the db in secondary mode");
   LogFlush(immutable_db_options_.info_log);
@@ -112,7 +115,7 @@ Status DBImplSecondary::FindNewLogNumbers(std::vector<uint64_t>* logs) {
   for (size_t i = 0; i < filenames.size(); i++) {
     uint64_t number;
     FileType type;
-    if (ParseFileName(filenames[i], &number, &type) && type == kLogFile &&
+    if (ParseFileName(filenames[i], &number, &type) && type == kWalFile &&
         number >= log_number_min) {
       logs->push_back(number);
     }
@@ -327,8 +330,8 @@ Status DBImplSecondary::GetImpl(const ReadOptions& read_options,
                                 ColumnFamilyHandle* column_family,
                                 const Slice& key, PinnableSlice* pinnable_val) {
   assert(pinnable_val != nullptr);
-  PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
-  StopWatch sw(env_, stats_, DB_GET);
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock);
+  StopWatch sw(immutable_db_options_.clock, stats_, DB_GET);
   PERF_TIMER_GUARD(get_snapshot_time);
 
   auto cfh = static_cast<ColumnFamilyHandleImpl*>(column_family);
@@ -421,7 +424,7 @@ ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl(
   SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
   auto db_iter = NewArenaWrappedDbIterator(
       env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options,
-      snapshot,
+      super_version->current, snapshot,
       super_version->mutable_cf_options.max_sequential_skip_in_iterations,
       super_version->version_number, read_callback);
   auto internal_iter = NewInternalIterator(
@@ -519,7 +522,8 @@ Status DBImplSecondary::TryCatchUpWithPrimary() {
   {
     InstrumentedMutexLock lock_guard(&mutex_);
     s = static_cast_with_check<ReactiveVersionSet>(versions_.get())
-            ->ReadAndApply(&mutex_, &manifest_reader_, &cfds_changed);
+            ->ReadAndApply(&mutex_, &manifest_reader_,
+                           manifest_reader_status_.get(), &cfds_changed);
 
     ROCKS_LOG_INFO(immutable_db_options_.info_log, "Last sequence is %" PRIu64,
                    static_cast<uint64_t>(versions_->LastSequence()));
@@ -616,7 +620,7 @@ Status DB::OpenAsSecondary(
   }
 
   handles->clear();
-  DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname);
+  DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname, secondary_path);
   impl->versions_.reset(new ReactiveVersionSet(
       dbname, &impl->immutable_db_options_, impl->file_options_,
       impl->table_cache_.get(), impl->write_buffer_manager_,
@@ -662,6 +666,159 @@ Status DB::OpenAsSecondary(
   }
   return s;
 }
+
+Status DBImplSecondary::CompactWithoutInstallation(
+    ColumnFamilyHandle* cfh, const CompactionServiceInput& input,
+    CompactionServiceResult* result) {
+  InstrumentedMutexLock l(&mutex_);
+  auto cfd = static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
+  if (!cfd) {
+    return Status::InvalidArgument("Cannot find column family" +
+                                   cfh->GetName());
+  }
+
+  std::unordered_set<uint64_t> input_set;
+  for (const auto& file_name : input.input_files) {
+    input_set.insert(TableFileNameToNumber(file_name));
+  }
+
+  auto* version = cfd->current();
+
+  ColumnFamilyMetaData cf_meta;
+  version->GetColumnFamilyMetaData(&cf_meta);
+
+  const MutableCFOptions* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+  ColumnFamilyOptions cf_options = cfd->GetLatestCFOptions();
+  VersionStorageInfo* vstorage = version->storage_info();
+
+  // Use comp_options to reuse some CompactFiles functions
+  CompactionOptions comp_options;
+  comp_options.compression = kDisableCompressionOption;
+  comp_options.output_file_size_limit = MaxFileSizeForLevel(
+      *mutable_cf_options, input.output_level, cf_options.compaction_style,
+      vstorage->base_level(), cf_options.level_compaction_dynamic_level_bytes);
+
+  std::vector<CompactionInputFiles> input_files;
+  Status s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage, comp_options);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::unique_ptr<Compaction> c;
+  assert(cfd->compaction_picker());
+  c.reset(cfd->compaction_picker()->CompactFiles(
+      comp_options, input_files, input.output_level, vstorage,
+      *mutable_cf_options, mutable_db_options_, 0));
+  assert(c != nullptr);
+
+  c->SetInputVersion(version);
+
+  // Create output directory if it's not existed yet
+  std::unique_ptr<FSDirectory> output_dir;
+  s = CreateAndNewDirectory(fs_.get(), secondary_path_, &output_dir);
+  if (!s.ok()) {
+    return s;
+  }
+
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+                       immutable_db_options_.info_log.get());
+
+  const int job_id = next_job_id_.fetch_add(1);
+
+  CompactionServiceCompactionJob compaction_job(
+      job_id, c.get(), immutable_db_options_, mutable_db_options_,
+      file_options_for_compaction_, versions_.get(), &shutting_down_,
+      &log_buffer, output_dir.get(), stats_, &mutex_, &error_handler_,
+      input.snapshots, table_cache_, &event_logger_, dbname_, io_tracer_,
+      db_id_, db_session_id_, secondary_path_, input, result);
+
+  mutex_.Unlock();
+  s = compaction_job.Run();
+  mutex_.Lock();
+
+  // clean up
+  compaction_job.io_status().PermitUncheckedError();
+  compaction_job.CleanupCompaction();
+  c->ReleaseCompactionFiles(s);
+  c.reset();
+
+  TEST_SYNC_POINT_CALLBACK("DBImplSecondary::CompactWithoutInstallation::End",
+                           &s);
+  result->status = s;
+  return s;
+}
+
+Status DB::OpenAndCompact(
+    const std::string& name, const std::string& output_directory,
+    const std::string& input, std::string* result,
+    const CompactionServiceOptionsOverride& override_options) {
+  CompactionServiceInput compaction_input;
+  Status s = CompactionServiceInput::Read(input, &compaction_input);
+  if (!s.ok()) {
+    return s;
+  }
+
+  compaction_input.db_options.max_open_files = -1;
+  compaction_input.db_options.compaction_service = nullptr;
+  if (compaction_input.db_options.statistics) {
+    compaction_input.db_options.statistics.reset();
+  }
+  compaction_input.db_options.env = override_options.env;
+  compaction_input.db_options.file_checksum_gen_factory =
+      override_options.file_checksum_gen_factory;
+  compaction_input.column_family.options.comparator =
+      override_options.comparator;
+  compaction_input.column_family.options.merge_operator =
+      override_options.merge_operator;
+  compaction_input.column_family.options.compaction_filter =
+      override_options.compaction_filter;
+  compaction_input.column_family.options.compaction_filter_factory =
+      override_options.compaction_filter_factory;
+  compaction_input.column_family.options.prefix_extractor =
+      override_options.prefix_extractor;
+  compaction_input.column_family.options.table_factory =
+      override_options.table_factory;
+  compaction_input.column_family.options.sst_partitioner_factory =
+      override_options.sst_partitioner_factory;
+
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(compaction_input.column_family);
+  // TODO: we have to open default CF, because of an implementation limitation,
+  // currently we just use the same CF option from input, which is not collect
+  // and open may fail.
+  if (compaction_input.column_family.name != kDefaultColumnFamilyName) {
+    column_families.emplace_back(kDefaultColumnFamilyName,
+                                 compaction_input.column_family.options);
+  }
+
+  DB* db;
+  std::vector<ColumnFamilyHandle*> handles;
+
+  s = DB::OpenAsSecondary(compaction_input.db_options, name, output_directory,
+                          column_families, &handles, &db);
+  if (!s.ok()) {
+    return s;
+  }
+
+  CompactionServiceResult compaction_result;
+  DBImplSecondary* db_secondary = static_cast_with_check<DBImplSecondary>(db);
+  assert(handles.size() > 0);
+  s = db_secondary->CompactWithoutInstallation(handles[0], compaction_input,
+                                               &compaction_result);
+
+  Status serialization_status = compaction_result.Write(result);
+
+  for (auto& handle : handles) {
+    delete handle;
+  }
+  delete db;
+  if (s.ok()) {
+    return serialization_status;
+  }
+  return s;
+}
+
 #else   // !ROCKSDB_LITE
 
 Status DB::OpenAsSecondary(const Options& /*options*/,
diff --git a/db/db_impl/db_impl_secondary.h b/db/db_impl/db_impl_secondary.h
index 8fc58616fe2..e278b79cde2 100644
--- a/db/db_impl/db_impl_secondary.h
+++ b/db/db_impl/db_impl_secondary.h
@@ -71,7 +71,8 @@ class LogReaderContainer {
 // effort attempts to catch up with the primary.
 class DBImplSecondary : public DBImpl {
  public:
-  DBImplSecondary(const DBOptions& options, const std::string& dbname);
+  DBImplSecondary(const DBOptions& options, const std::string& dbname,
+                  std::string secondary_path);
   ~DBImplSecondary() override;
 
   // Recover by replaying MANIFEST and WAL. Also initialize manifest_reader_
@@ -222,6 +223,14 @@ class DBImplSecondary : public DBImpl {
   // not flag the missing file as inconsistency.
   Status CheckConsistency() override;
 
+#ifndef NDEBUG
+  Status TEST_CompactWithoutInstallation(ColumnFamilyHandle* cfh,
+                                         const CompactionServiceInput& input,
+                                         CompactionServiceResult* result) {
+    return CompactWithoutInstallation(cfh, input, result);
+  }
+#endif  // NDEBUG
+
  protected:
   // ColumnFamilyCollector is a write batch handler which does nothing
   // except recording unique column family IDs
@@ -316,6 +325,13 @@ class DBImplSecondary : public DBImpl {
                          std::unordered_set<ColumnFamilyData*>* cfds_changed,
                          JobContext* job_context);
 
+  // Run compaction without installation, the output files will be placed in the
+  // secondary DB path. The LSM tree won't be changed, the secondary DB is still
+  // in read-only mode.
+  Status CompactWithoutInstallation(ColumnFamilyHandle* cfh,
+                                    const CompactionServiceInput& input,
+                                    CompactionServiceResult* result);
+
   std::unique_ptr<log::FragmentBufferedReader> manifest_reader_;
   std::unique_ptr<log::Reader::Reporter> manifest_reporter_;
   std::unique_ptr<Status> manifest_reader_status_;
@@ -326,6 +342,8 @@ class DBImplSecondary : public DBImpl {
 
   // Current WAL number replayed for each column family.
   std::unordered_map<ColumnFamilyData*, uint64_t> cfd_to_current_log_;
+
+  const std::string secondary_path_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index 0a7cd2fa47b..5896b5a9f77 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -163,10 +163,10 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     RecordTick(stats_, WRITE_WITH_WAL);
   }
 
-  StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
+  StopWatch write_sw(immutable_db_options_.clock, immutable_db_options_.stats,
+                     DB_WRITE);
 
   write_thread_.JoinBatchGroup(&w);
-  Status status;
   if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) {
     // we are a non-leader in a parallel group
 
@@ -196,8 +196,6 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     }
     assert(w.state == WriteThread::STATE_COMPLETED);
     // STATE_COMPLETED conditional below handles exit
-
-    status = w.FinalStatus();
   }
   if (w.state == WriteThread::STATE_COMPLETED) {
     if (log_used != nullptr) {
@@ -207,13 +205,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
       *seq_used = w.sequence;
     }
     // write is complete and leader has updated sequence
-    // Should we handle it?
-    status.PermitUncheckedError();
     return w.FinalStatus();
   }
   // else we are the leader of the write batch group
   assert(w.state == WriteThread::STATE_GROUP_LEADER);
-
+  Status status;
   // Once reaches this point, the current writer "w" will try to do its write
   // job.  It may also pick up some of the remaining writers in the "writers_"
   // when it finds suitable, and finish them in the same write batch.
@@ -429,7 +425,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
 
   if (need_log_sync) {
     mutex_.Lock();
-    MarkLogsSynced(logfile_number_, need_log_dir_sync, status);
+    if (status.ok()) {
+      status = MarkLogsSynced(logfile_number_, need_log_dir_sync);
+    } else {
+      MarkLogsNotSynced(logfile_number_);
+    }
     mutex_.Unlock();
     // Requesting sync with two_write_queues_ is expected to be very rare. We
     // hence provide a simple implementation that is not necessarily efficient.
@@ -469,7 +469,8 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
                                   uint64_t* log_used, uint64_t log_ref,
                                   bool disable_memtable, uint64_t* seq_used) {
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
-  StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
+  StopWatch write_sw(immutable_db_options_.clock, immutable_db_options_.stats,
+                     DB_WRITE);
 
   WriteContext write_context;
 
@@ -530,6 +531,8 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
     PERF_TIMER_STOP(write_pre_and_post_process_time);
 
     IOStatus io_s;
+    io_s.PermitUncheckedError();  // Allow io_s to be uninitialized
+
     if (w.status.ok() && !write_options.disableWAL) {
       PERF_TIMER_GUARD(write_wal_time);
       stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1);
@@ -554,14 +557,23 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
 
     if (need_log_sync) {
       mutex_.Lock();
-      MarkLogsSynced(logfile_number_, need_log_dir_sync, w.status);
+      if (w.status.ok()) {
+        w.status = MarkLogsSynced(logfile_number_, need_log_dir_sync);
+      } else {
+        MarkLogsNotSynced(logfile_number_);
+      }
       mutex_.Unlock();
     }
 
     write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status);
   }
 
+  // NOTE: the memtable_write_group is declared before the following
+  // `if` statement because its lifetime needs to be longer
+  // that the inner context  of the `if` as a reference to it
+  // may be used further below within the outer _write_thread
   WriteThread::WriteGroup memtable_write_group;
+
   if (w.state == WriteThread::STATE_MEMTABLE_WRITER_LEADER) {
     PERF_TIMER_GUARD(write_memtable_time);
     assert(w.ShouldWriteToMemtable());
@@ -578,6 +590,10 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
       versions_->SetLastSequence(memtable_write_group.last_sequence);
       write_thread_.ExitAsMemTableWriter(&w, memtable_write_group);
     }
+  } else {
+    // NOTE: the memtable_write_group is never really used,
+    // so we need to set its status to pass ASSERT_STATUS_CHECKED
+    memtable_write_group.status.PermitUncheckedError();
   }
 
   if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) {
@@ -610,7 +626,8 @@ Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options,
                                       SequenceNumber seq,
                                       const size_t sub_batch_cnt) {
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
-  StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
+  StopWatch write_sw(immutable_db_options_.clock, immutable_db_options_.stats,
+                     DB_WRITE);
 
   WriteThread::Writer w(write_options, my_batch, callback, log_ref,
                         false /*disable_memtable*/);
@@ -661,12 +678,12 @@ Status DBImpl::WriteImplWALOnly(
     const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
     PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
     const PublishLastSeq publish_last_seq, const bool disable_memtable) {
-  Status status;
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
   WriteThread::Writer w(write_options, my_batch, callback, log_ref,
                         disable_memtable, sub_batch_cnt, pre_release_callback);
   RecordTick(stats_, WRITE_WITH_WAL);
-  StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
+  StopWatch write_sw(immutable_db_options_.clock, immutable_db_options_.stats,
+                     DB_WRITE);
 
   write_thread->JoinBatchGroup(&w);
   assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER);
@@ -683,6 +700,8 @@ Status DBImpl::WriteImplWALOnly(
   assert(w.state == WriteThread::STATE_GROUP_LEADER);
 
   if (publish_last_seq == kDoPublishLastSeq) {
+    Status status;
+
     // Currently we only use kDoPublishLastSeq in unordered_write
     assert(immutable_db_options_.unordered_write);
     WriteContext write_context;
@@ -759,7 +778,9 @@ Status DBImpl::WriteImplWALOnly(
     }
     seq_inc = total_batch_cnt;
   }
+  Status status;
   IOStatus io_s;
+  io_s.PermitUncheckedError();  // Allow io_s to be uninitialized
   if (!write_options.disableWAL) {
     io_s = ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc);
     status = io_s;
@@ -845,8 +866,7 @@ void DBImpl::WriteStatusCheckOnLocked(const Status& status) {
   if (immutable_db_options_.paranoid_checks && !status.ok() &&
       !status.IsBusy() && !status.IsIncomplete()) {
     // Maybe change the return status to void?
-    error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback)
-        .PermitUncheckedError();
+    error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback);
   }
 }
 
@@ -858,8 +878,7 @@ void DBImpl::WriteStatusCheck(const Status& status) {
       !status.IsBusy() && !status.IsIncomplete()) {
     mutex_.Lock();
     // Maybe change the return status to void?
-    error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback)
-        .PermitUncheckedError();
+    error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback);
     mutex_.Unlock();
   }
 }
@@ -872,8 +891,7 @@ void DBImpl::IOStatusCheck(const IOStatus& io_status) {
       io_status.IsIOFenced()) {
     mutex_.Lock();
     // Maybe change the return status to void?
-    error_handler_.SetBGError(io_status, BackgroundErrorReason::kWriteCallback)
-        .PermitUncheckedError();
+    error_handler_.SetBGError(io_status, BackgroundErrorReason::kWriteCallback);
     mutex_.Unlock();
   }
 }
@@ -922,7 +940,7 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
     // be flushed. We may end up with flushing much more DBs than needed. It's
     // suboptimal but still correct.
     WaitForPendingWrites();
-    status = HandleWriteBufferFull(write_context);
+    status = HandleWriteBufferManagerFlush(write_context);
   }
 
   if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) {
@@ -949,6 +967,20 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
     PERF_TIMER_START(write_pre_and_post_process_time);
   }
 
+  // If memory usage exceeded beyond a certain threshold,
+  // write_buffer_manager_->ShouldStall() returns true to all threads writing to
+  // all DBs and writers will be stalled.
+  // It does soft checking because WriteBufferManager::buffer_limit_ has already
+  // exceeded at this point so no new write (including current one) will go
+  // through until memory usage is decreased.
+  if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldStall())) {
+    if (write_options.no_slowdown) {
+      status = Status::Incomplete("Write stall");
+    } else {
+      WriteBufferManagerStallWrites();
+    }
+  }
+
   if (status.ok() && *need_log_sync) {
     // Wait until the parallel syncs are finished. Any sync process has to sync
     // the front log too so it is enough to check the status of front()
@@ -1082,7 +1114,7 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
   }
 
   if (io_s.ok() && need_log_sync) {
-    StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS);
+    StopWatch sw(immutable_db_options_.clock, stats_, WAL_FILE_SYNC_MICROS);
     // It's safe to access logs_ with unlocked mutex_ here because:
     //  - we've set getting_synced=true for all logs,
     //    so other threads won't pop from logs_ while we're here,
@@ -1330,16 +1362,23 @@ Status DBImpl::SwitchWAL(WriteContext* write_context) {
     }
     for (auto cfd : cfds) {
       cfd->imm()->FlushRequested();
+      if (!immutable_db_options_.atomic_flush) {
+        FlushRequest flush_req;
+        GenerateFlushRequest({cfd}, &flush_req);
+        SchedulePendingFlush(flush_req, FlushReason::kWalFull);
+      }
+    }
+    if (immutable_db_options_.atomic_flush) {
+      FlushRequest flush_req;
+      GenerateFlushRequest(cfds, &flush_req);
+      SchedulePendingFlush(flush_req, FlushReason::kWalFull);
     }
-    FlushRequest flush_req;
-    GenerateFlushRequest(cfds, &flush_req);
-    SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
     MaybeScheduleFlushOrCompaction();
   }
   return status;
 }
 
-Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) {
+Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) {
   mutex_.AssertHeld();
   assert(write_context != nullptr);
   Status status;
@@ -1351,7 +1390,7 @@ Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) {
   // suboptimal but still correct.
   ROCKS_LOG_INFO(
       immutable_db_options_.info_log,
-      "Flushing column family with oldest memtable entry. Write buffer is "
+      "Flushing column family with oldest memtable entry. Write buffers are "
       "using %" ROCKSDB_PRIszt " bytes out of a total of %" ROCKSDB_PRIszt ".",
       write_buffer_manager_->memory_usage(),
       write_buffer_manager_->buffer_size());
@@ -1409,10 +1448,17 @@ Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) {
     }
     for (const auto cfd : cfds) {
       cfd->imm()->FlushRequested();
+      if (!immutable_db_options_.atomic_flush) {
+        FlushRequest flush_req;
+        GenerateFlushRequest({cfd}, &flush_req);
+        SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
+      }
+    }
+    if (immutable_db_options_.atomic_flush) {
+      FlushRequest flush_req;
+      GenerateFlushRequest(cfds, &flush_req);
+      SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
     }
-    FlushRequest flush_req;
-    GenerateFlushRequest(cfds, &flush_req);
-    SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
     MaybeScheduleFlushOrCompaction();
   }
   return status;
@@ -1432,8 +1478,10 @@ Status DBImpl::DelayWrite(uint64_t num_bytes,
   uint64_t time_delayed = 0;
   bool delayed = false;
   {
-    StopWatch sw(env_, stats_, WRITE_STALL, &time_delayed);
-    uint64_t delay = write_controller_.GetDelay(env_, num_bytes);
+    StopWatch sw(immutable_db_options_.clock, stats_, WRITE_STALL,
+                 &time_delayed);
+    uint64_t delay =
+        write_controller_.GetDelay(immutable_db_options_.clock, num_bytes);
     if (delay > 0) {
       if (write_options.no_slowdown) {
         return Status::Incomplete("Write stall");
@@ -1445,19 +1493,21 @@ Status DBImpl::DelayWrite(uint64_t num_bytes,
       write_thread_.BeginWriteStall();
       TEST_SYNC_POINT("DBImpl::DelayWrite:BeginWriteStallDone");
       mutex_.Unlock();
-      // We will delay the write until we have slept for delay ms or
-      // we don't need a delay anymore
-      const uint64_t kDelayInterval = 1000;
+      // We will delay the write until we have slept for `delay` microseconds
+      // or we don't need a delay anymore. We check for cancellation every 1ms
+      // (slightly longer because WriteController minimum delay is 1ms, in
+      // case of sleep imprecision, rounding, etc.)
+      const uint64_t kDelayInterval = 1001;
       uint64_t stall_end = sw.start_time() + delay;
       while (write_controller_.NeedsDelay()) {
-        if (env_->NowMicros() >= stall_end) {
+        if (immutable_db_options_.clock->NowMicros() >= stall_end) {
           // We already delayed this write `delay` microseconds
           break;
         }
 
         delayed = true;
         // Sleep for 0.001 seconds
-        env_->SleepForMicroseconds(kDelayInterval);
+        immutable_db_options_.clock->SleepForMicroseconds(kDelayInterval);
       }
       mutex_.Lock();
       write_thread_.EndWriteStall();
@@ -1503,6 +1553,29 @@ Status DBImpl::DelayWrite(uint64_t num_bytes,
   return s;
 }
 
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+void DBImpl::WriteBufferManagerStallWrites() {
+  mutex_.AssertHeld();
+  // First block future writer threads who want to add themselves to the queue
+  // of WriteThread.
+  write_thread_.BeginWriteStall();
+  mutex_.Unlock();
+
+  // Change the state to State::Blocked.
+  static_cast<WBMStallInterface*>(wbm_stall_.get())
+      ->SetState(WBMStallInterface::State::BLOCKED);
+  // Then WriteBufferManager will add DB instance to its queue
+  // and block this thread by calling WBMStallInterface::Block().
+  write_buffer_manager_->BeginWriteStall(wbm_stall_.get());
+  wbm_stall_->Block();
+
+  mutex_.Lock();
+  // Stall has ended. Signal writer threads so that they can add
+  // themselves to the WriteThread queue for writes.
+  write_thread_.EndWriteStall();
+}
+
 Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
                                             WriteBatch* my_batch) {
   assert(write_options.low_pri);
@@ -1636,10 +1709,16 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) {
   if (status.ok()) {
     if (immutable_db_options_.atomic_flush) {
       AssignAtomicFlushSeq(cfds);
+      FlushRequest flush_req;
+      GenerateFlushRequest(cfds, &flush_req);
+      SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
+    } else {
+      for (auto* cfd : cfds) {
+        FlushRequest flush_req;
+        GenerateFlushRequest({cfd}, &flush_req);
+        SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
+      }
     }
-    FlushRequest flush_req;
-    GenerateFlushRequest(cfds, &flush_req);
-    SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
     MaybeScheduleFlushOrCompaction();
   }
   return status;
@@ -1785,30 +1864,74 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
     }
     // We may have lost data from the WritableFileBuffer in-memory buffer for
     // the current log, so treat it as a fatal error and set bg_error
-    // Should handle return error?
     if (!io_s.ok()) {
-      error_handler_.SetBGError(io_s, BackgroundErrorReason::kMemTable)
-          .PermitUncheckedError();
+      error_handler_.SetBGError(io_s, BackgroundErrorReason::kMemTable);
     } else {
-      error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable)
-          .PermitUncheckedError();
+      error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable);
     }
     // Read back bg_error in order to get the right severity
     s = error_handler_.GetBGError();
     return s;
   }
 
-  for (auto loop_cfd : *versions_->GetColumnFamilySet()) {
-    // all this is just optimization to delete logs that
-    // are no longer needed -- if CF is empty, that means it
-    // doesn't need that particular log to stay alive, so we just
-    // advance the log number. no need to persist this in the manifest
-    if (loop_cfd->mem()->GetFirstSequenceNumber() == 0 &&
-        loop_cfd->imm()->NumNotFlushed() == 0) {
-      if (creating_new_log) {
-        loop_cfd->SetLogNumber(logfile_number_);
+  bool empty_cf_updated = false;
+  if (immutable_db_options_.track_and_verify_wals_in_manifest &&
+      !immutable_db_options_.allow_2pc && creating_new_log) {
+    // In non-2pc mode, WALs become obsolete if they do not contain unflushed
+    // data. Updating the empty CF's log number might cause some WALs to become
+    // obsolete. So we should track the WAL obsoletion event before actually
+    // updating the empty CF's log number.
+    uint64_t min_wal_number_to_keep =
+        versions_->PreComputeMinLogNumberWithUnflushedData(logfile_number_);
+    if (min_wal_number_to_keep >
+        versions_->GetWalSet().GetMinWalNumberToKeep()) {
+      // Get a snapshot of the empty column families.
+      // LogAndApply may release and reacquire db
+      // mutex, during that period, column family may become empty (e.g. its
+      // flush succeeds), then it affects the computed min_log_number_to_keep,
+      // so we take a snapshot for consistency of column family data
+      // status. If a column family becomes non-empty afterwards, its active log
+      // should still be the created new log, so the min_log_number_to_keep is
+      // not affected.
+      autovector<ColumnFamilyData*> empty_cfs;
+      for (auto cf : *versions_->GetColumnFamilySet()) {
+        if (cf->IsEmpty()) {
+          empty_cfs.push_back(cf);
+        }
+      }
+
+      VersionEdit wal_deletion;
+      wal_deletion.DeleteWalsBefore(min_wal_number_to_keep);
+      s = versions_->LogAndApplyToDefaultColumnFamily(&wal_deletion, &mutex_);
+      if (!s.ok() && versions_->io_status().IsIOError()) {
+        s = error_handler_.SetBGError(versions_->io_status(),
+                                      BackgroundErrorReason::kManifestWrite);
+      }
+      if (!s.ok()) {
+        return s;
+      }
+
+      for (auto cf : empty_cfs) {
+        if (cf->IsEmpty()) {
+          cf->SetLogNumber(logfile_number_);
+          cf->mem()->SetCreationSeq(versions_->LastSequence());
+        }  // cf may become non-empty.
+      }
+      empty_cf_updated = true;
+    }
+  }
+  if (!empty_cf_updated) {
+    for (auto cf : *versions_->GetColumnFamilySet()) {
+      // all this is just optimization to delete logs that
+      // are no longer needed -- if CF is empty, that means it
+      // doesn't need that particular log to stay alive, so we just
+      // advance the log number. no need to persist this in the manifest
+      if (cf->IsEmpty()) {
+        if (creating_new_log) {
+          cf->SetLogNumber(logfile_number_);
+        }
+        cf->mem()->SetCreationSeq(versions_->LastSequence());
       }
-      loop_cfd->mem()->SetCreationSeq(versions_->LastSequence());
     }
   }
 
diff --git a/db/db_info_dumper.cc b/db/db_info_dumper.cc
index 207e85faa2b..55c8bb95aa4 100644
--- a/db/db_info_dumper.cc
+++ b/db/db_info_dumper.cc
@@ -62,11 +62,12 @@ void DumpDBFileSummary(const ImmutableDBOptions& options,
                 dbname.c_str(), file.c_str());
         }
         break;
-      case kLogFile:
+      case kWalFile:
         if (env->GetFileSize(dbname + "/" + file, &file_size).ok()) {
-          char str[16];
-          snprintf(str, sizeof(str), "%" PRIu64, file_size);
-          wal_info.append(file).append(" size: ").append(str).append(" ; ");
+          wal_info.append(file)
+              .append(" size: ")
+              .append(std::to_string(file_size))
+              .append(" ; ");
         } else {
           Error(options.info_log, "Error when reading LOG file: %s/%s\n",
                 dbname.c_str(), file.c_str());
@@ -118,11 +119,12 @@ void DumpDBFileSummary(const ImmutableDBOptions& options,
     wal_info.clear();
     for (const std::string& file : files) {
       if (ParseFileName(file, &number, &type)) {
-        if (type == kLogFile) {
+        if (type == kWalFile) {
           if (env->GetFileSize(options.wal_dir + "/" + file, &file_size).ok()) {
-            char str[16];
-            snprintf(str, sizeof(str), "%" PRIu64, file_size);
-            wal_info.append(file).append(" size: ").append(str).append(" ; ");
+            wal_info.append(file)
+                .append(" size: ")
+                .append(std::to_string(file_size))
+                .append(" ; ");
           } else {
             Error(options.info_log, "Error when reading LOG file %s/%s\n",
                   options.wal_dir.c_str(), file.c_str());
diff --git a/db/db_io_failure_test.cc b/db/db_io_failure_test.cc
index 1fcaa6904ee..232ae649ccb 100644
--- a/db/db_io_failure_test.cc
+++ b/db/db_io_failure_test.cc
@@ -43,11 +43,15 @@ TEST_F(DBIOFailureTest, DropWrites) {
           if (level > 0 && level == dbfull()->NumberLevels() - 1) {
             break;
           }
-          dbfull()->TEST_CompactRange(level, nullptr, nullptr, nullptr,
-                                      true /* disallow trivial move */);
+          Status s =
+              dbfull()->TEST_CompactRange(level, nullptr, nullptr, nullptr,
+                                          true /* disallow trivial move */);
+          ASSERT_TRUE(s.ok() || s.IsCorruption());
         }
       } else {
-        dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+        Status s =
+            dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+        ASSERT_TRUE(s.ok() || s.IsCorruption());
       }
     }
 
@@ -56,7 +60,8 @@ TEST_F(DBIOFailureTest, DropWrites) {
     ASSERT_EQ("5", property_value);
 
     env_->drop_writes_.store(false, std::memory_order_release);
-    ASSERT_LT(CountFiles(), num_files + 3);
+    const size_t count = CountFiles();
+    ASSERT_LT(count, num_files + 3);
 
     // Check that compaction attempts slept after errors
     // TODO @krad: Figure out why ASSERT_EQ 5 keeps failing in certain compiler
@@ -82,7 +87,8 @@ TEST_F(DBIOFailureTest, DropWritesFlush) {
     ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
     ASSERT_EQ("0", property_value);
 
-    dbfull()->TEST_FlushMemTable(true);
+    // ASSERT file is too short
+    ASSERT_TRUE(dbfull()->TEST_FlushMemTable(true).IsCorruption());
 
     ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
     ASSERT_EQ("1", property_value);
@@ -166,7 +172,7 @@ TEST_F(DBIOFailureTest, ManifestWriteError) {
     ASSERT_EQ("bar", Get("foo"));
 
     // Memtable compaction (will succeed)
-    Flush();
+    ASSERT_OK(Flush());
     ASSERT_EQ("bar", Get("foo"));
     const int last = 2;
     MoveFilesToLevel(2);
@@ -174,7 +180,8 @@ TEST_F(DBIOFailureTest, ManifestWriteError) {
 
     // Merging compaction (will fail)
     error_type->store(true, std::memory_order_release);
-    dbfull()->TEST_CompactRange(last, nullptr, nullptr);  // Should fail
+    ASSERT_NOK(
+        dbfull()->TEST_CompactRange(last, nullptr, nullptr));  // Should fail
     ASSERT_EQ("bar", Get("foo"));
 
     error_type->store(false, std::memory_order_release);
@@ -192,7 +199,13 @@ TEST_F(DBIOFailureTest, ManifestWriteError) {
 
     // Merging compaction (will fail)
     error_type->store(true, std::memory_order_release);
-    dbfull()->TEST_CompactRange(last, nullptr, nullptr);  // Should fail
+    Status s =
+        dbfull()->TEST_CompactRange(last, nullptr, nullptr);  // Should fail
+    if (iter == 0) {
+      ASSERT_OK(s);
+    } else {
+      ASSERT_TRUE(s.IsIOError());
+    }
     ASSERT_EQ("bar", Get("foo"));
 
     // Recovery: should not lose data
@@ -220,18 +233,15 @@ TEST_F(DBIOFailureTest, PutFailsParanoid) {
   options.paranoid_checks = true;
   DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, options);
-  Status s;
 
   ASSERT_OK(Put(1, "foo", "bar"));
   ASSERT_OK(Put(1, "foo1", "bar1"));
   // simulate error
   env_->log_write_error_.store(true, std::memory_order_release);
-  s = Put(1, "foo2", "bar2");
-  ASSERT_TRUE(!s.ok());
+  ASSERT_NOK(Put(1, "foo2", "bar2"));
   env_->log_write_error_.store(false, std::memory_order_release);
-  s = Put(1, "foo3", "bar3");
   // the next put should fail, too
-  ASSERT_TRUE(!s.ok());
+  ASSERT_NOK(Put(1, "foo3", "bar3"));
   // but we're still able to read
   ASSERT_EQ("bar", Get(1, "foo"));
 
@@ -244,12 +254,10 @@ TEST_F(DBIOFailureTest, PutFailsParanoid) {
   ASSERT_OK(Put(1, "foo1", "bar1"));
   // simulate error
   env_->log_write_error_.store(true, std::memory_order_release);
-  s = Put(1, "foo2", "bar2");
-  ASSERT_TRUE(!s.ok());
+  ASSERT_NOK(Put(1, "foo2", "bar2"));
   env_->log_write_error_.store(false, std::memory_order_release);
-  s = Put(1, "foo3", "bar3");
   // the next put should NOT fail
-  ASSERT_TRUE(s.ok());
+  ASSERT_OK(Put(1, "foo3", "bar3"));
 }
 #if !(defined NDEBUG) || !defined(OS_WIN)
 TEST_F(DBIOFailureTest, FlushSstRangeSyncError) {
@@ -269,14 +277,14 @@ TEST_F(DBIOFailureTest, FlushSstRangeSyncError) {
 
   DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, options);
-  Status s;
 
+  const char* io_error_msg = "range sync dummy error";
   std::atomic<int> range_sync_called(0);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "SpecialEnv::SStableFile::RangeSync", [&](void* arg) {
         if (range_sync_called.fetch_add(1) == 0) {
           Status* st = static_cast<Status*>(arg);
-          *st = Status::IOError("range sync dummy error");
+          *st = Status::IOError(io_error_msg);
         }
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
@@ -298,7 +306,9 @@ TEST_F(DBIOFailureTest, FlushSstRangeSyncError) {
   ASSERT_OK(Put(1, "foo3_2", rnd_str));
   ASSERT_OK(Put(1, "foo3_3", rnd_str));
   ASSERT_OK(Put(1, "foo4", "bar"));
-  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
 
   // Following writes should fail as flush failed.
   ASSERT_NOK(Put(1, "foo2", "bar3"));
@@ -328,7 +338,6 @@ TEST_F(DBIOFailureTest, CompactSstRangeSyncError) {
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, options);
-  Status s;
 
   Random rnd(301);
   std::string rnd_str =
@@ -342,21 +351,22 @@ TEST_F(DBIOFailureTest, CompactSstRangeSyncError) {
   ASSERT_OK(Put(1, "foo1_1", rnd_str));
   ASSERT_OK(Put(1, "foo1_2", rnd_str));
   ASSERT_OK(Put(1, "foo1_3", rnd_str));
-  Flush(1);
+  ASSERT_OK(Flush(1));
   ASSERT_OK(Put(1, "foo", "bar"));
   ASSERT_OK(Put(1, "foo3_1", rnd_str));
   ASSERT_OK(Put(1, "foo3_2", rnd_str));
   ASSERT_OK(Put(1, "foo3_3", rnd_str));
   ASSERT_OK(Put(1, "foo4", "bar"));
-  Flush(1);
-  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
 
+  const char* io_error_msg = "range sync dummy error";
   std::atomic<int> range_sync_called(0);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "SpecialEnv::SStableFile::RangeSync", [&](void* arg) {
         if (range_sync_called.fetch_add(1) == 0) {
           Status* st = static_cast<Status*>(arg);
-          *st = Status::IOError("range sync dummy error");
+          *st = Status::IOError(io_error_msg);
         }
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
@@ -365,7 +375,9 @@ TEST_F(DBIOFailureTest, CompactSstRangeSyncError) {
                                  {
                                      {"disable_auto_compactions", "false"},
                                  }));
-  dbfull()->TEST_WaitForCompact();
+  Status s = dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
 
   // Following writes should fail as flush failed.
   ASSERT_NOK(Put(1, "foo2", "bar3"));
@@ -389,13 +401,14 @@ TEST_F(DBIOFailureTest, FlushSstCloseError) {
 
   DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, options);
-  Status s;
+
+  const char* io_error_msg = "close dummy error";
   std::atomic<int> close_called(0);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "SpecialEnv::SStableFile::Close", [&](void* arg) {
         if (close_called.fetch_add(1) == 0) {
           Status* st = static_cast<Status*>(arg);
-          *st = Status::IOError("close dummy error");
+          *st = Status::IOError(io_error_msg);
         }
       });
 
@@ -404,7 +417,9 @@ TEST_F(DBIOFailureTest, FlushSstCloseError) {
   ASSERT_OK(Put(1, "foo", "bar"));
   ASSERT_OK(Put(1, "foo1", "bar1"));
   ASSERT_OK(Put(1, "foo", "bar2"));
-  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
 
   // Following writes should fail as flush failed.
   ASSERT_NOK(Put(1, "foo2", "bar3"));
@@ -429,25 +444,25 @@ TEST_F(DBIOFailureTest, CompactionSstCloseError) {
 
   DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, options);
-  Status s;
 
   ASSERT_OK(Put(1, "foo", "bar"));
   ASSERT_OK(Put(1, "foo2", "bar"));
-  Flush(1);
+  ASSERT_OK(Flush(1));
   ASSERT_OK(Put(1, "foo", "bar2"));
   ASSERT_OK(Put(1, "foo2", "bar"));
-  Flush(1);
+  ASSERT_OK(Flush(1));
   ASSERT_OK(Put(1, "foo", "bar3"));
   ASSERT_OK(Put(1, "foo2", "bar"));
-  Flush(1);
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
+  const char* io_error_msg = "close dummy error";
   std::atomic<int> close_called(0);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "SpecialEnv::SStableFile::Close", [&](void* arg) {
         if (close_called.fetch_add(1) == 0) {
           Status* st = static_cast<Status*>(arg);
-          *st = Status::IOError("close dummy error");
+          *st = Status::IOError(io_error_msg);
         }
       });
 
@@ -456,7 +471,9 @@ TEST_F(DBIOFailureTest, CompactionSstCloseError) {
                                  {
                                      {"disable_auto_compactions", "false"},
                                  }));
-  dbfull()->TEST_WaitForCompact();
+  Status s = dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
 
   // Following writes should fail as compaction failed.
   ASSERT_NOK(Put(1, "foo2", "bar3"));
@@ -480,13 +497,14 @@ TEST_F(DBIOFailureTest, FlushSstSyncError) {
 
   DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, options);
-  Status s;
+
+  const char* io_error_msg = "sync dummy error";
   std::atomic<int> sync_called(0);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "SpecialEnv::SStableFile::Sync", [&](void* arg) {
         if (sync_called.fetch_add(1) == 0) {
           Status* st = static_cast<Status*>(arg);
-          *st = Status::IOError("sync dummy error");
+          *st = Status::IOError(io_error_msg);
         }
       });
 
@@ -495,7 +513,9 @@ TEST_F(DBIOFailureTest, FlushSstSyncError) {
   ASSERT_OK(Put(1, "foo", "bar"));
   ASSERT_OK(Put(1, "foo1", "bar1"));
   ASSERT_OK(Put(1, "foo", "bar2"));
-  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
 
   // Following writes should fail as flush failed.
   ASSERT_NOK(Put(1, "foo2", "bar3"));
@@ -521,25 +541,25 @@ TEST_F(DBIOFailureTest, CompactionSstSyncError) {
 
   DestroyAndReopen(options);
   CreateAndReopenWithCF({"pikachu"}, options);
-  Status s;
 
   ASSERT_OK(Put(1, "foo", "bar"));
   ASSERT_OK(Put(1, "foo2", "bar"));
-  Flush(1);
+  ASSERT_OK(Flush(1));
   ASSERT_OK(Put(1, "foo", "bar2"));
   ASSERT_OK(Put(1, "foo2", "bar"));
-  Flush(1);
+  ASSERT_OK(Flush(1));
   ASSERT_OK(Put(1, "foo", "bar3"));
   ASSERT_OK(Put(1, "foo2", "bar"));
-  Flush(1);
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
+  const char* io_error_msg = "sync dummy error";
   std::atomic<int> sync_called(0);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "SpecialEnv::SStableFile::Sync", [&](void* arg) {
         if (sync_called.fetch_add(1) == 0) {
           Status* st = static_cast<Status*>(arg);
-          *st = Status::IOError("close dummy error");
+          *st = Status::IOError(io_error_msg);
         }
       });
 
@@ -548,7 +568,9 @@ TEST_F(DBIOFailureTest, CompactionSstSyncError) {
                                  {
                                      {"disable_auto_compactions", "false"},
                                  }));
-  dbfull()->TEST_WaitForCompact();
+  Status s = dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
 
   // Following writes should fail as compaction failed.
   ASSERT_NOK(Put(1, "foo2", "bar3"));
diff --git a/db/db_iter.cc b/db/db_iter.cc
index a9eee88dd92..75a196e4dd2 100644
--- a/db/db_iter.cc
+++ b/db/db_iter.cc
@@ -8,9 +8,10 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/db_iter.h"
-#include <string>
+
 #include <iostream>
 #include <limits>
+#include <string>
 
 #include "db/dbformat.h"
 #include "db/merge_context.h"
@@ -24,6 +25,7 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/options.h"
+#include "rocksdb/system_clock.h"
 #include "table/internal_iterator.h"
 #include "table/iterator_wrapper.h"
 #include "trace_replay/trace_replay.h"
@@ -34,21 +36,24 @@
 namespace ROCKSDB_NAMESPACE {
 
 DBIter::DBIter(Env* _env, const ReadOptions& read_options,
-               const ImmutableCFOptions& cf_options,
+               const ImmutableOptions& ioptions,
                const MutableCFOptions& mutable_cf_options,
-               const Comparator* cmp, InternalIterator* iter, SequenceNumber s,
-               bool arena_mode, uint64_t max_sequential_skip_in_iterations,
+               const Comparator* cmp, InternalIterator* iter,
+               const Version* version, SequenceNumber s, bool arena_mode,
+               uint64_t max_sequential_skip_in_iterations,
                ReadCallback* read_callback, DBImpl* db_impl,
-               ColumnFamilyData* cfd, bool allow_blob)
+               ColumnFamilyData* cfd, bool expose_blob_index)
     : prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
       env_(_env),
-      logger_(cf_options.info_log),
+      clock_(ioptions.clock),
+      logger_(ioptions.logger),
       user_comparator_(cmp),
-      merge_operator_(cf_options.merge_operator),
+      merge_operator_(ioptions.merge_operator.get()),
       iter_(iter),
+      version_(version),
       read_callback_(read_callback),
       sequence_(s),
-      statistics_(cf_options.statistics),
+      statistics_(ioptions.stats),
       max_skip_(max_sequential_skip_in_iterations),
       max_skippable_internal_keys_(read_options.max_skippable_internal_keys),
       num_internal_keys_skipped_(0),
@@ -65,10 +70,12 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options,
       expect_total_order_inner_iter_(prefix_extractor_ == nullptr ||
                                      read_options.total_order_seek ||
                                      read_options.auto_prefix_mode),
-      allow_blob_(allow_blob),
+      read_tier_(read_options.read_tier),
+      verify_checksums_(read_options.verify_checksums),
+      expose_blob_index_(expose_blob_index),
       is_blob_(false),
       arena_mode_(arena_mode),
-      range_del_agg_(&cf_options.internal_comparator, s),
+      range_del_agg_(&ioptions.internal_comparator, s),
       db_impl_(db_impl),
       cfd_(cfd),
       start_seqnum_(read_options.iter_start_seqnum),
@@ -107,11 +114,11 @@ Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
 }
 
 bool DBIter::ParseKey(ParsedInternalKey* ikey) {
-  if (ParseInternalKey(iter_.key(), ikey) != Status::OK()) {
-    status_ = Status::Corruption("corrupted internal key in DBIter");
+  Status s = ParseInternalKey(iter_.key(), ikey, false /* log_err_key */);
+  if (!s.ok()) {
+    status_ = Status::Corruption("In DBIter: ", s.getState());
     valid_ = false;
-    ROCKS_LOG_ERROR(logger_, "corrupted internal key in DBIter: %s",
-                    iter_.key().ToString(true).c_str());
+    ROCKS_LOG_ERROR(logger_, "In DBIter: %s", status_.getState());
     return false;
   } else {
     return true;
@@ -122,7 +129,7 @@ void DBIter::Next() {
   assert(valid_);
   assert(status_.ok());
 
-  PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, env_);
+  PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, clock_);
   // Release temporarily pinned blocks from last operation
   ReleaseTempPinnedData();
   local_stats_.skip_count_ += num_internal_keys_skipped_;
@@ -164,6 +171,42 @@ void DBIter::Next() {
   }
 }
 
+bool DBIter::SetBlobValueIfNeeded(const Slice& user_key,
+                                  const Slice& blob_index) {
+  assert(!is_blob_);
+
+  if (expose_blob_index_) {  // Stacked BlobDB implementation
+    is_blob_ = true;
+    return true;
+  }
+
+  if (!version_) {
+    status_ = Status::Corruption("Encountered unexpected blob index.");
+    valid_ = false;
+    return false;
+  }
+
+  // TODO: consider moving ReadOptions from ArenaWrappedDBIter to DBIter to
+  // avoid having to copy options back and forth.
+  ReadOptions read_options;
+  read_options.read_tier = read_tier_;
+  read_options.verify_checksums = verify_checksums_;
+
+  constexpr uint64_t* bytes_read = nullptr;
+
+  const Status s = version_->GetBlob(read_options, user_key, blob_index,
+                                     &blob_value_, bytes_read);
+
+  if (!s.ok()) {
+    status_ = s;
+    valid_ = false;
+    return false;
+  }
+
+  is_blob_ = true;
+  return true;
+}
+
 // PRE: saved_key_ has the current user key if skipping_saved_key
 // POST: saved_key_ should have the next user key if valid_,
 //       if the current entry is a result of merge
@@ -220,25 +263,28 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
       is_key_seqnum_zero_ = false;
       return false;
     }
+    Slice user_key_without_ts =
+        StripTimestampFromUserKey(ikey_.user_key, timestamp_size_);
 
     is_key_seqnum_zero_ = (ikey_.sequence == 0);
 
     assert(iterate_upper_bound_ == nullptr ||
            iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound ||
            user_comparator_.CompareWithoutTimestamp(
-               ikey_.user_key, /*a_has_ts=*/true, *iterate_upper_bound_,
+               user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_,
                /*b_has_ts=*/false) < 0);
     if (iterate_upper_bound_ != nullptr &&
         iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound &&
         user_comparator_.CompareWithoutTimestamp(
-            ikey_.user_key, /*a_has_ts=*/true, *iterate_upper_bound_,
+            user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_,
             /*b_has_ts=*/false) >= 0) {
       break;
     }
 
     assert(prefix == nullptr || prefix_extractor_ != nullptr);
     if (prefix != nullptr &&
-        prefix_extractor_->Transform(ikey_.user_key).compare(*prefix) != 0) {
+        prefix_extractor_->Transform(user_key_without_ts).compare(*prefix) !=
+            0) {
       assert(prefix_same_as_start_);
       break;
     }
@@ -315,8 +361,14 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
           case kTypeBlobIndex:
             if (start_seqnum_ > 0) {
               if (ikey_.sequence >= start_seqnum_) {
-                assert(ikey_.type != kTypeBlobIndex);
                 saved_key_.SetInternalKey(ikey_);
+
+                if (ikey_.type == kTypeBlobIndex) {
+                  if (!SetBlobValueIfNeeded(ikey_.user_key, iter_.value())) {
+                    return false;
+                  }
+                }
+
                 valid_ = true;
                 return true;
               } else {
@@ -330,6 +382,13 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
               }
             } else if (timestamp_lb_) {
               saved_key_.SetInternalKey(ikey_);
+
+              if (ikey_.type == kTypeBlobIndex) {
+                if (!SetBlobValueIfNeeded(ikey_.user_key, iter_.value())) {
+                  return false;
+                }
+              }
+
               valid_ = true;
               return true;
             } else {
@@ -344,20 +403,13 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
                 num_skipped = 0;
                 reseek_done = false;
                 PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
-              } else if (ikey_.type == kTypeBlobIndex) {
-                if (!allow_blob_) {
-                  ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
-                  status_ = Status::NotSupported(
-                      "Encounter unexpected blob index. Please open DB with "
-                      "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
-                  valid_ = false;
-                  return false;
+              } else {
+                if (ikey_.type == kTypeBlobIndex) {
+                  if (!SetBlobValueIfNeeded(ikey_.user_key, iter_.value())) {
+                    return false;
+                  }
                 }
 
-                is_blob_ = true;
-                valid_ = true;
-                return true;
-              } else {
                 valid_ = true;
                 return true;
               }
@@ -436,11 +488,11 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
               &last_key,
               ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion));
         } else {
-          std::string min_ts(timestamp_size_, static_cast<char>(0));
+          const std::string kTsMin(timestamp_size_, '\0');
           AppendInternalKeyWithDifferentTimestamp(
               &last_key,
               ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion),
-              min_ts);
+              kTsMin);
         }
         // Don't set skipping_saved_key = false because we may still see more
         // user-keys equal to saved_key_.
@@ -497,7 +549,6 @@ bool DBIter::MergeValuesNewToOld() {
   TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:PushedFirstOperand");
 
   ParsedInternalKey ikey;
-  Status s;
   for (iter_.Next(); iter_.Valid(); iter_.Next()) {
     TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:SteppedToNextOperand");
     if (!ParseKey(&ikey)) {
@@ -525,12 +576,8 @@ bool DBIter::MergeValuesNewToOld() {
       // hit a put, merge the put value with operands and store the
       // final result in saved_value_. We are done!
       const Slice val = iter_.value();
-      s = MergeHelper::TimedFullMerge(
-          merge_operator_, ikey.user_key, &val, merge_context_.GetOperands(),
-          &saved_value_, logger_, statistics_, env_, &pinned_value_, true);
+      Status s = Merge(&val, ikey.user_key);
       if (!s.ok()) {
-        valid_ = false;
-        status_ = s;
         return false;
       }
       // iter_ is positioned after put
@@ -547,17 +594,31 @@ bool DBIter::MergeValuesNewToOld() {
           iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
       PERF_COUNTER_ADD(internal_merge_count, 1);
     } else if (kTypeBlobIndex == ikey.type) {
-      if (!allow_blob_) {
-        ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
-        status_ = Status::NotSupported(
-            "Encounter unexpected blob index. Please open DB with "
-            "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
-      } else {
+      if (expose_blob_index_) {
         status_ =
-            Status::NotSupported("Blob DB does not support merge operator.");
+            Status::NotSupported("BlobDB does not support merge operator.");
+        valid_ = false;
+        return false;
       }
-      valid_ = false;
-      return false;
+      // hit a put, merge the put value with operands and store the
+      // final result in saved_value_. We are done!
+      if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) {
+        return false;
+      }
+      valid_ = true;
+      const Slice blob_value = value();
+      Status s = Merge(&blob_value, ikey.user_key);
+      if (!s.ok()) {
+        return false;
+      }
+      is_blob_ = false;
+      // iter_ is positioned after put
+      iter_.Next();
+      if (!iter_.status().ok()) {
+        valid_ = false;
+        return false;
+      }
+      return true;
     } else {
       valid_ = false;
       status_ = Status::Corruption(
@@ -576,32 +637,19 @@ bool DBIter::MergeValuesNewToOld() {
   // a deletion marker.
   // feed null as the existing value to the merge operator, such that
   // client can differentiate this scenario and do things accordingly.
-  s = MergeHelper::TimedFullMerge(merge_operator_, saved_key_.GetUserKey(),
-                                  nullptr, merge_context_.GetOperands(),
-                                  &saved_value_, logger_, statistics_, env_,
-                                  &pinned_value_, true);
+  Status s = Merge(nullptr, saved_key_.GetUserKey());
   if (!s.ok()) {
-    valid_ = false;
-    status_ = s;
     return false;
   }
-
   assert(status_.ok());
   return true;
 }
 
 void DBIter::Prev() {
-  if (timestamp_size_ > 0) {
-    valid_ = false;
-    status_ = Status::NotSupported(
-        "SeekToLast/SeekForPrev/Prev currently not supported with timestamp.");
-    return;
-  }
-
   assert(valid_);
   assert(status_.ok());
 
-  PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, env_);
+  PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, clock_);
   ReleaseTempPinnedData();
   ResetInternalKeysSkippedCounter();
   bool ok = true;
@@ -636,9 +684,16 @@ bool DBIter::ReverseToForward() {
   // If that's the case, seek iter_ to current key.
   if (!expect_total_order_inner_iter() || !iter_.Valid()) {
     IterKey last_key;
-    last_key.SetInternalKey(ParsedInternalKey(
-        saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
+    ParsedInternalKey pikey(saved_key_.GetUserKey(), kMaxSequenceNumber,
+                            kValueTypeForSeek);
+    if (timestamp_size_ > 0) {
+      // TODO: pre-create kTsMax.
+      const std::string kTsMax(timestamp_size_, '\xff');
+      pikey.SetTimestamp(kTsMax);
+    }
+    last_key.SetInternalKey(pikey);
     iter_.Seek(last_key.GetInternalKey());
+    RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
   }
 
   direction_ = kForward;
@@ -689,6 +744,7 @@ bool DBIter::ReverseToBackward() {
         iter_.SeekToLast();
       }
     }
+    RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
   }
 
   direction_ = kReverse;
@@ -703,7 +759,9 @@ void DBIter::PrevInternal(const Slice* prefix) {
 
     assert(prefix == nullptr || prefix_extractor_ != nullptr);
     if (prefix != nullptr &&
-        prefix_extractor_->Transform(saved_key_.GetUserKey())
+        prefix_extractor_
+                ->Transform(StripTimestampFromUserKey(saved_key_.GetUserKey(),
+                                                      timestamp_size_))
                 .compare(*prefix) != 0) {
       assert(prefix_same_as_start_);
       // Current key does not have the same prefix as start
@@ -712,11 +770,13 @@ void DBIter::PrevInternal(const Slice* prefix) {
     }
 
     assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() ||
-           user_comparator_.Compare(saved_key_.GetUserKey(),
-                                    *iterate_lower_bound_) >= 0);
+           user_comparator_.CompareWithoutTimestamp(
+               saved_key_.GetUserKey(), /*a_has_ts=*/true,
+               *iterate_lower_bound_, /*b_has_ts=*/false) >= 0);
     if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() &&
-        user_comparator_.Compare(saved_key_.GetUserKey(),
-                                 *iterate_lower_bound_) < 0) {
+        user_comparator_.CompareWithoutTimestamp(
+            saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_lower_bound_,
+            /*b_has_ts=*/false) < 0) {
       // We've iterated earlier than the user-specified lower bound.
       valid_ = false;
       return;
@@ -761,8 +821,8 @@ bool DBIter::FindValueForCurrentKey() {
   assert(iter_.Valid());
   merge_context_.Clear();
   current_entry_is_merged_ = false;
-  // last entry before merge (could be kTypeDeletion, kTypeSingleDeletion or
-  // kTypeValue)
+  // last entry before merge (could be kTypeDeletion,
+  // kTypeDeletionWithTimestamp, kTypeSingleDeletion or kTypeValue)
   ValueType last_not_merge_type = kTypeDeletion;
   ValueType last_key_entry_type = kTypeDeletion;
 
@@ -783,9 +843,13 @@ bool DBIter::FindValueForCurrentKey() {
                  timestamp_size_);
     }
     if (!IsVisible(ikey.sequence, ts) ||
-        !user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) {
+        !user_comparator_.EqualWithoutTimestamp(ikey.user_key,
+                                                saved_key_.GetUserKey())) {
       break;
     }
+    if (!ts.empty()) {
+      saved_timestamp_.assign(ts.data(), ts.size());
+    }
     if (TooManyInternalKeysSkipped()) {
       return false;
     }
@@ -810,14 +874,22 @@ bool DBIter::FindValueForCurrentKey() {
                 ikey, RangeDelPositioningMode::kBackwardTraversal)) {
           last_key_entry_type = kTypeRangeDeletion;
           PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
-        } else {
-          assert(iter_.iter()->IsValuePinned());
+        } else if (iter_.iter()->IsValuePinned()) {
           pinned_value_ = iter_.value();
+        } else {
+          valid_ = false;
+          status_ = Status::NotSupported(
+              "Backward iteration not supported if underlying iterator's value "
+              "cannot be pinned.");
         }
         merge_context_.Clear();
         last_not_merge_type = last_key_entry_type;
+        if (!status_.ok()) {
+          return false;
+        }
         break;
       case kTypeDeletion:
+      case kTypeDeletionWithTimestamp:
       case kTypeSingleDeletion:
         merge_context_.Clear();
         last_not_merge_type = last_key_entry_type;
@@ -861,6 +933,7 @@ bool DBIter::FindValueForCurrentKey() {
   is_blob_ = false;
   switch (last_key_entry_type) {
     case kTypeDeletion:
+    case kTypeDeletionWithTimestamp:
     case kTypeSingleDeletion:
     case kTypeRangeDeletion:
       valid_ = false;
@@ -870,43 +943,45 @@ bool DBIter::FindValueForCurrentKey() {
       if (last_not_merge_type == kTypeDeletion ||
           last_not_merge_type == kTypeSingleDeletion ||
           last_not_merge_type == kTypeRangeDeletion) {
-        s = MergeHelper::TimedFullMerge(
-            merge_operator_, saved_key_.GetUserKey(), nullptr,
-            merge_context_.GetOperands(), &saved_value_, logger_, statistics_,
-            env_, &pinned_value_, true);
+        s = Merge(nullptr, saved_key_.GetUserKey());
+        if (!s.ok()) {
+          return false;
+        }
+        return true;
       } else if (last_not_merge_type == kTypeBlobIndex) {
-        if (!allow_blob_) {
-          ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
-          status_ = Status::NotSupported(
-              "Encounter unexpected blob index. Please open DB with "
-              "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
-        } else {
+        if (expose_blob_index_) {
           status_ =
-              Status::NotSupported("Blob DB does not support merge operator.");
+              Status::NotSupported("BlobDB does not support merge operator.");
+          valid_ = false;
+          return false;
         }
-        valid_ = false;
-        return false;
+        if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) {
+          return false;
+        }
+        valid_ = true;
+        const Slice blob_value = value();
+        s = Merge(&blob_value, saved_key_.GetUserKey());
+        if (!s.ok()) {
+          return false;
+        }
+        is_blob_ = false;
+        return true;
       } else {
         assert(last_not_merge_type == kTypeValue);
-        s = MergeHelper::TimedFullMerge(
-            merge_operator_, saved_key_.GetUserKey(), &pinned_value_,
-            merge_context_.GetOperands(), &saved_value_, logger_, statistics_,
-            env_, &pinned_value_, true);
+        s = Merge(&pinned_value_, saved_key_.GetUserKey());
+        if (!s.ok()) {
+          return false;
+        }
+        return true;
       }
       break;
     case kTypeValue:
       // do nothing - we've already has value in pinned_value_
       break;
     case kTypeBlobIndex:
-      if (!allow_blob_) {
-        ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
-        status_ = Status::NotSupported(
-            "Encounter unexpected blob index. Please open DB with "
-            "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
-        valid_ = false;
+      if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) {
         return false;
       }
-      is_blob_ = true;
       break;
     default:
       valid_ = false;
@@ -933,8 +1008,17 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
   // FindValueForCurrentKeyUsingSeek()
   assert(pinned_iters_mgr_.PinningEnabled());
   std::string last_key;
-  AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(),
-                                                 sequence_, kValueTypeForSeek));
+  if (0 == timestamp_size_) {
+    AppendInternalKey(&last_key,
+                      ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
+                                        kValueTypeForSeek));
+  } else {
+    AppendInternalKeyWithDifferentTimestamp(
+        &last_key,
+        ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
+                          kValueTypeForSeek),
+        *timestamp_ub_);
+  }
   iter_.Seek(last_key);
   RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
 
@@ -958,7 +1042,8 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
                  timestamp_size_);
     }
 
-    if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) {
+    if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
+                                                saved_key_.GetUserKey())) {
       // No visible values for this key, even though FindValueForCurrentKey()
       // has seen some. This is possible if we're using a tailing iterator, and
       // the entries were discarded in a compaction.
@@ -975,26 +1060,28 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
 
   if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
       range_del_agg_.ShouldDelete(
-          ikey, RangeDelPositioningMode::kBackwardTraversal)) {
+          ikey, RangeDelPositioningMode::kBackwardTraversal) ||
+      kTypeDeletionWithTimestamp == ikey.type) {
     valid_ = false;
     return true;
   }
-  if (ikey.type == kTypeBlobIndex && !allow_blob_) {
-    ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
-    status_ = Status::NotSupported(
-        "Encounter unexpected blob index. Please open DB with "
-        "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
-    valid_ = false;
-    return false;
-  }
   if (!iter_.PrepareValue()) {
     valid_ = false;
     return false;
   }
+  if (timestamp_size_ > 0) {
+    Slice ts = ExtractTimestampFromUserKey(ikey.user_key, timestamp_size_);
+    saved_timestamp_.assign(ts.data(), ts.size());
+  }
   if (ikey.type == kTypeValue || ikey.type == kTypeBlobIndex) {
     assert(iter_.iter()->IsValuePinned());
     pinned_value_ = iter_.value();
-    is_blob_ = (ikey.type == kTypeBlobIndex);
+    if (ikey.type == kTypeBlobIndex) {
+      if (!SetBlobValueIfNeeded(ikey.user_key, pinned_value_)) {
+        return false;
+      }
+    }
+
     valid_ = true;
     return true;
   }
@@ -1034,33 +1121,33 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
 
     if (ikey.type == kTypeValue) {
       const Slice val = iter_.value();
-      Status s = MergeHelper::TimedFullMerge(
-          merge_operator_, saved_key_.GetUserKey(), &val,
-          merge_context_.GetOperands(), &saved_value_, logger_, statistics_,
-          env_, &pinned_value_, true);
+      Status s = Merge(&val, saved_key_.GetUserKey());
       if (!s.ok()) {
-        valid_ = false;
-        status_ = s;
         return false;
       }
-      valid_ = true;
       return true;
     } else if (ikey.type == kTypeMerge) {
       merge_context_.PushOperand(
           iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
       PERF_COUNTER_ADD(internal_merge_count, 1);
     } else if (ikey.type == kTypeBlobIndex) {
-      if (!allow_blob_) {
-        ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
-        status_ = Status::NotSupported(
-            "Encounter unexpected blob index. Please open DB with "
-            "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
-      } else {
+      if (expose_blob_index_) {
         status_ =
-            Status::NotSupported("Blob DB does not support merge operator.");
+            Status::NotSupported("BlobDB does not support merge operator.");
+        valid_ = false;
+        return false;
       }
-      valid_ = false;
-      return false;
+      if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) {
+        return false;
+      }
+      valid_ = true;
+      const Slice blob_value = value();
+      Status s = Merge(&blob_value, saved_key_.GetUserKey());
+      if (!s.ok()) {
+        return false;
+      }
+      is_blob_ = false;
+      return true;
     } else {
       valid_ = false;
       status_ = Status::Corruption(
@@ -1070,13 +1157,8 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
     }
   }
 
-  Status s = MergeHelper::TimedFullMerge(
-      merge_operator_, saved_key_.GetUserKey(), nullptr,
-      merge_context_.GetOperands(), &saved_value_, logger_, statistics_, env_,
-      &pinned_value_, true);
+  Status s = Merge(nullptr, saved_key_.GetUserKey());
   if (!s.ok()) {
-    valid_ = false;
-    status_ = s;
     return false;
   }
 
@@ -1099,6 +1181,19 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
   return true;
 }
 
+Status DBIter::Merge(const Slice* val, const Slice& user_key) {
+  Status s = MergeHelper::TimedFullMerge(
+      merge_operator_, user_key, val, merge_context_.GetOperands(),
+      &saved_value_, logger_, statistics_, clock_, &pinned_value_, true);
+  if (!s.ok()) {
+    valid_ = false;
+    status_ = s;
+    return s;
+  }
+  valid_ = true;
+  return s;
+}
+
 // Move backwards until the key smaller than saved_key_.
 // Changes valid_ only if return value is false.
 bool DBIter::FindUserKeyBeforeSavedKey() {
@@ -1110,7 +1205,8 @@ bool DBIter::FindUserKeyBeforeSavedKey() {
       return false;
     }
 
-    if (user_comparator_.Compare(ikey.user_key, saved_key_.GetUserKey()) < 0) {
+    if (user_comparator_.CompareWithoutTimestamp(ikey.user_key,
+                                                 saved_key_.GetUserKey()) < 0) {
       return true;
     }
 
@@ -1134,8 +1230,14 @@ bool DBIter::FindUserKeyBeforeSavedKey() {
     if (num_skipped >= max_skip_) {
       num_skipped = 0;
       IterKey last_key;
-      last_key.SetInternalKey(ParsedInternalKey(
-          saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
+      ParsedInternalKey pikey(saved_key_.GetUserKey(), kMaxSequenceNumber,
+                              kValueTypeForSeek);
+      if (timestamp_size_ > 0) {
+        // TODO: pre-create kTsMax.
+        const std::string kTsMax(timestamp_size_, '\xff');
+        pikey.SetTimestamp(kTsMax);
+      }
+      last_key.SetInternalKey(pikey);
       // It would be more efficient to use SeekForPrev() here, but some
       // iterators may not support it.
       iter_.Seek(last_key.GetInternalKey());
@@ -1212,24 +1314,50 @@ void DBIter::SetSavedKeyToSeekForPrevTarget(const Slice& target) {
   saved_key_.Clear();
   // now saved_key is used to store internal key.
   saved_key_.SetInternalKey(target, 0 /* sequence_number */,
-                            kValueTypeForSeekForPrev);
+                            kValueTypeForSeekForPrev, timestamp_ub_);
+
+  if (timestamp_size_ > 0) {
+    const std::string kTsMin(timestamp_size_, '\0');
+    Slice ts = kTsMin;
+    saved_key_.UpdateInternalKey(/*seq=*/0, kValueTypeForSeekForPrev, &ts);
+  }
 
   if (iterate_upper_bound_ != nullptr &&
-      user_comparator_.Compare(saved_key_.GetUserKey(),
-                               *iterate_upper_bound_) >= 0) {
+      user_comparator_.CompareWithoutTimestamp(
+          saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_upper_bound_,
+          /*b_has_ts=*/false) >= 0) {
     saved_key_.Clear();
-    saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber);
+    saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber,
+                              kValueTypeForSeekForPrev, timestamp_ub_);
+    if (timestamp_size_ > 0) {
+      const std::string kTsMax(timestamp_size_, '\xff');
+      Slice ts = kTsMax;
+      saved_key_.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeekForPrev,
+                                   &ts);
+    }
   }
 }
 
 void DBIter::Seek(const Slice& target) {
-  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
-  StopWatch sw(env_, statistics_, DB_SEEK);
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
+  StopWatch sw(clock_, statistics_, DB_SEEK);
 
 #ifndef ROCKSDB_LITE
   if (db_impl_ != nullptr && cfd_ != nullptr) {
     // TODO: What do we do if this returns an error?
-    db_impl_->TraceIteratorSeek(cfd_->GetID(), target).PermitUncheckedError();
+    Slice lower_bound, upper_bound;
+    if (iterate_lower_bound_ != nullptr) {
+      lower_bound = *iterate_lower_bound_;
+    } else {
+      lower_bound = Slice("");
+    }
+    if (iterate_upper_bound_ != nullptr) {
+      upper_bound = *iterate_upper_bound_;
+    } else {
+      upper_bound = Slice("");
+    }
+    db_impl_->TraceIteratorSeek(cfd_->GetID(), target, lower_bound, upper_bound)
+        .PermitUncheckedError();
   }
 #endif  // ROCKSDB_LITE
 
@@ -1257,7 +1385,7 @@ void DBIter::Seek(const Slice& target) {
   // we need to find out the next key that is visible to the user.
   ClearSavedValue();
   if (prefix_same_as_start_) {
-    // The case where the iterator needs to be invalidated if it has exausted
+    // The case where the iterator needs to be invalidated if it has exhausted
     // keys within the same prefix of the seek key.
     assert(prefix_extractor_ != nullptr);
     Slice target_prefix = prefix_extractor_->Transform(target);
@@ -1285,24 +1413,30 @@ void DBIter::Seek(const Slice& target) {
 }
 
 void DBIter::SeekForPrev(const Slice& target) {
-  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
-  StopWatch sw(env_, statistics_, DB_SEEK);
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
+  StopWatch sw(clock_, statistics_, DB_SEEK);
 
 #ifndef ROCKSDB_LITE
   if (db_impl_ != nullptr && cfd_ != nullptr) {
     // TODO: What do we do if this returns an error?
-    db_impl_->TraceIteratorSeekForPrev(cfd_->GetID(), target)
+    Slice lower_bound, upper_bound;
+    if (iterate_lower_bound_ != nullptr) {
+      lower_bound = *iterate_lower_bound_;
+    } else {
+      lower_bound = Slice("");
+    }
+    if (iterate_upper_bound_ != nullptr) {
+      upper_bound = *iterate_upper_bound_;
+    } else {
+      upper_bound = Slice("");
+    }
+    db_impl_
+        ->TraceIteratorSeekForPrev(cfd_->GetID(), target, lower_bound,
+                                   upper_bound)
         .PermitUncheckedError();
   }
 #endif  // ROCKSDB_LITE
 
-  if (timestamp_size_ > 0) {
-    valid_ = false;
-    status_ = Status::NotSupported(
-        "SeekToLast/SeekForPrev/Prev currently not supported with timestamp.");
-    return;
-  }
-
   status_ = Status::OK();
   ReleaseTempPinnedData();
   ResetInternalKeysSkippedCounter();
@@ -1326,7 +1460,7 @@ void DBIter::SeekForPrev(const Slice& target) {
   // backward direction.
   ClearSavedValue();
   if (prefix_same_as_start_) {
-    // The case where the iterator needs to be invalidated if it has exausted
+    // The case where the iterator needs to be invalidated if it has exhausted
     // keys within the same prefix of the seek key.
     assert(prefix_extractor_ != nullptr);
     Slice target_prefix = prefix_extractor_->Transform(target);
@@ -1353,7 +1487,7 @@ void DBIter::SeekToFirst() {
     Seek(*iterate_lower_bound_);
     return;
   }
-  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
   // Don't use iter_::Seek() if we set a prefix extractor
   // because prefix seek will be used.
   if (!expect_total_order_inner_iter()) {
@@ -1391,29 +1525,25 @@ void DBIter::SeekToFirst() {
   }
   if (valid_ && prefix_same_as_start_) {
     assert(prefix_extractor_ != nullptr);
-    prefix_.SetUserKey(prefix_extractor_->Transform(saved_key_.GetUserKey()));
+    prefix_.SetUserKey(prefix_extractor_->Transform(
+        StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_)));
   }
 }
 
 void DBIter::SeekToLast() {
-  if (timestamp_size_ > 0) {
-    valid_ = false;
-    status_ = Status::NotSupported(
-        "SeekToLast/SeekForPrev/Prev currently not supported with timestamp.");
-    return;
-  }
-
   if (iterate_upper_bound_ != nullptr) {
     // Seek to last key strictly less than ReadOptions.iterate_upper_bound.
     SeekForPrev(*iterate_upper_bound_);
-    if (Valid() && user_comparator_.Equal(*iterate_upper_bound_, key())) {
+    if (Valid() && 0 == user_comparator_.CompareWithoutTimestamp(
+                            *iterate_upper_bound_, /*a_has_ts=*/false, key(),
+                            /*b_has_ts=*/false)) {
       ReleaseTempPinnedData();
       PrevInternal(nullptr);
     }
     return;
   }
 
-  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
   // Don't use iter_::Seek() if we set a prefix extractor
   // because prefix seek will be used.
   if (!expect_total_order_inner_iter()) {
@@ -1442,23 +1572,25 @@ void DBIter::SeekToLast() {
   }
   if (valid_ && prefix_same_as_start_) {
     assert(prefix_extractor_ != nullptr);
-    prefix_.SetUserKey(prefix_extractor_->Transform(saved_key_.GetUserKey()));
+    prefix_.SetUserKey(prefix_extractor_->Transform(
+        StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_)));
   }
 }
 
 Iterator* NewDBIterator(Env* env, const ReadOptions& read_options,
-                        const ImmutableCFOptions& cf_options,
+                        const ImmutableOptions& ioptions,
                         const MutableCFOptions& mutable_cf_options,
                         const Comparator* user_key_comparator,
-                        InternalIterator* internal_iter,
+                        InternalIterator* internal_iter, const Version* version,
                         const SequenceNumber& sequence,
                         uint64_t max_sequential_skip_in_iterations,
                         ReadCallback* read_callback, DBImpl* db_impl,
-                        ColumnFamilyData* cfd, bool allow_blob) {
-  DBIter* db_iter = new DBIter(
-      env, read_options, cf_options, mutable_cf_options, user_key_comparator,
-      internal_iter, sequence, false, max_sequential_skip_in_iterations,
-      read_callback, db_impl, cfd, allow_blob);
+                        ColumnFamilyData* cfd, bool expose_blob_index) {
+  DBIter* db_iter =
+      new DBIter(env, read_options, ioptions, mutable_cf_options,
+                 user_key_comparator, internal_iter, version, sequence, false,
+                 max_sequential_skip_in_iterations, read_callback, db_impl, cfd,
+                 expose_blob_index);
   return db_iter;
 }
 
diff --git a/db/db_iter.h b/db/db_iter.h
index 93b78c06ee8..52bffa55d6a 100644
--- a/db/db_iter.h
+++ b/db/db_iter.h
@@ -8,8 +8,9 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
-#include <stdint.h>
+#include <cstdint>
 #include <string>
+
 #include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "db/range_del_aggregator.h"
@@ -21,6 +22,7 @@
 #include "util/autovector.h"
 
 namespace ROCKSDB_NAMESPACE {
+class Version;
 
 // This file declares the factory functions of DBIter, in its original form
 // or a wrapped form with class ArenaWrappedDBIter, which is defined here.
@@ -66,7 +68,7 @@ class DBIter final : public Iterator {
   //        this->key().
   // (2) When moving backwards, the internal iterator is positioned
   //     just before all entries whose user key == this->key().
-  enum Direction { kForward, kReverse };
+  enum Direction : uint8_t { kForward, kReverse };
 
   // LocalStatistics contain Statistics counters that will be aggregated per
   // each iterator instance and then will be sent to the global statistics when
@@ -112,12 +114,12 @@ class DBIter final : public Iterator {
   };
 
   DBIter(Env* _env, const ReadOptions& read_options,
-         const ImmutableCFOptions& cf_options,
+         const ImmutableOptions& ioptions,
          const MutableCFOptions& mutable_cf_options, const Comparator* cmp,
-         InternalIterator* iter, SequenceNumber s, bool arena_mode,
-         uint64_t max_sequential_skip_in_iterations,
+         InternalIterator* iter, const Version* version, SequenceNumber s,
+         bool arena_mode, uint64_t max_sequential_skip_in_iterations,
          ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
-         bool allow_blob);
+         bool expose_blob_index);
 
   // No copying allowed
   DBIter(const DBIter&) = delete;
@@ -159,7 +161,10 @@ class DBIter final : public Iterator {
   }
   Slice value() const override {
     assert(valid_);
-    if (current_entry_is_merged_) {
+
+    if (!expose_blob_index_ && is_blob_) {
+      return blob_value_;
+    } else if (current_entry_is_merged_) {
       // If pinned_value_ is set then the result of merge operator is one of
       // the merge operands and we should return it.
       return pinned_value_.data() ? pinned_value_ : saved_value_;
@@ -180,12 +185,15 @@ class DBIter final : public Iterator {
   Slice timestamp() const override {
     assert(valid_);
     assert(timestamp_size_ > 0);
+    if (direction_ == kReverse) {
+      return saved_timestamp_;
+    }
     const Slice ukey_and_ts = saved_key_.GetUserKey();
     assert(timestamp_size_ < ukey_and_ts.size());
     return ExtractTimestampFromUserKey(ukey_and_ts, timestamp_size_);
   }
   bool IsBlob() const {
-    assert(valid_ && (allow_blob_ || !is_blob_));
+    assert(valid_);
     return is_blob_;
   }
 
@@ -227,7 +235,7 @@ class DBIter final : public Iterator {
   // If `skipping_saved_key` is true, the function will keep iterating until it
   // finds a user key that is larger than `saved_key_`.
   // If `prefix` is not null, the iterator needs to stop when all keys for the
-  // prefix are exhausted and the interator is set to invalid.
+  // prefix are exhausted and the iterator is set to invalid.
   bool FindNextUserEntry(bool skipping_saved_key, const Slice* prefix);
   // Internal implementation of FindNextUserEntry().
   bool FindNextUserEntryInternal(bool skipping_saved_key, const Slice* prefix);
@@ -287,12 +295,20 @@ class DBIter final : public Iterator {
                : user_comparator_.CompareWithoutTimestamp(a, b);
   }
 
+  // Retrieves the blob value for the specified user key using the given blob
+  // index when using the integrated BlobDB implementation.
+  bool SetBlobValueIfNeeded(const Slice& user_key, const Slice& blob_index);
+
+  Status Merge(const Slice* val, const Slice& user_key);
+
   const SliceTransform* prefix_extractor_;
   Env* const env_;
+  SystemClock* clock_;
   Logger* logger_;
   UserComparatorWrapper user_comparator_;
   const MergeOperator* const merge_operator_;
   IteratorWrapper iter_;
+  const Version* version_;
   ReadCallback* read_callback_;
   // Max visible sequence number. It is normally the snapshot seq unless we have
   // uncommitted data in db as in WriteUnCommitted.
@@ -306,6 +322,7 @@ class DBIter final : public Iterator {
   std::string saved_value_;
   Slice pinned_value_;
   // for prefix seek mode to support prev()
+  PinnableSlice blob_value_;
   Statistics* statistics_;
   uint64_t max_skip_;
   uint64_t max_skippable_internal_keys_;
@@ -335,7 +352,11 @@ class DBIter final : public Iterator {
   // Expect the inner iterator to maintain a total order.
   // prefix_extractor_ must be non-NULL if the value is false.
   const bool expect_total_order_inner_iter_;
-  bool allow_blob_;
+  ReadTier read_tier_;
+  bool verify_checksums_;
+  // Whether the iterator is allowed to expose blob references. Set to true when
+  // the stacked BlobDB implementation is used, false otherwise.
+  bool expose_blob_index_;
   bool is_blob_;
   bool arena_mode_;
   // List of operands for merge operator.
@@ -357,18 +378,19 @@ class DBIter final : public Iterator {
   const Slice* const timestamp_ub_;
   const Slice* const timestamp_lb_;
   const size_t timestamp_size_;
+  std::string saved_timestamp_;
 };
 
 // Return a new iterator that converts internal keys (yielded by
 // "*internal_iter") that were live at the specified `sequence` number
 // into appropriate user keys.
 extern Iterator* NewDBIterator(
-    Env* env, const ReadOptions& read_options,
-    const ImmutableCFOptions& cf_options,
+    Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
     const MutableCFOptions& mutable_cf_options,
     const Comparator* user_key_comparator, InternalIterator* internal_iter,
-    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
-    ReadCallback* read_callback, DBImpl* db_impl = nullptr,
-    ColumnFamilyData* cfd = nullptr, bool allow_blob = false);
+    const Version* version, const SequenceNumber& sequence,
+    uint64_t max_sequential_skip_in_iterations, ReadCallback* read_callback,
+    DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr,
+    bool expose_blob_index = false);
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/db_iter_stress_test.cc b/db/db_iter_stress_test.cc
index 484bb0b45a3..f2b200f68bf 100644
--- a/db/db_iter_stress_test.cc
+++ b/db/db_iter_stress_test.cc
@@ -511,9 +511,9 @@ TEST_F(DBIteratorStressTest, StressTest) {
                       target_hidden_fraction;
                   internal_iter->trace = trace;
                   db_iter.reset(NewDBIterator(
-                      env_, ropt, ImmutableCFOptions(options),
+                      env_, ropt, ImmutableOptions(options),
                       MutableCFOptions(options), BytewiseComparator(),
-                      internal_iter, sequence,
+                      internal_iter, nullptr /* version */, sequence,
                       options.max_sequential_skip_in_iterations,
                       nullptr /*read_callback*/));
                 }
diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc
index 1c9680da267..252b192fede 100644
--- a/db/db_iter_test.cc
+++ b/db/db_iter_test.cc
@@ -99,10 +99,11 @@ class TestIterator : public InternalIterator {
     }
     for (auto it = data_.begin(); it != data_.end(); ++it) {
       ParsedInternalKey ikey;
-      Status pikStatus = ParseInternalKey(it->first, &ikey);
-      pikStatus.PermitUncheckedError();
-      assert(pikStatus.ok());
-      if (!pikStatus.ok() || ikey.user_key != _key) {
+      Status pik_status =
+          ParseInternalKey(it->first, &ikey, true /* log_err_key */);
+      pik_status.PermitUncheckedError();
+      assert(pik_status.ok());
+      if (!pik_status.ok() || ikey.user_key != _key) {
         continue;
       }
       if (valid_ && data_.begin() + iter_ > it) {
@@ -236,7 +237,7 @@ class DBIteratorTest : public testing::Test {
 
 TEST_F(DBIteratorTest, DBIteratorPrevNext) {
   Options options;
-  ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  ImmutableOptions ioptions = ImmutableOptions(options);
   MutableCFOptions mutable_cf_options = MutableCFOptions(options);
   {
     TestIterator* internal_iter = new TestIterator(BytewiseComparator());
@@ -251,9 +252,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
 
     ReadOptions ro;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -284,9 +286,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
 
     ReadOptions ro;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -311,9 +314,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ro.iterate_upper_bound = &prefix;
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -344,9 +348,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ro.iterate_upper_bound = &prefix;
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -380,12 +385,14 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ro.iterate_upper_bound = &prefix;
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
   }
   // Test case to check SeekToLast with iterate_upper_bound set
   // (same key put may times - SeekToLast should start with the
@@ -410,9 +417,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ro.iterate_upper_bound = &prefix;
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 7, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 7 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     SetPerfLevel(kEnableCount);
     ASSERT_TRUE(GetPerfLevel() == kEnableCount);
@@ -448,9 +456,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ro.iterate_upper_bound = &prefix;
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 4, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 4 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -474,12 +483,14 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ro.iterate_upper_bound = &prefix;
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
   }
   // Test to check the SeekToLast() with the iterate_upper_bound set
   // (Deletion cases)
@@ -497,9 +508,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ro.iterate_upper_bound = &prefix;
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -533,9 +545,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ro.iterate_upper_bound = &prefix;
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 7, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 7 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     SetPerfLevel(kEnableCount);
     ASSERT_TRUE(GetPerfLevel() == kEnableCount);
@@ -563,9 +576,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
 
     ReadOptions ro;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -584,6 +598,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
 
     db_iter->Prev();
     ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
   }
 
   {
@@ -606,9 +621,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
 
     ReadOptions ro;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 2, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 2 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -638,9 +654,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
 
     ReadOptions ro;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -660,7 +677,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
 
 TEST_F(DBIteratorTest, DBIteratorEmpty) {
   Options options;
-  ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  ImmutableOptions ioptions = ImmutableOptions(options);
   MutableCFOptions mutable_cf_options = MutableCFOptions(options);
   ReadOptions ro;
 
@@ -669,11 +686,13 @@ TEST_F(DBIteratorTest, DBIteratorEmpty) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 0, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 0 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
   }
 
   {
@@ -681,11 +700,13 @@ TEST_F(DBIteratorTest, DBIteratorEmpty) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 0, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 0 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToFirst();
     ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
   }
 }
 
@@ -704,9 +725,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkipCountSkips) {
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 2,
-      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      2 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
   db_iter->SeekToLast();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -727,6 +749,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkipCountSkips) {
 
   db_iter->Prev();
   ASSERT_TRUE(!db_iter->Valid());
+  ASSERT_OK(db_iter->status());
   ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3u);
 }
 
@@ -734,7 +757,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
   ReadOptions ro;
   Options options;
   options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
-  ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  ImmutableOptions ioptions = ImmutableOptions(options);
   MutableCFOptions mutable_cf_options = MutableCFOptions(options);
 
   {
@@ -749,9 +772,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
 
       options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-          internal_iter, i + 2, options.max_sequential_skip_in_iterations,
-          nullptr /*read_callback*/));
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, i + 2 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -770,6 +794,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       db_iter->Prev();
 
       ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
     }
   }
 
@@ -785,9 +810,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-          internal_iter, i + 2, options.max_sequential_skip_in_iterations,
-          nullptr /*read_callback*/));
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, i + 2 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -801,6 +827,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       db_iter->Prev();
 
       ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
     }
 
     {
@@ -814,9 +841,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-          internal_iter, 202, options.max_sequential_skip_in_iterations,
-          nullptr /*read_callback*/));
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, 202 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -835,6 +863,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       db_iter->Prev();
 
       ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
     }
   }
 
@@ -847,14 +876,17 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->AddPut("c", "200");
       internal_iter->Finish();
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-          internal_iter, i, options.max_sequential_skip_in_iterations,
-          nullptr /*read_callback*/));
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, i /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
       db_iter->SeekToLast();
       ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
 
       db_iter->SeekToFirst();
       ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
     }
 
     TestIterator* internal_iter = new TestIterator(BytewiseComparator());
@@ -864,9 +896,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
     internal_iter->AddPut("c", "200");
     internal_iter->Finish();
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 200, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 200 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -874,6 +907,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
 
     db_iter->Prev();
     ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -882,6 +916,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
 
     db_iter->Next();
     ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
   }
 
   {
@@ -899,9 +934,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-          internal_iter, i + 2, options.max_sequential_skip_in_iterations,
-          nullptr /*read_callback*/));
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, i + 2 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -920,6 +956,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       db_iter->Prev();
 
       ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
     }
   }
 
@@ -934,9 +971,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-          internal_iter, i + 2, options.max_sequential_skip_in_iterations,
-          nullptr /*read_callback*/));
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, i + 2 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -959,13 +997,14 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
 
       db_iter->Prev();
       ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
     }
   }
 }
 
 TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
   Options options;
-  ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  ImmutableOptions ioptions = ImmutableOptions(options);
   MutableCFOptions mutable_cf_options = MutableCFOptions(options);
   ReadOptions ro;
 
@@ -984,9 +1023,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
 
     ro.max_skippable_internal_keys = 0;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1014,7 +1054,7 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
 
     db_iter->Prev();
     ASSERT_TRUE(!db_iter->Valid());
-    ASSERT_TRUE(db_iter->status().ok());
+    ASSERT_OK(db_iter->status());
   }
 
   // Test to make sure that the request will *not* fail as incomplete if
@@ -1031,9 +1071,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
 
     ro.max_skippable_internal_keys = 2;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1076,9 +1117,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
 
     ro.max_skippable_internal_keys = 2;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1115,9 +1157,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
 
     ro.max_skippable_internal_keys = 2;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1151,9 +1194,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
 
     ro.max_skippable_internal_keys = 2;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -1182,9 +1226,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
 
     ro.max_skippable_internal_keys = 2;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1220,9 +1265,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
 
     ro.max_skippable_internal_keys = 2;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1258,9 +1304,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
 
       ro.max_skippable_internal_keys = i;
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-          internal_iter, 2 * i + 1, options.max_sequential_skip_in_iterations,
-          nullptr /*read_callback*/));
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
 
       db_iter->SeekToFirst();
       ASSERT_TRUE(db_iter->Valid());
@@ -1312,9 +1359,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
       options.max_sequential_skip_in_iterations = 1000;
       ro.max_skippable_internal_keys = i;
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-          internal_iter, 2 * i + 1, options.max_sequential_skip_in_iterations,
-          nullptr /*read_callback*/));
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
 
       db_iter->SeekToFirst();
       ASSERT_TRUE(db_iter->Valid());
@@ -1351,9 +1399,10 @@ TEST_F(DBIteratorTest, DBIterator1) {
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 1,
-      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      1 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1379,9 +1428,10 @@ TEST_F(DBIteratorTest, DBIterator2) {
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 0,
-      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      0 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1404,9 +1454,10 @@ TEST_F(DBIteratorTest, DBIterator3) {
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 2,
-      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      2 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1429,9 +1480,10 @@ TEST_F(DBIteratorTest, DBIterator4) {
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 4,
-      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      4 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1448,7 +1500,7 @@ TEST_F(DBIteratorTest, DBIterator5) {
   ReadOptions ro;
   Options options;
   options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
-  ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  ImmutableOptions ioptions = ImmutableOptions(options);
   MutableCFOptions mutable_cf_options = MutableCFOptions(options);
 
   {
@@ -1463,9 +1515,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 0, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 0 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1486,9 +1539,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 1, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 1 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1509,9 +1563,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 2, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 2 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1532,9 +1587,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 3, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 3 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1555,9 +1611,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 4, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 4 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1578,9 +1635,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 5, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 5 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1601,9 +1659,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 6, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 6 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1622,9 +1681,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddPut("b", "val_b");
     internal_iter->Finish();
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->Seek("b");
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -1638,7 +1698,7 @@ TEST_F(DBIteratorTest, DBIterator6) {
   ReadOptions ro;
   Options options;
   options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
-  ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  ImmutableOptions ioptions = ImmutableOptions(options);
   MutableCFOptions mutable_cf_options = MutableCFOptions(options);
 
   {
@@ -1653,9 +1713,10 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 0, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 0 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1676,9 +1737,10 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 1, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 1 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1699,9 +1761,10 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 2, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 2 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1722,9 +1785,10 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 3, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 3 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
   }
@@ -1741,9 +1805,10 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 4, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 4 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1764,9 +1829,10 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 5, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 5 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1787,9 +1853,10 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 6, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 6 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1803,7 +1870,7 @@ TEST_F(DBIteratorTest, DBIterator7) {
   ReadOptions ro;
   Options options;
   options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
-  ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  ImmutableOptions ioptions = ImmutableOptions(options);
   MutableCFOptions mutable_cf_options = MutableCFOptions(options);
 
   {
@@ -1830,9 +1897,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 0, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 0 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1865,9 +1933,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 2, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 2 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -1906,9 +1975,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 4, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 4 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -1947,9 +2017,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 5, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 5 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -1993,9 +2064,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 6, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 6 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2040,9 +2112,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 7, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 7 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2081,9 +2154,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 9, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 9 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2128,9 +2202,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 13, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 13 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2176,9 +2251,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
-        internal_iter, 14, options.max_sequential_skip_in_iterations,
-        nullptr /*read_callback*/));
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 14 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2207,9 +2283,10 @@ TEST_F(DBIteratorTest, DBIterator8) {
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 10,
-      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
   db_iter->SeekToLast();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -2238,9 +2315,10 @@ TEST_F(DBIteratorTest, DBIterator9) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-        BytewiseComparator(), internal_iter, 10,
-        options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+        env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+        BytewiseComparator(), internal_iter, nullptr /* version */,
+        10 /* sequence */, options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -2305,9 +2383,10 @@ TEST_F(DBIteratorTest, DBIterator10) {
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 10,
-      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
 
   db_iter->Seek("c");
   ASSERT_TRUE(db_iter->Valid());
@@ -2345,9 +2424,9 @@ TEST_F(DBIteratorTest, SeekToLastOccurrenceSeq0) {
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 10, 0 /* force seek */,
-      nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -2374,9 +2453,10 @@ TEST_F(DBIteratorTest, DBIterator11) {
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 1,
-      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      1 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -2401,8 +2481,9 @@ TEST_F(DBIteratorTest, DBIterator12) {
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 10, 0, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */));
   db_iter->SeekToLast();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -2438,8 +2519,10 @@ TEST_F(DBIteratorTest, DBIterator13) {
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 2, 3, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      2 /* sequence */, 3 /* max_sequential_skip_in_iterations */,
+      nullptr /* read_callback */));
   db_iter->Seek("b");
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), key);
@@ -2466,8 +2549,10 @@ TEST_F(DBIteratorTest, DBIterator14) {
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 4, 1, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      4 /* sequence */, 1 /* max_sequential_skip_in_iterations */,
+      nullptr /* read_callback */));
   db_iter->Seek("b");
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -2493,9 +2578,10 @@ TEST_F(DBIteratorTest, DBIteratorTestDifferentialSnapshots) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-        BytewiseComparator(), internal_iter, 13,
-        options.max_sequential_skip_in_iterations, nullptr));
+        env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+        BytewiseComparator(), internal_iter, nullptr /* version */,
+        13 /* sequence */, options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     // Expecting InternalKeys in [5,8] range with correct type
     int seqnums[4] = {5,8,11,13};
     std::string user_keys[4] = {"1","2","3","4"};
@@ -2528,9 +2614,10 @@ TEST_F(DBIteratorTest, DBIteratorTestDifferentialSnapshots) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-        BytewiseComparator(), internal_iter, 13,
-        options.max_sequential_skip_in_iterations, nullptr));
+        env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+        BytewiseComparator(), internal_iter, nullptr /* version */,
+        13 /* sequence */, options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
     // Expecting InternalKeys in [5,8] range with correct type
     int seqnums[4] = {5,8,11,13};
     EntryType key_types[4] = {EntryType::kEntryDelete,EntryType::kEntryDelete,
@@ -2578,10 +2665,10 @@ class DBIterWithMergeIterTest : public testing::Test {
         NewMergingIterator(&icomp_, &child_iters[0], 2u);
 
     db_iter_.reset(NewDBIterator(
-        env_, ro_, ImmutableCFOptions(options_), MutableCFOptions(options_),
-        BytewiseComparator(), merge_iter,
+        env_, ro_, ImmutableOptions(options_), MutableCFOptions(options_),
+        BytewiseComparator(), merge_iter, nullptr /* version */,
         8 /* read data earlier than seqId 8 */,
-        3 /* max iterators before reseek */, nullptr /*read_callback*/));
+        3 /* max iterators before reseek */, nullptr /* read_callback */));
   }
 
   Env* env_;
@@ -3018,9 +3105,10 @@ TEST_F(DBIteratorTest, SeekPrefixTombstones) {
 
   ro.prefix_same_as_start = true;
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 10,
-      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
 
   int skipped_keys = 0;
 
@@ -3054,15 +3142,16 @@ TEST_F(DBIteratorTest, SeekToFirstLowerBound) {
     ro.iterate_lower_bound = &lower_bound;
     Options options;
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-        BytewiseComparator(), internal_iter, 10 /* sequence */,
-        options.max_sequential_skip_in_iterations,
+        env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+        BytewiseComparator(), internal_iter, nullptr /* version */,
+        10 /* sequence */, options.max_sequential_skip_in_iterations,
         nullptr /* read_callback */));
 
     db_iter->SeekToFirst();
     if (i == kNumKeys + 1) {
       // lower bound was beyond the last key
       ASSERT_FALSE(db_iter->Valid());
+      ASSERT_OK(db_iter->status());
     } else {
       ASSERT_TRUE(db_iter->Valid());
       int expected;
@@ -3093,9 +3182,10 @@ TEST_F(DBIteratorTest, PrevLowerBound) {
   ro.iterate_lower_bound = &lower_bound;
   Options options;
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 10 /* sequence */,
-      options.max_sequential_skip_in_iterations, nullptr /* read_callback */));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
 
   db_iter->SeekToLast();
   for (int i = kNumKeys; i >= kLowerBound; --i) {
@@ -3121,9 +3211,10 @@ TEST_F(DBIteratorTest, SeekLessLowerBound) {
   ro.iterate_lower_bound = &lower_bound;
   Options options;
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
-      BytewiseComparator(), internal_iter, 10 /* sequence */,
-      options.max_sequential_skip_in_iterations, nullptr /* read_callback */));
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
 
   auto before_lower_bound_str = std::to_string(kLowerBound - 1);
   Slice before_lower_bound(lower_bound_str);
@@ -3146,9 +3237,10 @@ TEST_F(DBIteratorTest, ReverseToForwardWithDisappearingKeys) {
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ReadOptions(), ImmutableCFOptions(options),
-      MutableCFOptions(options), BytewiseComparator(), internal_iter, 10,
-      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+      env_, ReadOptions(), ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
 
   db_iter->SeekForPrev("a");
   ASSERT_TRUE(db_iter->Valid());
diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc
index 8d2b0a7c0f4..ae972ee967f 100644
--- a/db/db_iterator_test.cc
+++ b/db/db_iterator_test.cc
@@ -18,6 +18,7 @@
 #include "rocksdb/perf_context.h"
 #include "table/block_based/flush_block_policy.h"
 #include "util/random.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -67,8 +68,8 @@ TEST_P(DBIteratorTest, IteratorProperty) {
   // The test needs to be changed if kPersistedTier is supported in iterator.
   Options options = CurrentOptions();
   CreateAndReopenWithCF({"pikachu"}, options);
-  Put(1, "1", "2");
-  Delete(1, "2");
+  ASSERT_OK(Put(1, "1", "2"));
+  ASSERT_OK(Delete(1, "2"));
   ReadOptions ropt;
   ropt.pin_data = false;
   {
@@ -172,10 +173,10 @@ TEST_P(DBIteratorTest, NonBlockingIteration) {
 TEST_P(DBIteratorTest, IterSeekBeforePrev) {
   ASSERT_OK(Put("a", "b"));
   ASSERT_OK(Put("c", "d"));
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
   ASSERT_OK(Put("0", "f"));
   ASSERT_OK(Put("1", "h"));
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
   ASSERT_OK(Put("2", "j"));
   auto iter = NewIterator(ReadOptions());
   iter->Seek(Slice("c"));
@@ -199,7 +200,7 @@ TEST_P(DBIteratorTest, IterReseekNewUpperBound) {
   ASSERT_OK(Put("aabb", rnd.RandomString(400)));
   ASSERT_OK(Put("aaef", rnd.RandomString(400)));
   ASSERT_OK(Put("b", rnd.RandomString(400)));
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
   ReadOptions opts;
   Slice ub = Slice("aa");
   opts.iterate_upper_bound = &ub;
@@ -215,10 +216,10 @@ TEST_P(DBIteratorTest, IterReseekNewUpperBound) {
 TEST_P(DBIteratorTest, IterSeekForPrevBeforeNext) {
   ASSERT_OK(Put("a", "b"));
   ASSERT_OK(Put("c", "d"));
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
   ASSERT_OK(Put("0", "f"));
   ASSERT_OK(Put("1", "h"));
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
   ASSERT_OK(Put("2", "j"));
   auto iter = NewIterator(ReadOptions());
   iter->SeekForPrev(Slice("0"));
@@ -238,7 +239,7 @@ TEST_P(DBIteratorTest, IterLongKeys) {
   ASSERT_OK(Put(MakeLongKey(20, 0), "0"));
   ASSERT_OK(Put(MakeLongKey(32, 2), "2"));
   ASSERT_OK(Put("a", "b"));
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
   ASSERT_OK(Put(MakeLongKey(50, 1), "1"));
   ASSERT_OK(Put(MakeLongKey(127, 3), "3"));
   ASSERT_OK(Put(MakeLongKey(64, 4), "4"));
@@ -276,7 +277,7 @@ TEST_P(DBIteratorTest, IterLongKeys) {
 
 TEST_P(DBIteratorTest, IterNextWithNewerSeq) {
   ASSERT_OK(Put("0", "0"));
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
   ASSERT_OK(Put("a", "b"));
   ASSERT_OK(Put("c", "d"));
   ASSERT_OK(Put("d", "e"));
@@ -302,7 +303,7 @@ TEST_P(DBIteratorTest, IterNextWithNewerSeq) {
 
 TEST_P(DBIteratorTest, IterPrevWithNewerSeq) {
   ASSERT_OK(Put("0", "0"));
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
   ASSERT_OK(Put("a", "b"));
   ASSERT_OK(Put("c", "d"));
   ASSERT_OK(Put("d", "e"));
@@ -333,7 +334,7 @@ TEST_P(DBIteratorTest, IterPrevWithNewerSeq) {
 
 TEST_P(DBIteratorTest, IterPrevWithNewerSeq2) {
   ASSERT_OK(Put("0", "0"));
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
   ASSERT_OK(Put("a", "b"));
   ASSERT_OK(Put("c", "d"));
   ASSERT_OK(Put("e", "f"));
@@ -377,6 +378,8 @@ TEST_P(DBIteratorTest, IterEmpty) {
     iter->SeekForPrev("foo");
     ASSERT_EQ(IterStatus(iter), "(invalid)");
 
+    ASSERT_OK(iter->status());
+
     delete iter;
   } while (ChangeCompactOptions());
 }
@@ -617,6 +620,40 @@ TEST_P(DBIteratorTest, IterReseek) {
   delete iter;
 }
 
+TEST_F(DBIteratorTest, ReseekUponDirectionChange) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.merge_operator.reset(
+      new StringAppendTESTOperator(/*delim_char=*/' '));
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value"));
+  ASSERT_OK(Put("bar", "value"));
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->SeekToLast();
+    it->Prev();
+    it->Next();
+  }
+  ASSERT_EQ(1,
+            options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+  const std::string merge_key("good");
+  ASSERT_OK(Put(merge_key, "orig"));
+  ASSERT_OK(Merge(merge_key, "suffix"));
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->Seek(merge_key);
+    ASSERT_TRUE(it->Valid());
+    const uint64_t prev_reseek_count =
+        options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION);
+    it->Prev();
+    ASSERT_EQ(prev_reseek_count + 1, options.statistics->getTickerCount(
+                                         NUMBER_OF_RESEEKS_IN_ITERATION));
+  }
+}
+
 TEST_P(DBIteratorTest, IterSmallAndLargeMix) {
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
@@ -783,18 +820,18 @@ TEST_P(DBIteratorTest, IterWithSnapshot) {
 TEST_P(DBIteratorTest, IteratorPinsRef) {
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    Put(1, "foo", "hello");
+    ASSERT_OK(Put(1, "foo", "hello"));
 
     // Get iterator that will yield the current contents of the DB.
     Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
 
     // Write to force compactions
-    Put(1, "foo", "newvalue1");
+    ASSERT_OK(Put(1, "foo", "newvalue1"));
     for (int i = 0; i < 100; i++) {
       // 100K values
       ASSERT_OK(Put(1, Key(i), Key(i) + std::string(100000, 'v')));
     }
-    Put(1, "foo", "newvalue2");
+    ASSERT_OK(Put(1, "foo", "newvalue2"));
 
     iter->SeekToFirst();
     ASSERT_TRUE(iter->Valid());
@@ -809,8 +846,8 @@ TEST_P(DBIteratorTest, IteratorPinsRef) {
 TEST_P(DBIteratorTest, IteratorDeleteAfterCfDelete) {
   CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
 
-  Put(1, "foo", "delete-cf-then-delete-iter");
-  Put(1, "hello", "value2");
+  ASSERT_OK(Put(1, "foo", "delete-cf-then-delete-iter"));
+  ASSERT_OK(Put(1, "hello", "value2"));
 
   ColumnFamilyHandle* cf = handles_[1];
   ReadOptions ro;
@@ -820,7 +857,7 @@ TEST_P(DBIteratorTest, IteratorDeleteAfterCfDelete) {
   ASSERT_EQ(IterStatus(iter), "foo->delete-cf-then-delete-iter");
 
   // delete CF handle
-  db_->DestroyColumnFamilyHandle(cf);
+  EXPECT_OK(db_->DestroyColumnFamilyHandle(cf));
   handles_.erase(std::begin(handles_) + 1);
 
   // delete Iterator after CF handle is deleted
@@ -832,7 +869,7 @@ TEST_P(DBIteratorTest, IteratorDeleteAfterCfDelete) {
 TEST_P(DBIteratorTest, IteratorDeleteAfterCfDrop) {
   CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
 
-  Put(1, "foo", "drop-cf-then-delete-iter");
+  ASSERT_OK(Put(1, "foo", "drop-cf-then-delete-iter"));
 
   ReadOptions ro;
   ColumnFamilyHandle* cf = handles_[1];
@@ -842,8 +879,8 @@ TEST_P(DBIteratorTest, IteratorDeleteAfterCfDrop) {
   ASSERT_EQ(IterStatus(iter), "foo->drop-cf-then-delete-iter");
 
   // drop and delete CF
-  db_->DropColumnFamily(cf);
-  db_->DestroyColumnFamilyHandle(cf);
+  EXPECT_OK(db_->DropColumnFamily(cf));
+  EXPECT_OK(db_->DestroyColumnFamilyHandle(cf));
   handles_.erase(std::begin(handles_) + 1);
 
   // delete Iterator after CF handle is dropped
@@ -1307,9 +1344,9 @@ TEST_P(DBIteratorTest, PrevAfterAndNextAfterMerge) {
 
   // write three entries with different keys using Merge()
   WriteOptions wopts;
-  db_->Merge(wopts, "1", "data1");
-  db_->Merge(wopts, "2", "data2");
-  db_->Merge(wopts, "3", "data3");
+  ASSERT_OK(db_->Merge(wopts, "1", "data1"));
+  ASSERT_OK(db_->Merge(wopts, "2", "data2"));
+  ASSERT_OK(db_->Merge(wopts, "3", "data3"));
 
   std::unique_ptr<Iterator> it(NewIterator(ReadOptions()));
 
@@ -1393,7 +1430,7 @@ class DBIteratorTestForPinnedData : public DBIteratorTest {
 
       if (run_config == TestConfig::FLUSH_EVERY_1000) {
         if (i && i % 1000 == 0) {
-          Flush();
+          ASSERT_OK(Flush());
         }
       }
     }
@@ -1402,7 +1439,7 @@ class DBIteratorTestForPinnedData : public DBIteratorTest {
       Close();
       Reopen(options);
     } else if (run_config == TestConfig::COMPACT_BEFORE_READ) {
-      db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
     }
 
     ReadOptions ro;
@@ -1516,6 +1553,10 @@ TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedFlush) {
   PinnedDataIteratorRandomized(TestConfig::FLUSH_EVERY_1000);
 }
 
+INSTANTIATE_TEST_CASE_P(DBIteratorTestForPinnedDataInstance,
+                        DBIteratorTestForPinnedData,
+                        testing::Values(true, false));
+
 #ifndef ROCKSDB_LITE
 TEST_P(DBIteratorTest, PinnedDataIteratorMultipleFiles) {
   Options options = CurrentOptions();
@@ -1768,6 +1809,7 @@ TEST_P(DBIteratorTest, IterSeekForPrevCrossingFiles) {
     Iterator* iter = NewIterator(ro);
     iter->SeekForPrev("c2");
     ASSERT_TRUE(!iter->Valid());
+    ASSERT_OK(iter->status());
     delete iter;
   }
 }
@@ -1823,6 +1865,7 @@ TEST_P(DBIteratorTest, IterSeekForPrevCrossingFilesCustomPrefixExtractor) {
     Iterator* iter = NewIterator(ro);
     iter->SeekForPrev("c2");
     ASSERT_TRUE(!iter->Valid());
+    ASSERT_OK(iter->status());
     delete iter;
   }
 }
@@ -2151,19 +2194,19 @@ TEST_P(DBIteratorTest, ReadAhead) {
 
   std::string value(1024, 'a');
   for (int i = 0; i < 100; i++) {
-    Put(Key(i), value);
+    ASSERT_OK(Put(Key(i), value));
   }
   ASSERT_OK(Flush());
   MoveFilesToLevel(2);
 
   for (int i = 0; i < 100; i++) {
-    Put(Key(i), value);
+    ASSERT_OK(Put(Key(i), value));
   }
   ASSERT_OK(Flush());
   MoveFilesToLevel(1);
 
   for (int i = 0; i < 100; i++) {
-    Put(Key(i), value);
+    ASSERT_OK(Put(Key(i), value));
   }
   ASSERT_OK(Flush());
 #ifndef ROCKSDB_LITE
@@ -2270,6 +2313,7 @@ TEST_P(DBIteratorTest, Refresh) {
   ASSERT_OK(Put("x", "y"));
 
   std::unique_ptr<Iterator> iter(NewIterator(ReadOptions()));
+  ASSERT_OK(iter->status());
   iter->Seek(Slice("a"));
   ASSERT_TRUE(iter->Valid());
   ASSERT_EQ(iter->key().compare(Slice("x")), 0);
@@ -2284,7 +2328,8 @@ TEST_P(DBIteratorTest, Refresh) {
   iter->Next();
   ASSERT_FALSE(iter->Valid());
 
-  iter->Refresh();
+  ASSERT_OK(iter->status());
+  ASSERT_OK(iter->Refresh());
 
   iter->Seek(Slice("a"));
   ASSERT_TRUE(iter->Valid());
@@ -2295,7 +2340,7 @@ TEST_P(DBIteratorTest, Refresh) {
   iter->Next();
   ASSERT_FALSE(iter->Valid());
 
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
 
   ASSERT_OK(Put("m", "n"));
 
@@ -2308,7 +2353,8 @@ TEST_P(DBIteratorTest, Refresh) {
   iter->Next();
   ASSERT_FALSE(iter->Valid());
 
-  iter->Refresh();
+  ASSERT_OK(iter->status());
+  ASSERT_OK(iter->Refresh());
 
   iter->Seek(Slice("a"));
   ASSERT_TRUE(iter->Valid());
@@ -2331,6 +2377,7 @@ TEST_P(DBIteratorTest, RefreshWithSnapshot) {
   ReadOptions options;
   options.snapshot = snapshot;
   Iterator* iter = NewIterator(options);
+  ASSERT_OK(iter->status());
 
   iter->Seek(Slice("a"));
   ASSERT_TRUE(iter->Valid());
@@ -2346,8 +2393,8 @@ TEST_P(DBIteratorTest, RefreshWithSnapshot) {
   iter->Next();
   ASSERT_FALSE(iter->Valid());
 
-  Status s;
-  s = iter->Refresh();
+  ASSERT_OK(iter->status());
+  Status s = iter->Refresh();
   ASSERT_TRUE(s.IsNotSupported());
   db_->ReleaseSnapshot(snapshot);
   delete iter;
@@ -2405,14 +2452,14 @@ TEST_P(DBIteratorTest, UpperBoundWithChangeDirection) {
 
 TEST_P(DBIteratorTest, TableFilter) {
   ASSERT_OK(Put("a", "1"));
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
   ASSERT_OK(Put("b", "2"));
   ASSERT_OK(Put("c", "3"));
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
   ASSERT_OK(Put("d", "4"));
   ASSERT_OK(Put("e", "5"));
   ASSERT_OK(Put("f", "6"));
-  dbfull()->Flush(FlushOptions());
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
 
   // Ensure the table_filter callback is called once for each table.
   {
@@ -2597,13 +2644,13 @@ TEST_P(DBIteratorTest, SeekAfterHittingManyInternalKeys) {
   ReadOptions ropts;
   ropts.max_skippable_internal_keys = 2;
 
-  Put("1", "val_1");
+  ASSERT_OK(Put("1", "val_1"));
   // Add more tombstones than max_skippable_internal_keys so that Next() fails.
-  Delete("2");
-  Delete("3");
-  Delete("4");
-  Delete("5");
-  Put("6", "val_6");
+  ASSERT_OK(Delete("2"));
+  ASSERT_OK(Delete("3"));
+  ASSERT_OK(Delete("4"));
+  ASSERT_OK(Delete("5"));
+  ASSERT_OK(Put("6", "val_6"));
 
   std::unique_ptr<Iterator> iter(NewIterator(ropts));
   iter->SeekToFirst();
@@ -2645,9 +2692,9 @@ TEST_P(DBIteratorTest, NonBlockingIterationBugRepro) {
   DestroyAndReopen(options);
 
   // Two records in sst file, each in its own block.
-  Put("b", "");
-  Put("d", "");
-  Flush();
+  ASSERT_OK(Put("b", ""));
+  ASSERT_OK(Put("d", ""));
+  ASSERT_OK(Flush());
 
   // Create a nonblocking iterator before writing to memtable.
   ReadOptions ropt;
@@ -2657,7 +2704,7 @@ TEST_P(DBIteratorTest, NonBlockingIterationBugRepro) {
   // Overwrite a key in memtable many times to hit
   // max_sequential_skip_in_iterations (which is 8 by default).
   for (int i = 0; i < 20; ++i) {
-    Put("c", "");
+    ASSERT_OK(Put("c", ""));
   }
 
   // Load the second block in sst file into the block cache.
@@ -2674,9 +2721,9 @@ TEST_P(DBIteratorTest, NonBlockingIterationBugRepro) {
 }
 
 TEST_P(DBIteratorTest, SeekBackwardAfterOutOfUpperBound) {
-  Put("a", "");
-  Put("b", "");
-  Flush();
+  ASSERT_OK(Put("a", ""));
+  ASSERT_OK(Put("b", ""));
+  ASSERT_OK(Flush());
 
   ReadOptions ropt;
   Slice ub = "b";
@@ -2883,6 +2930,127 @@ TEST_P(DBIteratorTest, IterateWithLowerBoundAcrossFileBoundary) {
   ASSERT_OK(iter->status());
 }
 
+TEST_P(DBIteratorTest, Blob) {
+  Options options = CurrentOptions();
+  options.enable_blob_files = true;
+  options.max_sequential_skip_in_iterations = 2;
+  options.statistics = CreateDBStatistics();
+
+  Reopen(options);
+
+  // Note: we have 4 KVs (3 of which are hidden) for key "b" and
+  // max_sequential_skip_in_iterations is set to 2. Thus, we need to do a reseek
+  // anytime we move from "b" to "c" or vice versa.
+  ASSERT_OK(Put("a", "va"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("b", "vb0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("b", "vb1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("b", "vb2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("b", "vb3"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("c", "vc"));
+  ASSERT_OK(Flush());
+
+  std::unique_ptr<Iterator> iter_guard(NewIterator(ReadOptions()));
+  Iterator* const iter = iter_guard.get();
+
+  iter->SeekToFirst();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+  iter->SeekToFirst();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  iter->SeekToLast();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+  iter->SeekToLast();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  iter->Seek("");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Seek("a");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Seek("ax");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+
+  iter->SeekForPrev("d");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->SeekForPrev("c");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->SeekForPrev("bx");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+
+  iter->Seek("b");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+  iter->Seek("z");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+  iter->SeekForPrev("b");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+  iter->SeekForPrev("");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  // Switch from reverse to forward
+  iter->SeekToLast();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4);
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 5);
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 5);
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+
+  // Switch from forward to reverse
+  iter->SeekToFirst();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6);
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6);
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 7);
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 8);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+}
+
 INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest,
                         testing::Values(true, false));
 
@@ -3021,6 +3189,44 @@ TEST_F(DBIteratorWithReadCallbackTest, ReadCallback) {
   delete iter;
 }
 
+TEST_F(DBIteratorTest, BackwardIterationOnInplaceUpdateMemtable) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.inplace_update_support = false;
+  options.env = env_;
+  DestroyAndReopen(options);
+  constexpr int kNumKeys = 10;
+
+  // Write kNumKeys to WAL.
+  for (int i = 0; i < kNumKeys; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    int count = 0;
+    for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+      ++count;
+    }
+    ASSERT_EQ(kNumKeys, count);
+  }
+
+  // Reopen and rebuild the memtable from WAL.
+  options.create_if_missing = false;
+  options.avoid_flush_during_recovery = true;
+  options.inplace_update_support = true;
+  options.allow_concurrent_memtable_write = false;
+  Reopen(options);
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->SeekToLast();
+    // Backward iteration not supported due to inplace_update_support = true.
+    ASSERT_TRUE(iter->status().IsNotSupported());
+    ASSERT_FALSE(iter->Valid());
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_kv_checksum_test.cc b/db/db_kv_checksum_test.cc
new file mode 100644
index 00000000000..24411811928
--- /dev/null
+++ b/db/db_kv_checksum_test.cc
@@ -0,0 +1,199 @@
+//  Copyright (c) 2020-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+enum class WriteBatchOpType {
+  kPut = 0,
+  kDelete,
+  kSingleDelete,
+  kDeleteRange,
+  kMerge,
+  kBlobIndex,
+  kNum,
+};
+
+// Integer addition is needed for `::testing::Range()` to take the enum type.
+WriteBatchOpType operator+(WriteBatchOpType lhs, const int rhs) {
+  using T = std::underlying_type<WriteBatchOpType>::type;
+  return static_cast<WriteBatchOpType>(static_cast<T>(lhs) + rhs);
+}
+
+class DbKvChecksumTest
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<WriteBatchOpType, char>> {
+ public:
+  DbKvChecksumTest()
+      : DBTestBase("/db_kv_checksum_test", /*env_do_fsync=*/false) {
+    op_type_ = std::get<0>(GetParam());
+    corrupt_byte_addend_ = std::get<1>(GetParam());
+  }
+
+  std::pair<WriteBatch, Status> GetWriteBatch(size_t ts_sz,
+                                              ColumnFamilyHandle* cf_handle) {
+    Status s;
+    WriteBatch wb(0 /* reserved_bytes */, 0 /* max_bytes */, ts_sz,
+                  8 /* protection_bytes_per_entry */);
+    switch (op_type_) {
+      case WriteBatchOpType::kPut:
+        s = wb.Put(cf_handle, "key", "val");
+        break;
+      case WriteBatchOpType::kDelete:
+        s = wb.Delete(cf_handle, "key");
+        break;
+      case WriteBatchOpType::kSingleDelete:
+        s = wb.SingleDelete(cf_handle, "key");
+        break;
+      case WriteBatchOpType::kDeleteRange:
+        s = wb.DeleteRange(cf_handle, "begin", "end");
+        break;
+      case WriteBatchOpType::kMerge:
+        s = wb.Merge(cf_handle, "key", "val");
+        break;
+      case WriteBatchOpType::kBlobIndex:
+        // TODO(ajkr): use public API once available.
+        uint32_t cf_id;
+        if (cf_handle == nullptr) {
+          cf_id = 0;
+        } else {
+          cf_id = cf_handle->GetID();
+        }
+        s = WriteBatchInternal::PutBlobIndex(&wb, cf_id, "key", "val");
+        break;
+      case WriteBatchOpType::kNum:
+        assert(false);
+    }
+    return {std::move(wb), std::move(s)};
+  }
+
+  void CorruptNextByteCallBack(void* arg) {
+    Slice encoded = *static_cast<Slice*>(arg);
+    if (entry_len_ == port::kMaxSizet) {
+      // We learn the entry size on the first attempt
+      entry_len_ = encoded.size();
+    }
+    // All entries should be the same size
+    assert(entry_len_ == encoded.size());
+    char* buf = const_cast<char*>(encoded.data());
+    buf[corrupt_byte_offset_] += corrupt_byte_addend_;
+    ++corrupt_byte_offset_;
+  }
+
+  bool MoreBytesToCorrupt() { return corrupt_byte_offset_ < entry_len_; }
+
+ protected:
+  WriteBatchOpType op_type_;
+  char corrupt_byte_addend_;
+  size_t corrupt_byte_offset_ = 0;
+  size_t entry_len_ = port::kMaxSizet;
+};
+
+std::string GetTestNameSuffix(
+    ::testing::TestParamInfo<std::tuple<WriteBatchOpType, char>> info) {
+  std::ostringstream oss;
+  switch (std::get<0>(info.param)) {
+    case WriteBatchOpType::kPut:
+      oss << "Put";
+      break;
+    case WriteBatchOpType::kDelete:
+      oss << "Delete";
+      break;
+    case WriteBatchOpType::kSingleDelete:
+      oss << "SingleDelete";
+      break;
+    case WriteBatchOpType::kDeleteRange:
+      oss << "DeleteRange";
+      break;
+    case WriteBatchOpType::kMerge:
+      oss << "Merge";
+      break;
+    case WriteBatchOpType::kBlobIndex:
+      oss << "BlobIndex";
+      break;
+    case WriteBatchOpType::kNum:
+      assert(false);
+  }
+  oss << "Add"
+      << static_cast<int>(static_cast<unsigned char>(std::get<1>(info.param)));
+  return oss.str();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DbKvChecksumTest, DbKvChecksumTest,
+    ::testing::Combine(::testing::Range(static_cast<WriteBatchOpType>(0),
+                                        WriteBatchOpType::kNum),
+                       ::testing::Values(2, 103, 251)),
+    GetTestNameSuffix);
+
+TEST_P(DbKvChecksumTest, MemTableAddCorrupted) {
+  // This test repeatedly attempts to write `WriteBatch`es containing a single
+  // entry of type `op_type_`. Each attempt has one byte corrupted in its
+  // memtable entry by adding `corrupt_byte_addend_` to its original value. The
+  // test repeats until an attempt has been made on each byte in the encoded
+  // memtable entry. All attempts are expected to fail with `Status::Corruption`
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTable::Add:Encoded",
+      std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+                std::placeholders::_1));
+
+  while (MoreBytesToCorrupt()) {
+    // Failed memtable insert always leads to read-only mode, so we have to
+    // reopen for every attempt.
+    Options options = CurrentOptions();
+    if (op_type_ == WriteBatchOpType::kMerge) {
+      options.merge_operator = MergeOperators::CreateStringAppendOperator();
+    }
+    Reopen(options);
+
+    SyncPoint::GetInstance()->EnableProcessing();
+    auto batch_and_status =
+        GetWriteBatch(0 /* ts_sz */, nullptr /* cf_handle */);
+    ASSERT_OK(batch_and_status.second);
+    ASSERT_TRUE(
+        db_->Write(WriteOptions(), &batch_and_status.first).IsCorruption());
+    SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_P(DbKvChecksumTest, MemTableAddWithColumnFamilyCorrupted) {
+  // This test repeatedly attempts to write `WriteBatch`es containing a single
+  // entry of type `op_type_` to a non-default column family. Each attempt has
+  // one byte corrupted in its memtable entry by adding `corrupt_byte_addend_`
+  // to its original value. The test repeats until an attempt has been made on
+  // each byte in the encoded memtable entry. All attempts are expected to fail
+  // with `Status::Corruption`.
+  Options options = CurrentOptions();
+  if (op_type_ == WriteBatchOpType::kMerge) {
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  }
+  CreateAndReopenWithCF({"pikachu"}, options);
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTable::Add:Encoded",
+      std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+                std::placeholders::_1));
+
+  while (MoreBytesToCorrupt()) {
+    // Failed memtable insert always leads to read-only mode, so we have to
+    // reopen for every attempt.
+    ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+
+    SyncPoint::GetInstance()->EnableProcessing();
+    auto batch_and_status = GetWriteBatch(0 /* ts_sz */, handles_[1]);
+    ASSERT_OK(batch_and_status.second);
+    ASSERT_TRUE(
+        db_->Write(WriteOptions(), &batch_and_status.first).IsCorruption());
+    SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/db/db_log_iter_test.cc b/db/db_log_iter_test.cc
index 51d232a6a4f..9baf720375c 100644
--- a/db/db_log_iter_test.cc
+++ b/db/db_log_iter_test.cc
@@ -13,6 +13,7 @@
 #if !defined(ROCKSDB_LITE)
 
 #include "db/db_test_util.h"
+#include "env/mock_env.h"
 #include "port/stack_trace.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -33,9 +34,8 @@ class DBTestXactLogIterator : public DBTestBase {
 };
 
 namespace {
-SequenceNumber ReadRecords(
-    std::unique_ptr<TransactionLogIterator>& iter,
-    int& count) {
+SequenceNumber ReadRecords(std::unique_ptr<TransactionLogIterator>& iter,
+                           int& count, bool expect_ok = true) {
   count = 0;
   SequenceNumber lastSequence = 0;
   BatchResult res;
@@ -47,6 +47,11 @@ SequenceNumber ReadRecords(
     EXPECT_OK(iter->status());
     iter->Next();
   }
+  if (expect_ok) {
+    EXPECT_OK(iter->status());
+  } else {
+    EXPECT_NOK(iter->status());
+  }
   return res.sequence;
 }
 
@@ -64,9 +69,9 @@ TEST_F(DBTestXactLogIterator, TransactionLogIterator) {
     Options options = OptionsForLogIterTest();
     DestroyAndReopen(options);
     CreateAndReopenWithCF({"pikachu"}, options);
-    Put(0, "key1", DummyString(1024));
-    Put(1, "key2", DummyString(1024));
-    Put(1, "key2", DummyString(1024));
+    ASSERT_OK(Put(0, "key1", DummyString(1024)));
+    ASSERT_OK(Put(1, "key2", DummyString(1024)));
+    ASSERT_OK(Put(1, "key2", DummyString(1024)));
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3U);
     {
       auto iter = OpenTransactionLogIter(0);
@@ -75,9 +80,9 @@ TEST_F(DBTestXactLogIterator, TransactionLogIterator) {
     ReopenWithColumnFamilies({"default", "pikachu"}, options);
     env_->SleepForMicroseconds(2 * 1000 * 1000);
     {
-      Put(0, "key4", DummyString(1024));
-      Put(1, "key5", DummyString(1024));
-      Put(0, "key6", DummyString(1024));
+      ASSERT_OK(Put(0, "key4", DummyString(1024)));
+      ASSERT_OK(Put(1, "key5", DummyString(1024)));
+      ASSERT_OK(Put(0, "key6", DummyString(1024)));
     }
     {
       auto iter = OpenTransactionLogIter(0);
@@ -109,15 +114,15 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorRace) {
       ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
       Options options = OptionsForLogIterTest();
       DestroyAndReopen(options);
-      Put("key1", DummyString(1024));
-      dbfull()->Flush(FlushOptions());
-      Put("key2", DummyString(1024));
-      dbfull()->Flush(FlushOptions());
-      Put("key3", DummyString(1024));
-      dbfull()->Flush(FlushOptions());
-      Put("key4", DummyString(1024));
+      ASSERT_OK(Put("key1", DummyString(1024)));
+      ASSERT_OK(dbfull()->Flush(FlushOptions()));
+      ASSERT_OK(Put("key2", DummyString(1024)));
+      ASSERT_OK(dbfull()->Flush(FlushOptions()));
+      ASSERT_OK(Put("key3", DummyString(1024)));
+      ASSERT_OK(dbfull()->Flush(FlushOptions()));
+      ASSERT_OK(Put("key4", DummyString(1024)));
       ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4U);
-      dbfull()->FlushWAL(false);
+      ASSERT_OK(dbfull()->FlushWAL(false));
 
       {
         auto iter = OpenTransactionLogIter(0);
@@ -130,11 +135,11 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorRace) {
       // condition
       FlushOptions flush_options;
       flush_options.wait = false;
-      dbfull()->Flush(flush_options);
+      ASSERT_OK(dbfull()->Flush(flush_options));
 
       // "key5" would be written in a new memtable and log
-      Put("key5", DummyString(1024));
-      dbfull()->FlushWAL(false);
+      ASSERT_OK(Put("key5", DummyString(1024)));
+      ASSERT_OK(dbfull()->FlushWAL(false));
       {
         // this iter would miss "key4" if not fixed
         auto iter = OpenTransactionLogIter(0);
@@ -149,14 +154,14 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorStallAtLastRecord) {
   do {
     Options options = OptionsForLogIterTest();
     DestroyAndReopen(options);
-    Put("key1", DummyString(1024));
+    ASSERT_OK(Put("key1", DummyString(1024)));
     auto iter = OpenTransactionLogIter(0);
     ASSERT_OK(iter->status());
     ASSERT_TRUE(iter->Valid());
     iter->Next();
     ASSERT_TRUE(!iter->Valid());
     ASSERT_OK(iter->status());
-    Put("key2", DummyString(1024));
+    ASSERT_OK(Put("key2", DummyString(1024)));
     iter->Next();
     ASSERT_OK(iter->status());
     ASSERT_TRUE(iter->Valid());
@@ -167,9 +172,9 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorCheckAfterRestart) {
   do {
     Options options = OptionsForLogIterTest();
     DestroyAndReopen(options);
-    Put("key1", DummyString(1024));
-    Put("key2", DummyString(1023));
-    dbfull()->Flush(FlushOptions());
+    ASSERT_OK(Put("key1", DummyString(1024)));
+    ASSERT_OK(Put("key2", DummyString(1023)));
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
     Reopen(options);
     auto iter = OpenTransactionLogIter(0);
     ExpectRecords(2, iter);
@@ -181,10 +186,10 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorCorruptedLog) {
     Options options = OptionsForLogIterTest();
     DestroyAndReopen(options);
     for (int i = 0; i < 1024; i++) {
-      Put("key"+ToString(i), DummyString(10));
+      ASSERT_OK(Put("key" + ToString(i), DummyString(10)));
     }
-    dbfull()->Flush(FlushOptions());
-    dbfull()->FlushWAL(false);
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
+    ASSERT_OK(dbfull()->FlushWAL(false));
     // Corrupt this log to create a gap
     ROCKSDB_NAMESPACE::VectorLogPtr wal_files;
     ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
@@ -197,13 +202,13 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorCorruptedLog) {
     }
 
     // Insert a new entry to a new log file
-    Put("key1025", DummyString(10));
-    dbfull()->FlushWAL(false);
+    ASSERT_OK(Put("key1025", DummyString(10)));
+    ASSERT_OK(dbfull()->FlushWAL(false));
     // Try to read from the beginning. Should stop before the gap and read less
     // than 1025 entries
     auto iter = OpenTransactionLogIter(0);
     int count;
-    SequenceNumber last_sequence_read = ReadRecords(iter, count);
+    SequenceNumber last_sequence_read = ReadRecords(iter, count, false);
     ASSERT_LT(last_sequence_read, 1025U);
     // Try to read past the gap, should be able to seek to key1025
     auto iter2 = OpenTransactionLogIter(last_sequence_read + 1);
@@ -217,15 +222,15 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorBatchOperations) {
     DestroyAndReopen(options);
     CreateAndReopenWithCF({"pikachu"}, options);
     WriteBatch batch;
-    batch.Put(handles_[1], "key1", DummyString(1024));
-    batch.Put(handles_[0], "key2", DummyString(1024));
-    batch.Put(handles_[1], "key3", DummyString(1024));
-    batch.Delete(handles_[0], "key2");
-    dbfull()->Write(WriteOptions(), &batch);
-    Flush(1);
-    Flush(0);
+    ASSERT_OK(batch.Put(handles_[1], "key1", DummyString(1024)));
+    ASSERT_OK(batch.Put(handles_[0], "key2", DummyString(1024)));
+    ASSERT_OK(batch.Put(handles_[1], "key3", DummyString(1024)));
+    ASSERT_OK(batch.Delete(handles_[0], "key2"));
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Flush(0));
     ReopenWithColumnFamilies({"default", "pikachu"}, options);
-    Put(1, "key4", DummyString(1024));
+    ASSERT_OK(Put(1, "key4", DummyString(1024)));
     auto iter = OpenTransactionLogIter(3);
     ExpectRecords(2, iter);
   } while (ChangeCompactOptions());
@@ -237,13 +242,13 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorBlobs) {
   CreateAndReopenWithCF({"pikachu"}, options);
   {
     WriteBatch batch;
-    batch.Put(handles_[1], "key1", DummyString(1024));
-    batch.Put(handles_[0], "key2", DummyString(1024));
-    batch.PutLogData(Slice("blob1"));
-    batch.Put(handles_[1], "key3", DummyString(1024));
-    batch.PutLogData(Slice("blob2"));
-    batch.Delete(handles_[0], "key2");
-    dbfull()->Write(WriteOptions(), &batch);
+    ASSERT_OK(batch.Put(handles_[1], "key1", DummyString(1024)));
+    ASSERT_OK(batch.Put(handles_[0], "key2", DummyString(1024)));
+    ASSERT_OK(batch.PutLogData(Slice("blob1")));
+    ASSERT_OK(batch.Put(handles_[1], "key3", DummyString(1024)));
+    ASSERT_OK(batch.PutLogData(Slice("blob2")));
+    ASSERT_OK(batch.Delete(handles_[0], "key2"));
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
     ReopenWithColumnFamilies({"default", "pikachu"}, options);
   }
 
@@ -268,7 +273,7 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorBlobs) {
       return Status::OK();
     }
   } handler;
-  res.writeBatchPtr->Iterate(&handler);
+  ASSERT_OK(res.writeBatchPtr->Iterate(&handler));
   ASSERT_EQ(
       "Put(1, key1, 1024)"
       "Put(0, key2, 1024)"
diff --git a/db/db_logical_block_size_cache_test.cc b/db/db_logical_block_size_cache_test.cc
index 20f6abadca1..1057871c9f3 100644
--- a/db/db_logical_block_size_cache_test.cc
+++ b/db/db_logical_block_size_cache_test.cc
@@ -401,7 +401,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) {
   ColumnFamilyOptions cf_options0;
   cf_options0.cf_paths = {{cf_path_0_, 1024}};
   ColumnFamilyHandle* cf0;
-  db0->CreateColumnFamily(cf_options0, "cf", &cf0);
+  ASSERT_OK(db0->CreateColumnFamily(cf_options0, "cf", &cf0));
   ASSERT_EQ(2, cache_->Size());
   ASSERT_TRUE(cache_->Contains(data_path_0_));
   ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
@@ -421,7 +421,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) {
   ColumnFamilyOptions cf_options1;
   cf_options1.cf_paths = {{cf_path_1_, 1024}};
   ColumnFamilyHandle* cf1;
-  db1->CreateColumnFamily(cf_options1, "cf", &cf1);
+  ASSERT_OK(db1->CreateColumnFamily(cf_options1, "cf", &cf1));
   ASSERT_EQ(4, cache_->Size());
   ASSERT_TRUE(cache_->Contains(data_path_0_));
   ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
@@ -432,7 +432,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) {
   ASSERT_TRUE(cache_->Contains(cf_path_1_));
   ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_));
 
-  db0->DestroyColumnFamilyHandle(cf0);
+  ASSERT_OK(db0->DestroyColumnFamilyHandle(cf0));
   delete db0;
   ASSERT_EQ(2, cache_->Size());
   ASSERT_TRUE(cache_->Contains(data_path_1_));
@@ -441,7 +441,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) {
   ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_));
   ASSERT_OK(DestroyDB(data_path_0_, options, {{"cf", cf_options0}}));
 
-  db1->DestroyColumnFamilyHandle(cf1);
+  ASSERT_OK(db1->DestroyColumnFamilyHandle(cf1));
   delete db1;
   ASSERT_EQ(0, cache_->Size());
   ASSERT_OK(DestroyDB(data_path_1_, options, {{"cf", cf_options1}}));
@@ -466,7 +466,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) {
   ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
 
   ColumnFamilyHandle* cf0;
-  db0->CreateColumnFamily(cf_options, "cf", &cf0);
+  ASSERT_OK(db0->CreateColumnFamily(cf_options, "cf", &cf0));
   ASSERT_EQ(2, cache_->Size());
   ASSERT_TRUE(cache_->Contains(data_path_0_));
   ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
@@ -482,14 +482,14 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) {
   ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
 
   ColumnFamilyHandle* cf1;
-  db1->CreateColumnFamily(cf_options, "cf", &cf1);
+  ASSERT_OK(db1->CreateColumnFamily(cf_options, "cf", &cf1));
   ASSERT_EQ(2, cache_->Size());
   ASSERT_TRUE(cache_->Contains(data_path_0_));
   ASSERT_EQ(2, cache_->GetRefCount(data_path_0_));
   ASSERT_TRUE(cache_->Contains(cf_path_0_));
   ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
 
-  db0->DestroyColumnFamilyHandle(cf0);
+  ASSERT_OK(db0->DestroyColumnFamilyHandle(cf0));
   delete db0;
   ASSERT_EQ(2, cache_->Size());
   ASSERT_TRUE(cache_->Contains(data_path_0_));
@@ -498,7 +498,7 @@ TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) {
   ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
   ASSERT_OK(DestroyDB(dbname_ + "/db0", options, {{"cf", cf_options}}));
 
-  db1->DestroyColumnFamilyHandle(cf1);
+  ASSERT_OK(db1->DestroyColumnFamilyHandle(cf1));
   delete db1;
   ASSERT_EQ(0, cache_->Size());
   ASSERT_OK(DestroyDB(dbname_ + "/db1", options, {{"cf", cf_options}}));
diff --git a/db/db_memtable_test.cc b/db/db_memtable_test.cc
index 99763e3516e..cc4aaeb81b6 100644
--- a/db/db_memtable_test.cc
+++ b/db/db_memtable_test.cc
@@ -129,7 +129,6 @@ class TestPrefixExtractor : public SliceTransform {
 TEST_F(DBMemTableTest, DuplicateSeq) {
   SequenceNumber seq = 123;
   std::string value;
-  Status s;
   MergeContext merge_context;
   Options options;
   InternalKeyComparator ikey_cmp(options.comparator);
@@ -140,28 +139,31 @@ TEST_F(DBMemTableTest, DuplicateSeq) {
   InternalKeyComparator cmp(BytewiseComparator());
   auto factory = std::make_shared<SkipListFactory>();
   options.memtable_factory = factory;
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   WriteBufferManager wb(options.db_write_buffer_size);
   MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
                                kMaxSequenceNumber, 0 /* column_family_id */);
 
   // Write some keys and make sure it returns false on duplicates
-  bool res;
-  res = mem->Add(seq, kTypeValue, "key", "value2");
-  ASSERT_TRUE(res);
-  res = mem->Add(seq, kTypeValue, "key", "value2");
-  ASSERT_FALSE(res);
+  ASSERT_OK(
+      mem->Add(seq, kTypeValue, "key", "value2", nullptr /* kv_prot_info */));
+  ASSERT_TRUE(
+      mem->Add(seq, kTypeValue, "key", "value2", nullptr /* kv_prot_info */)
+          .IsTryAgain());
   // Changing the type should still cause the duplicatae key
-  res = mem->Add(seq, kTypeMerge, "key", "value2");
-  ASSERT_FALSE(res);
+  ASSERT_TRUE(
+      mem->Add(seq, kTypeMerge, "key", "value2", nullptr /* kv_prot_info */)
+          .IsTryAgain());
   // Changing the seq number will make the key fresh
-  res = mem->Add(seq + 1, kTypeMerge, "key", "value2");
-  ASSERT_TRUE(res);
+  ASSERT_OK(mem->Add(seq + 1, kTypeMerge, "key", "value2",
+                     nullptr /* kv_prot_info */));
   // Test with different types for duplicate keys
-  res = mem->Add(seq, kTypeDeletion, "key", "");
-  ASSERT_FALSE(res);
-  res = mem->Add(seq, kTypeSingleDeletion, "key", "");
-  ASSERT_FALSE(res);
+  ASSERT_TRUE(
+      mem->Add(seq, kTypeDeletion, "key", "", nullptr /* kv_prot_info */)
+          .IsTryAgain());
+  ASSERT_TRUE(
+      mem->Add(seq, kTypeSingleDeletion, "key", "", nullptr /* kv_prot_info */)
+          .IsTryAgain());
 
   // Test the duplicate keys under stress
   for (int i = 0; i < 10000; i++) {
@@ -169,11 +171,12 @@ TEST_F(DBMemTableTest, DuplicateSeq) {
     if (!insert_dup) {
       seq++;
     }
-    res = mem->Add(seq, kTypeValue, "foo", "value" + ToString(seq));
+    Status s = mem->Add(seq, kTypeValue, "foo", "value" + ToString(seq),
+                        nullptr /* kv_prot_info */);
     if (insert_dup) {
-      ASSERT_FALSE(res);
+      ASSERT_TRUE(s.IsTryAgain());
     } else {
-      ASSERT_TRUE(res);
+      ASSERT_OK(s);
     }
   }
   delete mem;
@@ -181,26 +184,28 @@ TEST_F(DBMemTableTest, DuplicateSeq) {
   // Test with InsertWithHint
   options.memtable_insert_with_hint_prefix_extractor.reset(
       new TestPrefixExtractor());  // which uses _ to extract the prefix
-  ioptions = ImmutableCFOptions(options);
+  ioptions = ImmutableOptions(options);
   mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
                      kMaxSequenceNumber, 0 /* column_family_id */);
   // Insert a duplicate key with _ in it
-  res = mem->Add(seq, kTypeValue, "key_1", "value");
-  ASSERT_TRUE(res);
-  res = mem->Add(seq, kTypeValue, "key_1", "value");
-  ASSERT_FALSE(res);
+  ASSERT_OK(
+      mem->Add(seq, kTypeValue, "key_1", "value", nullptr /* kv_prot_info */));
+  ASSERT_TRUE(
+      mem->Add(seq, kTypeValue, "key_1", "value", nullptr /* kv_prot_info */)
+          .IsTryAgain());
   delete mem;
 
   // Test when InsertConcurrently will be invoked
   options.allow_concurrent_memtable_write = true;
-  ioptions = ImmutableCFOptions(options);
+  ioptions = ImmutableOptions(options);
   mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
                      kMaxSequenceNumber, 0 /* column_family_id */);
   MemTablePostProcessInfo post_process_info;
-  res = mem->Add(seq, kTypeValue, "key", "value", true, &post_process_info);
-  ASSERT_TRUE(res);
-  res = mem->Add(seq, kTypeValue, "key", "value", true, &post_process_info);
-  ASSERT_FALSE(res);
+  ASSERT_OK(mem->Add(seq, kTypeValue, "key", "value",
+                     nullptr /* kv_prot_info */, true, &post_process_info));
+  ASSERT_TRUE(mem->Add(seq, kTypeValue, "key", "value",
+                       nullptr /* kv_prot_info */, true, &post_process_info)
+                  .IsTryAgain());
   delete mem;
 }
 
@@ -208,7 +213,6 @@ TEST_F(DBMemTableTest, DuplicateSeq) {
 TEST_F(DBMemTableTest, ConcurrentMergeWrite) {
   int num_ops = 1000;
   std::string value;
-  Status s;
   MergeContext merge_context;
   Options options;
   // A merge operator that is not sensitive to concurrent writes since in this
@@ -220,15 +224,14 @@ TEST_F(DBMemTableTest, ConcurrentMergeWrite) {
   auto factory = std::make_shared<SkipListFactory>();
   options.memtable_factory = factory;
   options.allow_concurrent_memtable_write = true;
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   WriteBufferManager wb(options.db_write_buffer_size);
   MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
                                kMaxSequenceNumber, 0 /* column_family_id */);
 
   // Put 0 as the base
   PutFixed64(&value, static_cast<uint64_t>(0));
-  bool res = mem->Add(0, kTypeValue, "key", value);
-  ASSERT_TRUE(res);
+  ASSERT_OK(mem->Add(0, kTypeValue, "key", value, nullptr /* kv_prot_info */));
   value.clear();
 
   // Write Merge concurrently
@@ -237,9 +240,8 @@ TEST_F(DBMemTableTest, ConcurrentMergeWrite) {
     std::string v1;
     for (int seq = 1; seq < num_ops / 2; seq++) {
       PutFixed64(&v1, seq);
-      bool res1 =
-          mem->Add(seq, kTypeMerge, "key", v1, true, &post_process_info1);
-      ASSERT_TRUE(res1);
+      ASSERT_OK(mem->Add(seq, kTypeMerge, "key", v1, nullptr /* kv_prot_info */,
+                         true, &post_process_info1));
       v1.clear();
     }
   });
@@ -248,9 +250,8 @@ TEST_F(DBMemTableTest, ConcurrentMergeWrite) {
     std::string v2;
     for (int seq = num_ops / 2; seq < num_ops; seq++) {
       PutFixed64(&v2, seq);
-      bool res2 =
-          mem->Add(seq, kTypeMerge, "key", v2, true, &post_process_info2);
-      ASSERT_TRUE(res2);
+      ASSERT_OK(mem->Add(seq, kTypeMerge, "key", v2, nullptr /* kv_prot_info */,
+                         true, &post_process_info2));
       v2.clear();
     }
   });
@@ -261,8 +262,9 @@ TEST_F(DBMemTableTest, ConcurrentMergeWrite) {
   ReadOptions roptions;
   SequenceNumber max_covering_tombstone_seq = 0;
   LookupKey lkey("key", kMaxSequenceNumber);
-  res = mem->Get(lkey, &value, /*timestamp=*/nullptr, &status, &merge_context,
-                 &max_covering_tombstone_seq, roptions);
+  bool res = mem->Get(lkey, &value, /*timestamp=*/nullptr, &status,
+                      &merge_context, &max_covering_tombstone_seq, roptions);
+  ASSERT_OK(status);
   ASSERT_TRUE(res);
   uint64_t ivalue = DecodeFixed64(Slice(value).data());
   uint64_t sum = 0;
@@ -316,6 +318,7 @@ TEST_F(DBMemTableTest, InsertWithHint) {
 TEST_F(DBMemTableTest, ColumnFamilyId) {
   // Verifies MemTableRepFactory is told the right column family id.
   Options options;
+  options.env = CurrentOptions().env;
   options.allow_concurrent_memtable_write = false;
   options.create_if_missing = true;
   options.memtable_factory.reset(new MockMemTableRepFactory());
diff --git a/db/db_merge_operand_test.cc b/db/db_merge_operand_test.cc
index 7ec256213a4..45bafb44c75 100644
--- a/db/db_merge_operand_test.cc
+++ b/db/db_merge_operand_test.cc
@@ -19,33 +19,35 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-class DBMergeOperandTest : public DBTestBase {
+namespace {
+class LimitedStringAppendMergeOp : public StringAppendTESTOperator {
  public:
-  DBMergeOperandTest()
-      : DBTestBase("/db_merge_operand_test", /*env_do_fsync=*/true) {}
-};
+  LimitedStringAppendMergeOp(int limit, char delim)
+      : StringAppendTESTOperator(delim), limit_(limit) {}
 
-TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) {
-  class LimitedStringAppendMergeOp : public StringAppendTESTOperator {
-   public:
-    LimitedStringAppendMergeOp(int limit, char delim)
-        : StringAppendTESTOperator(delim), limit_(limit) {}
+  const char* Name() const override {
+    return "DBMergeOperatorTest::LimitedStringAppendMergeOp";
+  }
 
-    const char* Name() const override {
-      return "DBMergeOperatorTest::LimitedStringAppendMergeOp";
+  bool ShouldMerge(const std::vector<Slice>& operands) const override {
+    if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) {
+      return true;
     }
+    return false;
+  }
 
-    bool ShouldMerge(const std::vector<Slice>& operands) const override {
-      if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) {
-        return true;
-      }
-      return false;
-    }
+ private:
+  size_t limit_ = 0;
+};
+}  // namespace
 
-   private:
-    size_t limit_ = 0;
-  };
+class DBMergeOperandTest : public DBTestBase {
+ public:
+  DBMergeOperandTest()
+      : DBTestBase("/db_merge_operand_test", /*env_do_fsync=*/true) {}
+};
 
+TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) {
   Options options;
   options.create_if_missing = true;
   // Use only the latest two merge operands.
@@ -59,29 +61,29 @@ TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) {
   merge_operands_info.expected_max_number_of_operands = num_records;
 
   // k0 value in memtable
-  Put("k0", "PutARock");
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(Put("k0", "PutARock"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k0", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "PutARock");
 
   // k0.1 value in SST
-  Put("k0.1", "RockInSST");
+  ASSERT_OK(Put("k0.1", "RockInSST"));
   ASSERT_OK(Flush());
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k0.1",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k0.1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "RockInSST");
 
   // All k1 values are in memtable.
   ASSERT_OK(Merge("k1", "a"));
-  Put("k1", "x");
+  ASSERT_OK(Put("k1", "x"));
   ASSERT_OK(Merge("k1", "b"));
   ASSERT_OK(Merge("k1", "c"));
   ASSERT_OK(Merge("k1", "d"));
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "x");
   ASSERT_EQ(values[1], "b");
   ASSERT_EQ(values[2], "c");
@@ -98,13 +100,13 @@ TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) {
 
   // All k1.1 values are in memtable.
   ASSERT_OK(Merge("k1.1", "r"));
-  Delete("k1.1");
+  ASSERT_OK(Delete("k1.1"));
   ASSERT_OK(Merge("k1.1", "c"));
   ASSERT_OK(Merge("k1.1", "k"));
   ASSERT_OK(Merge("k1.1", "s"));
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1.1",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k1.1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "c");
   ASSERT_EQ(values[1], "k");
   ASSERT_EQ(values[2], "s");
@@ -115,9 +117,9 @@ TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) {
   ASSERT_OK(Merge("k2", "e"));
   ASSERT_OK(Merge("k2", "r"));
   ASSERT_OK(Flush());
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k2", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "q");
   ASSERT_EQ(values[1], "w");
   ASSERT_EQ(values[2], "e");
@@ -125,30 +127,30 @@ TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) {
 
   // All k2.1 values are flushed to L0 into a single file.
   ASSERT_OK(Merge("k2.1", "m"));
-  Put("k2.1", "l");
+  ASSERT_OK(Put("k2.1", "l"));
   ASSERT_OK(Merge("k2.1", "n"));
   ASSERT_OK(Merge("k2.1", "o"));
   ASSERT_OK(Flush());
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.1",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k2.1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "l,n,o");
 
   // All k2.2 values are flushed to L0 into a single file.
   ASSERT_OK(Merge("k2.2", "g"));
-  Delete("k2.2");
+  ASSERT_OK(Delete("k2.2"));
   ASSERT_OK(Merge("k2.2", "o"));
   ASSERT_OK(Merge("k2.2", "t"));
   ASSERT_OK(Flush());
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k2.2",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k2.2", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "o,t");
 
   // Do some compaction that will make the following tests more predictable
   //  Slice start("PutARock");
   //  Slice end("t");
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
   // All k3 values are flushed and are in different files.
   ASSERT_OK(Merge("k3", "ab"));
@@ -158,9 +160,9 @@ TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) {
   ASSERT_OK(Merge("k3", "cd"));
   ASSERT_OK(Flush());
   ASSERT_OK(Merge("k3", "de"));
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k3", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "ab");
   ASSERT_EQ(values[1], "bc");
   ASSERT_EQ(values[2], "cd");
@@ -169,14 +171,14 @@ TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) {
   // All k3.1 values are flushed and are in different files.
   ASSERT_OK(Merge("k3.1", "ab"));
   ASSERT_OK(Flush());
-  Put("k3.1", "bc");
+  ASSERT_OK(Put("k3.1", "bc"));
   ASSERT_OK(Flush());
   ASSERT_OK(Merge("k3.1", "cd"));
   ASSERT_OK(Flush());
   ASSERT_OK(Merge("k3.1", "de"));
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.1",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k3.1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "bc");
   ASSERT_EQ(values[1], "cd");
   ASSERT_EQ(values[2], "de");
@@ -184,14 +186,14 @@ TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) {
   // All k3.2 values are flushed and are in different files.
   ASSERT_OK(Merge("k3.2", "ab"));
   ASSERT_OK(Flush());
-  Delete("k3.2");
+  ASSERT_OK(Delete("k3.2"));
   ASSERT_OK(Flush());
   ASSERT_OK(Merge("k3.2", "cd"));
   ASSERT_OK(Flush());
   ASSERT_OK(Merge("k3.2", "de"));
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k3.2",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k3.2", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "cd");
   ASSERT_EQ(values[1], "de");
 
@@ -206,32 +208,120 @@ TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) {
   ASSERT_OK(Flush());
   MoveFilesToLevel(1);
   ASSERT_OK(Merge("k4", "ed"));
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k4",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k4", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "ba");
   ASSERT_EQ(values[1], "cb");
   ASSERT_EQ(values[2], "dc");
   ASSERT_EQ(values[3], "ed");
 
-  // First 3 k5 values are in SST and next 4 k5 values are in Immutable Memtable
+  // First 3 k5 values are in SST and next 4 k5 values are in Immutable
+  // Memtable
   ASSERT_OK(Merge("k5", "who"));
   ASSERT_OK(Merge("k5", "am"));
   ASSERT_OK(Merge("k5", "i"));
   ASSERT_OK(Flush());
-  Put("k5", "remember");
+  ASSERT_OK(Put("k5", "remember"));
   ASSERT_OK(Merge("k5", "i"));
   ASSERT_OK(Merge("k5", "am"));
   ASSERT_OK(Merge("k5", "rocks"));
-  dbfull()->TEST_SwitchMemtable();
-  db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k5",
-                        values.data(), &merge_operands_info,
-                        &number_of_operands);
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k5", values.data(), &merge_operands_info,
+                                  &number_of_operands));
   ASSERT_EQ(values[0], "remember");
   ASSERT_EQ(values[1], "i");
   ASSERT_EQ(values[2], "am");
 }
 
+TEST_F(DBMergeOperandTest, BlobDBGetMergeOperandsBasic) {
+  Options options;
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  // Use only the latest two merge operands.
+  options.merge_operator = std::make_shared<LimitedStringAppendMergeOp>(2, ',');
+  options.env = env_;
+  Reopen(options);
+  int num_records = 4;
+  int number_of_operands = 0;
+  std::vector<PinnableSlice> values(num_records);
+  GetMergeOperandsOptions merge_operands_info;
+  merge_operands_info.expected_max_number_of_operands = num_records;
+
+  // All k1 values are in memtable.
+  ASSERT_OK(Put("k1", "x"));
+  ASSERT_OK(Merge("k1", "b"));
+  ASSERT_OK(Merge("k1", "c"));
+  ASSERT_OK(Merge("k1", "d"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "x");
+  ASSERT_EQ(values[1], "b");
+  ASSERT_EQ(values[2], "c");
+  ASSERT_EQ(values[3], "d");
+
+  // expected_max_number_of_operands is less than number of merge operands so
+  // status should be Incomplete.
+  merge_operands_info.expected_max_number_of_operands = num_records - 1;
+  Status status = db_->GetMergeOperands(
+      ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(),
+      &merge_operands_info, &number_of_operands);
+  ASSERT_EQ(status.IsIncomplete(), true);
+  merge_operands_info.expected_max_number_of_operands = num_records;
+
+  // All k2 values are flushed to L0 into a single file.
+  ASSERT_OK(Put("k2", "q"));
+  ASSERT_OK(Merge("k2", "w"));
+  ASSERT_OK(Merge("k2", "e"));
+  ASSERT_OK(Merge("k2", "r"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k2", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "q,w,e,r");
+
+  // Do some compaction that will make the following tests more predictable
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // All k3 values are flushed and are in different files.
+  ASSERT_OK(Put("k3", "ab"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "bc"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "cd"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "de"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k3", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "ab");
+  ASSERT_EQ(values[1], "bc");
+  ASSERT_EQ(values[2], "cd");
+  ASSERT_EQ(values[3], "de");
+
+  // All K4 values are in different levels
+  ASSERT_OK(Put("k4", "ba"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(4);
+  ASSERT_OK(Merge("k4", "cb"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(3);
+  ASSERT_OK(Merge("k4", "dc"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+  ASSERT_OK(Merge("k4", "ed"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k4", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "ba");
+  ASSERT_EQ(values[1], "cb");
+  ASSERT_EQ(values[2], "dc");
+  ASSERT_EQ(values[3], "ed");
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_merge_operator_test.cc b/db/db_merge_operator_test.cc
index 3db68685185..4b819bd624a 100644
--- a/db/db_merge_operator_test.cc
+++ b/db/db_merge_operator_test.cc
@@ -37,7 +37,7 @@ class TestReadCallback : public ReadCallback {
 class DBMergeOperatorTest : public DBTestBase {
  public:
   DBMergeOperatorTest()
-      : DBTestBase("/db_merge_operator_test", /*env_do_fsync=*/true) {}
+      : DBTestBase("/db_merge_operator_test", /*env_do_fsync=*/false) {}
 
   std::string GetWithReadCallback(SnapshotChecker* snapshot_checker,
                                   const Slice& key,
@@ -94,7 +94,7 @@ TEST_F(DBMergeOperatorTest, LimitMergeOperands) {
   ASSERT_OK(Merge("k1", "c"));
   ASSERT_OK(Merge("k1", "d"));
   std::string value;
-  ASSERT_TRUE(db_->Get(ReadOptions(), "k1", &value).ok());
+  ASSERT_OK(db_->Get(ReadOptions(), "k1", &value));
   // Make sure that only the latest two merge operands are used. If this was
   // not the case the value would be "a,b,c,d".
   ASSERT_EQ(value, "c,d");
@@ -105,7 +105,7 @@ TEST_F(DBMergeOperatorTest, LimitMergeOperands) {
   ASSERT_OK(Merge("k2", "c"));
   ASSERT_OK(Merge("k2", "d"));
   ASSERT_OK(Flush());
-  ASSERT_TRUE(db_->Get(ReadOptions(), "k2", &value).ok());
+  ASSERT_OK(db_->Get(ReadOptions(), "k2", &value));
   ASSERT_EQ(value, "c,d");
 
   // All K3 values are flushed and are in different files.
@@ -116,7 +116,7 @@ TEST_F(DBMergeOperatorTest, LimitMergeOperands) {
   ASSERT_OK(Merge("k3", "cd"));
   ASSERT_OK(Flush());
   ASSERT_OK(Merge("k3", "de"));
-  ASSERT_TRUE(db_->Get(ReadOptions(), "k3", &value).ok());
+  ASSERT_OK(db_->Get(ReadOptions(), "k3", &value));
   ASSERT_EQ(value, "cd,de");
 
   // All K4 values are in different levels
@@ -130,7 +130,7 @@ TEST_F(DBMergeOperatorTest, LimitMergeOperands) {
   ASSERT_OK(Flush());
   MoveFilesToLevel(1);
   ASSERT_OK(Merge("k4", "de"));
-  ASSERT_TRUE(db_->Get(ReadOptions(), "k4", &value).ok());
+  ASSERT_OK(db_->Get(ReadOptions(), "k4", &value));
   ASSERT_EQ(value, "cd,de");
 }
 
@@ -344,8 +344,9 @@ TEST_P(MergeOperatorPinningTest, EvictCacheBeforeMerge) {
   // Code executed before merge operation
   merge_hook->before_merge_ = [&]() {
     // Evict all tables from cache before every merge operation
+    auto* table_cache = dbfull()->TEST_table_cache();
     for (uint64_t num : file_numbers) {
-      TableCache::Evict(dbfull()->TEST_table_cache(), num);
+      TableCache::Evict(table_cache, num);
     }
     // Decrease cache capacity to force all unrefed blocks to be evicted
     if (bbto.block_cache) {
@@ -366,7 +367,7 @@ TEST_P(MergeOperatorPinningTest, EvictCacheBeforeMerge) {
   VerifyDBFromMap(true_data, &total_reads);
   ASSERT_EQ(merge_cnt, total_reads);
 
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
   VerifyDBFromMap(true_data, &total_reads);
 }
@@ -385,7 +386,7 @@ TEST_P(MergeOperatorPinningTest, TailingIterator) {
   std::function<void()> writer_func = [&]() {
     int k = 0;
     for (int i = 0; i < kNumWrites; i++) {
-      db_->Merge(WriteOptions(), Key(k), Key(k));
+      ASSERT_OK(db_->Merge(WriteOptions(), Key(k), Key(k)));
 
       if (i && i % kNumOperands == 0) {
         k++;
@@ -403,7 +404,7 @@ TEST_P(MergeOperatorPinningTest, TailingIterator) {
     ReadOptions ro;
     ro.tailing = true;
     Iterator* iter = db_->NewIterator(ro);
-
+    ASSERT_OK(iter->status());
     iter->SeekToFirst();
     for (int i = 0; i < (kNumWrites / kNumOperands); i++) {
       while (!iter->Valid()) {
@@ -416,6 +417,7 @@ TEST_P(MergeOperatorPinningTest, TailingIterator) {
 
       iter->Next();
     }
+    ASSERT_OK(iter->status());
 
     delete iter;
   };
@@ -449,12 +451,13 @@ TEST_F(DBMergeOperatorTest, TailingIteratorMemtableUnrefedBySomeoneElse) {
   //    ForwardIterator to not pin it in some circumstances. This test
   //    reproduces it.
 
-  db_->Merge(WriteOptions(), "key", "sst");
-  db_->Flush(FlushOptions()); // Switch to SuperVersion A
-  db_->Merge(WriteOptions(), "key", "memtable");
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "sst"));
+  ASSERT_OK(db_->Flush(FlushOptions()));  // Switch to SuperVersion A
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "memtable"));
 
   // Pin SuperVersion A
   std::unique_ptr<Iterator> someone_else(db_->NewIterator(ReadOptions()));
+  ASSERT_OK(someone_else->status());
 
   bool pushed_first_operand = false;
   bool stepped_to_next_operand = false;
@@ -462,7 +465,7 @@ TEST_F(DBMergeOperatorTest, TailingIteratorMemtableUnrefedBySomeoneElse) {
       "DBIter::MergeValuesNewToOld:PushedFirstOperand", [&](void*) {
         EXPECT_FALSE(pushed_first_operand);
         pushed_first_operand = true;
-        db_->Flush(FlushOptions()); // Switch to SuperVersion B
+        EXPECT_OK(db_->Flush(FlushOptions()));  // Switch to SuperVersion B
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "DBIter::MergeValuesNewToOld:SteppedToNextOperand", [&](void*) {
@@ -477,7 +480,7 @@ TEST_F(DBMergeOperatorTest, TailingIteratorMemtableUnrefedBySomeoneElse) {
   std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
   iter->Seek("key");
 
-  ASSERT_TRUE(iter->status().ok());
+  ASSERT_OK(iter->status());
   ASSERT_TRUE(iter->Valid());
   EXPECT_EQ(std::string("sst,memtable"), iter->value().ToString());
   EXPECT_TRUE(pushed_first_operand);
diff --git a/db/db_options_test.cc b/db/db_options_test.cc
index 0ab06489c8f..96fd37357dd 100644
--- a/db/db_options_test.cc
+++ b/db/db_options_test.cc
@@ -34,7 +34,7 @@ class DBOptionsTest : public DBTestBase {
       const DBOptions& options) {
     std::string options_str;
     std::unordered_map<std::string, std::string> mutable_map;
-    ConfigOptions config_options;
+    ConfigOptions config_options(options);
     config_options.delimiter = "; ";
 
     EXPECT_OK(GetStringFromMutableDBOptions(
@@ -79,6 +79,22 @@ class DBOptionsTest : public DBTestBase {
 #endif  // ROCKSDB_LITE
 };
 
+TEST_F(DBOptionsTest, ImmutableTrackAndVerifyWalsInManifest) {
+  Options options;
+  options.env = env_;
+  options.track_and_verify_wals_in_manifest = true;
+
+  ImmutableDBOptions db_options(options);
+  ASSERT_TRUE(db_options.track_and_verify_wals_in_manifest);
+
+  Reopen(options);
+  ASSERT_TRUE(dbfull()->GetDBOptions().track_and_verify_wals_in_manifest);
+
+  Status s =
+      dbfull()->SetDBOptions({{"track_and_verify_wals_in_manifest", "false"}});
+  ASSERT_FALSE(s.ok());
+}
+
 // RocksDB lite don't support dynamic options.
 #ifndef ROCKSDB_LITE
 
@@ -113,6 +129,83 @@ TEST_F(DBOptionsTest, GetLatestCFOptions) {
             GetMutableCFOptionsMap(dbfull()->GetOptions(handles_[1])));
 }
 
+TEST_F(DBOptionsTest, SetMutableTableOptions) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  options.blob_file_size = 16384;
+  BlockBasedTableOptions bbto;
+  bbto.no_block_cache = true;
+  bbto.block_size = 8192;
+  bbto.block_restart_interval = 7;
+
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  Options c_opts = dbfull()->GetOptions(cfh);
+  const auto* c_bbto =
+      c_opts.table_factory->GetOptions<BlockBasedTableOptions>();
+  ASSERT_NE(c_bbto, nullptr);
+  ASSERT_EQ(c_opts.blob_file_size, 16384);
+  ASSERT_EQ(c_bbto->no_block_cache, true);
+  ASSERT_EQ(c_bbto->block_size, 8192);
+  ASSERT_EQ(c_bbto->block_restart_interval, 7);
+  ASSERT_OK(dbfull()->SetOptions(
+      cfh, {{"table_factory.block_size", "16384"},
+            {"table_factory.block_restart_interval", "11"}}));
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 11);
+
+  // Now set an option that is not mutable - options should not change
+  ASSERT_NOK(
+      dbfull()->SetOptions(cfh, {{"table_factory.no_block_cache", "false"}}));
+  ASSERT_EQ(c_bbto->no_block_cache, true);
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 11);
+
+  // Set some that are mutable and some that are not - options should not change
+  ASSERT_NOK(dbfull()->SetOptions(
+      cfh, {{"table_factory.no_block_cache", "false"},
+            {"table_factory.block_size", "8192"},
+            {"table_factory.block_restart_interval", "7"}}));
+  ASSERT_EQ(c_bbto->no_block_cache, true);
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 11);
+
+  // Set some that are mutable and some that do not exist - options should not
+  // change
+  ASSERT_NOK(dbfull()->SetOptions(
+      cfh, {{"table_factory.block_size", "8192"},
+            {"table_factory.does_not_exist", "true"},
+            {"table_factory.block_restart_interval", "7"}}));
+  ASSERT_EQ(c_bbto->no_block_cache, true);
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 11);
+
+  // Trying to change the table factory fails
+  ASSERT_NOK(dbfull()->SetOptions(
+      cfh, {{"table_factory", TableFactory::kPlainTableName()}}));
+
+  // Set some on the table and some on the Column Family
+  ASSERT_OK(dbfull()->SetOptions(
+      cfh, {{"table_factory.block_size", "16384"},
+            {"blob_file_size", "32768"},
+            {"table_factory.block_restart_interval", "13"}}));
+  c_opts = dbfull()->GetOptions(cfh);
+  ASSERT_EQ(c_opts.blob_file_size, 32768);
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 13);
+  // Set some on the table and a bad one on the ColumnFamily - options should
+  // not change
+  ASSERT_NOK(dbfull()->SetOptions(
+      cfh, {{"table_factory.block_size", "1024"},
+            {"no_such_option", "32768"},
+            {"table_factory.block_restart_interval", "7"}}));
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 13);
+}
+
 TEST_F(DBOptionsTest, SetBytesPerSync) {
   const size_t kValueSize = 1024 * 1024;  // 1MB
   Options options;
@@ -174,10 +267,11 @@ TEST_F(DBOptionsTest, SetWalBytesPerSync) {
   options.env = env_;
   Reopen(options);
   ASSERT_EQ(512, dbfull()->GetDBOptions().wal_bytes_per_sync);
-  int counter = 0;
+  std::atomic_int counter{0};
   int low_bytes_per_sync = 0;
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) { counter++; });
+      "WritableFileWriter::RangeSync:0",
+      [&](void* /*arg*/) { counter.fetch_add(1); });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
   const std::string kValue(kValueSize, 'v');
   int i = 0;
@@ -606,6 +700,7 @@ TEST_F(DBOptionsTest, MaxOpenFilesChange) {
 
 TEST_F(DBOptionsTest, SanitizeDelayedWriteRate) {
   Options options;
+  options.env = CurrentOptions().env;
   options.delayed_write_rate = 0;
   Reopen(options);
   ASSERT_EQ(16 * 1024 * 1024, dbfull()->GetDBOptions().delayed_write_rate);
@@ -617,6 +712,7 @@ TEST_F(DBOptionsTest, SanitizeDelayedWriteRate) {
 
 TEST_F(DBOptionsTest, SanitizeUniversalTTLCompaction) {
   Options options;
+  options.env = CurrentOptions().env;
   options.compaction_style = kCompactionStyleUniversal;
 
   options.ttl = 0;
@@ -646,6 +742,7 @@ TEST_F(DBOptionsTest, SanitizeUniversalTTLCompaction) {
 
 TEST_F(DBOptionsTest, SanitizeTtlDefault) {
   Options options;
+  options.env = CurrentOptions().env;
   Reopen(options);
   ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
 
@@ -662,6 +759,7 @@ TEST_F(DBOptionsTest, SanitizeTtlDefault) {
 TEST_F(DBOptionsTest, SanitizeFIFOPeriodicCompaction) {
   Options options;
   options.compaction_style = kCompactionStyleFIFO;
+  options.env = CurrentOptions().env;
   options.ttl = 0;
   Reopen(options);
   ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
@@ -687,6 +785,7 @@ TEST_F(DBOptionsTest, SanitizeFIFOPeriodicCompaction) {
 
 TEST_F(DBOptionsTest, SetFIFOCompactionOptions) {
   Options options;
+  options.env = CurrentOptions().env;
   options.compaction_style = kCompactionStyleFIFO;
   options.write_buffer_size = 10 << 10;  // 10KB
   options.arena_block_size = 4096;
@@ -708,7 +807,7 @@ TEST_F(DBOptionsTest, SetFIFOCompactionOptions) {
     for (int j = 0; j < 10; j++) {
       ASSERT_OK(Put(ToString(i * 20 + j), rnd.RandomString(980)));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(NumTableFilesAtLevel(0), 10);
@@ -826,6 +925,7 @@ TEST_F(DBOptionsTest, FIFOTtlBackwardCompatible) {
   options.compaction_style = kCompactionStyleFIFO;
   options.write_buffer_size = 10 << 10;  // 10KB
   options.create_if_missing = true;
+  options.env = CurrentOptions().env;
 
   ASSERT_OK(TryReopen(options));
 
@@ -879,6 +979,7 @@ TEST_F(DBOptionsTest, ChangeCompression) {
   options.bottommost_compression = CompressionType::kNoCompression;
   options.bottommost_compression_opts.level = 2;
   options.bottommost_compression_opts.parallel_threads = 1;
+  options.env = CurrentOptions().env;
 
   ASSERT_OK(TryReopen(options));
 
@@ -929,6 +1030,66 @@ TEST_F(DBOptionsTest, ChangeCompression) {
 
 #endif  // ROCKSDB_LITE
 
+TEST_F(DBOptionsTest, BottommostCompressionOptsWithFallbackType) {
+  // Verify the bottommost compression options still take effect even when the
+  // bottommost compression type is left at its default value. Verify for both
+  // automatic and manual compaction.
+  if (!Snappy_Supported() || !LZ4_Supported()) {
+    return;
+  }
+
+  constexpr int kUpperCompressionLevel = 1;
+  constexpr int kBottommostCompressionLevel = 2;
+  constexpr int kNumL0Files = 2;
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.compression = CompressionType::kLZ4Compression;
+  options.compression_opts.level = kUpperCompressionLevel;
+  options.bottommost_compression_opts.level = kBottommostCompressionLevel;
+  options.bottommost_compression_opts.enabled = true;
+  Reopen(options);
+
+  CompressionType compression_used = CompressionType::kDisableCompressionOption;
+  CompressionOptions compression_opt_used;
+  bool compacted = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionPicker::RegisterCompaction:Registered", [&](void* arg) {
+        Compaction* c = static_cast<Compaction*>(arg);
+        compression_used = c->output_compression();
+        compression_opt_used = c->output_compression_opts();
+        compacted = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // First, verify for automatic compaction.
+  for (int i = 0; i < kNumL0Files; ++i) {
+    ASSERT_OK(Put("foo", "foofoofoo"));
+    ASSERT_OK(Put("bar", "foofoofoo"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_TRUE(compacted);
+  ASSERT_EQ(CompressionType::kLZ4Compression, compression_used);
+  ASSERT_EQ(kBottommostCompressionLevel, compression_opt_used.level);
+
+  // Second, verify for manual compaction.
+  compacted = false;
+  compression_used = CompressionType::kDisableCompressionOption;
+  compression_opt_used = CompressionOptions();
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_TRUE(compacted);
+  ASSERT_EQ(CompressionType::kLZ4Compression, compression_used);
+  ASSERT_EQ(kBottommostCompressionLevel, compression_opt_used.level);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc
index ff6ded37fa7..8945ee29148 100644
--- a/db/db_properties_test.cc
+++ b/db/db_properties_test.cc
@@ -336,7 +336,7 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) {
     table_options.filter_policy.reset(
         NewBloomFilterPolicy(kBloomBitsPerKey, false));
     table_options.block_size = 1024;
-    options.table_factory.reset(new BlockBasedTableFactory(table_options));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
     DestroyAndReopen(options);
 
@@ -536,7 +536,7 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
   table_options.filter_policy.reset(
       NewBloomFilterPolicy(kBloomBitsPerKey, false));
   table_options.block_size = 1024;
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   DestroyAndReopen(options);
 
@@ -1175,6 +1175,61 @@ class CountingDeleteTabPropCollectorFactory
   }
 };
 
+class BlockCountingTablePropertiesCollector : public TablePropertiesCollector {
+ public:
+  static const std::string kNumSampledBlocksPropertyName;
+
+  const char* Name() const override {
+    return "BlockCountingTablePropertiesCollector";
+  }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    (*properties)[kNumSampledBlocksPropertyName] =
+        ToString(num_sampled_blocks_);
+    return Status::OK();
+  }
+
+  Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+                    EntryType /*type*/, SequenceNumber /*seq*/,
+                    uint64_t /*file_size*/) override {
+    return Status::OK();
+  }
+
+  void BlockAdd(uint64_t /* block_raw_bytes */,
+                uint64_t block_compressed_bytes_fast,
+                uint64_t block_compressed_bytes_slow) override {
+    if (block_compressed_bytes_fast > 0 || block_compressed_bytes_slow > 0) {
+      num_sampled_blocks_++;
+    }
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{
+        {kNumSampledBlocksPropertyName, ToString(num_sampled_blocks_)},
+    };
+  }
+
+ private:
+  uint32_t num_sampled_blocks_ = 0;
+};
+
+const std::string
+    BlockCountingTablePropertiesCollector::kNumSampledBlocksPropertyName =
+        "NumSampledBlocks";
+
+class BlockCountingTablePropertiesCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ public:
+  const char* Name() const override {
+    return "BlockCountingTablePropertiesCollectorFactory";
+  }
+
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /* context */) override {
+    return new BlockCountingTablePropertiesCollector();
+  }
+};
+
 #ifndef ROCKSDB_LITE
 TEST_F(DBPropertiesTest, GetUserDefinedTableProperties) {
   Options options = CurrentOptions();
@@ -1413,8 +1468,134 @@ TEST_F(DBPropertiesTest, NeedCompactHintPersistentTest) {
   }
 }
 
+// Excluded from RocksDB lite tests due to `GetPropertiesOfAllTables()` usage.
+TEST_F(DBPropertiesTest, BlockAddForCompressionSampling) {
+  // Sampled compression requires at least one of the following four types.
+  if (!Snappy_Supported() && !Zlib_Supported() && !LZ4_Supported() &&
+      !ZSTD_Supported()) {
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.table_properties_collector_factories.emplace_back(
+      std::make_shared<BlockCountingTablePropertiesCollectorFactory>());
+
+  for (bool sample_for_compression : {false, true}) {
+    // For simplicity/determinism, sample 100% when enabled, or 0% when disabled
+    options.sample_for_compression = sample_for_compression ? 1 : 0;
+
+    DestroyAndReopen(options);
+
+    // Setup the following LSM:
+    //
+    // L0_0 ["a", "b"]
+    // L1_0 ["a", "b"]
+    //
+    // L0_0 was created by flush. L1_0 was created by compaction. Each file
+    // contains one data block.
+    for (int i = 0; i < 3; ++i) {
+      ASSERT_OK(Put("a", "val"));
+      ASSERT_OK(Put("b", "val"));
+      ASSERT_OK(Flush());
+      if (i == 1) {
+        ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+      }
+    }
+
+    // A `BlockAdd()` should have been seen for files generated by flush or
+    // compaction when `sample_for_compression` is enabled.
+    TablePropertiesCollection file_to_props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(&file_to_props));
+    ASSERT_EQ(2, file_to_props.size());
+    for (const auto& file_and_props : file_to_props) {
+      auto& user_props = file_and_props.second->user_collected_properties;
+      ASSERT_TRUE(user_props.find(BlockCountingTablePropertiesCollector::
+                                      kNumSampledBlocksPropertyName) !=
+                  user_props.end());
+      ASSERT_EQ(user_props.at(BlockCountingTablePropertiesCollector::
+                                  kNumSampledBlocksPropertyName),
+                ToString(sample_for_compression ? 1 : 0));
+    }
+  }
+}
+
+class CompressionSamplingDBPropertiesTest
+    : public DBPropertiesTest,
+      public ::testing::WithParamInterface<bool> {
+ public:
+  CompressionSamplingDBPropertiesTest() : fast_(GetParam()) {}
+
+ protected:
+  const bool fast_;
+};
+
+INSTANTIATE_TEST_CASE_P(CompressionSamplingDBPropertiesTest,
+                        CompressionSamplingDBPropertiesTest, ::testing::Bool());
+
+// Excluded from RocksDB lite tests due to `GetPropertiesOfAllTables()` usage.
+TEST_P(CompressionSamplingDBPropertiesTest,
+       EstimateDataSizeWithCompressionSampling) {
+  Options options = CurrentOptions();
+  if (fast_) {
+    // One of the following light compression libraries must be present.
+    if (LZ4_Supported()) {
+      options.compression = kLZ4Compression;
+    } else if (Snappy_Supported()) {
+      options.compression = kSnappyCompression;
+    } else {
+      return;
+    }
+  } else {
+    // One of the following heavy compression libraries must be present.
+    if (ZSTD_Supported()) {
+      options.compression = kZSTD;
+    } else if (Zlib_Supported()) {
+      options.compression = kZlibCompression;
+    } else {
+      return;
+    }
+  }
+  options.disable_auto_compactions = true;
+  // For simplicity/determinism, sample 100%.
+  options.sample_for_compression = 1;
+  Reopen(options);
+
+  // Setup the following LSM:
+  //
+  // L0_0 ["a", "b"]
+  // L1_0 ["a", "b"]
+  //
+  // L0_0 was created by flush. L1_0 was created by compaction. Each file
+  // contains one data block. The value consists of compressible data so the
+  // data block should be stored compressed.
+  std::string val(1024, 'a');
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("a", val));
+    ASSERT_OK(Put("b", val));
+    ASSERT_OK(Flush());
+    if (i == 1) {
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    }
+  }
+
+  TablePropertiesCollection file_to_props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&file_to_props));
+  ASSERT_EQ(2, file_to_props.size());
+  for (const auto& file_and_props : file_to_props) {
+    ASSERT_GT(file_and_props.second->data_size, 0);
+    if (fast_) {
+      ASSERT_EQ(file_and_props.second->data_size,
+                file_and_props.second->fast_compression_estimated_data_size);
+    } else {
+      ASSERT_EQ(file_and_props.second->data_size,
+                file_and_props.second->slow_compression_estimated_data_size);
+    }
+  }
+}
+
 TEST_F(DBPropertiesTest, EstimateNumKeysUnderflow) {
-  Options options;
+  Options options = CurrentOptions();
   Reopen(options);
   ASSERT_OK(Put("foo", "bar"));
   ASSERT_OK(Delete("foo"));
@@ -1445,6 +1626,7 @@ TEST_F(DBPropertiesTest, EstimateOldestKeyTime) {
 
   options.compaction_style = kCompactionStyleFIFO;
   options.ttl = 300;
+  options.max_open_files = -1;
   options.compaction_options_fifo.allow_compaction = false;
   DestroyAndReopen(options);
 
@@ -1524,6 +1706,7 @@ TEST_F(DBPropertiesTest, SstFilesSize) {
   std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
 
   Options options;
+  options.env = CurrentOptions().env;
   options.disable_auto_compactions = true;
   options.listeners.push_back(listener);
   Reopen(options);
@@ -1608,6 +1791,8 @@ TEST_F(DBPropertiesTest, BlockCacheProperties) {
   Options options;
   uint64_t value;
 
+  options.env = CurrentOptions().env;
+
   // Block cache properties are not available for tables other than
   // block-based table.
   options.table_factory.reset(NewPlainTableFactory());
diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc
index 706f95e49d3..10720e758dd 100644
--- a/db/db_range_del_test.cc
+++ b/db/db_range_del_test.cc
@@ -14,7 +14,7 @@ namespace ROCKSDB_NAMESPACE {
 
 class DBRangeDelTest : public DBTestBase {
  public:
-  DBRangeDelTest() : DBTestBase("/db_range_del_test", /*env_do_fsync=*/true) {}
+  DBRangeDelTest() : DBTestBase("/db_range_del_test", /*env_do_fsync=*/false) {}
 
   std::string GetNumericStr(int key) {
     uint64_t uint64_key = static_cast<uint64_t>(key);
@@ -56,7 +56,7 @@ TEST_F(DBRangeDelTest, EndSameAsStartCoversNothing) {
 }
 
 TEST_F(DBRangeDelTest, EndComesBeforeStartInvalidArgument) {
-  db_->Put(WriteOptions(), "b", "val");
+  ASSERT_OK(db_->Put(WriteOptions(), "b", "val"));
   ASSERT_TRUE(
       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "a")
           .IsInvalidArgument());
@@ -73,6 +73,15 @@ TEST_F(DBRangeDelTest, FlushOutputHasOnlyRangeTombstones) {
   } while (ChangeOptions(kRangeDelSkipConfigs));
 }
 
+TEST_F(DBRangeDelTest, DictionaryCompressionWithOnlyRangeTombstones) {
+  Options opts = CurrentOptions();
+  opts.compression_opts.max_dict_bytes = 16384;
+  Reopen(opts);
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
+                             "dr2"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+}
+
 TEST_F(DBRangeDelTest, CompactionOutputHasOnlyRangeTombstone) {
   do {
     Options opts = CurrentOptions();
@@ -82,13 +91,14 @@ TEST_F(DBRangeDelTest, CompactionOutputHasOnlyRangeTombstone) {
 
     // snapshot protects range tombstone from dropping due to becoming obsolete.
     const Snapshot* snapshot = db_->GetSnapshot();
-    db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z");
-    db_->Flush(FlushOptions());
+    ASSERT_OK(
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+    ASSERT_OK(db_->Flush(FlushOptions()));
 
     ASSERT_EQ(1, NumTableFilesAtLevel(0));
     ASSERT_EQ(0, NumTableFilesAtLevel(1));
-    dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
-                                true /* disallow_trivial_move */);
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                          true /* disallow_trivial_move */));
     ASSERT_EQ(0, NumTableFilesAtLevel(0));
     ASSERT_EQ(1, NumTableFilesAtLevel(1));
     ASSERT_EQ(0, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
@@ -118,7 +128,8 @@ TEST_F(DBRangeDelTest, CompactionOutputFilesExactlyFilled) {
 
   // snapshot protects range tombstone from dropping due to becoming obsolete.
   const Snapshot* snapshot = db_->GetSnapshot();
-  db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), Key(1));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(1)));
 
   Random rnd(301);
   for (int i = 0; i < kNumFiles; ++i) {
@@ -128,18 +139,18 @@ TEST_F(DBRangeDelTest, CompactionOutputFilesExactlyFilled) {
       values.push_back(rnd.RandomString(3 << 10));
       ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
       if (j == 0 && i > 0) {
-        dbfull()->TEST_WaitForFlushMemTable();
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
       }
     }
   }
   // put extra key to trigger final flush
   ASSERT_OK(Put("", ""));
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
   ASSERT_EQ(0, NumTableFilesAtLevel(1));
 
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
-                              true /* disallow_trivial_move */);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                        true /* disallow_trivial_move */));
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
   ASSERT_EQ(2, NumTableFilesAtLevel(1));
   db_->ReleaseSnapshot(snapshot);
@@ -159,7 +170,7 @@ TEST_F(DBRangeDelTest, MaxCompactionBytesCutsOutputFiles) {
   // Want max_compaction_bytes to trigger the end of compaction output file, not
   // target_file_size_base, so make the latter much bigger
   opts.target_file_size_base = 100 * opts.max_compaction_bytes;
-  Reopen(opts);
+  DestroyAndReopen(opts);
 
   // snapshot protects range tombstone from dropping due to becoming obsolete.
   const Snapshot* snapshot = db_->GetSnapshot();
@@ -178,12 +189,12 @@ TEST_F(DBRangeDelTest, MaxCompactionBytesCutsOutputFiles) {
     }
     // extra entry to trigger SpecialSkipListFactory's flush
     ASSERT_OK(Put(GetNumericStr(kNumPerFile), ""));
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
   }
 
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
-                              true /* disallow_trivial_move */);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                        true /* disallow_trivial_move */));
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
   ASSERT_GE(NumTableFilesAtLevel(1), 2);
 
@@ -221,10 +232,10 @@ TEST_F(DBRangeDelTest, SentinelsOmittedFromOutputFile) {
 }
 
 TEST_F(DBRangeDelTest, FlushRangeDelsSameStartKey) {
-  db_->Put(WriteOptions(), "b1", "val");
+  ASSERT_OK(db_->Put(WriteOptions(), "b1", "val"));
   ASSERT_OK(
       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c"));
-  db_->Put(WriteOptions(), "b2", "val");
+  ASSERT_OK(db_->Put(WriteOptions(), "b2", "val"));
   ASSERT_OK(
       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
   // first iteration verifies query correctness in memtable, second verifies
@@ -241,8 +252,9 @@ TEST_F(DBRangeDelTest, FlushRangeDelsSameStartKey) {
 }
 
 TEST_F(DBRangeDelTest, CompactRangeDelsSameStartKey) {
-  db_->Put(WriteOptions(), "unused", "val");  // prevents empty after compaction
-  db_->Put(WriteOptions(), "b1", "val");
+  ASSERT_OK(db_->Put(WriteOptions(), "unused",
+                     "val"));  // prevents empty after compaction
+  ASSERT_OK(db_->Put(WriteOptions(), "b1", "val"));
   ASSERT_OK(db_->Flush(FlushOptions()));
   ASSERT_OK(
       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c"));
@@ -254,8 +266,8 @@ TEST_F(DBRangeDelTest, CompactRangeDelsSameStartKey) {
 
   for (int i = 0; i < 2; ++i) {
     if (i > 0) {
-      dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
-                                  true /* disallow_trivial_move */);
+      ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                            true /* disallow_trivial_move */));
       ASSERT_EQ(0, NumTableFilesAtLevel(0));
       ASSERT_EQ(1, NumTableFilesAtLevel(1));
     }
@@ -269,7 +281,7 @@ TEST_F(DBRangeDelTest, FlushRemovesCoveredKeys) {
   const int kNum = 300, kRangeBegin = 50, kRangeEnd = 250;
   Options opts = CurrentOptions();
   opts.comparator = test::Uint64Comparator();
-  Reopen(opts);
+  DestroyAndReopen(opts);
 
   // Write a third before snapshot, a third between snapshot and tombstone, and
   // a third after the tombstone. Keys older than snapshot or newer than the
@@ -279,12 +291,13 @@ TEST_F(DBRangeDelTest, FlushRemovesCoveredKeys) {
     if (i == kNum / 3) {
       snapshot = db_->GetSnapshot();
     } else if (i == 2 * kNum / 3) {
-      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                       GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 GetNumericStr(kRangeBegin),
+                                 GetNumericStr(kRangeEnd)));
     }
-    db_->Put(WriteOptions(), GetNumericStr(i), "val");
+    ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
   }
-  db_->Flush(FlushOptions());
+  ASSERT_OK(db_->Flush(FlushOptions()));
 
   for (int i = 0; i < kNum; ++i) {
     ReadOptions read_opts;
@@ -309,29 +322,32 @@ TEST_F(DBRangeDelTest, CompactionRemovesCoveredKeys) {
   opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
   opts.num_levels = 2;
   opts.statistics = CreateDBStatistics();
-  Reopen(opts);
+  DestroyAndReopen(opts);
 
   for (int i = 0; i < kNumFiles; ++i) {
     if (i > 0) {
       // range tombstone covers first half of the previous file
-      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                       GetNumericStr((i - 1) * kNumPerFile),
-                       GetNumericStr((i - 1) * kNumPerFile + kNumPerFile / 2));
+      ASSERT_OK(db_->DeleteRange(
+          WriteOptions(), db_->DefaultColumnFamily(),
+          GetNumericStr((i - 1) * kNumPerFile),
+          GetNumericStr((i - 1) * kNumPerFile + kNumPerFile / 2)));
     }
     // Make sure a given key appears in each file so compaction won't be able to
     // use trivial move, which would happen if the ranges were non-overlapping.
     // Also, we need an extra element since flush is only triggered when the
     // number of keys is one greater than SpecialSkipListFactory's limit.
     // We choose a key outside the key-range used by the test to avoid conflict.
-    db_->Put(WriteOptions(), GetNumericStr(kNumPerFile * kNumFiles), "val");
+    ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(kNumPerFile * kNumFiles),
+                       "val"));
 
     for (int j = 0; j < kNumPerFile; ++j) {
-      db_->Put(WriteOptions(), GetNumericStr(i * kNumPerFile + j), "val");
+      ASSERT_OK(
+          db_->Put(WriteOptions(), GetNumericStr(i * kNumPerFile + j), "val"));
     }
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
   }
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
   ASSERT_GT(NumTableFilesAtLevel(1), 0);
   ASSERT_EQ((kNumFiles - 1) * kNumPerFile / 2,
@@ -373,8 +389,8 @@ TEST_F(DBRangeDelTest, ValidLevelSubcompactionBoundaries) {
       if (i > 0) {
         // delete [95,105) in two files, [295,305) in next two
         int mid = (j + (1 - j % 2)) * kNumPerFile;
-        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                         Key(mid - 5), Key(mid + 5));
+        ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                   Key(mid - 5), Key(mid + 5)));
       }
       std::vector<std::string> values;
       // Write 100KB (100 values, each 1K)
@@ -384,7 +400,7 @@ TEST_F(DBRangeDelTest, ValidLevelSubcompactionBoundaries) {
       }
       // put extra key to trigger flush
       ASSERT_OK(Put("", ""));
-      dbfull()->TEST_WaitForFlushMemTable();
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
       if (j < kNumFiles - 1) {
         // background compaction may happen early for kNumFiles'th file
         ASSERT_EQ(NumTableFilesAtLevel(0), j + 1);
@@ -400,7 +416,7 @@ TEST_F(DBRangeDelTest, ValidLevelSubcompactionBoundaries) {
         // oversized L0 (relative to base_level) causes the compaction to run
         // earlier.
         ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()}));
-        dbfull()->TEST_WaitForCompact();
+        ASSERT_OK(dbfull()->TEST_WaitForCompact());
         ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
                                   {{"disable_auto_compactions", "true"}}));
         ASSERT_EQ(NumTableFilesAtLevel(0), 0);
@@ -433,8 +449,8 @@ TEST_F(DBRangeDelTest, ValidUniversalSubcompactionBoundaries) {
         // insert range deletions [95,105) in two files, [295,305) in next two
         // to prepare L1 for later manual compaction.
         int mid = (j + (1 - j % 2)) * kNumPerFile;
-        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                         Key(mid - 5), Key(mid + 5));
+        ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                   Key(mid - 5), Key(mid + 5)));
       }
       std::vector<std::string> values;
       // Write 100KB (100 values, each 1K)
@@ -444,13 +460,13 @@ TEST_F(DBRangeDelTest, ValidUniversalSubcompactionBoundaries) {
       }
       // put extra key to trigger flush
       ASSERT_OK(Put("", ""));
-      dbfull()->TEST_WaitForFlushMemTable();
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
       if (j < kFilesPerLevel - 1) {
         // background compaction may happen early for kFilesPerLevel'th file
         ASSERT_EQ(NumTableFilesAtLevel(0), j + 1);
       }
     }
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(NumTableFilesAtLevel(0), 0);
     ASSERT_GT(NumTableFilesAtLevel(kNumLevels - 1 - i), kFilesPerLevel - 1);
   }
@@ -483,17 +499,17 @@ TEST_F(DBRangeDelTest, CompactionRemovesCoveredMergeOperands) {
   for (int i = 0; i <= kNumFiles * kNumPerFile; ++i) {
     if (i % kNumPerFile == 0 && i / kNumPerFile == kNumFiles - 1) {
       // Delete merge operands from all but the last file
-      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key",
-                       "key_");
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 "key", "key_"));
     }
     std::string val;
     PutFixed64(&val, i);
-    db_->Merge(WriteOptions(), "key", val);
+    ASSERT_OK(db_->Merge(WriteOptions(), "key", val));
     // we need to prevent trivial move using Puts so compaction will actually
     // process the merge operands.
-    db_->Put(WriteOptions(), "prevent_trivial_move", "");
+    ASSERT_OK(db_->Put(WriteOptions(), "prevent_trivial_move", ""));
     if (i > 0 && i % kNumPerFile == 0) {
-      dbfull()->TEST_WaitForFlushMemTable();
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     }
   }
 
@@ -504,7 +520,7 @@ TEST_F(DBRangeDelTest, CompactionRemovesCoveredMergeOperands) {
   PutFixed64(&expected, 45);  // 1+2+...+9
   ASSERT_EQ(expected, actual);
 
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
   expected.clear();
   ASSERT_OK(db_->Get(read_opts, "key", &actual));
@@ -550,19 +566,19 @@ TEST_F(DBRangeDelTest, ObsoleteTombstoneCleanup) {
   opts.statistics = CreateDBStatistics();
   Reopen(opts);
 
-  db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
-                   "dr10");  // obsolete after compaction
-  db_->Put(WriteOptions(), "key", "val");
-  db_->Flush(FlushOptions());
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
+                             "dr10"));  // obsolete after compaction
+  ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
   const Snapshot* snapshot = db_->GetSnapshot();
-  db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr2",
-                   "dr20");  // protected by snapshot
-  db_->Put(WriteOptions(), "key", "val");
-  db_->Flush(FlushOptions());
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr2",
+                             "dr20"));  // protected by snapshot
+  ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
 
   ASSERT_EQ(2, NumTableFilesAtLevel(0));
   ASSERT_EQ(0, NumTableFilesAtLevel(1));
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
   ASSERT_EQ(1, NumTableFilesAtLevel(1));
   ASSERT_EQ(1, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
@@ -603,28 +619,30 @@ TEST_F(DBRangeDelTest, TableEvictedDuringScan) {
   bbto.cache_index_and_filter_blocks = true;
   bbto.block_cache = NewLRUCache(8 << 20);
   opts.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  Reopen(opts);
+  DestroyAndReopen(opts);
 
   // Hold a snapshot so range deletions can't become obsolete during compaction
   // to bottommost level (i.e., L1).
   const Snapshot* snapshot = db_->GetSnapshot();
   for (int i = 0; i < kNum; ++i) {
-    db_->Put(WriteOptions(), GetNumericStr(i), "val");
+    ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
     if (i > 0) {
-      dbfull()->TEST_WaitForFlushMemTable();
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     }
     if (i >= kNum / 2 && i < kNum / 2 + kNumRanges) {
-      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                       GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 GetNumericStr(kRangeBegin),
+                                 GetNumericStr(kRangeEnd)));
     }
   }
   // Must be > 1 so the first L1 file can be closed before scan finishes
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_GT(NumTableFilesAtLevel(1), 1);
   std::vector<uint64_t> file_numbers = ListTableFiles(env_, dbname_);
 
   ReadOptions read_opts;
   auto* iter = db_->NewIterator(read_opts);
+  ASSERT_OK(iter->status());
   int expected = kRangeEnd;
   iter->SeekToFirst();
   for (auto file_number : file_numbers) {
@@ -647,7 +665,7 @@ TEST_F(DBRangeDelTest, TableEvictedDuringScan) {
 TEST_F(DBRangeDelTest, GetCoveredKeyFromMutableMemtable) {
   do {
     DestroyAndReopen(CurrentOptions());
-    db_->Put(WriteOptions(), "key", "val");
+    ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
     ASSERT_OK(
         db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
 
@@ -669,10 +687,10 @@ TEST_F(DBRangeDelTest, GetCoveredKeyFromImmutableMemtable) {
     opts.memtable_factory.reset(new SpecialSkipListFactory(1));
     DestroyAndReopen(opts);
 
-    db_->Put(WriteOptions(), "key", "val");
+    ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
     ASSERT_OK(
         db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
-    db_->Put(WriteOptions(), "blah", "val");
+    ASSERT_OK(db_->Put(WriteOptions(), "blah", "val"));
 
     ReadOptions read_opts;
     std::string value;
@@ -683,7 +701,7 @@ TEST_F(DBRangeDelTest, GetCoveredKeyFromImmutableMemtable) {
 TEST_F(DBRangeDelTest, GetCoveredKeyFromSst) {
   do {
     DestroyAndReopen(CurrentOptions());
-    db_->Put(WriteOptions(), "key", "val");
+    ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
     // snapshot prevents key from being deleted during flush
     const Snapshot* snapshot = db_->GetSnapshot();
     ASSERT_OK(
@@ -706,11 +724,11 @@ TEST_F(DBRangeDelTest, GetCoveredMergeOperandFromMemtable) {
   for (int i = 0; i < kNumMergeOps; ++i) {
     std::string val;
     PutFixed64(&val, i);
-    db_->Merge(WriteOptions(), "key", val);
+    ASSERT_OK(db_->Merge(WriteOptions(), "key", val));
     if (i == kNumMergeOps / 2) {
       // deletes [0, 5]
-      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key",
-                       "key_");
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 "key", "key_"));
     }
   }
 
@@ -734,16 +752,16 @@ TEST_F(DBRangeDelTest, GetIgnoresRangeDeletions) {
   opts.memtable_factory.reset(new SpecialSkipListFactory(1));
   Reopen(opts);
 
-  db_->Put(WriteOptions(), "sst_key", "val");
+  ASSERT_OK(db_->Put(WriteOptions(), "sst_key", "val"));
   // snapshot prevents key from being deleted during flush
   const Snapshot* snapshot = db_->GetSnapshot();
   ASSERT_OK(
       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
   ASSERT_OK(db_->Flush(FlushOptions()));
-  db_->Put(WriteOptions(), "imm_key", "val");
+  ASSERT_OK(db_->Put(WriteOptions(), "imm_key", "val"));
   ASSERT_OK(
       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
-  db_->Put(WriteOptions(), "mem_key", "val");
+  ASSERT_OK(db_->Put(WriteOptions(), "mem_key", "val"));
   ASSERT_OK(
       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
 
@@ -761,20 +779,22 @@ TEST_F(DBRangeDelTest, IteratorRemovesCoveredKeys) {
   Options opts = CurrentOptions();
   opts.comparator = test::Uint64Comparator();
   opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
-  Reopen(opts);
+  DestroyAndReopen(opts);
 
   // Write half of the keys before the tombstone and half after the tombstone.
   // Only covered keys (i.e., within the range and older than the tombstone)
   // should be deleted.
   for (int i = 0; i < kNum; ++i) {
     if (i == kNum / 2) {
-      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                       GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 GetNumericStr(kRangeBegin),
+                                 GetNumericStr(kRangeEnd)));
     }
-    db_->Put(WriteOptions(), GetNumericStr(i), "val");
+    ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
   }
   ReadOptions read_opts;
   auto* iter = db_->NewIterator(read_opts);
+  ASSERT_OK(iter->status());
 
   int expected = 0;
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
@@ -794,7 +814,7 @@ TEST_F(DBRangeDelTest, IteratorOverUserSnapshot) {
   Options opts = CurrentOptions();
   opts.comparator = test::Uint64Comparator();
   opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
-  Reopen(opts);
+  DestroyAndReopen(opts);
 
   const Snapshot* snapshot = nullptr;
   // Put a snapshot before the range tombstone, verify an iterator using that
@@ -802,14 +822,16 @@ TEST_F(DBRangeDelTest, IteratorOverUserSnapshot) {
   for (int i = 0; i < kNum; ++i) {
     if (i == kNum / 2) {
       snapshot = db_->GetSnapshot();
-      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                       GetNumericStr(kRangeBegin), GetNumericStr(kRangeEnd));
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 GetNumericStr(kRangeBegin),
+                                 GetNumericStr(kRangeEnd)));
     }
-    db_->Put(WriteOptions(), GetNumericStr(i), "val");
+    ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
   }
   ReadOptions read_opts;
   read_opts.snapshot = snapshot;
   auto* iter = db_->NewIterator(read_opts);
+  ASSERT_OK(iter->status());
 
   int expected = 0;
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
@@ -828,22 +850,23 @@ TEST_F(DBRangeDelTest, IteratorIgnoresRangeDeletions) {
   opts.memtable_factory.reset(new SpecialSkipListFactory(1));
   Reopen(opts);
 
-  db_->Put(WriteOptions(), "sst_key", "val");
+  ASSERT_OK(db_->Put(WriteOptions(), "sst_key", "val"));
   // snapshot prevents key from being deleted during flush
   const Snapshot* snapshot = db_->GetSnapshot();
   ASSERT_OK(
       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
   ASSERT_OK(db_->Flush(FlushOptions()));
-  db_->Put(WriteOptions(), "imm_key", "val");
+  ASSERT_OK(db_->Put(WriteOptions(), "imm_key", "val"));
   ASSERT_OK(
       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
-  db_->Put(WriteOptions(), "mem_key", "val");
+  ASSERT_OK(db_->Put(WriteOptions(), "mem_key", "val"));
   ASSERT_OK(
       db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
 
   ReadOptions read_opts;
   read_opts.ignore_range_deletions = true;
   auto* iter = db_->NewIterator(read_opts);
+  ASSERT_OK(iter->status());
   int i = 0;
   std::string expected[] = {"imm_key", "mem_key", "sst_key"};
   for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++i) {
@@ -857,7 +880,7 @@ TEST_F(DBRangeDelTest, IteratorIgnoresRangeDeletions) {
 
 #ifndef ROCKSDB_UBSAN_RUN
 TEST_F(DBRangeDelTest, TailingIteratorRangeTombstoneUnsupported) {
-  db_->Put(WriteOptions(), "key", "val");
+  ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
   // snapshot prevents key from being deleted during flush
   const Snapshot* snapshot = db_->GetSnapshot();
   ASSERT_OK(
@@ -873,6 +896,7 @@ TEST_F(DBRangeDelTest, TailingIteratorRangeTombstoneUnsupported) {
       iter->SeekToFirst();
     }
     ASSERT_TRUE(iter->status().IsNotSupported());
+
     delete iter;
     if (i == 0) {
       ASSERT_OK(db_->Flush(FlushOptions()));
@@ -882,7 +906,6 @@ TEST_F(DBRangeDelTest, TailingIteratorRangeTombstoneUnsupported) {
   }
   db_->ReleaseSnapshot(snapshot);
 }
-
 #endif  // !ROCKSDB_UBSAN_RUN
 
 TEST_F(DBRangeDelTest, SubcompactionHasEmptyDedicatedRangeDelFile) {
@@ -926,8 +949,8 @@ TEST_F(DBRangeDelTest, SubcompactionHasEmptyDedicatedRangeDelFile) {
   ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
   ASSERT_EQ(1, NumTableFilesAtLevel(1));
 
-  db_->EnableAutoCompaction({db_->DefaultColumnFamily()});
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()}));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   db_->ReleaseSnapshot(snapshot);
 }
 
@@ -949,7 +972,7 @@ TEST_F(DBRangeDelTest, MemtableBloomFilter) {
   for (int i = 0; i < kNumKeys; ++i) {
     ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
                              Key(kNumKeys)));
   for (int i = 0; i < kNumKeys; ++i) {
@@ -987,8 +1010,8 @@ TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) {
 
     // snapshot protects range tombstone from dropping due to becoming obsolete.
     const Snapshot* snapshot = db_->GetSnapshot();
-    db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
-                     Key(2 * kNumFilesPerLevel));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(0), Key(2 * kNumFilesPerLevel)));
 
     Random rnd(301);
     std::string value = rnd.RandomString(kValueBytes);
@@ -997,14 +1020,14 @@ TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) {
       ASSERT_OK(Put(Key(j), value));
       ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value));
       if (j > 0) {
-        dbfull()->TEST_WaitForFlushMemTable();
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
         ASSERT_EQ(j, NumTableFilesAtLevel(0));
       }
     }
     // put extra key to trigger final flush
     ASSERT_OK(Put("", ""));
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ(0, NumTableFilesAtLevel(0));
     ASSERT_EQ(kNumFilesPerLevel, NumTableFilesAtLevel(1));
 
@@ -1022,7 +1045,7 @@ TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) {
     } else if (i == 2) {
       ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
                                 {{"max_bytes_for_level_base", "10000"}}));
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
       ASSERT_EQ(1, NumTableFilesAtLevel(1));
     }
     ASSERT_GT(NumTableFilesAtLevel(2), 0);
@@ -1056,8 +1079,8 @@ TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) {
   // A snapshot protects the range tombstone from dropping due to
   // becoming obsolete.
   const Snapshot* snapshot = db_->GetSnapshot();
-  db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
-                   Key(0), Key(2 * kNumFilesPerLevel));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(2 * kNumFilesPerLevel)));
 
   // Create 2 additional sstables in L0. Note that the first sstable
   // contains the range tombstone.
@@ -1096,7 +1119,7 @@ TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) {
     ASSERT_EQ(value, Get(Key(2)));
     auto begin_str = Key(3);
     const ROCKSDB_NAMESPACE::Slice begin = begin_str;
-    dbfull()->TEST_CompactRange(1, &begin, nullptr);
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, &begin, nullptr));
     ASSERT_EQ(1, NumTableFilesAtLevel(1));
     ASSERT_EQ(2, NumTableFilesAtLevel(2));
     ASSERT_EQ(value, Get(Key(2)));
@@ -1115,7 +1138,7 @@ TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) {
     //     [key000002#6,1, key000004#72057594037927935,15]
     auto begin_str = Key(0);
     const ROCKSDB_NAMESPACE::Slice begin = begin_str;
-    dbfull()->TEST_CompactRange(1, &begin, &begin);
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, &begin, &begin));
     ASSERT_EQ(0, NumTableFilesAtLevel(1));
     ASSERT_EQ(3, NumTableFilesAtLevel(2));
   }
@@ -1216,9 +1239,9 @@ TEST_F(DBRangeDelTest, KeyAtOverlappingEndpointReappears) {
   std::string value;
   ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
 
-  dbfull()->TEST_CompactRange(0 /* level */, nullptr /* begin */,
-                              nullptr /* end */, nullptr /* column_family */,
-                              true /* disallow_trivial_move */);
+  ASSERT_OK(dbfull()->TEST_CompactRange(
+      0 /* level */, nullptr /* begin */, nullptr /* end */,
+      nullptr /* column_family */, true /* disallow_trivial_move */));
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
   // Now we have multiple files at L1 all containing a single user key, thus
   // guaranteeing overlap in the file endpoints.
@@ -1229,9 +1252,9 @@ TEST_F(DBRangeDelTest, KeyAtOverlappingEndpointReappears) {
 
   // Compact and verify again. It's worthwhile because now the files have
   // tighter endpoints, so we can verify that doesn't mess anything up.
-  dbfull()->TEST_CompactRange(1 /* level */, nullptr /* begin */,
-                              nullptr /* end */, nullptr /* column_family */,
-                              true /* disallow_trivial_move */);
+  ASSERT_OK(dbfull()->TEST_CompactRange(
+      1 /* level */, nullptr /* begin */, nullptr /* end */,
+      nullptr /* column_family */, true /* disallow_trivial_move */));
   ASSERT_GT(NumTableFilesAtLevel(2), 1);
   ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
 
@@ -1307,6 +1330,7 @@ TEST_F(DBRangeDelTest, UntruncatedTombstoneDoesNotDeleteNewerKey) {
 
   auto get_key_count = [this]() -> int {
     auto* iter = db_->NewIterator(ReadOptions());
+    assert(iter->status().ok());
     iter->SeekToFirst();
     int keys_found = 0;
     for (; iter->Valid(); iter->Next()) {
@@ -1409,6 +1433,7 @@ TEST_F(DBRangeDelTest, DeletedMergeOperandReappearsIterPrev) {
   ASSERT_GT(NumTableFilesAtLevel(1), 1);
 
   auto* iter = db_->NewIterator(ReadOptions());
+  ASSERT_OK(iter->status());
   iter->SeekToLast();
   int keys_found = 0;
   for (; iter->Valid(); iter->Prev()) {
@@ -1435,11 +1460,12 @@ TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeys) {
   ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
                              Key(10)));
 
-  db_->Flush(FlushOptions());
+  ASSERT_OK(db_->Flush(FlushOptions()));
 
   ReadOptions read_opts;
   read_opts.snapshot = snapshot;
   auto* iter = db_->NewIterator(read_opts);
+  ASSERT_OK(iter->status());
 
   iter->SeekToFirst();
   ASSERT_TRUE(iter->Valid());
@@ -1482,6 +1508,7 @@ TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeysInImmMemTables) {
   ReadOptions read_opts;
   read_opts.snapshot = snapshot.get();
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+  ASSERT_OK(iter->status());
 
   TEST_SYNC_POINT("SnapshotPreventsDroppedKeysInImmMemTables:AfterNewIterator");
 
@@ -1519,7 +1546,7 @@ TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) {
       std::string value = rnd.RandomString(kValueBytes);
       ASSERT_OK(Put(key, value));
     }
-    db_->Flush(FlushOptions());
+    ASSERT_OK(db_->Flush(FlushOptions()));
     MoveFilesToLevel(2);
   }
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
@@ -1538,7 +1565,7 @@ TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) {
   // TODO(ajkr): remove this `Put` after file cutting accounts for range
   // tombstones (#3977).
   ASSERT_OK(Put("c" + Key(1), "value"));
-  db_->Flush(FlushOptions());
+  ASSERT_OK(db_->Flush(FlushOptions()));
 
   // Ensure manual L0->L1 compaction cuts the outputs before the range tombstone
   // and the range tombstone is only placed in the second SST.
@@ -1546,9 +1573,9 @@ TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) {
   Slice begin_key(begin_key_storage);
   std::string end_key_storage("d");
   Slice end_key(end_key_storage);
-  dbfull()->TEST_CompactRange(0 /* level */, &begin_key /* begin */,
-                              &end_key /* end */, nullptr /* column_family */,
-                              true /* disallow_trivial_move */);
+  ASSERT_OK(dbfull()->TEST_CompactRange(
+      0 /* level */, &begin_key /* begin */, &end_key /* end */,
+      nullptr /* column_family */, true /* disallow_trivial_move */));
   ASSERT_EQ(2, NumTableFilesAtLevel(1));
 
   std::vector<LiveFileMetaData> all_metadata;
@@ -1613,15 +1640,15 @@ TEST_F(DBRangeDelTest, OverlappedTombstones) {
 
   ASSERT_EQ(1, NumTableFilesAtLevel(0));
 
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
-                              true /* disallow_trivial_move */);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                        true /* disallow_trivial_move */));
 
   // The tombstone range is not broken up into multiple SSTs which may incur a
   // large compaction with L2.
   ASSERT_EQ(1, NumTableFilesAtLevel(1));
   std::vector<std::vector<FileMetaData>> files;
-  dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
-                              true /* disallow_trivial_move */);
+  ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
+                                        true /* disallow_trivial_move */));
   ASSERT_EQ(1, NumTableFilesAtLevel(2));
   ASSERT_EQ(0, NumTableFilesAtLevel(1));
 }
@@ -1654,14 +1681,15 @@ TEST_F(DBRangeDelTest, OverlappedKeys) {
 
   // The key range is broken up into three SSTs to avoid a future big compaction
   // with the grandparent
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
-                              true /* disallow_trivial_move */);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                        true /* disallow_trivial_move */));
   ASSERT_EQ(3, NumTableFilesAtLevel(1));
 
-  std::vector<std::vector<FileMetaData>> files;
-  dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
-                              true /* disallow_trivial_move */);
-  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+  ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
+                                        true /* disallow_trivial_move */));
+  ASSERT_EQ(
+      3, NumTableFilesAtLevel(
+             2));  // L1->L2 compaction size is limited to max_compaction_bytes
   ASSERT_EQ(0, NumTableFilesAtLevel(1));
 }
 
diff --git a/db/db_impl/db_secondary_test.cc b/db/db_secondary_test.cc
similarity index 79%
rename from db/db_impl/db_secondary_test.cc
rename to db/db_secondary_test.cc
index 23dc63aca09..13ec1eec0e0 100644
--- a/db/db_impl/db_secondary_test.cc
+++ b/db/db_secondary_test.cc
@@ -104,7 +104,7 @@ void DBSecondaryTest::CheckFileTypeCounts(const std::string& dir,
     uint64_t number;
     FileType type;
     if (ParseFileName(file, &number, &type)) {
-      log_cnt += (type == kLogFile);
+      log_cnt += (type == kWalFile);
       sst_cnt += (type == kTableFile);
       manifest_cnt += (type == kDescriptorFile);
     }
@@ -147,6 +147,213 @@ TEST_F(DBSecondaryTest, ReopenAsSecondary) {
   ASSERT_EQ(2, count);
 }
 
+TEST_F(DBSecondaryTest, SimpleInternalCompaction) {
+  Options options;
+  options.env = env_;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  CompactionServiceInput input;
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+  for (auto& file : meta.levels[0].files) {
+    ASSERT_EQ(0, meta.levels[0].level);
+    input.input_files.push_back(file.name);
+  }
+  ASSERT_EQ(input.input_files.size(), 3);
+
+  input.output_level = 1;
+  Close();
+
+  options.max_open_files = -1;
+  OpenSecondary(options);
+  auto cfh = db_secondary_->DefaultColumnFamily();
+
+  CompactionServiceResult result;
+  ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input,
+                                                                 &result));
+
+  ASSERT_EQ(result.output_files.size(), 1);
+  InternalKey smallest, largest;
+  smallest.DecodeFrom(result.output_files[0].smallest_internal_key);
+  largest.DecodeFrom(result.output_files[0].largest_internal_key);
+  ASSERT_EQ(smallest.user_key().ToString(), "bar");
+  ASSERT_EQ(largest.user_key().ToString(), "foo");
+  ASSERT_EQ(result.output_level, 1);
+  ASSERT_EQ(result.output_path, this->secondary_path_);
+  ASSERT_EQ(result.num_output_records, 2);
+  ASSERT_GT(result.bytes_written, 0);
+  ASSERT_OK(result.status);
+}
+
+TEST_F(DBSecondaryTest, InternalCompactionMultiLevels) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  const int kRangeL2 = 10;
+  const int kRangeL1 = 30;
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i * kRangeL2), "value" + ToString(i)));
+    ASSERT_OK(Put(Key((i + 1) * kRangeL2 - 1), "value" + ToString(i)));
+    ASSERT_OK(Flush());
+  }
+  MoveFilesToLevel(2);
+  for (int i = 0; i < 5; i++) {
+    ASSERT_OK(Put(Key(i * kRangeL1), "value" + ToString(i)));
+    ASSERT_OK(Put(Key((i + 1) * kRangeL1 - 1), "value" + ToString(i)));
+    ASSERT_OK(Flush());
+  }
+  MoveFilesToLevel(1);
+  for (int i = 0; i < 4; i++) {
+    ASSERT_OK(Put(Key(i * 30), "value" + ToString(i)));
+    ASSERT_OK(Put(Key(i * 30 + 50), "value" + ToString(i)));
+    ASSERT_OK(Flush());
+  }
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+
+  // pick 2 files on level 0 for compaction, which has 3 overlap files on L1
+  CompactionServiceInput input1;
+  input1.input_files.push_back(meta.levels[0].files[2].name);
+  input1.input_files.push_back(meta.levels[0].files[3].name);
+  input1.input_files.push_back(meta.levels[1].files[0].name);
+  input1.input_files.push_back(meta.levels[1].files[1].name);
+  input1.input_files.push_back(meta.levels[1].files[2].name);
+
+  input1.output_level = 1;
+
+  options.max_open_files = -1;
+  Close();
+
+  OpenSecondary(options);
+  auto cfh = db_secondary_->DefaultColumnFamily();
+  CompactionServiceResult result;
+  ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input1,
+                                                                 &result));
+  ASSERT_OK(result.status);
+
+  // pick 2 files on level 1 for compaction, which has 6 overlap files on L2
+  CompactionServiceInput input2;
+  input2.input_files.push_back(meta.levels[1].files[1].name);
+  input2.input_files.push_back(meta.levels[1].files[2].name);
+  for (int i = 3; i < 9; i++) {
+    input2.input_files.push_back(meta.levels[2].files[i].name);
+  }
+
+  input2.output_level = 2;
+  ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input2,
+                                                                 &result));
+  ASSERT_OK(result.status);
+
+  CloseSecondary();
+
+  // delete all l2 files, without update manifest
+  for (auto& file : meta.levels[2].files) {
+    ASSERT_OK(env_->DeleteFile(dbname_ + file.name));
+  }
+  OpenSecondary(options);
+  cfh = db_secondary_->DefaultColumnFamily();
+  Status s = db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input2,
+                                                                  &result);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_OK(result.status);
+
+  // TODO: L0 -> L1 compaction should success, currently version is not built
+  // if files is missing.
+  //  ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh,
+  //  input1, &result));
+}
+
+TEST_F(DBSecondaryTest, InternalCompactionCompactedFiles) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  CompactionServiceInput input;
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+  for (auto& file : meta.levels[0].files) {
+    ASSERT_EQ(0, meta.levels[0].level);
+    input.input_files.push_back(file.name);
+  }
+  ASSERT_EQ(input.input_files.size(), 3);
+
+  input.output_level = 1;
+
+  // trigger compaction to delete the files for secondary instance compaction
+  ASSERT_OK(Put("foo", "foo_value" + std::to_string(3)));
+  ASSERT_OK(Put("bar", "bar_value" + std::to_string(3)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  Close();
+
+  options.max_open_files = -1;
+  OpenSecondary(options);
+  auto cfh = db_secondary_->DefaultColumnFamily();
+
+  CompactionServiceResult result;
+  Status s =
+      db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input, &result);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_OK(result.status);
+}
+
+TEST_F(DBSecondaryTest, InternalCompactionMissingFiles) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  CompactionServiceInput input;
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+  for (auto& file : meta.levels[0].files) {
+    ASSERT_EQ(0, meta.levels[0].level);
+    input.input_files.push_back(file.name);
+  }
+  ASSERT_EQ(input.input_files.size(), 3);
+
+  input.output_level = 1;
+
+  Close();
+
+  ASSERT_OK(env_->DeleteFile(dbname_ + input.input_files[0]));
+
+  options.max_open_files = -1;
+  OpenSecondary(options);
+  auto cfh = db_secondary_->DefaultColumnFamily();
+
+  CompactionServiceResult result;
+  Status s =
+      db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input, &result);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_OK(result.status);
+
+  input.input_files.erase(input.input_files.begin());
+
+  ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(cfh, input,
+                                                                 &result));
+  ASSERT_OK(result.status);
+}
+
 TEST_F(DBSecondaryTest, OpenAsSecondary) {
   Options options;
   options.env = env_;
@@ -459,20 +666,6 @@ TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) {
 }
 
 TEST_F(DBSecondaryTest, MissingTableFile) {
-  int table_files_not_exist = 0;
-  SyncPoint::GetInstance()->DisableProcessing();
-  SyncPoint::GetInstance()->ClearAllCallBacks();
-  SyncPoint::GetInstance()->SetCallBack(
-      "ReactiveVersionSet::ApplyOneVersionEditToBuilder:AfterLoadTableHandlers",
-      [&](void* arg) {
-        Status s = *reinterpret_cast<Status*>(arg);
-        if (s.IsPathNotFound()) {
-          ++table_files_not_exist;
-        } else if (!s.ok()) {
-          assert(false);  // Should not reach here
-        }
-      });
-  SyncPoint::GetInstance()->EnableProcessing();
   Options options;
   options.env = env_;
   options.level0_file_num_compaction_trigger = 4;
@@ -499,7 +692,6 @@ TEST_F(DBSecondaryTest, MissingTableFile) {
   ASSERT_NOK(db_secondary_->Get(ropts, "bar", &value));
 
   ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
-  ASSERT_EQ(options.level0_file_num_compaction_trigger, table_files_not_exist);
   ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
   ASSERT_EQ("foo_value" +
                 std::to_string(options.level0_file_num_compaction_trigger - 1),
@@ -615,10 +807,7 @@ TEST_F(DBSecondaryTest, SwitchManifest) {
   range_scan_db();
 }
 
-// Here, "Snapshot" refers to the version edits written by
-// VersionSet::WriteSnapshot() at the beginning of the new MANIFEST after
-// switching from the old one.
-TEST_F(DBSecondaryTest, SkipSnapshotAfterManifestSwitch) {
+TEST_F(DBSecondaryTest, SwitchManifestTwice) {
   Options options;
   options.env = env_;
   options.disable_auto_compactions = true;
@@ -640,10 +829,15 @@ TEST_F(DBSecondaryTest, SkipSnapshotAfterManifestSwitch) {
 
   Reopen(options);
   ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+  Reopen(options);
+  ASSERT_OK(Put("0", "value1"));
   ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+
+  ASSERT_OK(db_secondary_->Get(ropts, "0", &value));
+  ASSERT_EQ("value1", value);
 }
 
-TEST_F(DBSecondaryTest, SwitchWAL) {
+TEST_F(DBSecondaryTest, DISABLED_SwitchWAL) {
   const int kNumKeysPerMemtable = 1;
   Options options;
   options.env = env_;
@@ -692,7 +886,7 @@ TEST_F(DBSecondaryTest, SwitchWAL) {
   }
 }
 
-TEST_F(DBSecondaryTest, SwitchWALMultiColumnFamilies) {
+TEST_F(DBSecondaryTest, DISABLED_SwitchWALMultiColumnFamilies) {
   const int kNumKeysPerMemtable = 1;
   SyncPoint::GetInstance()->DisableProcessing();
   SyncPoint::GetInstance()->LoadDependency(
@@ -748,10 +942,12 @@ TEST_F(DBSecondaryTest, SwitchWALMultiColumnFamilies) {
     }
   };
   for (int k = 0; k != 8; ++k) {
-    ASSERT_OK(
-        Put(0 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k)));
-    ASSERT_OK(
-        Put(1 /*cf*/, "key" + std::to_string(k), "value" + std::to_string(k)));
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(0 /*cf*/, "key" + std::to_string(k),
+                    "value" + std::to_string(k)));
+      ASSERT_OK(Put(1 /*cf*/, "key" + std::to_string(k),
+                    "value" + std::to_string(k)));
+    }
     TEST_SYNC_POINT(
         "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp");
     ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
@@ -883,6 +1079,7 @@ TEST_F(DBSecondaryTest, StartFromInconsistent) {
       });
   SyncPoint::GetInstance()->EnableProcessing();
   Options options1;
+  options1.env = env_;
   Status s = TryOpenSecondary(options1);
   ASSERT_TRUE(s.IsCorruption());
 }
@@ -894,6 +1091,7 @@ TEST_F(DBSecondaryTest, InconsistencyDuringCatchUp) {
   ASSERT_OK(Flush());
 
   Options options1;
+  options1.env = env_;
   OpenSecondary(options1);
 
   {
diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc
index 2287c2425ae..640b4f5a2cd 100644
--- a/db/db_sst_test.cc
+++ b/db/db_sst_test.cc
@@ -98,7 +98,7 @@ TEST_F(DBSSTTest, SSTsWithLdbSuffixHandling) {
   for (int i = 0; i < 10; ++i) {
     GenerateNewFile(&rnd, &key_id, false);
   }
-  Flush();
+  ASSERT_OK(Flush());
   Close();
   int const num_files = GetSstFileCount(dbname_);
   ASSERT_GT(num_files, 0);
@@ -141,6 +141,7 @@ TEST_F(DBSSTTest, SkipCheckingSSTFileSizesOnDBOpen) {
 
   // Just open the DB with the option set to true and check that we don't crash.
   Options options;
+  options.env = env_;
   options.skip_checking_sst_file_sizes_on_db_open = true;
   Reopen(options);
 
@@ -169,7 +170,7 @@ TEST_F(DBSSTTest, DontDeleteMovedFile) {
     ASSERT_OK(Flush());
   }
   // this should execute both L0->L1 and L1->(move)->L2 compactions
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("0,0,1", FilesPerLevel(0));
 
   // If the moved file is actually deleted (the move-safeguard in
@@ -217,7 +218,7 @@ TEST_F(DBSSTTest, DeleteObsoleteFilesPendingOutputs) {
     ASSERT_OK(Flush());
   }
   // this should execute both L0->L1 and L1->(move)->L2 compactions
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("0,0,1", FilesPerLevel(0));
 
   test::SleepingBackgroundTask blocking_thread;
@@ -263,9 +264,9 @@ TEST_F(DBSSTTest, DeleteObsoleteFilesPendingOutputs) {
   // finish the flush!
   blocking_thread.WakeUp();
   blocking_thread.WaitUntilDone();
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   // File just flushed is too big for L0 and L1 so gets moved to L2.
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ("0,0,1,0,1", FilesPerLevel(0));
 
   metadata.clear();
@@ -301,17 +302,17 @@ TEST_F(DBSSTTest, DBWithSstFileManager) {
   for (int i = 0; i < 25; i++) {
     GenerateNewRandomFile(&rnd);
     ASSERT_OK(Flush());
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     // Verify that we are tracking all sst files in dbname_
     std::unordered_map<std::string, uint64_t> files_in_db;
-    ASSERT_OK(GetAllSSTFiles(&files_in_db));
+    ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
     ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
   }
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
   std::unordered_map<std::string, uint64_t> files_in_db;
-  ASSERT_OK(GetAllSSTFiles(&files_in_db));
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
   // Verify that we are tracking all sst files in dbname_
   ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
   // Verify the total files size
@@ -345,6 +346,262 @@ TEST_F(DBSSTTest, DBWithSstFileManager) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  int files_added = 0;
+  int files_deleted = 0;
+  int files_moved = 0;
+  int files_scheduled_to_delete = 0;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnAddFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          files_added++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnDeleteFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          files_deleted++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) {
+        assert(arg);
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          ++files_scheduled_to_delete;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.enable_blob_files = true;
+  options.blob_file_size = 32;  // create one blob per file
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put("Key_" + std::to_string(i), "Value_" + std::to_string(i)));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+    // Verify that we are tracking all sst and blob files in dbname_
+    std::unordered_map<std::string, uint64_t> files_in_db;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+    ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db));
+    ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  }
+
+  std::vector<uint64_t> blob_files = GetBlobFileNumbers();
+  ASSERT_EQ(files_added, blob_files.size());
+  // No blob file is obsoleted.
+  ASSERT_EQ(files_deleted, 0);
+  ASSERT_EQ(files_scheduled_to_delete, 0);
+  // No files were moved.
+  ASSERT_EQ(files_moved, 0);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  std::unordered_map<std::string, uint64_t> files_in_db;
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+  ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db));
+
+  // Verify that we are tracking all sst and blob files in dbname_
+  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  // Verify the total files size
+  uint64_t total_files_size = 0;
+  for (auto& file_to_size : files_in_db) {
+    total_files_size += file_to_size.second;
+  }
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+  Close();
+
+  Reopen(options);
+  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+  // Verify that we track all the files again after the DB is closed and opened.
+  Close();
+
+  sst_file_manager.reset(NewSstFileManager(env_));
+  options.sst_file_manager = sst_file_manager;
+  sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  Reopen(options);
+
+  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+  // Destroy DB and it will remove all the blob files from sst file manager and
+  // blob files deletion will go through ScheduleFileDeletion.
+  ASSERT_EQ(files_deleted, 0);
+  ASSERT_EQ(files_scheduled_to_delete, 0);
+  Close();
+  ASSERT_OK(DestroyDB(dbname_, options));
+  ASSERT_EQ(files_deleted, blob_files.size());
+  ASSERT_EQ(files_scheduled_to_delete, blob_files.size());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFilesWithGC) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.enable_blob_files = true;
+  options.blob_file_size = 32;  // create one blob per file
+  options.disable_auto_compactions = true;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 0.5;
+
+  int files_added = 0;
+  int files_deleted = 0;
+  int files_moved = 0;
+  int files_scheduled_to_delete = 0;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnAddFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          files_added++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnDeleteFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          files_deleted++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) {
+        assert(arg);
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          ++files_scheduled_to_delete;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+
+  ASSERT_OK(Put(first_key, first_value));
+  ASSERT_OK(Put(second_key, second_value));
+  ASSERT_OK(Flush());
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "third_value";
+  constexpr char fourth_key[] = "fourth_key";
+  constexpr char fourth_value[] = "fourth_value";
+  constexpr char fifth_key[] = "fifth_key";
+  constexpr char fifth_value[] = "fifth_value";
+
+  ASSERT_OK(Put(third_key, third_value));
+  ASSERT_OK(Put(fourth_key, fourth_value));
+  ASSERT_OK(Put(fifth_key, fifth_value));
+  ASSERT_OK(Flush());
+
+  const std::vector<uint64_t> original_blob_files = GetBlobFileNumbers();
+
+  ASSERT_EQ(original_blob_files.size(), 5);
+  ASSERT_EQ(files_added, 5);
+  ASSERT_EQ(files_deleted, 0);
+  ASSERT_EQ(files_scheduled_to_delete, 0);
+  ASSERT_EQ(files_moved, 0);
+  {
+    // Verify that we are tracking all sst and blob files in dbname_
+    std::unordered_map<std::string, uint64_t> files_in_db;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+    ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db));
+    ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  }
+
+  const size_t cutoff_index = static_cast<size_t>(
+      options.blob_garbage_collection_age_cutoff * original_blob_files.size());
+
+  size_t expected_number_of_files = original_blob_files.size();
+  // Note: turning off enable_blob_files before the compaction results in
+  // garbage collected values getting inlined.
+  ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}}));
+  expected_number_of_files -= cutoff_index;
+  files_added = 0;
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+  sfm->WaitForEmptyTrash();
+
+  ASSERT_EQ(Get(first_key), first_value);
+  ASSERT_EQ(Get(second_key), second_value);
+  ASSERT_EQ(Get(third_key), third_value);
+  ASSERT_EQ(Get(fourth_key), fourth_value);
+  ASSERT_EQ(Get(fifth_key), fifth_value);
+
+  const std::vector<uint64_t> new_blob_files = GetBlobFileNumbers();
+
+  ASSERT_EQ(new_blob_files.size(), expected_number_of_files);
+  // No new file is added.
+  ASSERT_EQ(files_added, 0);
+  ASSERT_EQ(files_deleted, cutoff_index);
+  ASSERT_EQ(files_scheduled_to_delete, cutoff_index);
+  ASSERT_EQ(files_moved, 0);
+
+  // Original blob files below the cutoff should be gone, original blob files at
+  // or above the cutoff should be still there
+  for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) {
+    ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]);
+  }
+
+  {
+    // Verify that we are tracking all sst and blob files in dbname_
+    std::unordered_map<std::string, uint64_t> files_in_db;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+    ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db));
+    ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  }
+
+  Close();
+  ASSERT_OK(DestroyDB(dbname_, options));
+  sfm->WaitForEmptyTrash();
+  ASSERT_EQ(files_deleted, 5);
+  ASSERT_EQ(files_scheduled_to_delete, 5);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
 TEST_F(DBSSTTest, RateLimitedDelete) {
   Destroy(last_options_);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
@@ -373,6 +630,14 @@ TEST_F(DBSSTTest, RateLimitedDelete) {
         *abs_time_us = Env::Default()->NowMicros();
       });
 
+  // Disable PeriodicWorkScheduler as it also has TimedWait, which could update
+  // the simulated sleep time
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::StartPeriodicWorkScheduler:DisableScheduler", [&](void* arg) {
+        bool* disable_scheduler = static_cast<bool*>(arg);
+        *disable_scheduler = true;
+      });
+
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
   Options options = CurrentOptions();
@@ -392,7 +657,7 @@ TEST_F(DBSSTTest, RateLimitedDelete) {
 
   WriteOptions wo;
   wo.disableWAL = true;
-  ASSERT_OK(TryReopen(options));
+  Reopen(options);
   // Create 4 files in L0
   for (char v = 'a'; v <= 'd'; v++) {
     ASSERT_OK(Put("Key2", DummyString(1024, v), wo));
@@ -488,10 +753,11 @@ TEST_F(DBSSTTest, RateLimitedWALDelete) {
 }
 
 class DBWALTestWithParam
-    : public DBSSTTest,
+    : public DBTestBase,
       public testing::WithParamInterface<std::tuple<std::string, bool>> {
  public:
-  DBWALTestWithParam() {
+  explicit DBWALTestWithParam()
+      : DBTestBase("/db_wal_test_with_params", /*env_do_fsync=*/true) {
     wal_dir_ = std::get<0>(GetParam());
     wal_dir_same_as_dbname_ = std::get<1>(GetParam());
   }
@@ -519,7 +785,7 @@ TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) {
     bool fake_log_delete;
   };
 
-  std::unique_ptr<MyEnv> env(new MyEnv(Env::Default()));
+  std::unique_ptr<MyEnv> env(new MyEnv(env_));
   Destroy(last_options_);
 
   env->set_fake_log_delete(true);
@@ -539,10 +805,15 @@ TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) {
   auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
   sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1);
 
-  ASSERT_OK(TryReopen(options));
+  Reopen(options);
 
   // Create 4 files in L0
   for (char v = 'a'; v <= 'd'; v++) {
+    if (v == 'd') {
+      // Maximize the change that the last log file will be preserved in trash
+      // before restarting the DB.
+      options.sst_file_manager->SetDeleteRateBytesPerSecond(1);
+    }
     ASSERT_OK(Put("Key2", DummyString(1024, v)));
     ASSERT_OK(Put("Key3", DummyString(1024, v)));
     ASSERT_OK(Put("Key4", DummyString(1024, v)));
@@ -561,11 +832,11 @@ TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) {
   if (!wal_dir_same_as_dbname_) {
     // Forcibly create some trash log files
     std::unique_ptr<WritableFile> result;
-    env->NewWritableFile(options.wal_dir + "/1000.log.trash", &result,
-                         EnvOptions());
+    ASSERT_OK(env->NewWritableFile(options.wal_dir + "/1000.log.trash", &result,
+                                   EnvOptions()));
     result.reset();
   }
-  env->GetChildren(options.wal_dir, &filenames);
+  ASSERT_OK(env->GetChildren(options.wal_dir, &filenames));
   for (const std::string& fname : filenames) {
     if (fname.find(".log.trash") != std::string::npos) {
       trash_log_count++;
@@ -574,11 +845,11 @@ TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) {
   ASSERT_GE(trash_log_count, 1);
 
   env->set_fake_log_delete(false);
-  ASSERT_OK(TryReopen(options));
+  Reopen(options);
 
   filenames.clear();
   trash_log_count = 0;
-  env->GetChildren(options.wal_dir, &filenames);
+  ASSERT_OK(env->GetChildren(options.wal_dir, &filenames));
   for (const std::string& fname : filenames) {
     if (fname.find(".log.trash") != std::string::npos) {
       trash_log_count++;
@@ -602,13 +873,13 @@ TEST_F(DBSSTTest, OpenDBWithExistingTrash) {
   Destroy(last_options_);
 
   // Add some trash files to the db directory so the DB can clean them up
-  env_->CreateDirIfMissing(dbname_);
+  ASSERT_OK(env_->CreateDirIfMissing(dbname_));
   ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "001.sst.trash"));
   ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "002.sst.trash"));
   ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "003.sst.trash"));
 
   // Reopen the DB and verify that it deletes existing trash files
-  ASSERT_OK(TryReopen(options));
+  Reopen(options);
   sfm->WaitForEmptyTrash();
   ASSERT_NOK(env_->FileExists(dbname_ + "/" + "001.sst.trash"));
   ASSERT_NOK(env_->FileExists(dbname_ + "/" + "002.sst.trash"));
@@ -727,7 +998,7 @@ TEST_F(DBSSTTest, DestroyDBWithRateLimitedDelete) {
   int num_sst_files = 0;
   int num_wal_files = 0;
   std::vector<std::string> db_files;
-  env_->GetChildren(dbname_, &db_files);
+  ASSERT_OK(env_->GetChildren(dbname_, &db_files));
   for (std::string f : db_files) {
     if (f.substr(f.find_last_of(".") + 1) == "sst") {
       num_sst_files++;
@@ -766,7 +1037,7 @@ TEST_F(DBSSTTest, DBWithMaxSpaceAllowed) {
 
   uint64_t first_file_size = 0;
   std::unordered_map<std::string, uint64_t> files_in_db;
-  ASSERT_OK(GetAllSSTFiles(&files_in_db, &first_file_size));
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &first_file_size));
   ASSERT_EQ(sfm->GetTotalSize(), first_file_size);
 
   // Set the maximum allowed space usage to the current total size
@@ -777,6 +1048,68 @@ TEST_F(DBSSTTest, DBWithMaxSpaceAllowed) {
   ASSERT_NOK(Flush());
 }
 
+TEST_F(DBSSTTest, DBWithMaxSpaceAllowedWithBlobFiles) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+
+  // Generate a file containing keys.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+  }
+  ASSERT_OK(Flush());
+
+  uint64_t files_size = 0;
+  uint64_t total_files_size = 0;
+  std::unordered_map<std::string, uint64_t> files_in_db;
+
+  ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db, &files_size));
+  // Make sure blob files are considered by SSTFileManage in size limits.
+  ASSERT_GT(files_size, 0);
+  total_files_size = files_size;
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &files_size));
+  total_files_size += files_size;
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+  // Set the maximum allowed space usage to the current total size.
+  sfm->SetMaxAllowedSpaceUsage(files_size + 1);
+
+  bool max_allowed_space_reached = false;
+  bool delete_blob_file = false;
+  // Sync point called after blob file is closed and max allowed space is
+  // checked.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileCompletionCallback::CallBack::MaxAllowedSpaceReached",
+      [&](void* /*arg*/) { max_allowed_space_reached = true; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable::AfterDeleteFile",
+      [&](void* /*arg*/) { delete_blob_file = true; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {
+          "BuildTable::AfterDeleteFile",
+          "DBSSTTest::DBWithMaxSpaceAllowedWithBlobFiles:1",
+      },
+  });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("key1", "val1"));
+  // This flush will fail
+  ASSERT_NOK(Flush());
+  ASSERT_TRUE(max_allowed_space_reached);
+
+  TEST_SYNC_POINT("DBSSTTest::DBWithMaxSpaceAllowedWithBlobFiles:1");
+  ASSERT_TRUE(delete_blob_file);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
 TEST_F(DBSSTTest, CancellingCompactionsWorks) {
   std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
   auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
@@ -807,7 +1140,7 @@ TEST_F(DBSSTTest, CancellingCompactionsWorks) {
   ASSERT_OK(Flush());
   uint64_t total_file_size = 0;
   std::unordered_map<std::string, uint64_t> files_in_db;
-  ASSERT_OK(GetAllSSTFiles(&files_in_db, &total_file_size));
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_file_size));
   // Set the maximum allowed space usage to the current total size
   sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1);
 
@@ -816,7 +1149,7 @@ TEST_F(DBSSTTest, CancellingCompactionsWorks) {
     ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
   }
   ASSERT_OK(Flush());
-  dbfull()->TEST_WaitForCompact(true);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
 
   // Because we set a callback in CancelledCompaction, we actually
   // let the compaction run
@@ -855,7 +1188,7 @@ TEST_F(DBSSTTest, CancellingManualCompactionsWorks) {
   ASSERT_OK(Flush());
   uint64_t total_file_size = 0;
   std::unordered_map<std::string, uint64_t> files_in_db;
-  ASSERT_OK(GetAllSSTFiles(&files_in_db, &total_file_size));
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_file_size));
   // Set the maximum allowed space usage to the current total size
   sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1);
 
@@ -866,10 +1199,12 @@ TEST_F(DBSSTTest, CancellingManualCompactionsWorks) {
   ASSERT_OK(Flush());
 
   // OK, now trigger a manual compaction
-  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+                  .IsCompactionTooLarge());
 
   // Wait for manual compaction to get scheduled and finish
-  dbfull()->TEST_WaitForCompact(true);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
 
   ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
   // Make sure the stat is bumped
@@ -879,10 +1214,13 @@ TEST_F(DBSSTTest, CancellingManualCompactionsWorks) {
 
   // Now make sure CompactFiles also gets cancelled
   auto l0_files = collector->GetFlushedFiles();
-  dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), l0_files, 0);
+  ASSERT_TRUE(
+      dbfull()
+          ->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), l0_files, 0)
+          .IsCompactionTooLarge());
 
   // Wait for manual compaction to get scheduled and finish
-  dbfull()->TEST_WaitForCompact(true);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
 
   ASSERT_EQ(dbfull()->immutable_db_options().statistics.get()->getTickerCount(
                 COMPACTION_CANCELLED),
@@ -897,8 +1235,9 @@ TEST_F(DBSSTTest, CancellingManualCompactionsWorks) {
       "CompactFilesImpl:End", [&](void* /*arg*/) { completed_compactions++; });
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-  dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), l0_files, 0);
-  dbfull()->TEST_WaitForCompact(true);
+  ASSERT_OK(dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(),
+                                   l0_files, 0));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
 
   ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
   ASSERT_GT(completed_compactions, 0);
@@ -966,7 +1305,7 @@ TEST_F(DBSSTTest, DBWithMaxSpaceAllowedRandomized) {
     ASSERT_TRUE(bg_error_set);
     uint64_t total_sst_files_size = 0;
     std::unordered_map<std::string, uint64_t> files_in_db;
-    ASSERT_OK(GetAllSSTFiles(&files_in_db, &total_sst_files_size));
+    ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_sst_files_size));
     ASSERT_GE(total_sst_files_size, limit_mb * 1024 * 1024);
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   }
@@ -1002,7 +1341,7 @@ TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFiles) {
     CompactRangeOptions compact_options;
     compact_options.change_level = true;
     compact_options.target_level = 2;
-    db_->CompactRange(compact_options, nullptr, nullptr);
+    ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
 
     // Create 12 Files in L0
     for (int i = 0; i < 12; i++) {
@@ -1054,7 +1393,7 @@ TEST_F(DBSSTTest, GetTotalSstFilesSize) {
       std::string val = "val_file_" + ToString(i);
       ASSERT_OK(Put(Key(j), val));
     }
-    Flush();
+    ASSERT_OK(Flush());
   }
   ASSERT_EQ("5", FilesPerLevel(0));
 
@@ -1078,6 +1417,7 @@ TEST_F(DBSSTTest, GetTotalSstFilesSize) {
 
   // hold current version
   std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+  ASSERT_OK(iter1->status());
 
   // Compact 5 files into 1 file in L0
   ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
@@ -1101,12 +1441,13 @@ TEST_F(DBSSTTest, GetTotalSstFilesSize) {
 
   // hold current version
   std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
+  ASSERT_OK(iter2->status());
 
   // Delete all keys and compact, this will delete all live files
   for (int i = 0; i < 10; i++) {
     ASSERT_OK(Delete(Key(i)));
   }
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ("", FilesPerLevel(0));
 
@@ -1120,6 +1461,7 @@ TEST_F(DBSSTTest, GetTotalSstFilesSize) {
   // Total SST files = 6 (5 original files + compacted file)
   ASSERT_EQ(total_sst_files_size, 6 * single_file_size);
 
+  ASSERT_OK(iter1->status());
   iter1.reset();
   ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
                                        &total_sst_files_size));
@@ -1127,6 +1469,7 @@ TEST_F(DBSSTTest, GetTotalSstFilesSize) {
   // Total SST files = 1 (compacted file)
   ASSERT_EQ(total_sst_files_size, 1 * single_file_size);
 
+  ASSERT_OK(iter2->status());
   iter2.reset();
   ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
                                        &total_sst_files_size));
@@ -1145,7 +1488,7 @@ TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) {
   // Generate 5 files in L0
   for (int i = 0; i < 5; i++) {
     ASSERT_OK(Put(Key(i), "val"));
-    Flush();
+    ASSERT_OK(Flush());
   }
   ASSERT_EQ("5", FilesPerLevel(0));
 
@@ -1170,6 +1513,7 @@ TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) {
 
   // hold current version
   std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+  ASSERT_OK(iter1->status());
 
   // Compaction will do trivial move from L0 to L1
   ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
@@ -1193,12 +1537,13 @@ TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) {
 
   // hold current version
   std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
+  ASSERT_OK(iter2->status());
 
   // Delete all keys and compact, this will delete all live files
   for (int i = 0; i < 5; i++) {
     ASSERT_OK(Delete(Key(i)));
   }
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ("", FilesPerLevel(0));
 
@@ -1212,7 +1557,9 @@ TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) {
   // Total SST files = 5 (used in 2 version)
   ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
 
+  ASSERT_OK(iter1->status());
   iter1.reset();
+  ASSERT_OK(iter2->status());
   iter2.reset();
 
   ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
diff --git a/db/db_statistics_test.cc b/db/db_statistics_test.cc
index d4e4c628b1d..0874eb40bbc 100644
--- a/db/db_statistics_test.cc
+++ b/db/db_statistics_test.cc
@@ -137,7 +137,7 @@ TEST_F(DBStatisticsTest, ResetStats) {
       ASSERT_EQ(1, TestGetTickerCount(options, NUMBER_KEYS_WRITTEN));
       options.statistics->histogramData(DB_WRITE, &histogram_data);
       ASSERT_GT(histogram_data.max, 0.0);
-      options.statistics->Reset();
+      ASSERT_OK(options.statistics->Reset());
     }
   }
 }
diff --git a/db/db_table_properties_test.cc b/db/db_table_properties_test.cc
index 35ebb6ec3af..96cbe9f1a69 100644
--- a/db/db_table_properties_test.cc
+++ b/db/db_table_properties_test.cc
@@ -11,9 +11,11 @@
 #include <vector>
 
 #include "db/db_test_util.h"
+#include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/db.h"
 #include "rocksdb/utilities/table_properties_collectors.h"
+#include "table/format.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/random.h"
@@ -50,7 +52,7 @@ class DBTablePropertiesTest : public DBTestBase,
                               public testing::WithParamInterface<std::string> {
  public:
   DBTablePropertiesTest()
-      : DBTestBase("/db_table_properties_test", /*env_do_fsync=*/true) {}
+      : DBTestBase("/db_table_properties_test", /*env_do_fsync=*/false) {}
   TablePropertiesCollection TestGetPropertiesOfTablesInRange(
       std::vector<Range> ranges, std::size_t* num_properties = nullptr,
       std::size_t* num_files = nullptr);
@@ -63,9 +65,9 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) {
   // Create 4 tables
   for (int table = 0; table < 4; ++table) {
     for (int i = 0; i < 10 + table; ++i) {
-      db_->Put(WriteOptions(), ToString(table * 100 + i), "val");
+      ASSERT_OK(db_->Put(WriteOptions(), ToString(table * 100 + i), "val"));
     }
-    db_->Flush(FlushOptions());
+    ASSERT_OK(db_->Flush(FlushOptions()));
   }
 
   // 1. Read table properties directly from file
@@ -159,14 +161,14 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesInRange) {
   for (int i = 0; i < 10000; i++) {
     ASSERT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102)));
   }
-  Flush();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   if (NumTableFilesAtLevel(0) == 0) {
     ASSERT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102)));
-    Flush();
+    ASSERT_OK(Flush());
   }
 
-  db_->PauseBackgroundWork();
+  ASSERT_OK(db_->PauseBackgroundWork());
 
   // Ensure that we have at least L0, L1 and L2
   ASSERT_GT(NumTableFilesAtLevel(0), 0);
@@ -234,8 +236,8 @@ TEST_F(DBTablePropertiesTest, GetColumnFamilyNameProperty) {
   // Create one table per CF, then verify it was created with the column family
   // name property.
   for (uint32_t cf = 0; cf < 2; ++cf) {
-    Put(cf, "key", "val");
-    Flush(cf);
+    ASSERT_OK(Put(cf, "key", "val"));
+    ASSERT_OK(Flush(cf));
 
     TablePropertiesCollection fname_to_props;
     ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props));
@@ -258,22 +260,71 @@ TEST_F(DBTablePropertiesTest, GetDbIdentifiersProperty) {
   CreateAndReopenWithCF({"goku"}, CurrentOptions());
 
   for (uint32_t cf = 0; cf < 2; ++cf) {
-    Put(cf, "key", "val");
-    Put(cf, "foo", "bar");
-    Flush(cf);
+    ASSERT_OK(Put(cf, "key", "val"));
+    ASSERT_OK(Put(cf, "foo", "bar"));
+    ASSERT_OK(Flush(cf));
 
     TablePropertiesCollection fname_to_props;
     ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props));
     ASSERT_EQ(1U, fname_to_props.size());
 
     std::string id, sid;
-    db_->GetDbIdentity(id);
-    db_->GetDbSessionId(sid);
+    ASSERT_OK(db_->GetDbIdentity(id));
+    ASSERT_OK(db_->GetDbSessionId(sid));
     ASSERT_EQ(id, fname_to_props.begin()->second->db_id);
     ASSERT_EQ(sid, fname_to_props.begin()->second->db_session_id);
   }
 }
 
+class DBTableHostnamePropertyTest
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<int, std::string>> {
+ public:
+  DBTableHostnamePropertyTest()
+      : DBTestBase("/db_table_hostname_property_test",
+                   /*env_do_fsync=*/false) {}
+};
+
+TEST_P(DBTableHostnamePropertyTest, DbHostLocationProperty) {
+  option_config_ = std::get<0>(GetParam());
+  Options opts = CurrentOptions();
+  std::string expected_host_id = std::get<1>(GetParam());
+  ;
+  if (expected_host_id == kHostnameForDbHostId) {
+    ASSERT_OK(env_->GetHostNameString(&expected_host_id));
+  } else {
+    opts.db_host_id = expected_host_id;
+  }
+  CreateAndReopenWithCF({"goku"}, opts);
+
+  for (uint32_t cf = 0; cf < 2; ++cf) {
+    ASSERT_OK(Put(cf, "key", "val"));
+    ASSERT_OK(Put(cf, "foo", "bar"));
+    ASSERT_OK(Flush(cf));
+
+    TablePropertiesCollection fname_to_props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props));
+    ASSERT_EQ(1U, fname_to_props.size());
+
+    ASSERT_EQ(fname_to_props.begin()->second->db_host_id, expected_host_id);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DBTableHostnamePropertyTest, DBTableHostnamePropertyTest,
+    ::testing::Values(
+        // OptionConfig, override db_host_location
+        std::make_tuple(DBTestBase::OptionConfig::kDefault,
+                        kHostnameForDbHostId),
+        std::make_tuple(DBTestBase::OptionConfig::kDefault, "foobar"),
+        std::make_tuple(DBTestBase::OptionConfig::kDefault, ""),
+        std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix,
+                        kHostnameForDbHostId),
+        std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix,
+                        "foobar"),
+        std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix,
+                        "")));
+
 class DeletionTriggeredCompactionTestListener : public EventListener {
  public:
   void OnCompactionBegin(DB* , const CompactionJobInfo& ci) override {
@@ -305,8 +356,8 @@ TEST_P(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) {
 
   // add an L1 file to prevent tombstones from dropping due to obsolescence
   // during flush
-  Put(Key(0), "val");
-  Flush();
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
   MoveFilesToLevel(1);
 
   DeletionTriggeredCompactionTestListener *listener =
@@ -317,14 +368,14 @@ TEST_P(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) {
   for (int i = 0; i < kNumKeys; ++i) {
     if (i >= kNumKeys - kWindowSize &&
         i < kNumKeys - kWindowSize + kNumDelsTrigger) {
-      Delete(Key(i));
+      ASSERT_OK(Delete(Key(i)));
     } else {
-      Put(Key(i), "val");
+      ASSERT_OK(Put(Key(i), "val"));
     }
   }
-  Flush();
+  ASSERT_OK(Flush());
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
 
   // Change the window size and deletion trigger and ensure new values take
@@ -338,14 +389,14 @@ TEST_P(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) {
   for (int i = 0; i < kNumKeys; ++i) {
     if (i >= kNumKeys - kWindowSize &&
         i < kNumKeys - kWindowSize + kNumDelsTrigger) {
-      Delete(Key(i));
+      ASSERT_OK(Delete(Key(i)));
     } else {
-      Put(Key(i), "val");
+      ASSERT_OK(Put(Key(i), "val"));
     }
   }
-  Flush();
+  ASSERT_OK(Flush());
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
 
   // Change the window size to disable delete triggered compaction
@@ -357,14 +408,14 @@ TEST_P(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) {
   for (int i = 0; i < kNumKeys; ++i) {
     if (i >= kNumKeys - kWindowSize &&
         i < kNumKeys - kWindowSize + kNumDelsTrigger) {
-      Delete(Key(i));
+      ASSERT_OK(Delete(Key(i)));
     } else {
-      Put(Key(i), "val");
+      ASSERT_OK(Put(Key(i), "val"));
     }
   }
-  Flush();
+  ASSERT_OK(Flush());
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(1, NumTableFilesAtLevel(0));
   ASSERT_LT(0, opts.statistics->getTickerCount(COMPACT_WRITE_BYTES_MARKED));
   ASSERT_LT(0, opts.statistics->getTickerCount(COMPACT_READ_BYTES_MARKED));
@@ -387,8 +438,8 @@ TEST_P(DBTablePropertiesTest, RatioBasedDeletionTriggeredCompactionMarking) {
 
   // Add an L2 file to prevent tombstones from dropping due to obsolescence
   // during flush
-  Put(Key(0), "val");
-  Flush();
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
   MoveFilesToLevel(2);
 
   auto* listener = new DeletionTriggeredCompactionTestListener();
diff --git a/db/db_tailing_iter_test.cc b/db/db_tailing_iter_test.cc
index f33b7cb13a5..d77168d9648 100644
--- a/db/db_tailing_iter_test.cc
+++ b/db/db_tailing_iter_test.cc
@@ -31,6 +31,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorSingle) {
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
   iter->SeekToFirst();
   ASSERT_TRUE(!iter->Valid());
+  ASSERT_OK(iter->status());
 
   // add a record and check that iter can see it
   ASSERT_OK(db_->Put(WriteOptions(), "mirko", "fodor"));
@@ -48,6 +49,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorKeepAdding) {
   read_options.tailing = true;
 
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
   std::string value(1024, 'a');
 
   const int num_records = 10000;
@@ -70,7 +72,9 @@ TEST_F(DBTestTailingIterator, TailingIteratorSeekToNext) {
   read_options.tailing = true;
 
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
   std::unique_ptr<Iterator> itern(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(itern->status());
   std::string value(1024, 'a');
 
   const int num_records = 1000;
@@ -138,8 +142,11 @@ TEST_F(DBTestTailingIterator, TailingIteratorTrimSeekToNext) {
   Slice keyu(bufe, 20);
   read_options.iterate_upper_bound = &keyu;
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
   std::unique_ptr<Iterator> itern(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(itern->status());
   std::unique_ptr<Iterator> iterh(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iterh->status());
   std::string value(1024, 'a');
   bool file_iters_deleted = false;
   bool file_iters_renewed_null = false;
@@ -179,7 +186,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorTrimSeekToNext) {
 
     if (i % 100 == 99) {
       ASSERT_OK(Flush(1));
-      dbfull()->TEST_WaitForCompact();
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
       if (i == 299) {
         file_iters_deleted = true;
       }
@@ -225,6 +232,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorTrimSeekToNext) {
   ReopenWithColumnFamilies({"default", "pikachu"}, options);
   read_options.read_tier = kBlockCacheTier;
   std::unique_ptr<Iterator> iteri(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iteri->status());
   char buf5[32];
   snprintf(buf5, sizeof(buf5), "00a0%016d", (num_records / 2) * 5 - 2);
   Slice target1(buf5, 20);
@@ -236,6 +244,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorTrimSeekToNext) {
   options.table_factory.reset(NewBlockBasedTableFactory());
   ReopenWithColumnFamilies({"default", "pikachu"}, options);
   iter.reset(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
   for (int i = 2 * num_records; i > 0; --i) {
     char buf1[32];
     char buf2[32];
@@ -262,6 +271,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorDeletes) {
   read_options.tailing = true;
 
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
 
   // write a single record, read it using the iterator, then delete it
   ASSERT_OK(Put(1, "0test", "test"));
@@ -309,6 +319,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorPrefixSeek) {
   CreateAndReopenWithCF({"pikachu"}, options);
 
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
   ASSERT_OK(Put(1, "0101", "test"));
 
   ASSERT_OK(Flush(1));
@@ -339,6 +350,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorIncomplete) {
   ASSERT_OK(db_->Put(WriteOptions(), key, value));
 
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  ASSERT_OK(iter->status());
   iter->SeekToFirst();
   // we either see the entry or it's not in cache
   ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
@@ -369,6 +381,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorSeekToSame) {
   }
 
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  ASSERT_OK(iter->status());
   // Seek to 00001.  We expect to find 00002.
   std::string start_key = "00001";
   iter->Seek(start_key);
@@ -404,6 +417,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorUpperBound) {
   ASSERT_OK(Put(1, "21", "21"));
 
   std::unique_ptr<Iterator> it(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(it->status());
   it->Seek("12");
   ASSERT_TRUE(it->Valid());
   ASSERT_EQ("12", it->key().ToString());
@@ -411,7 +425,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorUpperBound) {
   it->Next();
   // Not valid since "21" is over the upper bound.
   ASSERT_FALSE(it->Valid());
-
+  ASSERT_OK(it->status());
   // This keeps track of the number of times NeedToSeekImmutable() was true.
   int immutable_seeks = 0;
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
@@ -424,6 +438,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorUpperBound) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 
   ASSERT_FALSE(it->Valid());
+  ASSERT_OK(it->status());
   ASSERT_EQ(0, immutable_seeks);
 }
 
@@ -478,6 +493,8 @@ TEST_F(DBTestTailingIterator, TailingIteratorGap) {
   it->Next();
   ASSERT_TRUE(it->Valid());
   ASSERT_EQ("40", it->key().ToString());
+
+  ASSERT_OK(it->status());
 }
 
 TEST_F(DBTestTailingIterator, SeekWithUpperBoundBug) {
@@ -496,6 +513,7 @@ TEST_F(DBTestTailingIterator, SeekWithUpperBoundBug) {
   ASSERT_OK(Flush());
 
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  ASSERT_OK(iter->status());
 
   iter->Seek("aa");
   ASSERT_TRUE(iter->Valid());
@@ -518,6 +536,7 @@ TEST_F(DBTestTailingIterator, SeekToFirstWithUpperBoundBug) {
   ASSERT_OK(Flush());
 
   std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  ASSERT_OK(iter->status());
 
   iter->SeekToFirst();
   ASSERT_TRUE(iter->Valid());
diff --git a/db/db_test.cc b/db/db_test.cc
index 948036cfce2..282631414a8 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -247,17 +247,21 @@ TEST_F(DBTest, SkipDelay) {
       wo.sync = sync;
       wo.disableWAL = disableWAL;
       wo.no_slowdown = true;
-      dbfull()->Put(wo, "foo", "bar");
+      // Large enough to exceed allowance for one time interval
+      std::string large_value(1024, 'x');
+      // Perhaps ideally this first write would fail because of delay, but
+      // the current implementation does not guarantee that.
+      dbfull()->Put(wo, "foo", large_value).PermitUncheckedError();
       // We need the 2nd write to trigger delay. This is because delay is
       // estimated based on the last write size which is 0 for the first write.
-      ASSERT_NOK(dbfull()->Put(wo, "foo2", "bar2"));
+      ASSERT_NOK(dbfull()->Put(wo, "foo2", large_value));
       ASSERT_GE(sleep_count.load(), 0);
       ASSERT_GE(wait_count.load(), 0);
       token.reset();
 
-      token = dbfull()->TEST_write_controler().GetDelayToken(1000000000);
+      token = dbfull()->TEST_write_controler().GetDelayToken(1000000);
       wo.no_slowdown = false;
-      ASSERT_OK(dbfull()->Put(wo, "foo3", "bar3"));
+      ASSERT_OK(dbfull()->Put(wo, "foo3", large_value));
       ASSERT_GE(sleep_count.load(), 1);
       token.reset();
     }
@@ -906,6 +910,9 @@ TEST_F(DBTest, FlushSchedule) {
       static_cast<int64_t>(options.write_buffer_size);
   options.max_write_buffer_number = 2;
   options.write_buffer_size = 120 * 1024;
+  auto flush_listener = std::make_shared<FlushCounterListener>();
+  flush_listener->expected_flush_reason = FlushReason::kWriteBufferFull;
+  options.listeners.push_back(flush_listener);
   CreateAndReopenWithCF({"pikachu"}, options);
   std::vector<port::Thread> threads;
 
@@ -1307,51 +1314,6 @@ TEST_F(DBTest, DISABLED_RepeatedWritesToSameKey) {
 }
 #endif  // ROCKSDB_LITE
 
-TEST_F(DBTest, SparseMerge) {
-  do {
-    Options options = CurrentOptions();
-    options.compression = kNoCompression;
-    CreateAndReopenWithCF({"pikachu"}, options);
-
-    FillLevels("A", "Z", 1);
-
-    // Suppose there is:
-    //    small amount of data with prefix A
-    //    large amount of data with prefix B
-    //    small amount of data with prefix C
-    // and that recent updates have made small changes to all three prefixes.
-    // Check that we do not do a compaction that merges all of B in one shot.
-    const std::string value(1000, 'x');
-    Put(1, "A", "va");
-    // Write approximately 100MB of "B" values
-    for (int i = 0; i < 100000; i++) {
-      char key[100];
-      snprintf(key, sizeof(key), "B%010d", i);
-      Put(1, key, value);
-    }
-    Put(1, "C", "vc");
-    ASSERT_OK(Flush(1));
-    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
-
-    // Make sparse update
-    Put(1, "A", "va2");
-    Put(1, "B100", "bvalue2");
-    Put(1, "C", "vc2");
-    ASSERT_OK(Flush(1));
-
-    // Compactions should not cause us to create a situation where
-    // a file overlaps too much data at the next level.
-    ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]),
-              20 * 1048576);
-    dbfull()->TEST_CompactRange(0, nullptr, nullptr);
-    ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]),
-              20 * 1048576);
-    dbfull()->TEST_CompactRange(1, nullptr, nullptr);
-    ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]),
-              20 * 1048576);
-  } while (ChangeCompactOptions());
-}
-
 #ifndef ROCKSDB_LITE
 static bool Between(uint64_t val, uint64_t low, uint64_t high) {
   bool result = (val >= low) && (val <= high);
@@ -1384,17 +1346,19 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
   SizeApproximationOptions size_approx_options;
   size_approx_options.include_memtabtles = true;
   size_approx_options.include_files = true;
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
   ASSERT_GT(size, 6000);
   ASSERT_LT(size, 204800);
   // Zero if not including mem table
-  db_->GetApproximateSizes(&r, 1, &size);
+  ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size));
   ASSERT_EQ(size, 0);
 
   start = Key(500);
   end = Key(600);
   r = Range(start, end);
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
   ASSERT_EQ(size, 0);
 
   for (int i = 0; i < N; i++) {
@@ -1404,13 +1368,15 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
   start = Key(500);
   end = Key(600);
   r = Range(start, end);
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
   ASSERT_EQ(size, 0);
 
   start = Key(100);
   end = Key(1020);
   r = Range(start, end);
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
   ASSERT_GT(size, 6000);
 
   options.max_write_buffer_number = 8;
@@ -1436,29 +1402,32 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
   start = Key(100);
   end = Key(300);
   r = Range(start, end);
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
   ASSERT_EQ(size, 0);
 
   start = Key(1050);
   end = Key(1080);
   r = Range(start, end);
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
   ASSERT_GT(size, 6000);
 
   start = Key(2100);
   end = Key(2300);
   r = Range(start, end);
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
   ASSERT_EQ(size, 0);
 
   start = Key(1050);
   end = Key(1080);
   r = Range(start, end);
   uint64_t size_with_mt, size_without_mt;
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
-                           &size_with_mt);
+  ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+                                     &size_with_mt));
   ASSERT_GT(size_with_mt, 6000);
-  db_->GetApproximateSizes(&r, 1, &size_without_mt);
+  ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size_without_mt));
   ASSERT_EQ(size_without_mt, 0);
 
   Flush();
@@ -1470,15 +1439,16 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
   start = Key(1050);
   end = Key(1080);
   r = Range(start, end);
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
-                           &size_with_mt);
-  db_->GetApproximateSizes(&r, 1, &size_without_mt);
+  ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+                                     &size_with_mt));
+  ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size_without_mt));
   ASSERT_GT(size_with_mt, size_without_mt);
   ASSERT_GT(size_without_mt, 6000);
 
   // Check that include_memtabtles flag works as expected
   size_approx_options.include_memtabtles = false;
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
   ASSERT_EQ(size, size_without_mt);
 
   // Check that files_size_error_margin works as expected, when the heuristic
@@ -1487,10 +1457,12 @@ TEST_F(DBTest, ApproximateSizesMemTable) {
   end = Key(1000 + N - 2);
   r = Range(start, end);
   size_approx_options.files_size_error_margin = -1.0;  // disabled
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
   uint64_t size2;
   size_approx_options.files_size_error_margin = 0.5;  // enabled, but not used
-  db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2));
   ASSERT_EQ(size, size2);
 }
 
@@ -1541,14 +1513,16 @@ TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) {
 
     // Get the precise size without any approximation heuristic
     uint64_t size;
-    db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size);
+    ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+                                       &size));
     ASSERT_NE(size, 0);
 
     // Get the size with an approximation heuristic
     uint64_t size2;
     const double error_margin = 0.2;
     size_approx_options.files_size_error_margin = error_margin;
-    db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2);
+    ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+                                       &size2));
     ASSERT_LT(size2, size * (1 + error_margin));
     ASSERT_GT(size2, size * (1 - error_margin));
   }
@@ -1564,7 +1538,7 @@ TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) {
       const std::string end = Key(i + 11);  // overlap by 1 key
       const Range r(start, end);
       uint64_t size;
-      db_->GetApproximateSizes(&r, 1, &size);
+      ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size));
       ASSERT_LE(size, 11 * 100);
     }
   }
@@ -1632,9 +1606,12 @@ TEST_F(DBTest, ApproximateSizes) {
     DestroyAndReopen(options);
     CreateAndReopenWithCF({"pikachu"}, options);
 
-    ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0));
+    uint64_t size;
+    ASSERT_OK(Size("", "xyz", 1, &size));
+    ASSERT_TRUE(Between(size, 0, 0));
     ReopenWithColumnFamilies({"default", "pikachu"}, options);
-    ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0));
+    ASSERT_OK(Size("", "xyz", 1, &size));
+    ASSERT_TRUE(Between(size, 0, 0));
 
     // Write 8MB (80 values, each 100K)
     ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
@@ -1647,7 +1624,8 @@ TEST_F(DBTest, ApproximateSizes) {
     }
 
     // 0 because GetApproximateSizes() does not account for memtable space
-    ASSERT_TRUE(Between(Size("", Key(50), 1), 0, 0));
+    ASSERT_OK(Size("", Key(50), 1, &size));
+    ASSERT_TRUE(Between(size, 0, 0));
 
     // Check sizes across recovery by reopening a few times
     for (int run = 0; run < 3; run++) {
@@ -1655,14 +1633,17 @@ TEST_F(DBTest, ApproximateSizes) {
 
       for (int compact_start = 0; compact_start < N; compact_start += 10) {
         for (int i = 0; i < N; i += 10) {
-          ASSERT_TRUE(Between(Size("", Key(i), 1), S1 * i, S2 * i));
-          ASSERT_TRUE(Between(Size("", Key(i) + ".suffix", 1), S1 * (i + 1),
-                              S2 * (i + 1)));
-          ASSERT_TRUE(Between(Size(Key(i), Key(i + 10), 1), S1 * 10, S2 * 10));
+          ASSERT_OK(Size("", Key(i), 1, &size));
+          ASSERT_TRUE(Between(size, S1 * i, S2 * i));
+          ASSERT_OK(Size("", Key(i) + ".suffix", 1, &size));
+          ASSERT_TRUE(Between(size, S1 * (i + 1), S2 * (i + 1)));
+          ASSERT_OK(Size(Key(i), Key(i + 10), 1, &size));
+          ASSERT_TRUE(Between(size, S1 * 10, S2 * 10));
         }
-        ASSERT_TRUE(Between(Size("", Key(50), 1), S1 * 50, S2 * 50));
-        ASSERT_TRUE(
-            Between(Size("", Key(50) + ".suffix", 1), S1 * 50, S2 * 50));
+        ASSERT_OK(Size("", Key(50), 1, &size));
+        ASSERT_TRUE(Between(size, S1 * 50, S2 * 50));
+        ASSERT_OK(Size("", Key(50) + ".suffix", 1, &size));
+        ASSERT_TRUE(Between(size, S1 * 50, S2 * 50));
 
         std::string cstart_str = Key(compact_start);
         std::string cend_str = Key(compact_start + 9);
@@ -1697,21 +1678,32 @@ TEST_F(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
     ASSERT_OK(Put(1, Key(7), rnd.RandomString(10000)));
 
     // Check sizes across recovery by reopening a few times
+    uint64_t size;
     for (int run = 0; run < 3; run++) {
       ReopenWithColumnFamilies({"default", "pikachu"}, options);
 
-      ASSERT_TRUE(Between(Size("", Key(0), 1), 0, 0));
-      ASSERT_TRUE(Between(Size("", Key(1), 1), 10000, 11000));
-      ASSERT_TRUE(Between(Size("", Key(2), 1), 20000, 21000));
-      ASSERT_TRUE(Between(Size("", Key(3), 1), 120000, 121000));
-      ASSERT_TRUE(Between(Size("", Key(4), 1), 130000, 131000));
-      ASSERT_TRUE(Between(Size("", Key(5), 1), 230000, 232000));
-      ASSERT_TRUE(Between(Size("", Key(6), 1), 240000, 242000));
+      ASSERT_OK(Size("", Key(0), 1, &size));
+      ASSERT_TRUE(Between(size, 0, 0));
+      ASSERT_OK(Size("", Key(1), 1, &size));
+      ASSERT_TRUE(Between(size, 10000, 11000));
+      ASSERT_OK(Size("", Key(2), 1, &size));
+      ASSERT_TRUE(Between(size, 20000, 21000));
+      ASSERT_OK(Size("", Key(3), 1, &size));
+      ASSERT_TRUE(Between(size, 120000, 121000));
+      ASSERT_OK(Size("", Key(4), 1, &size));
+      ASSERT_TRUE(Between(size, 130000, 131000));
+      ASSERT_OK(Size("", Key(5), 1, &size));
+      ASSERT_TRUE(Between(size, 230000, 232000));
+      ASSERT_OK(Size("", Key(6), 1, &size));
+      ASSERT_TRUE(Between(size, 240000, 242000));
       // Ensure some overhead is accounted for, even without including all
-      ASSERT_TRUE(Between(Size("", Key(7), 1), 540500, 545000));
-      ASSERT_TRUE(Between(Size("", Key(8), 1), 550500, 555000));
+      ASSERT_OK(Size("", Key(7), 1, &size));
+      ASSERT_TRUE(Between(size, 540500, 545000));
+      ASSERT_OK(Size("", Key(8), 1, &size));
+      ASSERT_TRUE(Between(size, 550500, 555000));
 
-      ASSERT_TRUE(Between(Size(Key(3), Key(5), 1), 110100, 111000));
+      ASSERT_OK(Size(Key(3), Key(5), 1, &size));
+      ASSERT_TRUE(Between(size, 110100, 111000));
 
       dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
     }
@@ -1795,6 +1787,7 @@ TEST_F(DBTest, Snapshot) {
 TEST_F(DBTest, HiddenValuesAreRemoved) {
   anon::OptionsOverride options_override;
   options_override.skip_policy = kSkipNoSnapshot;
+  uint64_t size;
   do {
     Options options = CurrentOptions(options_override);
     CreateAndReopenWithCF({"pikachu"}, options);
@@ -1812,7 +1805,8 @@ TEST_F(DBTest, HiddenValuesAreRemoved) {
     ASSERT_GT(NumTableFilesAtLevel(0, 1), 0);
 
     ASSERT_EQ(big, Get(1, "foo", snapshot));
-    ASSERT_TRUE(Between(Size("", "pastfoo", 1), 50000, 60000));
+    ASSERT_OK(Size("", "pastfoo", 1, &size));
+    ASSERT_TRUE(Between(size, 50000, 60000));
     db_->ReleaseSnapshot(snapshot);
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny, " + big + " ]");
     Slice x("x");
@@ -1823,7 +1817,8 @@ TEST_F(DBTest, HiddenValuesAreRemoved) {
     dbfull()->TEST_CompactRange(1, nullptr, &x, handles_[1]);
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]");
 
-    ASSERT_TRUE(Between(Size("", "pastfoo", 1), 0, 1000));
+    ASSERT_OK(Size("", "pastfoo", 1, &size));
+    ASSERT_TRUE(Between(size, 0, 1000));
     // ApproximateOffsetOf() is not yet implemented in plain table format,
     // which is used by Size().
   } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction |
@@ -2346,6 +2341,13 @@ TEST_F(DBTest, ReadonlyDBGetLiveManifestSize) {
 }
 
 TEST_F(DBTest, GetLiveBlobFiles) {
+  // Note: the following prevents an otherwise harmless data race between the
+  // test setup code (AddBlobFile) below and the periodic stat dumping thread.
+  Options options = CurrentOptions();
+  options.stats_dump_period_sec = 0;
+
+  Reopen(options);
+
   VersionSet* const versions = dbfull()->TEST_GetVersionSet();
   assert(versions);
   assert(versions->GetColumnFamilySet());
@@ -2393,6 +2395,7 @@ TEST_F(DBTest, PurgeInfoLogs) {
   Options options = CurrentOptions();
   options.keep_log_file_num = 5;
   options.create_if_missing = true;
+  options.env = env_;
   for (int mode = 0; mode <= 1; mode++) {
     if (mode == 1) {
       options.db_log_dir = dbname_ + "_logs";
@@ -3506,17 +3509,21 @@ TEST_F(DBTest, FIFOCompactionStyleWithCompactionAndDelete) {
 }
 
 // Check that FIFO-with-TTL is not supported with max_open_files != -1.
+// Github issue #8014
 TEST_F(DBTest, FIFOCompactionWithTTLAndMaxOpenFilesTest) {
-  Options options;
+  Options options = CurrentOptions();
   options.compaction_style = kCompactionStyleFIFO;
   options.create_if_missing = true;
   options.ttl = 600;  // seconds
 
-  // TTL is now supported with max_open_files != -1.
+  // TTL is not supported with max_open_files != -1.
+  options.max_open_files = 0;
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
+
   options.max_open_files = 100;
-  options = CurrentOptions(options);
-  ASSERT_OK(TryReopen(options));
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
 
+  // TTL is supported with unlimited max_open_files
   options.max_open_files = -1;
   ASSERT_OK(TryReopen(options));
 }
@@ -3951,6 +3958,7 @@ TEST_F(DBTest, WriteSingleThreadEntry) {
 TEST_F(DBTest, ConcurrentFlushWAL) {
   const size_t cnt = 100;
   Options options;
+  options.env = env_;
   WriteOptions wopt;
   ReadOptions ropt;
   for (bool two_write_queues : {false, true}) {
@@ -4619,6 +4627,7 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel) {
 
   Random rnd(301);
   Options options;
+  options.env = env_;
   options.create_if_missing = true;
   options.db_write_buffer_size = 20480;
   options.write_buffer_size = 20480;
@@ -5021,6 +5030,7 @@ TEST_F(DBTest, DynamicFIFOCompactionOptions) {
   Options options;
   options.ttl = 0;
   options.create_if_missing = true;
+  options.env = env_;
   DestroyAndReopen(options);
 
   // Initial defaults
@@ -5082,6 +5092,7 @@ TEST_F(DBTest, DynamicFIFOCompactionOptions) {
 TEST_F(DBTest, DynamicUniversalCompactionOptions) {
   Options options;
   options.create_if_missing = true;
+  options.env = env_;
   DestroyAndReopen(options);
 
   // Initial defaults
@@ -5307,41 +5318,45 @@ TEST_F(DBTest, DynamicMiscOptions) {
 #endif  // ROCKSDB_LITE
 
 TEST_F(DBTest, L0L1L2AndUpHitCounter) {
+  const int kNumLevels = 3;
+  const int kNumKeysPerLevel = 10000;
+  const int kNumKeysPerDb = kNumLevels * kNumKeysPerLevel;
+
   Options options = CurrentOptions();
-  options.write_buffer_size = 32 * 1024;
-  options.target_file_size_base = 32 * 1024;
-  options.level0_file_num_compaction_trigger = 2;
-  options.level0_slowdown_writes_trigger = 2;
-  options.level0_stop_writes_trigger = 4;
-  options.max_bytes_for_level_base = 64 * 1024;
-  options.max_write_buffer_number = 2;
-  options.max_background_compactions = 8;
-  options.max_background_flushes = 8;
   options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
-  CreateAndReopenWithCF({"mypikachu"}, options);
+  Reopen(options);
 
-  int numkeys = 20000;
-  for (int i = 0; i < numkeys; i++) {
-    ASSERT_OK(Put(1, Key(i), "val"));
+  // After the below loop there will be one file on each of L0, L1, and L2.
+  int key = 0;
+  for (int output_level = kNumLevels - 1; output_level >= 0; --output_level) {
+    for (int i = 0; i < kNumKeysPerLevel; ++i) {
+      ASSERT_OK(Put(Key(key), "val"));
+      key++;
+    }
+    ASSERT_OK(Flush());
+    for (int input_level = 0; input_level < output_level; ++input_level) {
+      // `TEST_CompactRange(input_level, ...)` compacts from `input_level` to
+      // `input_level + 1`.
+      ASSERT_OK(dbfull()->TEST_CompactRange(input_level, nullptr, nullptr));
+    }
   }
+  assert(key == kNumKeysPerDb);
+
   ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0));
   ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1));
   ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
 
-  ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
-
-  for (int i = 0; i < numkeys; i++) {
-    ASSERT_EQ(Get(1, Key(i)), "val");
+  for (int i = 0; i < kNumKeysPerDb; i++) {
+    ASSERT_EQ(Get(Key(i)), "val");
   }
 
-  ASSERT_GT(TestGetTickerCount(options, GET_HIT_L0), 100);
-  ASSERT_GT(TestGetTickerCount(options, GET_HIT_L1), 100);
-  ASSERT_GT(TestGetTickerCount(options, GET_HIT_L2_AND_UP), 100);
+  ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L0));
+  ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L1));
+  ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
 
-  ASSERT_EQ(numkeys, TestGetTickerCount(options, GET_HIT_L0) +
-                         TestGetTickerCount(options, GET_HIT_L1) +
-                         TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+  ASSERT_EQ(kNumKeysPerDb, TestGetTickerCount(options, GET_HIT_L0) +
+                               TestGetTickerCount(options, GET_HIT_L1) +
+                               TestGetTickerCount(options, GET_HIT_L2_AND_UP));
 }
 
 TEST_F(DBTest, EncodeDecompressedBlockSizeTest) {
@@ -6697,20 +6712,19 @@ TEST_F(DBTest, MemoryUsageWithMaxWriteBufferSizeToMaintain) {
   Reopen(options);
   Random rnd(301);
   bool memory_limit_exceeded = false;
-  uint64_t size_all_mem_table = 0;
-  uint64_t cur_active_mem = 0;
+
+  ColumnFamilyData* cfd =
+      static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+
   for (int i = 0; i < 1000; i++) {
     std::string value = rnd.RandomString(1000);
     ASSERT_OK(Put("keykey_" + std::to_string(i), value));
 
     dbfull()->TEST_WaitForFlushMemTable();
 
-    ASSERT_TRUE(db_->GetIntProperty(db_->DefaultColumnFamily(),
-                                    DB::Properties::kSizeAllMemTables,
-                                    &size_all_mem_table));
-    ASSERT_TRUE(db_->GetIntProperty(db_->DefaultColumnFamily(),
-                                    DB::Properties::kCurSizeActiveMemTable,
-                                    &cur_active_mem));
+    const uint64_t cur_active_mem = cfd->mem()->ApproximateMemoryUsage();
+    const uint64_t size_all_mem_table =
+        cur_active_mem + cfd->imm()->ApproximateMemoryUsage();
 
     // Errors out if memory usage keeps on increasing beyond the limit.
     // Once memory limit exceeds,  memory_limit_exceeded  is set and if
diff --git a/db/db_test2.cc b/db/db_test2.cc
index ed0f469d0a6..f209cdd0384 100644
--- a/db/db_test2.cc
+++ b/db/db_test2.cc
@@ -9,9 +9,11 @@
 #include <atomic>
 #include <cstdlib>
 #include <functional>
+#include <memory>
 
 #include "db/db_test_util.h"
 #include "db/read_callback.h"
+#include "options/options_helper.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/persistent_cache.h"
@@ -40,9 +42,7 @@ TEST_F(DBTest2, OpenForReadOnly) {
   std::vector<std::string> files;
   ASSERT_OK(env_->GetChildren(dbname, &files));
   for (auto& f : files) {
-    if (f != "." && f != "..") {
-      ASSERT_OK(env_->DeleteFile(dbname + "/" + f));
-    }
+    ASSERT_OK(env_->DeleteFile(dbname + "/" + f));
   }
   // <dbname> should be empty now and we should be able to delete it
   ASSERT_OK(env_->DeleteDir(dbname));
@@ -74,9 +74,7 @@ TEST_F(DBTest2, OpenForReadOnlyWithColumnFamilies) {
   std::vector<std::string> files;
   ASSERT_OK(env_->GetChildren(dbname, &files));
   for (auto& f : files) {
-    if (f != "." && f != "..") {
-      ASSERT_OK(env_->DeleteFile(dbname + "/" + f));
-    }
+    ASSERT_OK(env_->DeleteFile(dbname + "/" + f));
   }
   // <dbname> should be empty now and we should be able to delete it
   ASSERT_OK(env_->DeleteDir(dbname));
@@ -104,6 +102,7 @@ class TestReadOnlyWithCompressedCache
 
 TEST_P(TestReadOnlyWithCompressedCache, ReadOnlyWithCompressedCache) {
   if (use_mmap_ && !IsMemoryMappedAccessSupported()) {
+    ROCKSDB_GTEST_SKIP("Test requires MMAP support");
     return;
   }
   ASSERT_OK(Put("foo", "bar"));
@@ -156,8 +155,14 @@ class PartitionedIndexTestListener : public EventListener {
 };
 
 TEST_F(DBTest2, PartitionedIndexUserToInternalKey) {
+  const int kValueSize = 10500;
+  const int kNumEntriesPerFile = 1000;
+  const int kNumFiles = 3;
+  const int kNumDistinctKeys = 30;
+
   BlockBasedTableOptions table_options;
   Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
   table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
   PartitionedIndexTestListener* listener = new PartitionedIndexTestListener();
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
@@ -166,13 +171,16 @@ TEST_F(DBTest2, PartitionedIndexUserToInternalKey) {
   Reopen(options);
   Random rnd(301);
 
-  for (int i = 0; i < 3000; i++) {
-    int j = i % 30;
-    std::string value = rnd.RandomString(10500);
-    ASSERT_OK(Put("keykey_" + std::to_string(j), value));
-    snapshots.push_back(db_->GetSnapshot());
+  for (int i = 0; i < kNumFiles; i++) {
+    for (int j = 0; j < kNumEntriesPerFile; j++) {
+      int key_id = (i * kNumEntriesPerFile + j) % kNumDistinctKeys;
+      std::string value = rnd.RandomString(kValueSize);
+      ASSERT_OK(Put("keykey_" + std::to_string(key_id), value));
+      snapshots.push_back(db_->GetSnapshot());
+    }
+    ASSERT_OK(Flush());
   }
-  Flush();
+
   for (auto s : snapshots) {
     db_->ReleaseSnapshot(s);
   }
@@ -291,7 +299,7 @@ TEST_F(DBTest2, CacheIndexAndFilterWithDBRestart) {
   BlockBasedTableOptions table_options;
   table_options.cache_index_and_filter_blocks = true;
   table_options.filter_policy.reset(NewBloomFilterPolicy(20));
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   CreateAndReopenWithCF({"pikachu"}, options);
 
   Put(1, "a", "begin");
@@ -337,6 +345,10 @@ class DBTestSharedWriteBufferAcrossCFs
 TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
   Options options = CurrentOptions();
   options.arena_block_size = 4096;
+  auto flush_listener = std::make_shared<FlushCounterListener>();
+  options.listeners.push_back(flush_listener);
+  // Don't trip the listener at shutdown.
+  options.avoid_flush_during_shutdown = true;
 
   // Avoid undeterministic value by malloc_usable_size();
   // Force arena block size to 1
@@ -380,6 +392,7 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
 
   // Create some data and flush "default" and "nikitich" so that they
   // are newer CFs created.
+  flush_listener->expected_flush_reason = FlushReason::kManualFlush;
   ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
   Flush(3);
   ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
@@ -390,6 +403,7 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
   ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
             static_cast<uint64_t>(1));
 
+  flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager;
   ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
   if (cost_cache_) {
     ASSERT_GE(cache->GetUsage(), 256 * 1024);
@@ -514,6 +528,10 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
   std::string dbname2 = test::PerThreadDBPath("db_shared_wb_db2");
   Options options = CurrentOptions();
   options.arena_block_size = 4096;
+  auto flush_listener = std::make_shared<FlushCounterListener>();
+  options.listeners.push_back(flush_listener);
+  // Don't trip the listener at shutdown.
+  options.avoid_flush_during_shutdown = true;
   // Avoid undeterministic value by malloc_usable_size();
   // Force arena block size to 1
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
@@ -551,6 +569,7 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
   };
 
   // Trigger a flush on cf2
+  flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager;
   ASSERT_OK(Put(2, Key(1), DummyString(70000), wo));
   wait_flush();
   ASSERT_OK(Put(0, Key(1), DummyString(20000), wo));
@@ -1344,7 +1363,7 @@ TEST_F(DBTest2, PresetCompressionDictLocality) {
   options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
   BlockBasedTableOptions table_options;
   table_options.cache_index_and_filter_blocks = true;
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   Reopen(options);
 
   Random rnd(301);
@@ -1389,6 +1408,236 @@ TEST_F(DBTest2, PresetCompressionDictLocality) {
   }
 }
 
+class PresetCompressionDictTest
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<CompressionType, bool>> {
+ public:
+  PresetCompressionDictTest()
+      : DBTestBase("/db_test2", false /* env_do_fsync */),
+        compression_type_(std::get<0>(GetParam())),
+        bottommost_(std::get<1>(GetParam())) {}
+
+ protected:
+  const CompressionType compression_type_;
+  const bool bottommost_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    DBTest2, PresetCompressionDictTest,
+    ::testing::Combine(::testing::ValuesIn(GetSupportedDictCompressions()),
+                       ::testing::Bool()));
+
+TEST_P(PresetCompressionDictTest, Flush) {
+  // Verifies that dictionary is generated and written during flush only when
+  // `ColumnFamilyOptions::compression` enables dictionary. Also verifies the
+  // size of the dictionary is within expectations according to the limit on
+  // buffering set by `CompressionOptions::max_dict_buffer_bytes`.
+  const size_t kValueLen = 256;
+  const size_t kKeysPerFile = 1 << 10;
+  const size_t kDictLen = 16 << 10;
+  const size_t kBlockLen = 4 << 10;
+
+  Options options = CurrentOptions();
+  if (bottommost_) {
+    options.bottommost_compression = compression_type_;
+    options.bottommost_compression_opts.enabled = true;
+    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
+    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
+  } else {
+    options.compression = compression_type_;
+    options.compression_opts.max_dict_bytes = kDictLen;
+    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
+  }
+  options.memtable_factory.reset(new SpecialSkipListFactory(kKeysPerFile));
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.block_size = kBlockLen;
+  bbto.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  Random rnd(301);
+  for (size_t i = 0; i <= kKeysPerFile; ++i) {
+    ASSERT_OK(Put(Key(static_cast<int>(i)), rnd.RandomString(kValueLen)));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
+  // compression dictionary exists since dictionaries would be preloaded when
+  // the flush finishes.
+  if (bottommost_) {
+    // Flush is never considered bottommost. This should change in the future
+    // since flushed files may have nothing underneath them, like the one in
+    // this test case.
+    ASSERT_EQ(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        0);
+  } else {
+    ASSERT_GT(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        0);
+    // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
+    // number of bytes needs to be adjusted in case the cached block is in
+    // ZSTD's digested dictionary format.
+    if (compression_type_ != kZSTD &&
+        compression_type_ != kZSTDNotFinalCompression) {
+      // Although we limited buffering to `kBlockLen`, there may be up to two
+      // blocks of data included in the dictionary since we only check limit
+      // after each block is built.
+      ASSERT_LE(TestGetTickerCount(options,
+                                   BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+                2 * kBlockLen);
+    }
+  }
+}
+
+TEST_P(PresetCompressionDictTest, CompactNonBottommost) {
+  // Verifies that dictionary is generated and written during compaction to
+  // non-bottommost level only when `ColumnFamilyOptions::compression` enables
+  // dictionary. Also verifies the size of the dictionary is within expectations
+  // according to the limit on buffering set by
+  // `CompressionOptions::max_dict_buffer_bytes`.
+  const size_t kValueLen = 256;
+  const size_t kKeysPerFile = 1 << 10;
+  const size_t kDictLen = 16 << 10;
+  const size_t kBlockLen = 4 << 10;
+
+  Options options = CurrentOptions();
+  if (bottommost_) {
+    options.bottommost_compression = compression_type_;
+    options.bottommost_compression_opts.enabled = true;
+    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
+    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
+  } else {
+    options.compression = compression_type_;
+    options.compression_opts.max_dict_bytes = kDictLen;
+    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
+  }
+  options.disable_auto_compactions = true;
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.block_size = kBlockLen;
+  bbto.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  Random rnd(301);
+  for (size_t j = 0; j <= kKeysPerFile; ++j) {
+    ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
+  }
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (size_t j = 0; j <= kKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
+    }
+    ASSERT_OK(Flush());
+  }
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,0,1", FilesPerLevel(0));
+#endif  // ROCKSDB_LITE
+
+  uint64_t prev_compression_dict_bytes_inserted =
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
+  // This L0->L1 compaction merges the two L0 files into L1. The produced L1
+  // file is not bottommost due to the existing L2 file covering the same key-
+  // range.
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,1,1", FilesPerLevel(0));
+#endif  // ROCKSDB_LITE
+  // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
+  // compression dictionary exists since dictionaries would be preloaded when
+  // the compaction finishes.
+  if (bottommost_) {
+    ASSERT_EQ(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        prev_compression_dict_bytes_inserted);
+  } else {
+    ASSERT_GT(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        prev_compression_dict_bytes_inserted);
+    // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
+    // number of bytes needs to be adjusted in case the cached block is in
+    // ZSTD's digested dictionary format.
+    if (compression_type_ != kZSTD &&
+        compression_type_ != kZSTDNotFinalCompression) {
+      // Although we limited buffering to `kBlockLen`, there may be up to two
+      // blocks of data included in the dictionary since we only check limit
+      // after each block is built.
+      ASSERT_LE(TestGetTickerCount(options,
+                                   BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+                prev_compression_dict_bytes_inserted + 2 * kBlockLen);
+    }
+  }
+}
+
+TEST_P(PresetCompressionDictTest, CompactBottommost) {
+  // Verifies that dictionary is generated and written during compaction to
+  // non-bottommost level only when either `ColumnFamilyOptions::compression` or
+  // `ColumnFamilyOptions::bottommost_compression` enables dictionary. Also
+  // verifies the size of the dictionary is within expectations according to the
+  // limit on buffering set by `CompressionOptions::max_dict_buffer_bytes`.
+  const size_t kValueLen = 256;
+  const size_t kKeysPerFile = 1 << 10;
+  const size_t kDictLen = 16 << 10;
+  const size_t kBlockLen = 4 << 10;
+
+  Options options = CurrentOptions();
+  if (bottommost_) {
+    options.bottommost_compression = compression_type_;
+    options.bottommost_compression_opts.enabled = true;
+    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
+    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
+  } else {
+    options.compression = compression_type_;
+    options.compression_opts.max_dict_bytes = kDictLen;
+    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
+  }
+  options.disable_auto_compactions = true;
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.block_size = kBlockLen;
+  bbto.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    for (size_t j = 0; j <= kKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
+    }
+    ASSERT_OK(Flush());
+  }
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2", FilesPerLevel(0));
+#endif  // ROCKSDB_LITE
+
+  uint64_t prev_compression_dict_bytes_inserted =
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
+  CompactRangeOptions cro;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+#endif  // ROCKSDB_LITE
+  ASSERT_GT(
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+      prev_compression_dict_bytes_inserted);
+  // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
+  // number of bytes needs to be adjusted in case the cached block is in ZSTD's
+  // digested dictionary format.
+  if (compression_type_ != kZSTD &&
+      compression_type_ != kZSTDNotFinalCompression) {
+    // Although we limited buffering to `kBlockLen`, there may be up to two
+    // blocks of data included in the dictionary since we only check limit after
+    // each block is built.
+    ASSERT_LE(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        prev_compression_dict_bytes_inserted + 2 * kBlockLen);
+  }
+}
+
 class CompactionCompressionListener : public EventListener {
  public:
   explicit CompactionCompressionListener(Options* db_options)
@@ -1470,7 +1719,7 @@ TEST_P(CompressionFailuresTest, CompressionFailures) {
   BlockBasedTableOptions table_options;
   table_options.block_size = 512;
   table_options.verify_compression = true;
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   options.compression = compression_type_;
   options.compression_opts.parallel_threads = compression_parallel_threads_;
@@ -1808,7 +2057,7 @@ class PinL0IndexAndFilterBlocksTest
     table_options.cache_index_and_filter_blocks = true;
     table_options.pin_l0_filter_and_index_blocks_in_cache = true;
     table_options.filter_policy.reset(NewBloomFilterPolicy(20));
-    options->table_factory.reset(new BlockBasedTableFactory(table_options));
+    options->table_factory.reset(NewBlockBasedTableFactory(table_options));
     CreateAndReopenWithCF({"pikachu"}, *options);
 
     Put(1, "a", "begin");
@@ -1848,7 +2097,7 @@ TEST_P(PinL0IndexAndFilterBlocksTest,
   table_options.cache_index_and_filter_blocks = true;
   table_options.pin_l0_filter_and_index_blocks_in_cache = true;
   table_options.filter_policy.reset(NewBloomFilterPolicy(20));
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   CreateAndReopenWithCF({"pikachu"}, options);
 
   ASSERT_OK(Put(1, "key", "val"));
@@ -2485,26 +2734,30 @@ TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) {
   {
     const int kIdBufLen = 100;
     char id_buf[kIdBufLen];
+    Status s = Status::NotSupported();
 #ifndef OS_WIN
     // You can't open a directory on windows using random access file
     std::unique_ptr<RandomAccessFile> file;
-    ASSERT_OK(env_->NewRandomAccessFile(dbname_, &file, EnvOptions()));
-    if (file->GetUniqueId(id_buf, kIdBufLen) == 0) {
-      // fs holding db directory doesn't support getting a unique file id,
-      // this means that running this test will fail because lru_cache will load
-      // the blocks again regardless of them being already in the cache
-      return;
-    }
-#else
-    std::unique_ptr<Directory> dir;
-    ASSERT_OK(env_->NewDirectory(dbname_, &dir));
-    if (dir->GetUniqueId(id_buf, kIdBufLen) == 0) {
-      // fs holding db directory doesn't support getting a unique file id,
-      // this means that running this test will fail because lru_cache will load
-      // the blocks again regardless of them being already in the cache
-      return;
+    s = env_->NewRandomAccessFile(dbname_, &file, EnvOptions());
+    if (s.ok()) {
+      if (file->GetUniqueId(id_buf, kIdBufLen) == 0) {
+        // fs holding db directory doesn't support getting a unique file id,
+        // this means that running this test will fail because lru_cache will
+        // load the blocks again regardless of them being already in the cache
+        return;
+      }
     }
 #endif
+    if (!s.ok()) {
+      std::unique_ptr<Directory> dir;
+      ASSERT_OK(env_->NewDirectory(dbname_, &dir));
+      if (dir->GetUniqueId(id_buf, kIdBufLen) == 0) {
+        // fs holding db directory doesn't support getting a unique file id,
+        // this means that running this test will fail because lru_cache will
+        // load the blocks again regardless of them being already in the cache
+        return;
+      }
+    }
   }
   uint32_t bytes_per_bit[2] = {1, 16};
   for (size_t k = 0; k < 2; k++) {
@@ -2949,6 +3202,180 @@ TEST_F(DBTest2, PausingManualCompaction4) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+TEST_F(DBTest2, CancelManualCompaction1) {
+  CompactRangeOptions compact_options;
+  auto canceledPtr =
+      std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>{true});
+  compact_options.canceled = canceledPtr.get();
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  Random rnd(301);
+  auto generate_files = [&]() {
+    for (int i = 0; i < options.num_levels; i++) {
+      for (int j = 0; j < options.num_levels - i + 1; j++) {
+        for (int k = 0; k < 1000; k++) {
+          ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
+        }
+        Flush();
+      }
+
+      for (int l = 1; l < options.num_levels - i; l++) {
+        MoveFilesToLevel(l);
+      }
+    }
+  };
+
+  DestroyAndReopen(options);
+  generate_files();
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  int run_manual_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1",
+      [&](void* /*arg*/) { run_manual_compactions++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Setup a callback to disable compactions after a couple of levels are
+  // compacted
+  int compactions_run = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::RunManualCompaction()::1",
+      [&](void* /*arg*/) { ++compactions_run; });
+
+  dbfull()->CompactRange(compact_options, nullptr, nullptr);
+  dbfull()->TEST_WaitForCompact(true);
+
+  // Since compactions are disabled, we shouldn't start compacting.
+  // E.g. we should call the compaction function exactly one time.
+  ASSERT_EQ(compactions_run, 0);
+  ASSERT_EQ(run_manual_compactions, 0);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  compactions_run = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "DBImpl::RunManualCompaction()::1");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) {
+        ++compactions_run;
+        // After 3 compactions disable
+        if (compactions_run == 3) {
+          compact_options.canceled->store(true, std::memory_order_release);
+        }
+      });
+
+  compact_options.canceled->store(false, std::memory_order_release);
+  dbfull()->CompactRange(compact_options, nullptr, nullptr);
+  dbfull()->TEST_WaitForCompact(true);
+
+  ASSERT_EQ(compactions_run, 3);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "DBImpl::RunManualCompaction()::1");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1");
+
+  // Compactions should work again if we re-enable them..
+  compact_options.canceled->store(false, std::memory_order_relaxed);
+  dbfull()->CompactRange(compact_options, nullptr, nullptr);
+  dbfull()->TEST_WaitForCompact(true);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, CancelManualCompaction2) {
+  CompactRangeOptions compact_options;
+  auto canceledPtr =
+      std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>{true});
+  compact_options.canceled = canceledPtr.get();
+  compact_options.max_subcompactions = 1;
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  Random rnd(301);
+  auto generate_files = [&]() {
+    for (int i = 0; i < options.num_levels; i++) {
+      for (int j = 0; j < options.num_levels - i + 1; j++) {
+        for (int k = 0; k < 1000; k++) {
+          ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
+        }
+        Flush();
+      }
+
+      for (int l = 1; l < options.num_levels - i; l++) {
+        MoveFilesToLevel(l);
+      }
+    }
+  };
+
+  DestroyAndReopen(options);
+  generate_files();
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  int compactions_run = 0;
+  std::atomic<int> kv_compactions{0};
+  int compactions_stopped_at = 0;
+  int kv_compactions_stopped_at = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) {
+        ++compactions_run;
+        // After 3 compactions disable
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator:ProcessKV", [&](void* /*arg*/) {
+        int kv_compactions_run =
+            kv_compactions.fetch_add(1, std::memory_order_release);
+        if (kv_compactions_run == 5) {
+          compact_options.canceled->store(true, std::memory_order_release);
+          kv_compactions_stopped_at = kv_compactions_run;
+          compactions_stopped_at = compactions_run;
+        }
+      });
+
+  compact_options.canceled->store(false, std::memory_order_release);
+  dbfull()->CompactRange(compact_options, nullptr, nullptr);
+  dbfull()->TEST_WaitForCompact(true);
+
+  // NOTE: as we set compact_options.max_subcompacitons = 1, and store true to
+  // the canceled variable from the single compacting thread (via callback),
+  // this value is deterministically kv_compactions_stopped_at + 1.
+  ASSERT_EQ(kv_compactions, kv_compactions_stopped_at + 1);
+  ASSERT_EQ(compactions_run, compactions_stopped_at);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionIterator::ProcessKV");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "DBImpl::RunManualCompaction()::1");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1");
+
+  // Compactions should work again if we re-enable them..
+  compact_options.canceled->store(false, std::memory_order_relaxed);
+  dbfull()->CompactRange(compact_options, nullptr, nullptr);
+  dbfull()->TEST_WaitForCompact(true);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
 TEST_F(DBTest2, OptimizeForPointLookup) {
   Options options = CurrentOptions();
   Close();
@@ -3297,7 +3724,7 @@ TEST_F(DBTest2, RateLimitedCompactionReads) {
     BlockBasedTableOptions bbto;
     bbto.block_size = 16384;
     bbto.no_block_cache = true;
-    options.table_factory.reset(new BlockBasedTableFactory(bbto));
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
     DestroyAndReopen(options);
 
     for (int i = 0; i < kNumL0Files; ++i) {
@@ -3342,6 +3769,7 @@ TEST_F(DBTest2, RateLimitedCompactionReads) {
 // is on levels higher than the new num_levels.
 TEST_F(DBTest2, ReduceLevel) {
   Options options;
+  options.env = env_;
   options.disable_auto_compactions = true;
   options.num_levels = 7;
   Reopen(options);
@@ -3370,6 +3798,7 @@ TEST_F(DBTest2, ReadCallbackTest) {
   Options options;
   options.disable_auto_compactions = true;
   options.num_levels = 7;
+  options.env = env_;
   Reopen(options);
   std::vector<const Snapshot*> snapshots;
   // Try to create a db with multiple layers and a memtable
@@ -3503,20 +3932,26 @@ TEST_F(DBTest2, LiveFilesOmitObsoleteFiles) {
 
 TEST_F(DBTest2, TestNumPread) {
   Options options = CurrentOptions();
+  bool prefetch_supported =
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
   // disable block cache
   BlockBasedTableOptions table_options;
   table_options.no_block_cache = true;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   Reopen(options);
   env_->count_random_reads_ = true;
-
   env_->random_file_open_counter_.store(0);
   ASSERT_OK(Put("bar", "foo"));
   ASSERT_OK(Put("foo", "bar"));
   ASSERT_OK(Flush());
-  // After flush, we'll open the file and read footer, meta block,
-  // property block and index block.
-  ASSERT_EQ(4, env_->random_read_counter_.Read());
+  if (prefetch_supported) {
+    // After flush, we'll open the file and read footer, meta block,
+    // property block and index block.
+    ASSERT_EQ(4, env_->random_read_counter_.Read());
+  } else {
+    // With prefetch not supported, we will do a single read into a buffer
+    ASSERT_EQ(1, env_->random_read_counter_.Read());
+  }
   ASSERT_EQ(1, env_->random_file_open_counter_.load());
 
   // One pread per a normal data block read
@@ -3532,19 +3967,30 @@ TEST_F(DBTest2, TestNumPread) {
   ASSERT_OK(Put("bar2", "foo2"));
   ASSERT_OK(Put("foo2", "bar2"));
   ASSERT_OK(Flush());
-  // After flush, we'll open the file and read footer, meta block,
-  // property block and index block.
-  ASSERT_EQ(4, env_->random_read_counter_.Read());
+  if (prefetch_supported) {
+    // After flush, we'll open the file and read footer, meta block,
+    // property block and index block.
+    ASSERT_EQ(4, env_->random_read_counter_.Read());
+  } else {
+    // With prefetch not supported, we will do a single read into a buffer
+    ASSERT_EQ(1, env_->random_read_counter_.Read());
+  }
   ASSERT_EQ(1, env_->random_file_open_counter_.load());
 
-  // Compaction needs two input blocks, which requires 2 preads, and
-  // generate a new SST file which needs 4 preads (footer, meta block,
-  // property block and index block). In total 6.
   env_->random_file_open_counter_.store(0);
   env_->random_read_counter_.Reset();
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  ASSERT_EQ(6, env_->random_read_counter_.Read());
-  // All compactin input files should have already been opened.
+  if (prefetch_supported) {
+    // Compaction needs two input blocks, which requires 2 preads, and
+    // generate a new SST file which needs 4 preads (footer, meta block,
+    // property block and index block). In total 6.
+    ASSERT_EQ(6, env_->random_read_counter_.Read());
+  } else {
+    // With prefetch off, compaction needs two input blocks,
+    // followed by a single buffered read.  In total 3.
+    ASSERT_EQ(3, env_->random_read_counter_.Read());
+  }
+  // All compaction input files should have already been opened.
   ASSERT_EQ(1, env_->random_file_open_counter_.load());
 
   // One pread per a normal data block read
@@ -3629,7 +4075,9 @@ TEST_F(DBTest2, TraceAndReplay) {
   column_families.push_back(
       ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
   std::vector<ColumnFamilyHandle*> handles;
-  ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2));
+  DBOptions db_opts;
+  db_opts.env = env_;
+  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
 
   env_->SleepForMicroseconds(100);
   // Verify that the keys don't already exist
@@ -3704,7 +4152,9 @@ TEST_F(DBTest2, TraceWithLimit) {
   column_families.push_back(
       ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
   std::vector<ColumnFamilyHandle*> handles;
-  ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2));
+  DBOptions db_opts;
+  db_opts.env = env_;
+  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
 
   env_->SleepForMicroseconds(100);
   // Verify that the keys don't already exist
@@ -3772,7 +4222,9 @@ TEST_F(DBTest2, TraceWithSampling) {
   column_families.push_back(
       ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
   std::vector<ColumnFamilyHandle*> handles;
-  ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2));
+  DBOptions db_opts;
+  db_opts.env = env_;
+  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
 
   env_->SleepForMicroseconds(100);
   ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
@@ -3850,7 +4302,7 @@ TEST_F(DBTest2, TraceWithFilter) {
 
   // Open another db, replay, and verify the data
   std::string value;
-  std::string dbname2 = test::TmpDir(env_) + "/db_replay";
+  std::string dbname2 = test::PerThreadDBPath(env_, "db_replay");
   ASSERT_OK(DestroyDB(dbname2, options));
 
   // Using a different name than db2, to pacify infer's use-after-lifetime
@@ -3872,7 +4324,9 @@ TEST_F(DBTest2, TraceWithFilter) {
   column_families.push_back(
       ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
   std::vector<ColumnFamilyHandle*> handles;
-  ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2));
+  DBOptions db_opts;
+  db_opts.env = env_;
+  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
 
   env_->SleepForMicroseconds(100);
   // Verify that the keys don't already exist
@@ -3899,7 +4353,7 @@ TEST_F(DBTest2, TraceWithFilter) {
   ASSERT_OK(DestroyDB(dbname2, options));
 
   // Set up a new db.
-  std::string dbname3 = test::TmpDir(env_) + "/db_not_trace_read";
+  std::string dbname3 = test::PerThreadDBPath(env_, "db_not_trace_read");
   ASSERT_OK(DestroyDB(dbname3, options));
 
   DB* db3_init = nullptr;
@@ -3918,7 +4372,7 @@ TEST_F(DBTest2, TraceWithFilter) {
   handles.clear();
 
   DB* db3 =  nullptr;
-  ASSERT_OK(DB::Open(DBOptions(), dbname3, column_families, &handles, &db3));
+  ASSERT_OK(DB::Open(db_opts, dbname3, column_families, &handles, &db3));
 
   env_->SleepForMicroseconds(100);
   // Verify that the keys don't already exist
@@ -3974,6 +4428,11 @@ TEST_F(DBTest2, TraceWithFilter) {
 
 TEST_F(DBTest2, PinnableSliceAndMmapReads) {
   Options options = CurrentOptions();
+  options.env = env_;
+  if (!IsMemoryMappedAccessSupported()) {
+    ROCKSDB_GTEST_SKIP("Test requires default environment");
+    return;
+  }
   options.allow_mmap_reads = true;
   options.max_open_files = 100;
   options.compression = kNoCompression;
@@ -4026,7 +4485,7 @@ TEST_F(DBTest2, DISABLED_IteratorPinnedMemory) {
   bbto.cache_index_and_filter_blocks = false;
   bbto.block_cache = NewLRUCache(100000);
   bbto.block_size = 400;  // small block size
-  options.table_factory.reset(new BlockBasedTableFactory(bbto));
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
   Reopen(options);
 
   Random rnd(301);
@@ -4252,6 +4711,7 @@ TEST_F(DBTest2, TestCompactFiles) {
   SyncPoint::GetInstance()->EnableProcessing();
 
   Options options;
+  options.env = env_;
   options.num_levels = 2;
   options.disable_auto_compactions = true;
   Reopen(options);
@@ -4310,7 +4770,7 @@ TEST_F(DBTest2, MultiDBParallelOpenTest) {
   Options options = CurrentOptions();
   std::vector<std::string> dbnames;
   for (int i = 0; i < kNumDbs; ++i) {
-    dbnames.emplace_back(test::TmpDir(env_) + "/db" + ToString(i));
+    dbnames.emplace_back(test::PerThreadDBPath(env_, "db" + ToString(i)));
     ASSERT_OK(DestroyDB(dbnames.back(), options));
   }
 
@@ -4376,8 +4836,8 @@ class DummyOldStats : public Statistics {
   }
   bool HistEnabledForType(uint32_t /*type*/) const override { return false; }
   std::string ToString() const override { return ""; }
-  int num_rt = 0;
-  int num_mt = 0;
+  std::atomic<int> num_rt{0};
+  std::atomic<int> num_mt{0};
 };
 }  // namespace
 
@@ -4569,7 +5029,7 @@ TEST_F(DBTest2, CrashInRecoveryMultipleCF) {
     for (const auto& f : filenames) {
       uint64_t number;
       FileType type;
-      if (ParseFileName(f, &number, &type) && type == FileType::kLogFile) {
+      if (ParseFileName(f, &number, &type) && type == FileType::kWalFile) {
         std::string fname = dbname_ + "/" + f;
         std::string file_content;
         ASSERT_OK(ReadFileToString(env_, fname, &file_content));
@@ -4703,7 +5163,7 @@ TEST_F(DBTest2, SameSmallestInSameLevel) {
   ASSERT_OK(Put("key", "2"));
   ASSERT_OK(db_->Merge(WriteOptions(), "key", "3"));
   ASSERT_OK(db_->Merge(WriteOptions(), "key", "4"));
-  Flush();
+  ASSERT_OK(Flush());
   CompactRangeOptions cro;
   cro.change_level = true;
   cro.target_level = 2;
@@ -4807,6 +5267,7 @@ TEST_F(DBTest2, BlockBasedTablePrefixIndexSeekForPrev) {
 
 TEST_F(DBTest2, PartitionedIndexPrefetchFailure) {
   Options options = last_options_;
+  options.env = env_;
   options.max_open_files = 20;
   BlockBasedTableOptions bbto;
   bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
@@ -5153,6 +5614,120 @@ TEST_F(DBTest2, AutoPrefixMode1) {
     ASSERT_EQ("a1", iterator->key().ToString());
   }
 }
+
+class RenameCurrentTest : public DBTestBase,
+                          public testing::WithParamInterface<std::string> {
+ public:
+  RenameCurrentTest()
+      : DBTestBase("rename_current_test", /*env_do_fsync=*/true),
+        sync_point_(GetParam()) {}
+
+  ~RenameCurrentTest() override {}
+
+  void SetUp() override {
+    env_->no_file_overwrite_.store(true, std::memory_order_release);
+  }
+
+  void TearDown() override {
+    env_->no_file_overwrite_.store(false, std::memory_order_release);
+  }
+
+  void SetupSyncPoints() {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->SetCallBack(sync_point_, [&](void* arg) {
+      Status* s = reinterpret_cast<Status*>(arg);
+      assert(s);
+      *s = Status::IOError("Injected IO error.");
+    });
+  }
+
+  const std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(DistributedFS, RenameCurrentTest,
+                        ::testing::Values("SetCurrentFile:BeforeRename",
+                                          "SetCurrentFile:AfterRename"));
+
+TEST_P(RenameCurrentTest, Open) {
+  Destroy(last_options_);
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  SetupSyncPoints();
+  SyncPoint::GetInstance()->EnableProcessing();
+  Status s = TryReopen(options);
+  ASSERT_NOK(s);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  Reopen(options);
+}
+
+TEST_P(RenameCurrentTest, Flush) {
+  Destroy(last_options_);
+  Options options = GetDefaultOptions();
+  options.max_manifest_file_size = 1;
+  options.create_if_missing = true;
+  Reopen(options);
+  ASSERT_OK(Put("key", "value"));
+  SetupSyncPoints();
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_NOK(Flush());
+
+  ASSERT_NOK(Put("foo", "value"));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  Reopen(options);
+  ASSERT_EQ("value", Get("key"));
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+}
+
+TEST_P(RenameCurrentTest, Compaction) {
+  Destroy(last_options_);
+  Options options = GetDefaultOptions();
+  options.max_manifest_file_size = 1;
+  options.create_if_missing = true;
+  Reopen(options);
+  ASSERT_OK(Put("a", "a_value"));
+  ASSERT_OK(Put("c", "c_value"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("b", "b_value"));
+  ASSERT_OK(Put("d", "d_value"));
+  ASSERT_OK(Flush());
+
+  SetupSyncPoints();
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_NOK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                               /*end=*/nullptr));
+
+  ASSERT_NOK(Put("foo", "value"));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  Reopen(options);
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+  ASSERT_EQ("d_value", Get("d"));
+}
+
+TEST_F(DBTest2, BottommostTemperature) {
+  Options options = CurrentOptions();
+  options.bottommost_temperature = Temperature::kWarm;
+  options.level0_file_num_compaction_trigger = 2;
+  Reopen(options);
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  Reopen(options);
+
+  ColumnFamilyMetaData metadata;
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(1, metadata.file_count);
+  ASSERT_EQ(Temperature::kWarm, metadata.levels[1].files[0].temperature);
+}
 #endif  // ROCKSDB_LITE
 
 // WAL recovery mode is WALRecoveryMode::kPointInTimeRecovery.
@@ -5180,6 +5755,34 @@ TEST_F(DBTest2, PointInTimeRecoveryWithIOErrorWhileReadingWal) {
   Status s = TryReopen(options);
   ASSERT_TRUE(s.IsIOError());
 }
+
+TEST_F(DBTest2, PointInTimeRecoveryWithSyncFailureInCFCreation) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCallFlush:Start:1",
+        "PointInTimeRecoveryWithSyncFailureInCFCreation:1"},
+       {"PointInTimeRecoveryWithSyncFailureInCFCreation:2",
+        "DBImpl::BackgroundCallFlush:Start:2"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateColumnFamilies({"test1"}, Options());
+  ASSERT_OK(Put("foo", "bar"));
+
+  // Creating a CF when a flush is going on, log is synced but the
+  // closed log file is not synced and corrupted.
+  port::Thread flush_thread([&]() { ASSERT_NOK(Flush()); });
+  TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:1");
+  CreateColumnFamilies({"test2"}, Options());
+  env_->corrupt_in_sync_ = true;
+  TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:2");
+  flush_thread.join();
+  env_->corrupt_in_sync_ = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  // Reopening the DB should not corrupt anything
+  Options options = CurrentOptions();
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  ReopenWithColumnFamilies({"default", "test1", "test2"}, options);
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
diff --git a/db/db_test_util.cc b/db/db_test_util.cc
index dd79a71950c..1d81774815a 100644
--- a/db/db_test_util.cc
+++ b/db/db_test_util.cc
@@ -10,6 +10,7 @@
 #include "db/db_test_util.h"
 
 #include "db/forward_iterator.h"
+#include "env/mock_env.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/env_encryption.h"
 #include "util/stderr_logger.h"
@@ -52,6 +53,7 @@ SpecialEnv::SpecialEnv(Env* base, bool time_elapse_only_sleep)
   manifest_sync_error_.store(false, std::memory_order_release);
   manifest_write_error_.store(false, std::memory_order_release);
   log_write_error_.store(false, std::memory_order_release);
+  no_file_overwrite_.store(false, std::memory_order_release);
   random_file_open_counter_.store(0, std::memory_order_relaxed);
   delete_count_.store(0, std::memory_order_relaxed);
   num_open_wal_file_.store(0);
@@ -70,16 +72,8 @@ DBTestBase::DBTestBase(const std::string path, bool env_do_fsync)
       option_config_(kDefault),
       s3_env_(nullptr) {
   Env* base_env = Env::Default();
-#ifndef ROCKSDB_LITE
-  const char* test_env_uri = getenv("TEST_ENV_URI");
-  if (test_env_uri) {
-    Env* test_env = nullptr;
-    Status s = Env::LoadEnv(test_env_uri, &test_env, &env_guard_);
-    base_env = test_env;
-    EXPECT_OK(s);
-    EXPECT_NE(Env::Default(), base_env);
-  }
-#endif  // !ROCKSDB_LITE
+  ConfigOptions config_options;
+  EXPECT_OK(test::CreateEnvFromSystem(config_options, &base_env, &env_guard_));
   EXPECT_NE(nullptr, base_env);
   if (getenv("MEM_ENV")) {
     mem_env_ = new MockEnv(base_env);
@@ -88,7 +82,7 @@ DBTestBase::DBTestBase(const std::string path, bool env_do_fsync)
   if (getenv("ENCRYPTED_ENV")) {
     std::shared_ptr<EncryptionProvider> provider;
     Status s = EncryptionProvider::CreateFromString(
-        ConfigOptions(), std::string("test://") + getenv("ENCRYPTED_ENV"),
+        config_options, std::string("test://") + getenv("ENCRYPTED_ENV"),
         &provider);
     encrypted_env_ = NewEncryptedEnv(mem_env_ ? mem_env_ : base_env, provider);
   }
@@ -254,28 +248,28 @@ bool DBTestBase::ChangeCompactOptions() {
     Destroy(last_options_);
     auto options = CurrentOptions();
     options.create_if_missing = true;
-    TryReopen(options);
+    Reopen(options);
     return true;
   } else if (option_config_ == kUniversalCompaction) {
     option_config_ = kUniversalCompactionMultiLevel;
     Destroy(last_options_);
     auto options = CurrentOptions();
     options.create_if_missing = true;
-    TryReopen(options);
+    Reopen(options);
     return true;
   } else if (option_config_ == kUniversalCompactionMultiLevel) {
     option_config_ = kLevelSubcompactions;
     Destroy(last_options_);
     auto options = CurrentOptions();
     assert(options.max_subcompactions > 1);
-    TryReopen(options);
+    Reopen(options);
     return true;
   } else if (option_config_ == kLevelSubcompactions) {
     option_config_ = kUniversalSubcompactions;
     Destroy(last_options_);
     auto options = CurrentOptions();
     assert(options.max_subcompactions > 1);
-    TryReopen(options);
+    Reopen(options);
     return true;
   } else {
     return false;
@@ -290,7 +284,7 @@ bool DBTestBase::ChangeWalOptions() {
     auto options = CurrentOptions();
     Destroy(options);
     options.create_if_missing = true;
-    TryReopen(options);
+    Reopen(options);
     return true;
   } else if (option_config_ == kDBLogDir) {
     option_config_ = kWalDirAndMmapReads;
@@ -298,14 +292,14 @@ bool DBTestBase::ChangeWalOptions() {
     auto options = CurrentOptions();
     Destroy(options);
     options.create_if_missing = true;
-    TryReopen(options);
+    Reopen(options);
     return true;
   } else if (option_config_ == kWalDirAndMmapReads) {
     option_config_ = kRecycleLogFiles;
     Destroy(last_options_);
     auto options = CurrentOptions();
     Destroy(options);
-    TryReopen(options);
+    Reopen(options);
     return true;
   } else {
     return false;
@@ -385,7 +379,7 @@ Options DBTestBase::CurrentOptions(
   return GetOptions(option_config_, default_options, options_override);
 }
 
-Options DBTestBase::GetDefaultOptions() {
+Options DBTestBase::GetDefaultOptions() const {
   Options options;
   options.write_buffer_size = 4090 * 4096;
   options.target_file_size_base = 2 * 1024 * 1024;
@@ -393,6 +387,10 @@ Options DBTestBase::GetDefaultOptions() {
   options.max_open_files = 5000;
   options.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
   options.compaction_pri = CompactionPri::kByCompensatedSize;
+  options.env = env_;
+  if (!env_->skip_fsync_) {
+    options.track_and_verify_wals_in_manifest = true;
+  }
   return options;
 }
 
@@ -421,28 +419,28 @@ Options DBTestBase::GetOptions(
       options.unordered_write = false;
       break;
     case kPlainTableFirstBytePrefix:
-      options.table_factory.reset(new PlainTableFactory());
+      options.table_factory.reset(NewPlainTableFactory());
       options.prefix_extractor.reset(NewFixedPrefixTransform(1));
       options.allow_mmap_reads = can_allow_mmap;
       options.max_sequential_skip_in_iterations = 999999;
       set_block_based_table_factory = false;
       break;
     case kPlainTableCappedPrefix:
-      options.table_factory.reset(new PlainTableFactory());
+      options.table_factory.reset(NewPlainTableFactory());
       options.prefix_extractor.reset(NewCappedPrefixTransform(8));
       options.allow_mmap_reads = can_allow_mmap;
       options.max_sequential_skip_in_iterations = 999999;
       set_block_based_table_factory = false;
       break;
     case kPlainTableCappedPrefixNonMmap:
-      options.table_factory.reset(new PlainTableFactory());
+      options.table_factory.reset(NewPlainTableFactory());
       options.prefix_extractor.reset(NewCappedPrefixTransform(8));
       options.allow_mmap_reads = false;
       options.max_sequential_skip_in_iterations = 999999;
       set_block_based_table_factory = false;
       break;
     case kPlainTableAllBytesPrefix:
-      options.table_factory.reset(new PlainTableFactory());
+      options.table_factory.reset(NewPlainTableFactory());
       options.prefix_extractor.reset(NewNoopTransform());
       options.allow_mmap_reads = can_allow_mmap;
       options.max_sequential_skip_in_iterations = 999999;
@@ -536,6 +534,7 @@ Options DBTestBase::GetOptions(
     }
     case kFIFOCompaction: {
       options.compaction_style = kCompactionStyleFIFO;
+      options.max_open_files = -1;
       break;
     }
     case kBlockBasedTableWithPrefixHashIndex: {
@@ -549,6 +548,7 @@ Options DBTestBase::GetOptions(
       break;
     }
     case kBlockBasedTableWithPartitionedIndex: {
+      table_options.format_version = 3;
       table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
       options.prefix_extractor.reset(NewNoopTransform());
       break;
@@ -774,7 +774,7 @@ void DBTestBase::Close() {
 void DBTestBase::DestroyAndReopen(const Options& options) {
   // Destroy using last options
   Destroy(last_options_);
-  ASSERT_OK(TryReopen(options));
+  Reopen(options);
 }
 
 void DBTestBase::Destroy(const Options& options, bool delete_cf_paths) {
@@ -819,9 +819,9 @@ Status DBTestBase::TryReopen(const Options& options) {
   // Note: operator= is an unsafe approach here since it destructs
   // std::shared_ptr in the same order of their creation, in contrast to
   // destructors which destructs them in the opposite order of creation. One
-  // particular problme is that the cache destructor might invoke callback
+  // particular problem is that the cache destructor might invoke callback
   // functions that use Option members such as statistics. To work around this
-  // problem, we manually call destructor of table_facotry which eventually
+  // problem, we manually call destructor of table_factory which eventually
   // clears the block cache.
   last_options_ = options;
   MaybeInstallTimeElapseOnlySleep(options);
@@ -1078,7 +1078,8 @@ std::string DBTestBase::AllEntriesFor(const Slice& user_key, int cf) {
     bool first = true;
     while (iter->Valid()) {
       ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-      if (ParseInternalKey(iter->key(), &ikey) != Status::OK()) {
+      if (ParseInternalKey(iter->key(), &ikey, true /* log_err_key */) !=
+          Status::OK()) {
         result += "CORRUPTED";
       } else {
         if (!last_options_.comparator->Equal(ikey.user_key, user_key)) {
@@ -1235,29 +1236,77 @@ std::string DBTestBase::FilesPerLevel(int cf) {
   result.resize(last_non_zero_offset);
   return result;
 }
+
 #endif  // !ROCKSDB_LITE
 
+std::vector<uint64_t> DBTestBase::GetBlobFileNumbers() {
+  VersionSet* const versions = dbfull()->TEST_GetVersionSet();
+  assert(versions);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  Version* const current = cfd->current();
+  assert(current);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  assert(storage_info);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+
+  std::vector<uint64_t> result;
+  result.reserve(blob_files.size());
+
+  for (const auto& blob_file : blob_files) {
+    result.emplace_back(blob_file.first);
+  }
+
+  return result;
+}
+
 size_t DBTestBase::CountFiles() {
+  size_t count = 0;
   std::vector<std::string> files;
-  env_->GetChildren(dbname_, &files);
+  if (env_->GetChildren(dbname_, &files).ok()) {
+    count += files.size();
+  }
 
-  std::vector<std::string> logfiles;
   if (dbname_ != last_options_.wal_dir) {
-    env_->GetChildren(last_options_.wal_dir, &logfiles);
+    if (env_->GetChildren(last_options_.wal_dir, &files).ok()) {
+      count += files.size();
+    }
   }
 
-  return files.size() + logfiles.size();
+  return count;
+};
+
+Status DBTestBase::CountFiles(size_t* count) {
+  std::vector<std::string> files;
+  Status s = env_->GetChildren(dbname_, &files);
+  if (!s.ok()) {
+    return s;
+  }
+  size_t files_count = files.size();
+
+  if (dbname_ != last_options_.wal_dir) {
+    s = env_->GetChildren(last_options_.wal_dir, &files);
+    if (!s.ok()) {
+      return s;
+    }
+    *count = files_count + files.size();
+  }
+
+  return Status::OK();
 }
 
-uint64_t DBTestBase::Size(const Slice& start, const Slice& limit, int cf) {
+Status DBTestBase::Size(const Slice& start, const Slice& limit, int cf,
+                        uint64_t* size) {
   Range r(start, limit);
-  uint64_t size;
   if (cf == 0) {
-    db_->GetApproximateSizes(&r, 1, &size);
+    return db_->GetApproximateSizes(&r, 1, size);
   } else {
-    db_->GetApproximateSizes(handles_[1], &r, 1, &size);
+    return db_->GetApproximateSizes(handles_[1], &r, 1, size);
   }
-  return size;
 }
 
 void DBTestBase::Compact(int cf, const Slice& start, const Slice& limit,
@@ -1376,8 +1425,8 @@ void DBTestBase::GenerateNewRandomFile(Random* rnd, bool nowait) {
   }
   ASSERT_OK(Put("key" + rnd->RandomString(7), rnd->RandomString(200)));
   if (!nowait) {
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
 }
 
@@ -1485,12 +1534,12 @@ void DBTestBase::validateNumberOfEntries(int numValues, int cf) {
                                            kMaxSequenceNumber));
   }
   iter->SeekToFirst();
-  ASSERT_EQ(iter->status().ok(), true);
+  ASSERT_OK(iter->status());
   int seq = numValues;
   while (iter->Valid()) {
     ParsedInternalKey ikey;
     ikey.clear();
-    ASSERT_OK(ParseInternalKey(iter->key(), &ikey));
+    ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
 
     // checks sequence number for updates
     ASSERT_EQ(ikey.sequence, (unsigned)seq--);
@@ -1523,26 +1572,26 @@ void DBTestBase::CopyFile(const std::string& source,
   ASSERT_OK(destfile->Close());
 }
 
-Status DBTestBase::GetAllSSTFiles(
-    std::unordered_map<std::string, uint64_t>* sst_files,
+Status DBTestBase::GetAllDataFiles(
+    const FileType file_type, std::unordered_map<std::string, uint64_t>* files,
     uint64_t* total_size /* = nullptr */) {
   if (total_size) {
     *total_size = 0;
   }
-  std::vector<std::string> files;
-  Status s = env_->GetChildren(dbname_, &files);
+  std::vector<std::string> children;
+  Status s = env_->GetChildren(dbname_, &children);
   if (s.ok()) {
-    for (auto& file_name : files) {
+    for (auto& file_name : children) {
       uint64_t number;
       FileType type;
-      if (ParseFileName(file_name, &number, &type) && type == kTableFile) {
+      if (ParseFileName(file_name, &number, &type) && type == file_type) {
         std::string file_path = dbname_ + "/" + file_name;
         uint64_t file_size = 0;
         s = env_->GetFileSize(file_path, &file_size);
         if (!s.ok()) {
           break;
         }
-        (*sst_files)[file_path] = file_size;
+        (*files)[file_path] = file_size;
         if (total_size) {
           *total_size += file_size;
         }
@@ -1695,7 +1744,7 @@ void DBTestBase::VerifyDBInternal(
   for (auto p : true_data) {
     ASSERT_TRUE(iter->Valid());
     ParsedInternalKey ikey;
-    ASSERT_OK(ParseInternalKey(iter->key(), &ikey));
+    ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
     ASSERT_EQ(p.first, ikey.user_key);
     ASSERT_EQ(p.second, iter->value());
     iter->Next();
diff --git a/db/db_test_util.h b/db/db_test_util.h
index a78a37eea6a..bcb93c8055d 100644
--- a/db/db_test_util.h
+++ b/db/db_test_util.h
@@ -23,7 +23,6 @@
 
 #include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
-#include "env/mock_env.h"
 #include "file/filename.h"
 #include "memtable/hash_linklist_rep.h"
 #include "rocksdb/cache.h"
@@ -40,7 +39,6 @@
 #include "rocksdb/utilities/checkpoint.h"
 #include "table/mock_table.h"
 #include "table/scoped_arena_iterator.h"
-#include "test_util/mock_time_env.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "util/cast_util.h"
@@ -52,6 +50,7 @@
 extern "C" bool RocksDbFileChecksumsVerificationEnabledOnRecovery();
 
 namespace ROCKSDB_NAMESPACE {
+class MockEnv;
 
 namespace anon {
 class AtomicCounter {
@@ -232,6 +231,11 @@ class SpecialEnv : public EnvWrapper {
           return base_->Append(data);
         }
       }
+      Status Append(
+          const Slice& data,
+          const DataVerificationInfo& /* verification_info */) override {
+        return Append(data);
+      }
       Status PositionedAppend(const Slice& data, uint64_t offset) override {
         if (env_->table_write_callback_) {
           (*env_->table_write_callback_)();
@@ -246,6 +250,11 @@ class SpecialEnv : public EnvWrapper {
           return base_->PositionedAppend(data, offset);
         }
       }
+      Status PositionedAppend(
+          const Slice& data, uint64_t offset,
+          const DataVerificationInfo& /* verification_info */) override {
+        return PositionedAppend(data, offset);
+      }
       Status Truncate(uint64_t size) override { return base_->Truncate(size); }
       Status RangeSync(uint64_t offset, uint64_t nbytes) override {
         Status s = base_->RangeSync(offset, nbytes);
@@ -296,6 +305,9 @@ class SpecialEnv : public EnvWrapper {
       Status Allocate(uint64_t offset, uint64_t len) override {
         return base_->Allocate(offset, len);
       }
+      size_t GetUniqueId(char* id, size_t max_size) const override {
+        return base_->GetUniqueId(id, max_size);
+      }
     };
     class ManifestFile : public WritableFile {
      public:
@@ -308,6 +320,12 @@ class SpecialEnv : public EnvWrapper {
           return base_->Append(data);
         }
       }
+      Status Append(
+          const Slice& data,
+          const DataVerificationInfo& /*verification_info*/) override {
+        return Append(data);
+      }
+
       Status Truncate(uint64_t size) override { return base_->Truncate(size); }
       Status Close() override { return base_->Close(); }
       Status Flush() override { return base_->Flush(); }
@@ -359,15 +377,26 @@ class SpecialEnv : public EnvWrapper {
 #endif
         return s;
       }
+      Status Append(
+          const Slice& data,
+          const DataVerificationInfo& /* verification_info */) override {
+        return Append(data);
+      }
       Status Truncate(uint64_t size) override { return base_->Truncate(size); }
+      void PrepareWrite(size_t offset, size_t len) override {
+        base_->PrepareWrite(offset, len);
+      }
+      void SetPreallocationBlockSize(size_t size) override {
+        base_->SetPreallocationBlockSize(size);
+      }
       Status Close() override {
 // SyncPoint is not supported in Released Windows Mode.
 #if !(defined NDEBUG) || !defined(OS_WIN)
         // Check preallocation size
-        // preallocation size is never passed to base file.
-        size_t preallocation_size = preallocation_block_size();
+        size_t block_size, last_allocated_block;
+        base_->GetPreallocationStatus(&block_size, &last_allocated_block);
         TEST_SYNC_POINT_CALLBACK("DBTestWalFile.GetPreallocationStatus",
-                                 &preallocation_size);
+                                 &block_size);
 #endif  // !(defined NDEBUG) || !defined(OS_WIN)
 
         return base_->Close();
@@ -375,6 +404,10 @@ class SpecialEnv : public EnvWrapper {
       Status Flush() override { return base_->Flush(); }
       Status Sync() override {
         ++env_->sync_counter_;
+        if (env_->corrupt_in_sync_) {
+          Append(std::string(33000, ' '));
+          return Status::IOError("Ingested Sync Failure");
+        }
         if (env_->skip_fsync_) {
           return Status::OK();
         } else {
@@ -397,6 +430,11 @@ class SpecialEnv : public EnvWrapper {
       OtherFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& b)
           : env_(env), base_(std::move(b)) {}
       Status Append(const Slice& data) override { return base_->Append(data); }
+      Status Append(
+          const Slice& data,
+          const DataVerificationInfo& /*verification_info*/) override {
+        return Append(data);
+      }
       Status Truncate(uint64_t size) override { return base_->Truncate(size); }
       Status Close() override { return base_->Close(); }
       Status Flush() override { return base_->Flush(); }
@@ -417,6 +455,11 @@ class SpecialEnv : public EnvWrapper {
       std::unique_ptr<WritableFile> base_;
     };
 
+    if (no_file_overwrite_.load(std::memory_order_acquire) &&
+        target()->FileExists(f).ok()) {
+      return Status::NotSupported("SpecialEnv::no_file_overwrite_ is true.");
+    }
+
     if (non_writeable_rate_.load(std::memory_order_acquire) > 0) {
       uint32_t random_number;
       {
@@ -664,6 +707,9 @@ class SpecialEnv : public EnvWrapper {
   // Slow down every log write, in micro-seconds.
   std::atomic<int> log_write_slowdown_;
 
+  // If true, returns Status::NotSupported for file overwrite.
+  std::atomic<bool> no_file_overwrite_;
+
   // Number of WAL files that are still open for write.
   std::atomic<int> num_open_wal_file_;
 
@@ -686,6 +732,9 @@ class SpecialEnv : public EnvWrapper {
   // If true, all fsync to files and directories are skipped.
   bool skip_fsync_ = false;
 
+  // If true, ingest the corruption to file during sync.
+  bool corrupt_in_sync_ = false;
+
   std::atomic<uint32_t> non_writeable_rate_;
 
   std::atomic<uint32_t> new_writable_count_;
@@ -738,6 +787,17 @@ class OnFileDeletionListener : public EventListener {
   size_t matched_count_;
   std::string expected_file_name_;
 };
+
+class FlushCounterListener : public EventListener {
+ public:
+  std::atomic<int> count{0};
+  std::atomic<FlushReason> expected_flush_reason{FlushReason::kOthers};
+
+  void OnFlushBegin(DB* /*db*/, const FlushJobInfo& flush_job_info) override {
+    count++;
+    ASSERT_EQ(expected_flush_reason.load(), flush_job_info.flush_reason);
+  }
+};
 #endif
 
 // A test merge operator mimics put but also fails if one of merge operands is
@@ -771,6 +831,7 @@ class CacheWrapper : public Cache {
 
   const char* Name() const override { return target_->Name(); }
 
+  using Cache::Insert;
   Status Insert(const Slice& key, void* value, size_t charge,
                 void (*deleter)(const Slice& key, void* value),
                 Handle** handle = nullptr,
@@ -778,12 +839,14 @@ class CacheWrapper : public Cache {
     return target_->Insert(key, value, charge, deleter, handle, priority);
   }
 
+  using Cache::Lookup;
   Handle* Lookup(const Slice& key, Statistics* stats = nullptr) override {
     return target_->Lookup(key, stats);
   }
 
   bool Ref(Handle* handle) override { return target_->Ref(handle); }
 
+  using Cache::Release;
   bool Release(Handle* handle, bool force_erase = false) override {
     return target_->Release(handle, force_erase);
   }
@@ -817,11 +880,22 @@ class CacheWrapper : public Cache {
     return target_->GetCharge(handle);
   }
 
+  DeleterFn GetDeleter(Handle* handle) const override {
+    return target_->GetDeleter(handle);
+  }
+
   void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
                               bool thread_safe) override {
     target_->ApplyToAllCacheEntries(callback, thread_safe);
   }
 
+  void ApplyToAllEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      const ApplyToAllEntriesOptions& opts) override {
+    target_->ApplyToAllEntries(callback, opts);
+  }
+
   void EraseUnRefEntries() override { target_->EraseUnRefEntries(); }
 
  protected:
@@ -967,10 +1041,13 @@ class DBTestBase : public testing::Test {
                          const anon::OptionsOverride& options_override =
                              anon::OptionsOverride()) const;
 
-  static Options GetDefaultOptions();
+  Options GetDefaultOptions() const;
+
+  Options GetOptions(int option_config) const {
+    return GetOptions(option_config, GetDefaultOptions());
+  }
 
-  Options GetOptions(int option_config,
-                     const Options& default_options = GetDefaultOptions(),
+  Options GetOptions(int option_config, const Options& default_options,
                      const anon::OptionsOverride& options_override =
                          anon::OptionsOverride()) const;
 
@@ -1080,12 +1157,20 @@ class DBTestBase : public testing::Test {
   int TotalTableFiles(int cf = 0, int levels = -1);
 #endif  // ROCKSDB_LITE
 
+  std::vector<uint64_t> GetBlobFileNumbers();
+
   // Return spread of files per level
   std::string FilesPerLevel(int cf = 0);
 
   size_t CountFiles();
 
-  uint64_t Size(const Slice& start, const Slice& limit, int cf = 0);
+  Status CountFiles(size_t* count);
+
+  Status Size(const Slice& start, const Slice& limit, uint64_t* size) {
+    return Size(start, limit, 0, size);
+  }
+
+  Status Size(const Slice& start, const Slice& limit, int cf, uint64_t* size);
 
   void Compact(int cf, const Slice& start, const Slice& limit,
                uint32_t target_path_id);
@@ -1163,8 +1248,9 @@ class DBTestBase : public testing::Test {
   void CopyFile(const std::string& source, const std::string& destination,
                 uint64_t size = 0);
 
-  Status GetAllSSTFiles(std::unordered_map<std::string, uint64_t>* sst_files,
-                        uint64_t* total_size = nullptr);
+  Status GetAllDataFiles(const FileType file_type,
+                         std::unordered_map<std::string, uint64_t>* sst_files,
+                         uint64_t* total_size = nullptr);
 
   std::vector<std::uint64_t> ListTableFiles(Env* env, const std::string& path);
 
diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc
index 119fc66c508..548b8ae0e08 100644
--- a/db/db_universal_compaction_test.cc
+++ b/db/db_universal_compaction_test.cc
@@ -996,7 +996,7 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionStopStyleSimilarSize) {
   //   First, clean up memtable before inserting new data. This will generate
   //   a level-0 file, with size around 0.4 (according to previously written
   //   data amount).
-  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(dbfull()->Flush(FlushOptions()));
   for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
        num++) {
     // Write 110KB (11 values, each 10K)
@@ -1781,7 +1781,7 @@ TEST_P(DBTestUniversalCompaction, FinalSortedRunCompactFilesConflict) {
       dbfull()->TEST_write_controler().GetCompactionPressureToken();
 
   ASSERT_OK(Put("key", "val"));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ(NumTableFilesAtLevel(num_levels_ - 1), 1);
   ColumnFamilyMetaData cf_meta;
@@ -1807,7 +1807,7 @@ TEST_P(DBTestUniversalCompaction, FinalSortedRunCompactFilesConflict) {
       "DBTestUniversalCompaction:FinalSortedRunCompactFilesConflict:0");
   for (int i = 0; i < 2; ++i) {
     ASSERT_OK(Put("key", "val"));
-    Flush();
+    ASSERT_OK(Flush());
   }
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
@@ -1911,7 +1911,7 @@ TEST_F(DBTestUniversalCompaction2, BasicL0toL1) {
   for (i = 0; i < 2000; ++i) {
     ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
   //  MoveFilesToLevel(6);
   ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
@@ -1923,7 +1923,7 @@ TEST_F(DBTestUniversalCompaction2, BasicL0toL1) {
       ASSERT_OK(Put(Key(i), "val"));
     }
   }
-  Flush();
+  ASSERT_OK(Flush());
 
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
@@ -1954,7 +1954,7 @@ TEST_F(DBTestUniversalCompaction2, SingleLevel) {
   for (i = 0; i < 2000; ++i) {
     ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
 
   for (i = 1999; i < kNumKeys; ++i) {
     if (i >= kNumKeys - kWindowSize &&
@@ -1964,7 +1964,7 @@ TEST_F(DBTestUniversalCompaction2, SingleLevel) {
       ASSERT_OK(Put(Key(i), "val"));
     }
   }
-  Flush();
+  ASSERT_OK(Flush()(;
 
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(1, NumTableFilesAtLevel(0));
@@ -1992,19 +1992,19 @@ TEST_F(DBTestUniversalCompaction2, MultipleLevels) {
   for (i = 0; i < 500; ++i) {
     ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
   for (i = 500; i < 1000; ++i) {
     ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
   for (i = 1000; i < 1500; ++i) {
     ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
   for (i = 1500; i < 2000; ++i) {
     ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
 
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
@@ -2013,15 +2013,15 @@ TEST_F(DBTestUniversalCompaction2, MultipleLevels) {
   for (i = 1999; i < 2333; ++i) {
     ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
   for (i = 2333; i < 2666; ++i) {
     ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
   for (i = 2666; i < 2999; ++i) {
     ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
 
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
@@ -2031,7 +2031,7 @@ TEST_F(DBTestUniversalCompaction2, MultipleLevels) {
   for (i = 1900; i < 2100; ++i) {
     ASSERT_OK(Delete(Key(i)));
   }
-  Flush();
+  ASSERT_OK(Flush());
 
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
@@ -2064,19 +2064,19 @@ TEST_F(DBTestUniversalCompaction2, OverlappingL0) {
   for (i = 0; i < 2000; ++i) {
     ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
   for (i = 2000; i < 3000; ++i) {
     ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
   for (i = 3500; i < 4000; ++i) {
     ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
   for (i = 2900; i < 3100; ++i) {
     ASSERT_OK(Delete(Key(i)));
   }
-  Flush();
+  ASSERT_OK(Flush());
 
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(2, NumTableFilesAtLevel(0));
@@ -2106,7 +2106,7 @@ TEST_F(DBTestUniversalCompaction2, IngestBehind) {
   for (i = 0; i < 2000; ++i) {
     ASSERT_OK(Put(Key(i), "val"));
   }
-  Flush();
+  ASSERT_OK(Flush());
   //  MoveFilesToLevel(6);
   ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
@@ -2118,7 +2118,7 @@ TEST_F(DBTestUniversalCompaction2, IngestBehind) {
       ASSERT_OK(Put(Key(i), "val"));
     }
   }
-  Flush();
+  ASSERT_OK(Flush());
 
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(0, NumTableFilesAtLevel(0));
@@ -2129,7 +2129,7 @@ TEST_F(DBTestUniversalCompaction2, IngestBehind) {
 TEST_F(DBTestUniversalCompaction2, PeriodicCompactionDefault) {
   Options options;
   options.compaction_style = kCompactionStyleUniversal;
-
+  options.env = env_;
   KeepFilterFactory* filter = new KeepFilterFactory(true);
   options.compaction_filter_factory.reset(filter);
   Reopen(options);
@@ -2184,7 +2184,7 @@ TEST_F(DBTestUniversalCompaction2, PeriodicCompaction) {
 
   // Case 1: Oldest flushed file excceeds periodic compaction threshold.
   ASSERT_OK(Put("foo", "bar"));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_EQ(0, periodic_compactions);
   // Move clock forward so that the flushed file would qualify periodic
   // compaction.
@@ -2192,7 +2192,7 @@ TEST_F(DBTestUniversalCompaction2, PeriodicCompaction) {
 
   // Another flush would trigger compaction the oldest file.
   ASSERT_OK(Put("foo", "bar2"));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_EQ(1, periodic_compactions);
@@ -2203,7 +2203,7 @@ TEST_F(DBTestUniversalCompaction2, PeriodicCompaction) {
   periodic_compactions = 0;
   // A flush doesn't trigger a periodic compaction when threshold not hit
   ASSERT_OK(Put("foo", "bar2"));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(0, periodic_compactions);
 
@@ -2211,7 +2211,7 @@ TEST_F(DBTestUniversalCompaction2, PeriodicCompaction) {
   // a compaction
   ASSERT_OK(Put("foo", "bar2"));
   env_->MockSleepForSeconds(48 * 60 * 60 + 100);
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_EQ(1, periodic_compactions);
   ASSERT_EQ(0, start_level);
diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc
index 69940840fa9..dd632742e2b 100644
--- a/db/db_wal_test.cc
+++ b/db/db_wal_test.cc
@@ -8,12 +8,13 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/db_test_util.h"
-#include "env/composite_env_wrapper.h"
 #include "options/options_helper.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
+#include "rocksdb/file_system.h"
 #include "test_util/sync_point.h"
 #include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
 
 namespace ROCKSDB_NAMESPACE {
 class DBWALTestBase : public DBTestBase {
@@ -23,13 +24,37 @@ class DBWALTestBase : public DBTestBase {
 
 #if defined(ROCKSDB_PLATFORM_POSIX)
  public:
+#if defined(ROCKSDB_FALLOCATE_PRESENT)
+  bool IsFallocateSupported() {
+    // Test fallocate support of running file system.
+    // Skip this test if fallocate is not supported.
+    std::string fname_test_fallocate = dbname_ + "/preallocate_testfile";
+    int fd = -1;
+    do {
+      fd = open(fname_test_fallocate.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
+    } while (fd < 0 && errno == EINTR);
+    assert(fd > 0);
+    int alloc_status = fallocate(fd, 0, 0, 1);
+    int err_number = errno;
+    close(fd);
+    assert(env_->DeleteFile(fname_test_fallocate) == Status::OK());
+    if (err_number == ENOSYS || err_number == EOPNOTSUPP) {
+      fprintf(stderr, "Skipped preallocated space check: %s\n",
+              errnoStr(err_number).c_str());
+      return false;
+    }
+    assert(alloc_status == 0);
+    return true;
+  }
+#endif  // ROCKSDB_FALLOCATE_PRESENT
+
   uint64_t GetAllocatedFileSize(std::string file_name) {
     struct stat sbuf;
     int err = stat(file_name.c_str(), &sbuf);
     assert(err == 0);
     return sbuf.st_blocks * 512;
   }
-#endif
+#endif  // ROCKSDB_PLATFORM_POSIX
 };
 
 class DBWALTest : public DBWALTestBase {
@@ -47,8 +72,8 @@ class EnrichedSpecialEnv : public SpecialEnv {
     InstrumentedMutexLock l(&env_mutex_);
     if (f == skipped_wal) {
       deleted_wal_reopened = true;
-      if (IsWAL(f) && largetest_deleted_wal.size() != 0 &&
-          f.compare(largetest_deleted_wal) <= 0) {
+      if (IsWAL(f) && largest_deleted_wal.size() != 0 &&
+          f.compare(largest_deleted_wal) <= 0) {
         gap_in_wals = true;
       }
     }
@@ -62,9 +87,9 @@ class EnrichedSpecialEnv : public SpecialEnv {
       // remember its name partly because the application might attempt to
       // delete the file again.
       if (skipped_wal.size() != 0 && skipped_wal != fname) {
-        if (largetest_deleted_wal.size() == 0 ||
-            largetest_deleted_wal.compare(fname) < 0) {
-          largetest_deleted_wal = fname;
+        if (largest_deleted_wal.size() == 0 ||
+            largest_deleted_wal.compare(fname) < 0) {
+          largest_deleted_wal = fname;
         }
       } else {
         skipped_wal = fname;
@@ -82,7 +107,7 @@ class EnrichedSpecialEnv : public SpecialEnv {
   // the wal whose actual delete was skipped by the env
   std::string skipped_wal = "";
   // the largest WAL that was requested to be deleted
-  std::string largetest_deleted_wal = "";
+  std::string largest_deleted_wal = "";
   // number of WALs that were successfully deleted
   std::atomic<size_t> deleted_wal_cnt = {0};
   // the WAL whose delete from fs was skipped is reopened during recovery
@@ -358,16 +383,16 @@ TEST_F(DBWALTest, RecoverWithBlob) {
   // There should be no files just yet since we haven't flushed.
   {
     VersionSet* const versions = dbfull()->TEST_GetVersionSet();
-    assert(versions);
+    ASSERT_NE(versions, nullptr);
 
     ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
-    assert(cfd);
+    ASSERT_NE(cfd, nullptr);
 
     Version* const current = cfd->current();
-    assert(current);
+    ASSERT_NE(current, nullptr);
 
     const VersionStorageInfo* const storage_info = current->storage_info();
-    assert(storage_info);
+    ASSERT_NE(storage_info, nullptr);
 
     ASSERT_EQ(storage_info->num_non_empty_levels(), 0);
     ASSERT_TRUE(storage_info->GetBlobFiles().empty());
@@ -380,37 +405,36 @@ TEST_F(DBWALTest, RecoverWithBlob) {
   options.min_blob_size = min_blob_size;
   options.avoid_flush_during_recovery = false;
   options.disable_auto_compactions = true;
+  options.env = env_;
 
   Reopen(options);
 
   ASSERT_EQ(Get("key1"), short_value);
-
-  // TODO: enable once Get support is implemented for blobs
-  // ASSERT_EQ(Get("key2"), long_value);
+  ASSERT_EQ(Get("key2"), long_value);
 
   VersionSet* const versions = dbfull()->TEST_GetVersionSet();
-  assert(versions);
+  ASSERT_NE(versions, nullptr);
 
   ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
-  assert(cfd);
+  ASSERT_NE(cfd, nullptr);
 
   Version* const current = cfd->current();
-  assert(current);
+  ASSERT_NE(current, nullptr);
 
   const VersionStorageInfo* const storage_info = current->storage_info();
-  assert(storage_info);
+  ASSERT_NE(storage_info, nullptr);
 
   const auto& l0_files = storage_info->LevelFiles(0);
   ASSERT_EQ(l0_files.size(), 1);
 
   const FileMetaData* const table_file = l0_files[0];
-  assert(table_file);
+  ASSERT_NE(table_file, nullptr);
 
   const auto& blob_files = storage_info->GetBlobFiles();
   ASSERT_EQ(blob_files.size(), 1);
 
   const auto& blob_file = blob_files.begin()->second;
-  assert(blob_file);
+  ASSERT_NE(blob_file, nullptr);
 
   ASSERT_EQ(table_file->smallest.user_key(), "key1");
   ASSERT_EQ(table_file->largest.user_key(), "key2");
@@ -423,29 +447,177 @@ TEST_F(DBWALTest, RecoverWithBlob) {
 
 #ifndef ROCKSDB_LITE
   const InternalStats* const internal_stats = cfd->internal_stats();
-  assert(internal_stats);
-
-  const uint64_t expected_bytes =
-      table_file->fd.GetFileSize() + blob_file->GetTotalBlobBytes();
+  ASSERT_NE(internal_stats, nullptr);
 
   const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
   ASSERT_FALSE(compaction_stats.empty());
-  ASSERT_EQ(compaction_stats[0].bytes_written, expected_bytes);
-  ASSERT_EQ(compaction_stats[0].num_output_files, 2);
+  ASSERT_EQ(compaction_stats[0].bytes_written, table_file->fd.GetFileSize());
+  ASSERT_EQ(compaction_stats[0].bytes_written_blob,
+            blob_file->GetTotalBlobBytes());
+  ASSERT_EQ(compaction_stats[0].num_output_files, 1);
+  ASSERT_EQ(compaction_stats[0].num_output_files_blob, 1);
 
   const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue();
-  ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED], expected_bytes);
+  ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED],
+            compaction_stats[0].bytes_written +
+                compaction_stats[0].bytes_written_blob);
 #endif  // ROCKSDB_LITE
 }
 
+TEST_F(DBWALTest, RecoverWithBlobMultiSST) {
+  // Write several large (4 KB) values without flushing. Note that blob files
+  // are not actually enabled at this point.
+  std::string large_value(1 << 12, 'a');
+
+  constexpr int num_keys = 64;
+
+  for (int i = 0; i < num_keys; ++i) {
+    ASSERT_OK(Put(Key(i), large_value));
+  }
+
+  // There should be no files just yet since we haven't flushed.
+  {
+    VersionSet* const versions = dbfull()->TEST_GetVersionSet();
+    ASSERT_NE(versions, nullptr);
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    ASSERT_NE(cfd, nullptr);
+
+    Version* const current = cfd->current();
+    ASSERT_NE(current, nullptr);
+
+    const VersionStorageInfo* const storage_info = current->storage_info();
+    ASSERT_NE(storage_info, nullptr);
+
+    ASSERT_EQ(storage_info->num_non_empty_levels(), 0);
+    ASSERT_TRUE(storage_info->GetBlobFiles().empty());
+  }
+
+  // Reopen the database with blob files enabled and write buffer size set to a
+  // smaller value. Multiple table files+blob files should be written and added
+  // to the Version during recovery.
+  Options options;
+  options.write_buffer_size = 1 << 16;  // 64 KB
+  options.enable_blob_files = true;
+  options.avoid_flush_during_recovery = false;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  Reopen(options);
+
+  for (int i = 0; i < num_keys; ++i) {
+    ASSERT_EQ(Get(Key(i)), large_value);
+  }
+
+  VersionSet* const versions = dbfull()->TEST_GetVersionSet();
+  ASSERT_NE(versions, nullptr);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  ASSERT_NE(cfd, nullptr);
+
+  Version* const current = cfd->current();
+  ASSERT_NE(current, nullptr);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  ASSERT_NE(storage_info, nullptr);
+
+  const auto& l0_files = storage_info->LevelFiles(0);
+  ASSERT_GT(l0_files.size(), 1);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_GT(blob_files.size(), 1);
+
+  ASSERT_EQ(l0_files.size(), blob_files.size());
+}
+
+TEST_F(DBWALTest, WALWithChecksumHandoff) {
+#ifndef ROCKSDB_ASSERT_STATUS_CHECKED
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  do {
+    Options options = CurrentOptions();
+
+    options.checksum_handoff_file_types.Add(FileType::kWalFile);
+    options.env = fault_fs_env.get();
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+
+    CreateAndReopenWithCF({"pikachu"}, options);
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "bar"));
+
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2"));
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    // Both value's should be present.
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ("v2", Get(1, "foo"));
+
+    writeOpt.disableWAL = true;
+    // This put, data is persisted by Flush
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    writeOpt.disableWAL = false;
+    // Data is persisted in the WAL
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "zoo", "v3"));
+    // The hash does not match, write fails
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+    writeOpt.disableWAL = false;
+    ASSERT_NOK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    // Due to the write failure, Get should not find
+    ASSERT_NE("v3", Get(1, "foo"));
+    ASSERT_EQ("v3", Get(1, "zoo"));
+    ASSERT_EQ("v3", Get(1, "bar"));
+
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+    // Each write will be similated as corrupted.
+    fault_fs->IngestDataCorruptionBeforeWrite();
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v4"));
+    writeOpt.disableWAL = false;
+    ASSERT_NOK(dbfull()->Put(writeOpt, handles_[1], "foo", "v4"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_NE("v4", Get(1, "foo"));
+    ASSERT_NE("v4", Get(1, "bar"));
+    fault_fs->NoDataCorruptionBeforeWrite();
+
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+    // The file system does not provide checksum method and verification.
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v5"));
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v5"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_EQ("v5", Get(1, "foo"));
+    ASSERT_EQ("v5", Get(1, "bar"));
+
+    Destroy(options);
+  } while (ChangeWalOptions());
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+}
+
 class DBRecoveryTestBlobError
     : public DBWALTest,
       public testing::WithParamInterface<std::string> {
  public:
-  DBRecoveryTestBlobError() : fault_injection_env_(env_) {}
-  ~DBRecoveryTestBlobError() { Close(); }
+  DBRecoveryTestBlobError() : sync_point_(GetParam()) {}
 
-  FaultInjectionTestEnv fault_injection_env_;
+  std::string sync_point_;
 };
 
 INSTANTIATE_TEST_CASE_P(DBRecoveryTestBlobError, DBRecoveryTestBlobError,
@@ -459,20 +631,19 @@ TEST_P(DBRecoveryTestBlobError, RecoverWithBlobError) {
 
   // Reopen with blob files enabled but make blob file writing fail during
   // recovery.
-  SyncPoint::GetInstance()->SetCallBack(GetParam(), [this](void* /* arg */) {
-    fault_injection_env_.SetFilesystemActive(false, Status::IOError());
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) {
+    Status* const s = static_cast<Status*>(arg);
+    assert(s);
+
+    (*s) = Status::IOError(sync_point_);
   });
-  SyncPoint::GetInstance()->SetCallBack(
-      "BuildTable:BeforeFinishBuildTable", [this](void* /* arg */) {
-        fault_injection_env_.SetFilesystemActive(true);
-      });
   SyncPoint::GetInstance()->EnableProcessing();
 
   Options options;
   options.enable_blob_files = true;
   options.avoid_flush_during_recovery = false;
   options.disable_auto_compactions = true;
-  options.env = &fault_injection_env_;
+  options.env = env_;
 
   ASSERT_NOK(TryReopen(options));
 
@@ -500,13 +671,11 @@ TEST_F(DBWALTest, IgnoreRecoveredLog) {
 
   do {
     // delete old files in backup_logs directory
-    env_->CreateDirIfMissing(backup_logs);
+    ASSERT_OK(env_->CreateDirIfMissing(backup_logs));
     std::vector<std::string> old_files;
-    env_->GetChildren(backup_logs, &old_files);
+    ASSERT_OK(env_->GetChildren(backup_logs, &old_files));
     for (auto& file : old_files) {
-      if (file != "." && file != "..") {
-        env_->DeleteFile(backup_logs + "/" + file);
-      }
+      ASSERT_OK(env_->DeleteFile(backup_logs + "/" + file));
     }
     Options options = CurrentOptions();
     options.create_if_missing = true;
@@ -524,11 +693,9 @@ TEST_F(DBWALTest, IgnoreRecoveredLog) {
 
     // copy the logs to backup
     std::vector<std::string> logs;
-    env_->GetChildren(options.wal_dir, &logs);
+    ASSERT_OK(env_->GetChildren(options.wal_dir, &logs));
     for (auto& log : logs) {
-      if (log != ".." && log != ".") {
-        CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log);
-      }
+      CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log);
     }
 
     // recover the DB
@@ -539,9 +706,7 @@ TEST_F(DBWALTest, IgnoreRecoveredLog) {
 
     // copy the logs from backup back to wal dir
     for (auto& log : logs) {
-      if (log != ".." && log != ".") {
-        CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
-      }
+      CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
     }
     // this should ignore the log files, recovery should not happen again
     // if the recovery happens, the same merge operator would be called twice,
@@ -555,11 +720,9 @@ TEST_F(DBWALTest, IgnoreRecoveredLog) {
     Close();
 
     // copy the logs from backup back to wal dir
-    env_->CreateDirIfMissing(options.wal_dir);
+    ASSERT_OK(env_->CreateDirIfMissing(options.wal_dir));
     for (auto& log : logs) {
-      if (log != ".." && log != ".") {
-        CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
-      }
+      CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
     }
     // assert that we successfully recovered only from logs, even though we
     // destroyed the DB
@@ -570,16 +733,14 @@ TEST_F(DBWALTest, IgnoreRecoveredLog) {
     // Recovery will fail if DB directory doesn't exist.
     Destroy(options);
     // copy the logs from backup back to wal dir
-    env_->CreateDirIfMissing(options.wal_dir);
+    ASSERT_OK(env_->CreateDirIfMissing(options.wal_dir));
     for (auto& log : logs) {
-      if (log != ".." && log != ".") {
-        CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
-        // we won't be needing this file no more
-        env_->DeleteFile(backup_logs + "/" + log);
-      }
+      CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
+      // we won't be needing this file no more
+      ASSERT_OK(env_->DeleteFile(backup_logs + "/" + log));
     }
     Status s = TryReopen(options);
-    ASSERT_TRUE(!s.ok());
+    ASSERT_NOK(s);
     Destroy(options);
   } while (ChangeWalOptions());
 }
@@ -617,9 +778,9 @@ TEST_F(DBWALTest, PreallocateBlock) {
         called.fetch_add(1);
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-  Put("", "");
-  Flush();
-  Put("", "");
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("", ""));
   Close();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   ASSERT_EQ(2, called.load());
@@ -636,9 +797,9 @@ TEST_F(DBWALTest, PreallocateBlock) {
         called.fetch_add(1);
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-  Put("", "");
-  Flush();
-  Put("", "");
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("", ""));
   Close();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   ASSERT_EQ(2, called.load());
@@ -656,9 +817,9 @@ TEST_F(DBWALTest, PreallocateBlock) {
         called.fetch_add(1);
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-  Put("", "");
-  Flush();
-  Put("", "");
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("", ""));
   Close();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   ASSERT_EQ(2, called.load());
@@ -677,9 +838,9 @@ TEST_F(DBWALTest, PreallocateBlock) {
         called.fetch_add(1);
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-  Put("", "");
-  Flush();
-  Put("", "");
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("", ""));
   Close();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   ASSERT_EQ(2, called.load());
@@ -905,7 +1066,7 @@ TEST_F(DBWALTest, RecoverCheckFileAmountWithSmallWriteBuffer) {
   // Make 'dobrynia' to be flushed and new WAL file to be created
   ASSERT_OK(Put(2, Key(10), DummyString(7500000)));
   ASSERT_OK(Put(2, Key(1), DummyString(1)));
-  dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
   {
     auto tables = ListTableFiles(env_, dbname_);
     ASSERT_EQ(tables.size(), static_cast<size_t>(1));
@@ -959,7 +1120,7 @@ TEST_F(DBWALTest, RecoverCheckFileAmount) {
   // Make 'nikitich' memtable to be flushed
   ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
   ASSERT_OK(Put(3, Key(1), DummyString(1)));
-  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3]));
   // 4 memtable are not flushed, 1 sst file
   {
     auto tables = ListTableFiles(env_, dbname_);
@@ -979,7 +1140,7 @@ TEST_F(DBWALTest, RecoverCheckFileAmount) {
   ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
   // make it flush
   ASSERT_OK(Put(3, Key(1), DummyString(1)));
-  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3]));
   // There are still 4 memtable not flushed, and 2 sst tables
   ASSERT_OK(Put(0, Key(1), DummyString(1)));
   ASSERT_OK(Put(1, Key(1), DummyString(1)));
@@ -1027,10 +1188,10 @@ TEST_F(DBWALTest, SyncMultipleLogs) {
   for (uint64_t b = 0; b < kNumBatches; b++) {
     batch.Clear();
     for (int i = 0; i < kBatchSize; i++) {
-      batch.Put(Key(i), DummyString(128));
+      ASSERT_OK(batch.Put(Key(i), DummyString(128)));
     }
 
-    dbfull()->Write(wo, &batch);
+    ASSERT_OK(dbfull()->Write(wo, &batch));
   }
 
   ASSERT_OK(dbfull()->SyncWAL());
@@ -1058,7 +1219,7 @@ TEST_F(DBWALTest, PartOfWritesWithWALDisabled) {
   ASSERT_OK(Flush(0));
   ASSERT_OK(Put(0, "key", "v5", wal_on));  // seq id 5
   ASSERT_EQ("v5", Get(0, "key"));
-  dbfull()->FlushWAL(false);
+  ASSERT_OK(dbfull()->FlushWAL(false));
   // Simulate a crash.
   fault_env->SetFilesystemActive(false);
   Close();
@@ -1094,30 +1255,31 @@ class RecoveryTestHelper {
     *count = 0;
 
     std::shared_ptr<Cache> table_cache = NewLRUCache(50, 0);
-    EnvOptions env_options;
+    FileOptions file_options;
     WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
 
     std::unique_ptr<VersionSet> versions;
     std::unique_ptr<WalManager> wal_manager;
     WriteController write_controller;
 
-    versions.reset(new VersionSet(
-        test->dbname_, &db_options, env_options, table_cache.get(),
-        &write_buffer_manager, &write_controller,
-        /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr));
+    versions.reset(new VersionSet(test->dbname_, &db_options, file_options,
+                                  table_cache.get(), &write_buffer_manager,
+                                  &write_controller,
+                                  /*block_cache_tracer=*/nullptr,
+                                  /*io_tracer=*/nullptr, /*db_session_id*/ ""));
 
     wal_manager.reset(
-        new WalManager(db_options, env_options, /*io_tracer=*/nullptr));
+        new WalManager(db_options, file_options, /*io_tracer=*/nullptr));
 
     std::unique_ptr<log::Writer> current_log_writer;
 
     for (size_t j = kWALFileOffset; j < wal_count + kWALFileOffset; j++) {
       uint64_t current_log_number = j;
       std::string fname = LogFileName(test->dbname_, current_log_number);
-      std::unique_ptr<WritableFile> file;
-      ASSERT_OK(db_options.env->NewWritableFile(fname, &file, env_options));
-      std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-          NewLegacyWritableFileWrapper(std::move(file)), fname, env_options));
+      std::unique_ptr<WritableFileWriter> file_writer;
+      ASSERT_OK(WritableFileWriter::Create(db_options.env->GetFileSystem(),
+                                           fname, file_options, &file_writer,
+                                           nullptr));
       current_log_writer.reset(
           new log::Writer(std::move(file_writer), current_log_number,
                           db_options.recycle_log_file_num > 0));
@@ -1126,12 +1288,13 @@ class RecoveryTestHelper {
       for (int i = 0; i < kKeysPerWALFile; i++) {
         std::string key = "key" + ToString((*count)++);
         std::string value = test->DummyString(kValueSize);
-        assert(current_log_writer.get() != nullptr);
+        ASSERT_NE(current_log_writer.get(), nullptr);
         uint64_t seq = versions->LastSequence() + 1;
         batch.Clear();
-        batch.Put(key, value);
+        ASSERT_OK(batch.Put(key, value));
         WriteBatchInternal::SetSequence(&batch, seq);
-        current_log_writer->AddRecord(WriteBatchInternal::Contents(&batch));
+        ASSERT_OK(current_log_writer->AddRecord(
+            WriteBatchInternal::Contents(&batch)));
         versions->SetLastAllocatedSequence(seq);
         versions->SetLastPublishedSequence(seq);
         versions->SetLastSequence(seq);
@@ -1179,32 +1342,13 @@ class RecoveryTestHelper {
     test->Close();
 #endif
     if (trunc) {
-      ASSERT_EQ(0, truncate(fname.c_str(), static_cast<int64_t>(size * off)));
+      ASSERT_OK(
+          test::TruncateFile(env, fname, static_cast<uint64_t>(size * off)));
     } else {
-      InduceCorruption(fname, static_cast<size_t>(size * off + 8),
-                       static_cast<size_t>(size * len));
+      ASSERT_OK(test::CorruptFile(env, fname, static_cast<int>(size * off + 8),
+                                  static_cast<int>(size * len), false));
     }
   }
-
-  // Overwrite data with 'a' from offset for length len
-  static void InduceCorruption(const std::string& filename, size_t offset,
-                               size_t len) {
-    ASSERT_GT(len, 0U);
-
-    int fd = open(filename.c_str(), O_RDWR);
-
-    // On windows long is 32-bit
-    ASSERT_LE(offset, std::numeric_limits<long>::max());
-
-    ASSERT_GT(fd, 0);
-    ASSERT_EQ(offset, lseek(fd, static_cast<long>(offset), SEEK_SET));
-
-    void* buf = alloca(len);
-    memset(buf, 'b', len);
-    ASSERT_EQ(len, write(fd, buf, static_cast<unsigned int>(len)));
-
-    close(fd);
-  }
 };
 
 class DBWALTestWithParams
@@ -1326,11 +1470,11 @@ TEST_F(DBWALTest, kPointInTimeRecoveryCFConsistency) {
 
   ASSERT_OK(Put(1, "key3", "val3"));
   // Corrupt WAL at location of key3
-  RecoveryTestHelper::InduceCorruption(
-      fname, static_cast<size_t>(offset_to_corrupt), static_cast<size_t>(4));
+  ASSERT_OK(test::CorruptFile(env, fname, static_cast<int>(offset_to_corrupt),
+                              4, false));
   ASSERT_OK(Put(2, "key4", "val4"));
   ASSERT_OK(Put(1, "key5", "val5"));
-  Flush(2);
+  ASSERT_OK(Flush(2));
 
   // PIT recovery & verify
   options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
@@ -1484,7 +1628,7 @@ TEST_F(DBWALTest, WalCleanupAfterAvoidFlushDuringRecovery) {
   for (int i = 0; i < 2; ++i) {
     if (i > 0) {
       // Flush() triggers deletion of obsolete tracked files
-      Flush();
+      ASSERT_OK(Flush());
     }
     VectorLogPtr log_files;
     ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
@@ -1526,7 +1670,7 @@ TEST_F(DBWALTest, RecoverWithoutFlush) {
   ASSERT_EQ(Get("foo"), "foo_v2");
   ASSERT_EQ(Get("bar"), "bar_v2");
   // manual flush and insert again
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_EQ(Get("foo"), "foo_v2");
   ASSERT_EQ(Get("bar"), "bar_v2");
   ASSERT_OK(Put("foo", "foo_v3"));
@@ -1547,7 +1691,9 @@ TEST_F(DBWALTest, RecoverWithoutFlushMultipleCF) {
 
   auto countWalFiles = [this]() {
     VectorLogPtr log_files;
-    dbfull()->GetSortedWalFiles(log_files);
+    if (!dbfull()->GetSortedWalFiles(log_files).ok()) {
+      return size_t{0};
+    }
     return log_files.size();
   };
 
@@ -1555,11 +1701,11 @@ TEST_F(DBWALTest, RecoverWithoutFlushMultipleCF) {
   CreateAndReopenWithCF({"one", "two"}, options);
   ASSERT_OK(Put(0, "key1", kSmallValue));
   ASSERT_OK(Put(1, "key2", kLargeValue));
-  Flush(1);
+  ASSERT_OK(Flush(1));
   ASSERT_EQ(1, countWalFiles());
   ASSERT_OK(Put(0, "key3", kSmallValue));
   ASSERT_OK(Put(2, "key4", kLargeValue));
-  Flush(2);
+  ASSERT_OK(Flush(2));
   ASSERT_EQ(2, countWalFiles());
 
   // Reopen, insert and flush.
@@ -1573,9 +1719,9 @@ TEST_F(DBWALTest, RecoverWithoutFlushMultipleCF) {
   ASSERT_OK(Put(0, "key5", kLargeValue));
   ASSERT_OK(Put(1, "key6", kLargeValue));
   ASSERT_EQ(3, countWalFiles());
-  Flush(1);
+  ASSERT_OK(Flush(1));
   ASSERT_OK(Put(2, "key7", kLargeValue));
-  dbfull()->FlushWAL(false);
+  ASSERT_OK(dbfull()->FlushWAL(false));
   ASSERT_EQ(4, countWalFiles());
 
   // Reopen twice and validate.
@@ -1656,19 +1802,8 @@ TEST_P(DBWALTestWithParamsVaryingRecoveryMode,
 // avoid_flush_during_recovery=true.
 // Flush should trigger if max_total_wal_size is reached.
 TEST_F(DBWALTest, RestoreTotalLogSizeAfterRecoverWithoutFlush) {
-  class TestFlushListener : public EventListener {
-   public:
-    std::atomic<int> count{0};
-
-    TestFlushListener() = default;
-
-    void OnFlushBegin(DB* /*db*/, const FlushJobInfo& flush_job_info) override {
-      count++;
-      assert(FlushReason::kWriteBufferManager == flush_job_info.flush_reason);
-    }
-  };
-  std::shared_ptr<TestFlushListener> test_listener =
-      std::make_shared<TestFlushListener>();
+  auto test_listener = std::make_shared<FlushCounterListener>();
+  test_listener->expected_flush_reason = FlushReason::kWalFull;
 
   constexpr size_t kKB = 1024;
   constexpr size_t kMB = 1024 * 1024;
@@ -1708,7 +1843,9 @@ TEST_F(DBWALTest, RestoreTotalLogSizeAfterRecoverWithoutFlush) {
             1 * kMB);
   // Write one more key to trigger flush.
   ASSERT_OK(Put(0, "foo", "v2"));
-  dbfull()->TEST_WaitForFlushMemTable();
+  for (auto* h : handles_) {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(h));
+  }
   // Flushed two column families.
   ASSERT_EQ(2, test_listener->count.load());
 }
@@ -1720,24 +1857,15 @@ TEST_F(DBWALTest, RestoreTotalLogSizeAfterRecoverWithoutFlush) {
 TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithoutFlush) {
   constexpr size_t kKB = 1024;
   Options options = CurrentOptions();
+  options.env = env_;
   options.avoid_flush_during_recovery = true;
-  // Test fallocate support of running file system.
-  // Skip this test if fallocate is not supported.
-  std::string fname_test_fallocate = dbname_ + "/preallocate_testfile";
-  int fd = -1;
-  do {
-    fd = open(fname_test_fallocate.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
-  } while (fd < 0 && errno == EINTR);
-  ASSERT_GT(fd, 0);
-  int alloc_status = fallocate(fd, 0, 0, 1);
-  int err_number = errno;
-  close(fd);
-  ASSERT_OK(options.env->DeleteFile(fname_test_fallocate));
-  if (err_number == ENOSYS || err_number == EOPNOTSUPP) {
-    fprintf(stderr, "Skipped preallocated space check: %s\n", strerror(err_number));
+  if (mem_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem environment");
+    return;
+  }
+  if (!IsFallocateSupported()) {
     return;
   }
-  ASSERT_EQ(0, alloc_status);
 
   DestroyAndReopen(options);
   size_t preallocated_size =
@@ -1760,6 +1888,175 @@ TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithoutFlush) {
   ASSERT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()),
             preallocated_size);
 }
+// Tests that we will truncate the preallocated space of the last log from
+// previous.
+TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithFlush) {
+  constexpr size_t kKB = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.avoid_flush_during_recovery = false;
+  options.avoid_flush_during_shutdown = true;
+  if (mem_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem environment");
+    return;
+  }
+  if (!IsFallocateSupported()) {
+    return;
+  }
+
+  DestroyAndReopen(options);
+  size_t preallocated_size =
+      dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
+  ASSERT_OK(Put("foo", "v1"));
+  VectorLogPtr log_files_before;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+  ASSERT_EQ(1, log_files_before.size());
+  auto& file_before = log_files_before[0];
+  ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB);
+  ASSERT_GE(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+            preallocated_size);
+  // The log file has preallocated space.
+  Close();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::PurgeObsoleteFiles:Begin",
+        "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"},
+       {"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate",
+        "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread reopen_thread([&]() { Reopen(options); });
+
+  TEST_SYNC_POINT(
+      "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover");
+  // After the flush during Open, the log file should get deleted.  However,
+  // if  the process is in a crash loop, the log file may not get
+  // deleted and thte preallocated space will keep accumulating. So we need
+  // to ensure it gets trtuncated.
+  EXPECT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+            preallocated_size);
+  TEST_SYNC_POINT(
+      "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate");
+  reopen_thread.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBWALTest, TruncateLastLogAfterRecoverWALEmpty) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.avoid_flush_during_recovery = false;
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem/non-encrypted  environment");
+    return;
+  }
+  if (!IsFallocateSupported()) {
+    return;
+  }
+
+  DestroyAndReopen(options);
+  size_t preallocated_size =
+      dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
+  Close();
+  std::vector<std::string> filenames;
+  std::string last_log;
+  uint64_t last_log_num = 0;
+  ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+  for (auto fname : filenames) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(fname, &number, &type, nullptr)) {
+      if (type == kWalFile && number > last_log_num) {
+        last_log = fname;
+      }
+    }
+  }
+  ASSERT_NE(last_log, "");
+  last_log = dbname_ + '/' + last_log;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::PurgeObsoleteFiles:Begin",
+        "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"},
+       {"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate",
+        "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "PosixWritableFile::Close",
+      [](void* arg) { *(reinterpret_cast<size_t*>(arg)) = 0; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  // Preallocate space for the empty log file. This could happen if WAL data
+  // was buffered in memory and the process crashed.
+  std::unique_ptr<WritableFile> log_file;
+  ASSERT_OK(env_->ReopenWritableFile(last_log, &log_file, EnvOptions()));
+  log_file->SetPreallocationBlockSize(preallocated_size);
+  log_file->PrepareWrite(0, 4096);
+  log_file.reset();
+
+  ASSERT_GE(GetAllocatedFileSize(last_log), preallocated_size);
+
+  port::Thread reopen_thread([&]() { Reopen(options); });
+
+  TEST_SYNC_POINT(
+      "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover");
+  // The preallocated space should be truncated.
+  EXPECT_LT(GetAllocatedFileSize(last_log), preallocated_size);
+  TEST_SYNC_POINT(
+      "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate");
+  reopen_thread.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBWALTest, ReadOnlyRecoveryNoTruncate) {
+  constexpr size_t kKB = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.avoid_flush_during_recovery = true;
+  if (mem_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem environment");
+    return;
+  }
+  if (!IsFallocateSupported()) {
+    return;
+  }
+
+  // create DB and close with file truncate disabled
+  std::atomic_bool enable_truncate{false};
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "PosixWritableFile::Close", [&](void* arg) {
+        if (!enable_truncate) {
+          *(reinterpret_cast<size_t*>(arg)) = 0;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  DestroyAndReopen(options);
+  size_t preallocated_size =
+      dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
+  ASSERT_OK(Put("foo", "v1"));
+  VectorLogPtr log_files_before;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+  ASSERT_EQ(1, log_files_before.size());
+  auto& file_before = log_files_before[0];
+  ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB);
+  // The log file has preallocated space.
+  auto db_size = GetAllocatedFileSize(dbname_ + file_before->PathName());
+  ASSERT_GE(db_size, preallocated_size);
+  Close();
+
+  // enable truncate and open DB as readonly, the file should not be truncated
+  // and DB size is not changed.
+  enable_truncate = true;
+  ASSERT_OK(ReadOnlyReopen(options));
+  VectorLogPtr log_files_after;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after));
+  ASSERT_EQ(1, log_files_after.size());
+  ASSERT_LT(log_files_after[0]->SizeFileBytes(), 1 * kKB);
+  ASSERT_EQ(log_files_after[0]->PathName(), file_before->PathName());
+  // The preallocated space should NOT be truncated.
+  // the DB size is almost the same.
+  ASSERT_NEAR(GetAllocatedFileSize(dbname_ + file_before->PathName()), db_size,
+              db_size / 100);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
 #endif  // ROCKSDB_FALLOCATE_PRESENT
 #endif  // ROCKSDB_PLATFORM_POSIX
 
@@ -1777,9 +2074,9 @@ TEST_F(DBWALTest, WalTermTest) {
   wo.disableWAL = false;
 
   WriteBatch batch;
-  batch.Put("foo", "bar");
+  ASSERT_OK(batch.Put("foo", "bar"));
   batch.MarkWalTerminationPoint();
-  batch.Put("foo2", "bar2");
+  ASSERT_OK(batch.Put("foo2", "bar2"));
 
   ASSERT_OK(dbfull()->Write(wo, &batch));
 
diff --git a/db/db_with_timestamp_basic_test.cc b/db/db_with_timestamp_basic_test.cc
index a0984bc2ff7..a227eb9395d 100644
--- a/db/db_with_timestamp_basic_test.cc
+++ b/db/db_with_timestamp_basic_test.cc
@@ -32,6 +32,22 @@ class DBBasicTestWithTimestampBase : public DBTestBase {
     return ret;
   }
 
+  static std::string KeyWithPrefix(std::string prefix, uint64_t k) {
+    std::string ret;
+    PutFixed64(&ret, k);
+    std::reverse(ret.begin(), ret.end());
+    return prefix + ret;
+  }
+
+  static std::vector<Slice> ConvertStrToSlice(
+      std::vector<std::string>& strings) {
+    std::vector<Slice> ret;
+    for (const auto& s : strings) {
+      ret.emplace_back(s);
+    }
+    return ret;
+  }
+
   class TestComparator : public Comparator {
    private:
     const Comparator* cmp_without_ts_;
@@ -141,7 +157,8 @@ class DBBasicTestWithTimestampBase : public DBTestBase {
     ukey_and_ts.assign(expected_ukey.data(), expected_ukey.size());
     ukey_and_ts.append(expected_ts.data(), expected_ts.size());
     ParsedInternalKey parsed_ikey;
-    ASSERT_OK(ParseInternalKey(it->key(), &parsed_ikey));
+    ASSERT_OK(
+        ParseInternalKey(it->key(), &parsed_ikey, true /* log_err_key */));
     ASSERT_EQ(ukey_and_ts, parsed_ikey.user_key);
     ASSERT_EQ(expected_val_type, parsed_ikey.type);
     ASSERT_EQ(expected_seq, parsed_ikey.sequence);
@@ -161,7 +178,8 @@ class DBBasicTestWithTimestampBase : public DBTestBase {
     ukey_and_ts.append(expected_ts.data(), expected_ts.size());
 
     ParsedInternalKey parsed_ikey;
-    ASSERT_OK(ParseInternalKey(it->key(), &parsed_ikey));
+    ASSERT_OK(
+        ParseInternalKey(it->key(), &parsed_ikey, true /* log_err_key */));
     ASSERT_EQ(expected_val_type, parsed_ikey.type);
     ASSERT_EQ(Slice(ukey_and_ts), parsed_ikey.user_key);
     if (expected_val_type == kTypeValue) {
@@ -177,7 +195,218 @@ class DBBasicTestWithTimestamp : public DBBasicTestWithTimestampBase {
       : DBBasicTestWithTimestampBase("db_basic_test_with_timestamp") {}
 };
 
-TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterate) {
+TEST_F(DBBasicTestWithTimestamp, CompactRangeWithSpecifiedRange) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  WriteOptions write_opts;
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+
+  ASSERT_OK(db_->Put(write_opts, "foo1", "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->Put(write_opts, "foo2", "bar"));
+  ASSERT_OK(Flush());
+
+  std::string start_str = "foo";
+  std::string end_str = "foo2";
+  Slice start(start_str), end(end_str);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, UpdateFullHistoryTsLow) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  const std::string kKey = "test kKey";
+
+  // Test set ts_low first and flush()
+  int current_ts_low = 5;
+  std::string ts_low_str = Timestamp(current_ts_low, 0);
+  Slice ts_low = ts_low_str;
+  CompactRangeOptions comp_opts;
+  comp_opts.full_history_ts_low = &ts_low;
+  comp_opts.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+
+  ASSERT_OK(db_->CompactRange(comp_opts, nullptr, nullptr));
+
+  auto* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
+          ->cfd();
+  auto result_ts_low = cfd->GetFullHistoryTsLow();
+
+  ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low, result_ts_low) == 0);
+
+  for (int i = 0; i < 10; i++) {
+    WriteOptions write_opts;
+    std::string ts_str = Timestamp(i, 0);
+    Slice ts = ts_str;
+    write_opts.timestamp = &ts;
+    ASSERT_OK(db_->Put(write_opts, kKey, Key(i)));
+  }
+  ASSERT_OK(Flush());
+
+  for (int i = 0; i < 10; i++) {
+    ReadOptions read_opts;
+    std::string ts_str = Timestamp(i, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::string value;
+    Status status = db_->Get(read_opts, kKey, &value);
+    if (i < current_ts_low) {
+      ASSERT_TRUE(status.IsNotFound());
+    } else {
+      ASSERT_OK(status);
+      ASSERT_TRUE(value.compare(Key(i)) == 0);
+    }
+  }
+
+  // Test set ts_low and then trigger compaction
+  for (int i = 10; i < 20; i++) {
+    WriteOptions write_opts;
+    std::string ts_str = Timestamp(i, 0);
+    Slice ts = ts_str;
+    write_opts.timestamp = &ts;
+    ASSERT_OK(db_->Put(write_opts, kKey, Key(i)));
+  }
+
+  ASSERT_OK(Flush());
+
+  current_ts_low = 15;
+  ts_low_str = Timestamp(current_ts_low, 0);
+  ts_low = ts_low_str;
+  comp_opts.full_history_ts_low = &ts_low;
+  ASSERT_OK(db_->CompactRange(comp_opts, nullptr, nullptr));
+  result_ts_low = cfd->GetFullHistoryTsLow();
+  ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low, result_ts_low) == 0);
+
+  for (int i = 0; i < 20; i++) {
+    ReadOptions read_opts;
+    std::string ts_str = Timestamp(i, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::string value;
+    Status status = db_->Get(read_opts, kKey, &value);
+    if (i < current_ts_low) {
+      ASSERT_TRUE(status.IsNotFound());
+    } else {
+      ASSERT_OK(status);
+      ASSERT_TRUE(value.compare(Key(i)) == 0);
+    }
+  }
+
+  // Test invalid compaction with range
+  Slice start(kKey), end(kKey);
+  Status s = db_->CompactRange(comp_opts, &start, &end);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  s = db_->CompactRange(comp_opts, &start, nullptr);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  s = db_->CompactRange(comp_opts, nullptr, &end);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  // Test invalid compaction with the decreasing ts_low
+  ts_low_str = Timestamp(current_ts_low - 1, 0);
+  ts_low = ts_low_str;
+  comp_opts.full_history_ts_low = &ts_low;
+  s = db_->CompactRange(comp_opts, nullptr, nullptr);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, GetApproximateSizes) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000000;  // Large write buffer
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  auto default_cf = db_->DefaultColumnFamily();
+
+  WriteOptions write_opts;
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+
+  const int N = 128;
+  Random rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(db_->Put(write_opts, Key(i), rnd.RandomString(1024)));
+  }
+
+  uint64_t size;
+  std::string start = Key(50);
+  std::string end = Key(60);
+  Range r(start, end);
+  SizeApproximationOptions size_approx_options;
+  size_approx_options.include_memtabtles = true;
+  size_approx_options.include_files = true;
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_GT(size, 6000);
+  ASSERT_LT(size, 204800);
+
+  // test multiple ranges
+  std::vector<Range> ranges;
+  std::string start_tmp = Key(10);
+  std::string end_tmp = Key(20);
+  ranges.emplace_back(Range(start_tmp, end_tmp));
+  ranges.emplace_back(Range(start, end));
+  uint64_t range_sizes[2];
+  ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf,
+                                     ranges.data(), 2, range_sizes));
+
+  ASSERT_EQ(range_sizes[1], size);
+
+  // Zero if not including mem table
+  ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size));
+  ASSERT_EQ(size, 0);
+
+  start = Key(500);
+  end = Key(600);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_EQ(size, 0);
+
+  // Test range boundaries
+  ASSERT_OK(db_->Put(write_opts, Key(1000), rnd.RandomString(1024)));
+  // Should include start key
+  start = Key(1000);
+  end = Key(1100);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_GT(size, 0);
+
+  // Should exclude end key
+  start = Key(900);
+  end = Key(1000);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_EQ(size, 0);
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, SimpleIterate) {
   const int kNumKeysPerFile = 128;
   const uint64_t kMaxKey = 1024;
   Options options = CurrentOptions();
@@ -209,6 +438,7 @@ TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterate) {
     std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
     int count = 0;
     uint64_t key = 0;
+    // Forward iterate.
     for (it->Seek(Key1(0)), key = start_keys[i]; it->Valid();
          it->Next(), ++count, ++key) {
       CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
@@ -217,7 +447,16 @@ TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterate) {
     size_t expected_count = kMaxKey - start_keys[i] + 1;
     ASSERT_EQ(expected_count, count);
 
-    // SeekToFirst() with lower bound.
+    // Backward iterate.
+    count = 0;
+    for (it->SeekForPrev(Key1(kMaxKey)), key = kMaxKey; it->Valid();
+         it->Prev(), ++count, --key) {
+      CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                         "value" + std::to_string(i), write_timestamps[i]);
+    }
+    ASSERT_EQ(static_cast<size_t>(kMaxKey) - start_keys[i] + 1, count);
+
+    // SeekToFirst()/SeekToLast() with lower/upper bounds.
     // Then iter with lower and upper bounds.
     uint64_t l = 0;
     uint64_t r = kMaxKey + 1;
@@ -235,6 +474,12 @@ TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterate) {
                            "value" + std::to_string(i), write_timestamps[i]);
       }
       ASSERT_EQ(r - std::max(l, start_keys[i]), count);
+
+      for (it->SeekToLast(), key = std::min(r, kMaxKey + 1), count = 0;
+           it->Valid(); it->Prev(), --key, ++count) {
+        CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue,
+                           "value" + std::to_string(i), write_timestamps[i]);
+      }
       l += (kMaxKey / 100);
       r -= (kMaxKey / 100);
     }
@@ -242,256 +487,917 @@ TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterate) {
   Close();
 }
 
-TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterateLowerTsBound) {
-  constexpr int kNumKeysPerFile = 128;
-  constexpr uint64_t kMaxKey = 1024;
-  Options options = CurrentOptions();
-  options.env = env_;
+class DBBasicTestWithTimestampTableOptions
+    : public DBBasicTestWithTimestampBase,
+      public testing::WithParamInterface<BlockBasedTableOptions::IndexType> {
+ public:
+  explicit DBBasicTestWithTimestampTableOptions()
+      : DBBasicTestWithTimestampBase(
+            "db_basic_test_with_timestamp_table_options") {}
+};
+
+INSTANTIATE_TEST_CASE_P(
+    Timestamp, DBBasicTestWithTimestampTableOptions,
+    testing::Values(
+        BlockBasedTableOptions::IndexType::kBinarySearch,
+        BlockBasedTableOptions::IndexType::kHashSearch,
+        BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
+        BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey));
+
+TEST_P(DBBasicTestWithTimestampTableOptions, GetAndMultiGet) {
+  Options options = GetDefaultOptions();
   options.create_if_missing = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  options.compression = kNoCompression;
+  BlockBasedTableOptions bbto;
+  bbto.index_type = GetParam();
+  bbto.block_size = 100;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
   const size_t kTimestampSize = Timestamp(0, 0).size();
-  TestComparator test_cmp(kTimestampSize);
-  options.comparator = &test_cmp;
-  options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+  TestComparator cmp(kTimestampSize);
+  options.comparator = &cmp;
   DestroyAndReopen(options);
-  const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
-                                                     Timestamp(3, 0)};
-  const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
-                                                    Timestamp(4, 0)};
-  const std::vector<std::string> read_timestamps_lb = {Timestamp(1, 0),
-                                                       Timestamp(1, 0)};
-  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+  constexpr uint64_t kNumKeys = 1024;
+  for (uint64_t k = 0; k < kNumKeys; ++k) {
     WriteOptions write_opts;
-    Slice write_ts = write_timestamps[i];
-    write_opts.timestamp = &write_ts;
-    for (uint64_t key = 0; key <= kMaxKey; ++key) {
-      Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(i));
-      ASSERT_OK(s);
-    }
+    std::string ts_str = Timestamp(1, 0);
+    Slice ts = ts_str;
+    write_opts.timestamp = &ts;
+    ASSERT_OK(db_->Put(write_opts, Key1(k), "value" + std::to_string(k)));
   }
-  for (size_t i = 0; i < read_timestamps.size(); ++i) {
+  ASSERT_OK(Flush());
+  {
     ReadOptions read_opts;
-    Slice read_ts = read_timestamps[i];
-    Slice read_ts_lb = read_timestamps_lb[i];
-    read_opts.timestamp = &read_ts;
-    read_opts.iter_start_ts = &read_ts_lb;
+    read_opts.total_order_seek = true;
+    std::string ts_str = Timestamp(2, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
     std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
-    int count = 0;
-    uint64_t key = 0;
-    for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) {
-      CheckIterEntry(it.get(), Key1(key), kTypeValue,
-                     "value" + std::to_string(i), write_timestamps[i]);
-      if (i > 0) {
-        it->Next();
-        CheckIterEntry(it.get(), Key1(key), kTypeValue,
-                       "value" + std::to_string(i - 1),
-                       write_timestamps[i - 1]);
+    // verify Get()
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      std::string value_from_get;
+      std::string key_str(it->key().data(), it->key().size());
+      std::string timestamp;
+      ASSERT_OK(db_->Get(read_opts, key_str, &value_from_get, &timestamp));
+      ASSERT_EQ(it->value(), value_from_get);
+      ASSERT_EQ(Timestamp(1, 0), timestamp);
+    }
+
+    // verify MultiGet()
+    constexpr uint64_t step = 2;
+    static_assert(0 == (kNumKeys % step),
+                  "kNumKeys must be a multiple of step");
+    for (uint64_t k = 0; k < kNumKeys; k += 2) {
+      std::vector<std::string> key_strs;
+      std::vector<Slice> keys;
+      for (size_t i = 0; i < step; ++i) {
+        key_strs.push_back(Key1(k + i));
+      }
+      for (size_t i = 0; i < step; ++i) {
+        keys.emplace_back(key_strs[i]);
+      }
+      std::vector<std::string> values;
+      std::vector<std::string> timestamps;
+      std::vector<Status> statuses =
+          db_->MultiGet(read_opts, keys, &values, &timestamps);
+      ASSERT_EQ(step, statuses.size());
+      ASSERT_EQ(step, values.size());
+      ASSERT_EQ(step, timestamps.size());
+      for (uint64_t i = 0; i < step; ++i) {
+        ASSERT_OK(statuses[i]);
+        ASSERT_EQ("value" + std::to_string(k + i), values[i]);
+        ASSERT_EQ(Timestamp(1, 0), timestamps[i]);
       }
     }
-    size_t expected_count = kMaxKey + 1;
-    ASSERT_EQ(expected_count, count);
   }
-  // Delete all keys@ts=5 and check iteration result with start ts set
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLessThanKey) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  options.memtable_whole_key_filtering = true;
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = true;
+  bbto.index_type = GetParam();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  WriteOptions write_opts;
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+
+  ASSERT_OK(db_->Put(write_opts, "foo1", "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->Put(write_opts, "foo2", "bar"));
+  ASSERT_OK(Flush());
+
+  // Move sst file to next level
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_OK(db_->Put(write_opts, "foo3", "bar"));
+  ASSERT_OK(Flush());
+
+  ReadOptions read_opts;
+  std::string read_ts = Timestamp(1, 0);
+  ts = read_ts;
+  read_opts.timestamp = &ts;
   {
-    std::string write_timestamp = Timestamp(5, 0);
-    WriteOptions write_opts;
-    Slice write_ts = write_timestamp;
-    write_opts.timestamp = &write_ts;
-    for (uint64_t key = 0; key < kMaxKey + 1; ++key) {
-      Status s = db_->Delete(write_opts, Key1(key));
-      ASSERT_OK(s);
-    }
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->Seek("foo");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
 
-    std::string read_timestamp = Timestamp(6, 0);
-    ReadOptions read_opts;
-    Slice read_ts = read_timestamp;
-    read_opts.timestamp = &read_ts;
-    std::string read_timestamp_lb = Timestamp(2, 0);
-    Slice read_ts_lb = read_timestamp_lb;
-    read_opts.iter_start_ts = &read_ts_lb;
-    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
-    int count = 0;
-    uint64_t key = 0;
-    for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) {
-      CheckIterEntry(it.get(), Key1(key), kTypeDeletionWithTimestamp, Slice(),
-                     write_ts);
-      // Skip key@ts=3 and land on tombstone key@ts=5
-      it->Next();
-    }
-    ASSERT_EQ(kMaxKey + 1, count);
+    iter->Seek("bbb");
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+  }
+
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLongerThanKey) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(20));
+  options.memtable_whole_key_filtering = true;
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = true;
+  bbto.index_type = GetParam();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  WriteOptions write_opts;
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+
+  ASSERT_OK(db_->Put(write_opts, "foo1", "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->Put(write_opts, "foo2", "bar"));
+  ASSERT_OK(Flush());
+
+  // Move sst file to next level
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_OK(db_->Put(write_opts, "foo3", "bar"));
+  ASSERT_OK(Flush());
+
+  ReadOptions read_opts;
+  std::string read_ts = Timestamp(2, 0);
+  ts = read_ts;
+  read_opts.timestamp = &ts;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    // Make sure the prefix extractor doesn't include timestamp, otherwise it
+    // may return invalid result.
+    iter->Seek("foo");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
   }
+
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithBound) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = true;
+  bbto.index_type = GetParam();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  WriteOptions write_opts;
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+
+  ASSERT_OK(db_->Put(write_opts, "foo1", "bar1"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->Put(write_opts, "foo2", "bar2"));
+  ASSERT_OK(Flush());
+
+  // Move sst file to next level
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  for (int i = 3; i < 9; ++i) {
+    ASSERT_OK(db_->Put(write_opts, "foo" + std::to_string(i),
+                       "bar" + std::to_string(i)));
+  }
+  ASSERT_OK(Flush());
+
+  ReadOptions read_opts;
+  std::string read_ts = Timestamp(2, 0);
+  ts = read_ts;
+  read_opts.timestamp = &ts;
+  std::string up_bound = "foo5";  // exclusive
+  Slice up_bound_slice = up_bound;
+  std::string lo_bound = "foo2";  // inclusive
+  Slice lo_bound_slice = lo_bound;
+  read_opts.iterate_upper_bound = &up_bound_slice;
+  read_opts.iterate_lower_bound = &lo_bound_slice;
+  read_opts.auto_prefix_mode = true;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    // Make sure the prefix extractor doesn't include timestamp, otherwise it
+    // may return invalid result.
+    iter->Seek("foo");
+    CheckIterUserEntry(iter.get(), lo_bound, kTypeValue, "bar2",
+                       Timestamp(1, 0));
+    iter->SeekToFirst();
+    CheckIterUserEntry(iter.get(), lo_bound, kTypeValue, "bar2",
+                       Timestamp(1, 0));
+    iter->SeekForPrev("g");
+    CheckIterUserEntry(iter.get(), "foo4", kTypeValue, "bar4", Timestamp(1, 0));
+    iter->SeekToLast();
+    CheckIterUserEntry(iter.get(), "foo4", kTypeValue, "bar4", Timestamp(1, 0));
+  }
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, ChangeIterationDirection) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.env = env_;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+  const std::vector<std::string> timestamps = {Timestamp(1, 1), Timestamp(0, 2),
+                                               Timestamp(4, 3)};
+  const std::vector<std::tuple<std::string, std::string>> kvs = {
+      std::make_tuple("aa", "value1"), std::make_tuple("ab", "value2")};
+  for (const auto& ts : timestamps) {
+    WriteBatch wb(0, 0, kTimestampSize);
+    for (const auto& kv : kvs) {
+      const std::string& key = std::get<0>(kv);
+      const std::string& value = std::get<1>(kv);
+      ASSERT_OK(wb.Put(key, value));
+    }
+
+    ASSERT_OK(wb.AssignTimestamp(ts));
+    ASSERT_OK(db_->Write(WriteOptions(), &wb));
+  }
+  std::string read_ts_str = Timestamp(5, 3);
+  Slice read_ts = read_ts_str;
+  ReadOptions read_opts;
+  read_opts.timestamp = &read_ts;
+  std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+
+  it->SeekToFirst();
+  ASSERT_TRUE(it->Valid());
+  it->Prev();
+  ASSERT_FALSE(it->Valid());
+
+  it->SeekToLast();
+  ASSERT_TRUE(it->Valid());
+  uint64_t prev_reseek_count =
+      options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION);
+  ASSERT_EQ(0, prev_reseek_count);
+  it->Next();
+  ASSERT_FALSE(it->Valid());
+  ASSERT_EQ(1 + prev_reseek_count,
+            options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+  it->Seek(std::get<0>(kvs[0]));
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue,
+                     std::get<1>(kvs[0]), Timestamp(4, 3));
+  it->Next();
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
+                     std::get<1>(kvs[1]), Timestamp(4, 3));
+  it->Prev();
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue,
+                     std::get<1>(kvs[0]), Timestamp(4, 3));
+
+  prev_reseek_count =
+      options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION);
+  ASSERT_EQ(1, prev_reseek_count);
+  it->Next();
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
+                     std::get<1>(kvs[1]), Timestamp(4, 3));
+  ASSERT_EQ(1 + prev_reseek_count,
+            options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+  it->SeekForPrev(std::get<0>(kvs[1]));
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
+                     std::get<1>(kvs[1]), Timestamp(4, 3));
+  it->Prev();
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue,
+                     std::get<1>(kvs[0]), Timestamp(4, 3));
+
+  prev_reseek_count =
+      options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION);
+  it->Next();
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
+                     std::get<1>(kvs[1]), Timestamp(4, 3));
+  ASSERT_EQ(1 + prev_reseek_count,
+            options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+  it.reset();
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterateLowerTsBound) {
+  constexpr int kNumKeysPerFile = 128;
+  constexpr uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+                                                     Timestamp(3, 0)};
+  const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+                                                    Timestamp(4, 0)};
+  const std::vector<std::string> read_timestamps_lb = {Timestamp(1, 0),
+                                                       Timestamp(1, 0)};
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    WriteOptions write_opts;
+    Slice write_ts = write_timestamps[i];
+    write_opts.timestamp = &write_ts;
+    for (uint64_t key = 0; key <= kMaxKey; ++key) {
+      Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  }
+  for (size_t i = 0; i < read_timestamps.size(); ++i) {
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamps[i];
+    Slice read_ts_lb = read_timestamps_lb[i];
+    read_opts.timestamp = &read_ts;
+    read_opts.iter_start_ts = &read_ts_lb;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    int count = 0;
+    uint64_t key = 0;
+    for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) {
+      CheckIterEntry(it.get(), Key1(key), kTypeValue,
+                     "value" + std::to_string(i), write_timestamps[i]);
+      if (i > 0) {
+        it->Next();
+        CheckIterEntry(it.get(), Key1(key), kTypeValue,
+                       "value" + std::to_string(i - 1),
+                       write_timestamps[i - 1]);
+      }
+    }
+    size_t expected_count = kMaxKey + 1;
+    ASSERT_EQ(expected_count, count);
+  }
+  // Delete all keys@ts=5 and check iteration result with start ts set
+  {
+    std::string write_timestamp = Timestamp(5, 0);
+    WriteOptions write_opts;
+    Slice write_ts = write_timestamp;
+    write_opts.timestamp = &write_ts;
+    for (uint64_t key = 0; key < kMaxKey + 1; ++key) {
+      Status s = db_->Delete(write_opts, Key1(key));
+      ASSERT_OK(s);
+    }
+
+    std::string read_timestamp = Timestamp(6, 0);
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamp;
+    read_opts.timestamp = &read_ts;
+    std::string read_timestamp_lb = Timestamp(2, 0);
+    Slice read_ts_lb = read_timestamp_lb;
+    read_opts.iter_start_ts = &read_ts_lb;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    int count = 0;
+    uint64_t key = 0;
+    for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) {
+      CheckIterEntry(it.get(), Key1(key), kTypeDeletionWithTimestamp, Slice(),
+                     write_ts);
+      // Skip key@ts=3 and land on tombstone key@ts=5
+      it->Next();
+    }
+    ASSERT_EQ(kMaxKey + 1, count);
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, ForwardIterateStartSeqnum) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 0xffffffffffffffff;
+  const uint64_t kMinKey = kMaxKey - 1023;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  // Need to disable compaction to bottommost level when sequence number will be
+  // zeroed out, causing the verification of sequence number to fail in this
+  // test.
+  options.disable_auto_compactions = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  std::vector<SequenceNumber> start_seqs;
+
+  const int kNumTimestamps = 4;
+  std::vector<std::string> write_ts_list;
+  for (int t = 0; t != kNumTimestamps; ++t) {
+    write_ts_list.push_back(Timestamp(2 * t, /*do not care*/ 17));
+  }
+  WriteOptions write_opts;
+  for (size_t i = 0; i != write_ts_list.size(); ++i) {
+    Slice write_ts = write_ts_list[i];
+    write_opts.timestamp = &write_ts;
+    for (uint64_t k = kMaxKey; k >= kMinKey; --k) {
+      Status s;
+      if (k % 2) {
+        s = db_->Put(write_opts, Key1(k), "value" + std::to_string(i));
+      } else {
+        s = db_->Delete(write_opts, Key1(k));
+      }
+      ASSERT_OK(s);
+    }
+    start_seqs.push_back(db_->GetLatestSequenceNumber());
+  }
+  std::vector<std::string> read_ts_list;
+  for (int t = 0; t != kNumTimestamps - 1; ++t) {
+    read_ts_list.push_back(Timestamp(2 * t + 3, /*do not care*/ 17));
+  }
+
+  ReadOptions read_opts;
+  // Scan with only read_opts.iter_start_seqnum set.
+  for (size_t i = 0; i != read_ts_list.size(); ++i) {
+    Slice read_ts = read_ts_list[i];
+    read_opts.timestamp = &read_ts;
+    read_opts.iter_start_seqnum = start_seqs[i] + 1;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    SequenceNumber expected_seq = start_seqs[i] + (kMaxKey - kMinKey) + 1;
+    uint64_t key = kMinKey;
+    for (iter->Seek(Key1(kMinKey)); iter->Valid(); iter->Next()) {
+      CheckIterEntry(
+          iter.get(), Key1(key), expected_seq,
+          (key % 2) ? kTypeValue : kTypeDeletionWithTimestamp,
+          (key % 2) ? "value" + std::to_string(i + 1) : std::string(),
+          write_ts_list[i + 1]);
+      ++key;
+      --expected_seq;
+    }
+  }
+  // Scan with both read_opts.iter_start_seqnum and read_opts.iter_start_ts set.
+  std::vector<std::string> read_ts_lb_list;
+  for (int t = 0; t < kNumTimestamps - 1; ++t) {
+    read_ts_lb_list.push_back(Timestamp(2 * t, /*do not care*/ 17));
+  }
+  for (size_t i = 0; i < read_ts_list.size(); ++i) {
+    Slice read_ts = read_ts_list[i];
+    Slice read_ts_lb = read_ts_lb_list[i];
+    read_opts.timestamp = &read_ts;
+    read_opts.iter_start_ts = &read_ts_lb;
+    read_opts.iter_start_seqnum = start_seqs[i] + 1;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    uint64_t key = kMinKey;
+    SequenceNumber expected_seq = start_seqs[i] + (kMaxKey - kMinKey) + 1;
+    for (it->Seek(Key1(kMinKey)); it->Valid(); it->Next()) {
+      CheckIterEntry(it.get(), Key1(key), expected_seq,
+                     (key % 2) ? kTypeValue : kTypeDeletionWithTimestamp,
+                     "value" + std::to_string(i + 1), write_ts_list[i + 1]);
+      ++key;
+      --expected_seq;
+    }
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, ReseekToTargetTimestamp) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  constexpr size_t kNumKeys = 16;
+  options.max_sequential_skip_in_iterations = kNumKeys / 2;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  // Insert kNumKeys
+  WriteOptions write_opts;
+  Status s;
+  for (size_t i = 0; i != kNumKeys; ++i) {
+    std::string ts_str = Timestamp(static_cast<uint64_t>(i + 1), 0);
+    Slice ts = ts_str;
+    write_opts.timestamp = &ts;
+    s = db_->Put(write_opts, "foo", "value" + std::to_string(i));
+    ASSERT_OK(s);
+  }
+  {
+    ReadOptions read_opts;
+    std::string ts_str = Timestamp(1, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->SeekToFirst();
+    CheckIterUserEntry(iter.get(), "foo", kTypeValue, "value0", ts_str);
+    ASSERT_EQ(
+        1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+    ts_str = Timestamp(kNumKeys, 0);
+    ts = ts_str;
+    read_opts.timestamp = &ts;
+    iter.reset(db_->NewIterator(read_opts));
+    iter->SeekToLast();
+    CheckIterUserEntry(iter.get(), "foo", kTypeValue,
+                       "value" + std::to_string(kNumKeys - 1), ts_str);
+    ASSERT_EQ(
+        2, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, ReseekToNextUserKey) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  constexpr size_t kNumKeys = 16;
+  options.max_sequential_skip_in_iterations = kNumKeys / 2;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  // Write kNumKeys + 1 keys
+  WriteOptions write_opts;
+  Status s;
+  for (size_t i = 0; i != kNumKeys; ++i) {
+    std::string ts_str = Timestamp(static_cast<uint64_t>(i + 1), 0);
+    Slice ts = ts_str;
+    write_opts.timestamp = &ts;
+    s = db_->Put(write_opts, "a", "value" + std::to_string(i));
+    ASSERT_OK(s);
+  }
+  {
+    std::string ts_str = Timestamp(static_cast<uint64_t>(kNumKeys + 1), 0);
+    WriteBatch batch(0, 0, kTimestampSize);
+    ASSERT_OK(batch.Put("a", "new_value"));
+    ASSERT_OK(batch.Put("b", "new_value"));
+    s = batch.AssignTimestamp(ts_str);
+    ASSERT_OK(s);
+    s = db_->Write(write_opts, &batch);
+    ASSERT_OK(s);
+  }
+  {
+    ReadOptions read_opts;
+    std::string ts_str = Timestamp(static_cast<uint64_t>(kNumKeys + 1), 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->Seek("a");
+    iter->Next();
+    CheckIterUserEntry(iter.get(), "b", kTypeValue, "new_value", ts_str);
+    ASSERT_EQ(
+        1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, ReseekToUserKeyBeforeSavedKey) {
+  Options options = GetDefaultOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  constexpr size_t kNumKeys = 16;
+  options.max_sequential_skip_in_iterations = kNumKeys / 2;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  for (size_t i = 0; i < kNumKeys; ++i) {
+    std::string ts_str = Timestamp(static_cast<uint64_t>(i + 1), 0);
+    Slice ts = ts_str;
+    WriteOptions write_opts;
+    write_opts.timestamp = &ts;
+    Status s = db_->Put(write_opts, "b", "value" + std::to_string(i));
+    ASSERT_OK(s);
+  }
+  {
+    std::string ts_str = Timestamp(1, 0);
+    Slice ts = ts_str;
+    WriteOptions write_opts;
+    write_opts.timestamp = &ts;
+    ASSERT_OK(db_->Put(write_opts, "a", "value"));
+  }
+  {
+    ReadOptions read_opts;
+    std::string ts_str = Timestamp(1, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->SeekToLast();
+    iter->Prev();
+    CheckIterUserEntry(iter.get(), "a", kTypeValue, "value", ts_str);
+    ASSERT_EQ(
+        1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MultiGetWithFastLocalBloom) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  // Write any value
+  WriteOptions write_opts;
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+
+  ASSERT_OK(db_->Put(write_opts, "foo", "bar"));
+
+  ASSERT_OK(Flush());
+
+  // Read with MultiGet
+  ReadOptions read_opts;
+  read_opts.timestamp = &ts;
+  size_t batch_size = 1;
+  std::vector<Slice> keys(batch_size);
+  std::vector<PinnableSlice> values(batch_size);
+  std::vector<Status> statuses(batch_size);
+  keys[0] = "foo";
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                statuses.data());
+
+  ASSERT_OK(statuses[0]);
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetWithPrefix) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(5));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = false;
+  bbto.index_type = GetParam();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  // Write any value
+  WriteOptions write_opts;
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+
+  ASSERT_OK(db_->Put(write_opts, "foo", "bar"));
+
+  ASSERT_OK(Flush());
+
+  // Read with MultiGet
+  ReadOptions read_opts;
+  read_opts.timestamp = &ts;
+  size_t batch_size = 1;
+  std::vector<Slice> keys(batch_size);
+  std::vector<PinnableSlice> values(batch_size);
+  std::vector<Status> statuses(batch_size);
+  keys[0] = "foo";
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                statuses.data());
+
+  ASSERT_OK(statuses[0]);
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetWithMemBloomFilter) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(5));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = false;
+  bbto.index_type = GetParam();
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  // Write any value
+  WriteOptions write_opts;
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+
+  ASSERT_OK(db_->Put(write_opts, "foo", "bar"));
+
+  // Read with MultiGet
+  ts_str = Timestamp(2, 0);
+  ts = ts_str;
+  ReadOptions read_opts;
+  read_opts.timestamp = &ts;
+  size_t batch_size = 1;
+  std::vector<Slice> keys(batch_size);
+  std::vector<PinnableSlice> values(batch_size);
+  std::vector<Status> statuses(batch_size);
+  keys[0] = "foo";
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                statuses.data());
+
+  ASSERT_OK(statuses[0]);
   Close();
 }
 
-TEST_F(DBBasicTestWithTimestamp, ForwardIterateStartSeqnum) {
-  const int kNumKeysPerFile = 128;
-  const uint64_t kMaxKey = 0xffffffffffffffff;
-  const uint64_t kMinKey = kMaxKey - 1023;
+TEST_F(DBBasicTestWithTimestamp, MultiGetRangeFiltering) {
   Options options = CurrentOptions();
   options.env = env_;
   options.create_if_missing = true;
-  // Need to disable compaction to bottommost level when sequence number will be
-  // zeroed out, causing the verification of sequence number to fail in this
-  // test.
-  options.disable_auto_compactions = true;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = false;
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
   const size_t kTimestampSize = Timestamp(0, 0).size();
   TestComparator test_cmp(kTimestampSize);
   options.comparator = &test_cmp;
-  options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
   DestroyAndReopen(options);
-  std::vector<SequenceNumber> start_seqs;
 
-  const int kNumTimestamps = 4;
-  std::vector<std::string> write_ts_list;
-  for (int t = 0; t != kNumTimestamps; ++t) {
-    write_ts_list.push_back(Timestamp(2 * t, /*do not care*/ 17));
-  }
+  // Write any value
   WriteOptions write_opts;
-  for (size_t i = 0; i != write_ts_list.size(); ++i) {
-    Slice write_ts = write_ts_list[i];
-    write_opts.timestamp = &write_ts;
-    for (uint64_t k = kMaxKey; k >= kMinKey; --k) {
-      Status s;
-      if (k % 2) {
-        s = db_->Put(write_opts, Key1(k), "value" + std::to_string(i));
-      } else {
-        s = db_->Delete(write_opts, Key1(k));
-      }
-      ASSERT_OK(s);
-    }
-    start_seqs.push_back(db_->GetLatestSequenceNumber());
-  }
-  std::vector<std::string> read_ts_list;
-  for (int t = 0; t != kNumTimestamps - 1; ++t) {
-    read_ts_list.push_back(Timestamp(2 * t + 3, /*do not care*/ 17));
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+
+  // random data
+  for (int i = 0; i < 3; i++) {
+    auto key = ToString(i * 10);
+    auto value = ToString(i * 10);
+    Slice key_slice = key;
+    Slice value_slice = value;
+    ASSERT_OK(db_->Put(write_opts, key_slice, value_slice));
+    ASSERT_OK(Flush());
   }
 
+  // Make num_levels to 2 to do key range filtering of sst files
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_OK(db_->Put(write_opts, "foo", "bar"));
+
+  ASSERT_OK(Flush());
+
+  // Read with MultiGet
+  ts_str = Timestamp(2, 0);
+  ts = ts_str;
   ReadOptions read_opts;
-  // Scan with only read_opts.iter_start_seqnum set.
-  for (size_t i = 0; i != read_ts_list.size(); ++i) {
-    Slice read_ts = read_ts_list[i];
-    read_opts.timestamp = &read_ts;
-    read_opts.iter_start_seqnum = start_seqs[i] + 1;
-    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
-    SequenceNumber expected_seq = start_seqs[i] + (kMaxKey - kMinKey) + 1;
-    uint64_t key = kMinKey;
-    for (iter->Seek(Key1(kMinKey)); iter->Valid(); iter->Next()) {
-      CheckIterEntry(
-          iter.get(), Key1(key), expected_seq,
-          (key % 2) ? kTypeValue : kTypeDeletionWithTimestamp,
-          (key % 2) ? "value" + std::to_string(i + 1) : std::string(),
-          write_ts_list[i + 1]);
-      ++key;
-      --expected_seq;
-    }
-  }
-  // Scan with both read_opts.iter_start_seqnum and read_opts.iter_start_ts set.
-  std::vector<std::string> read_ts_lb_list;
-  for (int t = 0; t < kNumTimestamps - 1; ++t) {
-    read_ts_lb_list.push_back(Timestamp(2 * t, /*do not care*/ 17));
-  }
-  for (size_t i = 0; i < read_ts_list.size(); ++i) {
-    Slice read_ts = read_ts_list[i];
-    Slice read_ts_lb = read_ts_lb_list[i];
-    read_opts.timestamp = &read_ts;
-    read_opts.iter_start_ts = &read_ts_lb;
-    read_opts.iter_start_seqnum = start_seqs[i] + 1;
-    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
-    uint64_t key = kMinKey;
-    SequenceNumber expected_seq = start_seqs[i] + (kMaxKey - kMinKey) + 1;
-    for (it->Seek(Key1(kMinKey)); it->Valid(); it->Next()) {
-      CheckIterEntry(it.get(), Key1(key), expected_seq,
-                     (key % 2) ? kTypeValue : kTypeDeletionWithTimestamp,
-                     "value" + std::to_string(i + 1), write_ts_list[i + 1]);
-      ++key;
-      --expected_seq;
-    }
-  }
+  read_opts.timestamp = &ts;
+  size_t batch_size = 1;
+  std::vector<Slice> keys(batch_size);
+  std::vector<PinnableSlice> values(batch_size);
+  std::vector<Status> statuses(batch_size);
+  keys[0] = "foo";
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                statuses.data());
+
+  ASSERT_OK(statuses[0]);
   Close();
 }
 
-TEST_F(DBBasicTestWithTimestamp, ReseekToTargetTimestamp) {
+TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetPrefixFilter) {
   Options options = CurrentOptions();
   options.env = env_;
   options.create_if_missing = true;
-  constexpr size_t kNumKeys = 16;
-  options.max_sequential_skip_in_iterations = kNumKeys / 2;
-  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.prefix_extractor.reset(NewCappedPrefixTransform(5));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = false;
+  bbto.index_type = GetParam();
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
   const size_t kTimestampSize = Timestamp(0, 0).size();
   TestComparator test_cmp(kTimestampSize);
   options.comparator = &test_cmp;
   DestroyAndReopen(options);
-  // Insert kNumKeys
+
   WriteOptions write_opts;
-  Status s;
-  for (size_t i = 0; i != kNumKeys; ++i) {
-    std::string ts_str = Timestamp(static_cast<uint64_t>(i + 1), 0);
-    Slice ts = ts_str;
-    write_opts.timestamp = &ts;
-    s = db_->Put(write_opts, "foo", "value" + std::to_string(i));
-    ASSERT_OK(s);
-  }
-  {
-    ReadOptions read_opts;
-    std::string ts_str = Timestamp(1, 0);
-    Slice ts = ts_str;
-    read_opts.timestamp = &ts;
-    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
-    iter->SeekToFirst();
-    CheckIterUserEntry(iter.get(), "foo", kTypeValue, "value0", ts_str);
-    ASSERT_EQ(
-        1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
-  }
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+
+  ASSERT_OK(db_->Put(write_opts, "foo", "bar"));
+
+  ASSERT_OK(Flush());
+  // Read with MultiGet
+  ts_str = Timestamp(2, 0);
+  ts = ts_str;
+  ReadOptions read_opts;
+  read_opts.timestamp = &ts;
+  size_t batch_size = 1;
+  std::vector<Slice> keys(batch_size);
+  std::vector<std::string> values(batch_size);
+  std::vector<std::string> timestamps(batch_size);
+  keys[0] = "foo";
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  std::vector<ColumnFamilyHandle*> cfhs(keys.size(), cfh);
+  std::vector<Status> statuses =
+      db_->MultiGet(read_opts, cfhs, keys, &values, &timestamps);
+
+  ASSERT_OK(statuses[0]);
   Close();
 }
 
-TEST_F(DBBasicTestWithTimestamp, ReseekToNextUserKey) {
+TEST_F(DBBasicTestWithTimestamp, MaxKeysSkippedDuringNext) {
   Options options = CurrentOptions();
   options.env = env_;
   options.create_if_missing = true;
-  constexpr size_t kNumKeys = 16;
-  options.max_sequential_skip_in_iterations = kNumKeys / 2;
-  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
   const size_t kTimestampSize = Timestamp(0, 0).size();
   TestComparator test_cmp(kTimestampSize);
   options.comparator = &test_cmp;
   DestroyAndReopen(options);
-  // Write kNumKeys + 1 keys
+  constexpr size_t max_skippable_internal_keys = 2;
+  const size_t kNumKeys = max_skippable_internal_keys + 2;
   WriteOptions write_opts;
   Status s;
-  for (size_t i = 0; i != kNumKeys; ++i) {
-    std::string ts_str = Timestamp(static_cast<uint64_t>(i + 1), 0);
+  {
+    std::string ts_str = Timestamp(1, 0);
     Slice ts = ts_str;
     write_opts.timestamp = &ts;
-    s = db_->Put(write_opts, "a", "value" + std::to_string(i));
-    ASSERT_OK(s);
+    ASSERT_OK(db_->Put(write_opts, "a", "value"));
   }
-  {
-    std::string ts_str = Timestamp(static_cast<uint64_t>(kNumKeys + 1), 0);
-    WriteBatch batch(0, 0, kTimestampSize);
-    ASSERT_OK(batch.Put("a", "new_value"));
-    ASSERT_OK(batch.Put("b", "new_value"));
-    s = batch.AssignTimestamp(ts_str);
-    ASSERT_OK(s);
-    s = db_->Write(write_opts, &batch);
+  for (size_t i = 0; i < kNumKeys; ++i) {
+    std::string ts_str = Timestamp(static_cast<uint64_t>(i + 1), 0);
+    Slice ts = ts_str;
+    write_opts.timestamp = &ts;
+    s = db_->Put(write_opts, "b", "value" + std::to_string(i));
     ASSERT_OK(s);
   }
   {
     ReadOptions read_opts;
-    std::string ts_str = Timestamp(static_cast<uint64_t>(kNumKeys + 1), 0);
+    read_opts.max_skippable_internal_keys = max_skippable_internal_keys;
+    std::string ts_str = Timestamp(1, 0);
     Slice ts = ts_str;
     read_opts.timestamp = &ts;
     std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
-    iter->Seek("a");
+    iter->SeekToFirst();
     iter->Next();
-    CheckIterUserEntry(iter.get(), "b", kTypeValue, "new_value", ts_str);
-    ASSERT_EQ(
-        1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+    ASSERT_TRUE(iter->status().IsIncomplete());
   }
   Close();
 }
 
-TEST_F(DBBasicTestWithTimestamp, MaxKeysSkipped) {
-  Options options = CurrentOptions();
+TEST_F(DBBasicTestWithTimestamp, MaxKeysSkippedDuringPrev) {
+  Options options = GetDefaultOptions();
   options.env = env_;
   options.create_if_missing = true;
   const size_t kTimestampSize = Timestamp(0, 0).size();
@@ -506,13 +1412,13 @@ TEST_F(DBBasicTestWithTimestamp, MaxKeysSkipped) {
     std::string ts_str = Timestamp(1, 0);
     Slice ts = ts_str;
     write_opts.timestamp = &ts;
-    ASSERT_OK(db_->Put(write_opts, "a", "value"));
+    ASSERT_OK(db_->Put(write_opts, "b", "value"));
   }
   for (size_t i = 0; i < kNumKeys; ++i) {
     std::string ts_str = Timestamp(static_cast<uint64_t>(i + 1), 0);
     Slice ts = ts_str;
     write_opts.timestamp = &ts;
-    s = db_->Put(write_opts, "b", "value" + std::to_string(i));
+    s = db_->Put(write_opts, "a", "value" + std::to_string(i));
     ASSERT_OK(s);
   }
   {
@@ -522,8 +1428,8 @@ TEST_F(DBBasicTestWithTimestamp, MaxKeysSkipped) {
     Slice ts = ts_str;
     read_opts.timestamp = &ts;
     std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
-    iter->SeekToFirst();
-    iter->Next();
+    iter->SeekToLast();
+    iter->Prev();
     ASSERT_TRUE(iter->status().IsIncomplete());
   }
   Close();
@@ -588,6 +1494,138 @@ TEST_F(DBBasicTestWithTimestamp, CompactDeletionWithTimestampMarkerToBottom) {
   Close();
 }
 
+class DBBasicTestWithTimestampFilterPrefixSettings
+    : public DBBasicTestWithTimestampBase,
+      public testing::WithParamInterface<
+          std::tuple<std::shared_ptr<const FilterPolicy>, bool, bool,
+                     std::shared_ptr<const SliceTransform>, bool, double,
+                     BlockBasedTableOptions::IndexType>> {
+ public:
+  DBBasicTestWithTimestampFilterPrefixSettings()
+      : DBBasicTestWithTimestampBase(
+            "db_basic_test_with_timestamp_filter_prefix") {}
+};
+
+TEST_P(DBBasicTestWithTimestampFilterPrefixSettings, GetAndMultiGet) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy = std::get<0>(GetParam());
+  bbto.whole_key_filtering = std::get<1>(GetParam());
+  bbto.cache_index_and_filter_blocks = std::get<2>(GetParam());
+  bbto.index_type = std::get<6>(GetParam());
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.prefix_extractor = std::get<3>(GetParam());
+  options.memtable_whole_key_filtering = std::get<4>(GetParam());
+  options.memtable_prefix_bloom_size_ratio = std::get<5>(GetParam());
+
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  const int kMaxKey = 1000;
+
+  // Write any value
+  WriteOptions write_opts;
+  std::string ts_str = Timestamp(1, 0);
+  Slice ts = ts_str;
+  write_opts.timestamp = &ts;
+
+  int idx = 0;
+  for (; idx < kMaxKey / 4; idx++) {
+    ASSERT_OK(db_->Put(write_opts, Key1(idx), "bar"));
+    ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), "bar"));
+  }
+
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  for (; idx < kMaxKey / 2; idx++) {
+    ASSERT_OK(db_->Put(write_opts, Key1(idx), "bar"));
+    ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), "bar"));
+  }
+
+  ASSERT_OK(Flush());
+
+  for (; idx < kMaxKey; idx++) {
+    ASSERT_OK(db_->Put(write_opts, Key1(idx), "bar"));
+    ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), "bar"));
+  }
+
+  // Read with MultiGet
+  ReadOptions read_opts;
+  read_opts.timestamp = &ts;
+
+  ReadOptions read_opts_total_order;
+  read_opts_total_order.timestamp = &ts;
+  read_opts_total_order.total_order_seek = true;
+
+  for (idx = 0; idx < kMaxKey; idx++) {
+    size_t batch_size = 4;
+    std::vector<std::string> keys_str(batch_size);
+    std::vector<PinnableSlice> values(batch_size);
+    std::vector<Status> statuses(batch_size);
+    ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+
+    keys_str[0] = Key1(idx);
+    keys_str[1] = KeyWithPrefix("foo", idx);
+    keys_str[2] = Key1(kMaxKey + idx);
+    keys_str[3] = KeyWithPrefix("foo", kMaxKey + idx);
+
+    auto keys = ConvertStrToSlice(keys_str);
+
+    db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                  statuses.data());
+
+    for (int i = 0; i < 2; i++) {
+      ASSERT_OK(statuses[i]);
+    }
+    for (int i = 2; i < 4; i++) {
+      ASSERT_TRUE(statuses[i].IsNotFound());
+    }
+
+    for (int i = 0; i < 2; i++) {
+      std::string value;
+      ASSERT_OK(db_->Get(read_opts, keys[i], &value));
+      std::unique_ptr<Iterator> it1(db_->NewIterator(read_opts));
+      ASSERT_NE(nullptr, it1);
+      ASSERT_OK(it1->status());
+      // TODO(zjay) Fix seek with prefix
+      // it1->Seek(keys[i]);
+      // ASSERT_TRUE(it1->Valid());
+    }
+
+    for (int i = 2; i < 4; i++) {
+      std::string value;
+      Status s = db_->Get(read_opts, keys[i], &value);
+      ASSERT_TRUE(s.IsNotFound());
+    }
+  }
+  Close();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    Timestamp, DBBasicTestWithTimestampFilterPrefixSettings,
+    ::testing::Combine(
+        ::testing::Values(
+            std::shared_ptr<const FilterPolicy>(nullptr),
+            std::shared_ptr<const FilterPolicy>(NewBloomFilterPolicy(10, true)),
+            std::shared_ptr<const FilterPolicy>(NewBloomFilterPolicy(10,
+                                                                     false))),
+        ::testing::Bool(), ::testing::Bool(),
+        ::testing::Values(
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(1)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(4)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(7)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(8))),
+        ::testing::Bool(), ::testing::Values(0, 0.1),
+        ::testing::Values(
+            BlockBasedTableOptions::IndexType::kBinarySearch,
+            BlockBasedTableOptions::IndexType::kHashSearch,
+            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
+            BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey)));
+
 class DataVisibilityTest : public DBBasicTestWithTimestampBase {
  public:
   DataVisibilityTest() : DBBasicTestWithTimestampBase("data_visibility_test") {
@@ -1051,7 +2089,7 @@ TEST_F(DataVisibilityTest, MultiGetWithTimestamp) {
   VerifyDefaultCF(snap0);
   VerifyDefaultCF(snap1);
 
-  Flush();
+  ASSERT_OK(Flush());
 
   const Snapshot* snap2 = db_->GetSnapshot();
   PutTestData(2);
@@ -1137,7 +2175,7 @@ TEST_F(DataVisibilityTest, MultiGetCrossCF) {
   VerifyDefaultCF(snap0);
   VerifyDefaultCF(snap1);
 
-  Flush();
+  ASSERT_OK(Flush());
 
   const Snapshot* snap2 = db_->GetSnapshot();
   PutTestData(2);
@@ -1518,6 +2556,8 @@ TEST_F(DBBasicTestWithTimestamp, BatchWriteAndMultiGet) {
   options.create_if_missing = true;
   options.env = env_;
   options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.memtable_whole_key_filtering = true;
 
   size_t ts_sz = Timestamp(0, 0).size();
   TestComparator test_cmp(ts_sz);
@@ -1665,14 +2705,15 @@ class DBBasicTestWithTimestampPrefixSeek
     : public DBBasicTestWithTimestampBase,
       public testing::WithParamInterface<
           std::tuple<std::shared_ptr<const SliceTransform>,
-                     std::shared_ptr<const FilterPolicy>, bool>> {
+                     std::shared_ptr<const FilterPolicy>, bool,
+                     BlockBasedTableOptions::IndexType>> {
  public:
   DBBasicTestWithTimestampPrefixSeek()
       : DBBasicTestWithTimestampBase(
             "/db_basic_test_with_timestamp_prefix_seek") {}
 };
 
-TEST_P(DBBasicTestWithTimestampPrefixSeek, ForwardIterateWithPrefix) {
+TEST_P(DBBasicTestWithTimestampPrefixSeek, IterateWithPrefix) {
   const size_t kNumKeysPerFile = 128;
   Options options = CurrentOptions();
   options.env = env_;
@@ -1684,6 +2725,7 @@ TEST_P(DBBasicTestWithTimestampPrefixSeek, ForwardIterateWithPrefix) {
   options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
   BlockBasedTableOptions bbto;
   bbto.filter_policy = std::get<1>(GetParam());
+  bbto.index_type = std::get<3>(GetParam());
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
   DestroyAndReopen(options);
 
@@ -1722,6 +2764,13 @@ TEST_P(DBBasicTestWithTimestampPrefixSeek, ForwardIterateWithPrefix) {
                          "value" + std::to_string(i), write_ts_list[i]);
       iter->Next();
       ASSERT_FALSE(iter->Valid());
+
+      // Seek to kMinKey
+      iter->Seek(Key1(kMinKey));
+      CheckIterUserEntry(iter.get(), Key1(kMinKey), kTypeValue,
+                         "value" + std::to_string(i), write_ts_list[i]);
+      iter->Prev();
+      ASSERT_FALSE(iter->Valid());
     }
     const std::vector<uint64_t> targets = {kMinKey, kMinKey + 0x10,
                                            kMinKey + 0x100, kMaxKey};
@@ -1737,6 +2786,7 @@ TEST_P(DBBasicTestWithTimestampPrefixSeek, ForwardIterateWithPrefix) {
       Slice read_ts = read_ts_list[i];
       read_opts.timestamp = &read_ts;
       std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+      // Forward and backward iterate.
       for (size_t j = 0; j != targets.size(); ++j) {
         std::string start_key = Key1(targets[j]);
         uint64_t expected_ub =
@@ -1760,6 +2810,24 @@ TEST_P(DBBasicTestWithTimestampPrefixSeek, ForwardIterateWithPrefix) {
           it->Next();
         }
         ASSERT_EQ(expected_ub - targets[j] + 1, count);
+
+        count = 0;
+        expected_key = targets[j];
+        it->SeekForPrev(start_key);
+        uint64_t expected_lb = (targets[j] & kPrefixMask);
+        while (it->Valid()) {
+          // Out of prefix
+          if (!read_opts.prefix_same_as_start &&
+              pe->Transform(it->key()) != pe->Transform(start_key)) {
+            break;
+          }
+          CheckIterUserEntry(it.get(), Key1(expected_key), kTypeValue,
+                             "value" + std::to_string(i), write_ts_list[i]);
+          ++count;
+          --expected_key;
+          it->Prev();
+        }
+        ASSERT_EQ(targets[j] - std::max(expected_lb, kMinKey) + 1, count);
       }
     }
   }
@@ -1772,6 +2840,7 @@ INSTANTIATE_TEST_CASE_P(
     Timestamp, DBBasicTestWithTimestampPrefixSeek,
     ::testing::Combine(
         ::testing::Values(
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(1)),
             std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(4)),
             std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(7)),
             std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(8))),
@@ -1781,19 +2850,25 @@ INSTANTIATE_TEST_CASE_P(
                           std::shared_ptr<const FilterPolicy>(
                               NewBloomFilterPolicy(20 /*bits_per_key*/,
                                                    false))),
-        ::testing::Bool()));
+        ::testing::Bool(),
+        ::testing::Values(
+            BlockBasedTableOptions::IndexType::kBinarySearch,
+            BlockBasedTableOptions::IndexType::kHashSearch,
+            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
+            BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey)));
 
 class DBBasicTestWithTsIterTombstones
     : public DBBasicTestWithTimestampBase,
       public testing::WithParamInterface<
           std::tuple<std::shared_ptr<const SliceTransform>,
-                     std::shared_ptr<const FilterPolicy>, int>> {
+                     std::shared_ptr<const FilterPolicy>, int,
+                     BlockBasedTableOptions::IndexType>> {
  public:
   DBBasicTestWithTsIterTombstones()
       : DBBasicTestWithTimestampBase("/db_basic_ts_iter_tombstones") {}
 };
 
-TEST_P(DBBasicTestWithTsIterTombstones, ForwardIterDelete) {
+TEST_P(DBBasicTestWithTsIterTombstones, IterWithDelete) {
   constexpr size_t kNumKeysPerFile = 128;
   Options options = CurrentOptions();
   options.env = env_;
@@ -1804,6 +2879,7 @@ TEST_P(DBBasicTestWithTsIterTombstones, ForwardIterDelete) {
   options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
   BlockBasedTableOptions bbto;
   bbto.filter_policy = std::get<1>(GetParam());
+  bbto.index_type = std::get<3>(GetParam());
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
   options.num_levels = std::get<2>(GetParam());
   DestroyAndReopen(options);
@@ -1823,7 +2899,7 @@ TEST_P(DBBasicTestWithTsIterTombstones, ForwardIterDelete) {
     }
     ++key;
   } while (true);
-  // Delete them all
+
   ts = write_ts_strs[1];
   write_opts.timestamp = &ts;
   for (key = kMaxKey; key >= kMinKey; --key) {
@@ -1850,6 +2926,13 @@ TEST_P(DBBasicTestWithTsIterTombstones, ForwardIterDelete) {
       ASSERT_EQ("value1" + std::to_string(key), iter->value());
     }
     ASSERT_EQ((kMaxKey - kMinKey + 1) / 2, count);
+
+    for (iter->SeekToLast(), count = 0, key = kMaxKey; iter->Valid();
+         key -= 2, ++count, iter->Prev()) {
+      ASSERT_EQ(Key1(key), iter->key());
+      ASSERT_EQ("value1" + std::to_string(key), iter->value());
+    }
+    ASSERT_EQ((kMaxKey - kMinKey + 1) / 2, count);
   }
   Close();
 }
@@ -1865,7 +2948,12 @@ INSTANTIATE_TEST_CASE_P(
                               NewBloomFilterPolicy(10, false)),
                           std::shared_ptr<const FilterPolicy>(
                               NewBloomFilterPolicy(20, false))),
-        ::testing::Values(2, 6)));
+        ::testing::Values(2, 6),
+        ::testing::Values(
+            BlockBasedTableOptions::IndexType::kBinarySearch,
+            BlockBasedTableOptions::IndexType::kHashSearch,
+            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
+            BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey)));
 
 }  // namespace ROCKSDB_NAMESPACE
 
diff --git a/db/db_write_buffer_manager_test.cc b/db/db_write_buffer_manager_test.cc
new file mode 100644
index 00000000000..0ae74475284
--- /dev/null
+++ b/db/db_write_buffer_manager_test.cc
@@ -0,0 +1,801 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "db/write_thread.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBWriteBufferManagerTest : public DBTestBase,
+                                 public testing::WithParamInterface<bool> {
+ public:
+  DBWriteBufferManagerTest()
+      : DBTestBase("/db_write_buffer_manager_test", /*env_do_fsync=*/false) {}
+  bool cost_cache_;
+};
+
+TEST_P(DBWriteBufferManagerTest, SharedBufferAcrossCFs1) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  Flush(3);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  Flush(0);
+
+  // Write to "Default", "cf2" and "cf3".
+  ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(40000), wo));
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+
+  ASSERT_OK(Put(3, Key(2), DummyString(40000), wo));
+  // WriteBufferManager::buffer_size_ has exceeded after the previous write is
+  // completed.
+
+  // This make sures write will go through and if stall was in effect, it will
+  // end.
+  ASSERT_OK(Put(0, Key(2), DummyString(1), wo));
+}
+
+// Test Single DB with multiple writer threads get blocked when
+// WriteBufferManager execeeds buffer_size_ and flush is waiting to be
+// finished.
+TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs2) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  Flush(3);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  Flush(0);
+
+  // Write to "Default", "cf2" and "cf3". No flush will be triggered.
+  ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(40000), wo));
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+
+  ASSERT_OK(Put(3, Key(2), DummyString(40000), wo));
+  // WriteBufferManager::buffer_size_ has exceeded after the previous write is
+  // completed.
+
+  std::unordered_set<WriteThread::Writer*> w_set;
+  std::vector<port::Thread> threads;
+  int wait_count_db = 0;
+  int num_writers = 4;
+  InstrumentedMutex mutex;
+  InstrumentedCondVar cv(&mutex);
+  std::atomic<int> thread_num(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+        "DBImpl::BackgroundCallFlush:start"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WBMStallInterface::BlockDB", [&](void*) {
+        InstrumentedMutexLock lock(&mutex);
+        wait_count_db++;
+        cv.SignalAll();
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::WriteStall::Wait", [&](void* arg) {
+        InstrumentedMutexLock lock(&mutex);
+        WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
+        w_set.insert(w);
+        // Allow the flush to continue if all writer threads are blocked.
+        if (w_set.size() == (unsigned long)num_writers) {
+          TEST_SYNC_POINT(
+              "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool s = true;
+
+  std::function<void(int)> writer = [&](int cf) {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    Status tmp = Put(cf, Slice(key), DummyString(1), wo);
+    InstrumentedMutexLock lock(&mutex);
+    s = s && tmp.ok();
+  };
+
+  // Flow:
+  // main_writer thread will write but will be blocked (as Flush will on hold,
+  // buffer_size_ has exceeded, thus will create stall in effect).
+  //  |
+  //  |
+  //  multiple writer threads will be created to write across multiple columns
+  //  and they will be blocked.
+  //  |
+  //  |
+  //  Last writer thread will write and when its blocked it will signal Flush to
+  //  continue to clear the stall.
+
+  threads.emplace_back(writer, 1);
+  // Wait untill first thread (main_writer) writing to DB is blocked and then
+  // create the multiple writers which will be blocked from getting added to the
+  // queue because stall is in effect.
+  {
+    InstrumentedMutexLock lock(&mutex);
+    while (wait_count_db != 1) {
+      cv.Wait();
+    }
+  }
+  for (int i = 0; i < num_writers; i++) {
+    threads.emplace_back(writer, i % 4);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  ASSERT_TRUE(s);
+
+  // Number of DBs blocked.
+  ASSERT_EQ(wait_count_db, 1);
+  // Number of Writer threads blocked.
+  ASSERT_EQ(w_set.size(), num_writers);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Test multiple DBs get blocked when WriteBufferManager limit exceeds and flush
+// is waiting to be finished but DBs tries to write meanwhile.
+TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) {
+  std::vector<std::string> dbnames;
+  std::vector<DB*> dbs;
+  int num_dbs = 3;
+
+  for (int i = 0; i < num_dbs; i++) {
+    dbs.push_back(nullptr);
+    dbnames.push_back(
+        test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
+  }
+
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+  CreateAndReopenWithCF({"cf1", "cf2"}, options);
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i])));
+  }
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
+  }
+  // Insert to db_.
+  ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
+
+  // WriteBufferManager Limit exceeded.
+  std::vector<port::Thread> threads;
+  int wait_count_db = 0;
+  InstrumentedMutex mutex;
+  InstrumentedCondVar cv(&mutex);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+        "DBImpl::BackgroundCallFlush:start"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WBMStallInterface::BlockDB", [&](void*) {
+        {
+          InstrumentedMutexLock lock(&mutex);
+          wait_count_db++;
+          cv.Signal();
+          // Since this is the last DB, signal Flush to continue.
+          if (wait_count_db == num_dbs + 1) {
+            TEST_SYNC_POINT(
+                "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool s = true;
+
+  // Write to DB.
+  std::function<void(DB*)> write_db = [&](DB* db) {
+    Status tmp = db->Put(wo, Key(3), DummyString(1));
+    InstrumentedMutexLock lock(&mutex);
+    s = s && tmp.ok();
+  };
+
+  // Flow:
+  // db_ will write and will be blocked (as Flush will on hold and will create
+  // stall in effect).
+  //  |
+  //  multiple dbs writers will be created to write to that db and they will be
+  //  blocked.
+  //  |
+  //  |
+  //  Last writer will write and when its blocked it will signal Flush to
+  //  continue to clear the stall.
+
+  threads.emplace_back(write_db, db_);
+  // Wait untill first DB is blocked and then create the multiple writers for
+  // different DBs which will be blocked from getting added to the queue because
+  // stall is in effect.
+  {
+    InstrumentedMutexLock lock(&mutex);
+    while (wait_count_db != 1) {
+      cv.Wait();
+    }
+  }
+  for (int i = 0; i < num_dbs; i++) {
+    threads.emplace_back(write_db, dbs[i]);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  ASSERT_TRUE(s);
+  ASSERT_EQ(num_dbs + 1, wait_count_db);
+  // Clean up DBs.
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Close());
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    delete dbs[i];
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Test multiple threads writing across multiple DBs and multiple columns get
+// blocked when stall by WriteBufferManager is in effect.
+TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) {
+  std::vector<std::string> dbnames;
+  std::vector<DB*> dbs;
+  int num_dbs = 3;
+
+  for (int i = 0; i < num_dbs; i++) {
+    dbs.push_back(nullptr);
+    dbnames.push_back(
+        test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
+  }
+
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+  CreateAndReopenWithCF({"cf1", "cf2"}, options);
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i])));
+  }
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
+  }
+  // Insert to db_.
+  ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
+
+  // WriteBufferManager::buffer_size_ has exceeded after the previous write to
+  // dbs[0] is completed.
+  std::vector<port::Thread> threads;
+  int wait_count_db = 0;
+  InstrumentedMutex mutex;
+  InstrumentedCondVar cv(&mutex);
+  std::unordered_set<WriteThread::Writer*> w_set;
+  std::vector<port::Thread> writer_threads;
+  std::atomic<int> thread_num(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+        "DBImpl::BackgroundCallFlush:start"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WBMStallInterface::BlockDB", [&](void*) {
+        {
+          InstrumentedMutexLock lock(&mutex);
+          wait_count_db++;
+          thread_num.fetch_add(1);
+          cv.Signal();
+          // Allow the flush to continue if all writer threads are blocked.
+          if (thread_num.load(std::memory_order_relaxed) == 2 * num_dbs + 1) {
+            TEST_SYNC_POINT(
+                "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::WriteStall::Wait", [&](void* arg) {
+        WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
+        {
+          InstrumentedMutexLock lock(&mutex);
+          w_set.insert(w);
+          thread_num.fetch_add(1);
+          // Allow the flush continue if all writer threads are blocked.
+          if (thread_num.load(std::memory_order_relaxed) == 2 * num_dbs + 1) {
+            TEST_SYNC_POINT(
+                "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool s1 = true, s2 = true;
+  // Write to multiple columns of db_.
+  std::function<void(int)> write_cf = [&](int cf) {
+    Status tmp = Put(cf, Key(3), DummyString(1), wo);
+    InstrumentedMutexLock lock(&mutex);
+    s1 = s1 && tmp.ok();
+  };
+  // Write to multiple DBs.
+  std::function<void(DB*)> write_db = [&](DB* db) {
+    Status tmp = db->Put(wo, Key(3), DummyString(1));
+    InstrumentedMutexLock lock(&mutex);
+    s2 = s2 && tmp.ok();
+  };
+
+  // Flow:
+  // thread will write to db_ will be blocked (as Flush will on hold,
+  // buffer_size_ has exceeded and will create stall in effect).
+  //  |
+  //  |
+  //  multiple writers threads writing to different DBs and to db_ across
+  //  multiple columns will be created and they will be blocked due to stall.
+  //  |
+  //  |
+  //  Last writer thread will write and when its blocked it will signal Flush to
+  //  continue to clear the stall.
+  threads.emplace_back(write_db, db_);
+  // Wait untill first thread is blocked and then create the multiple writer
+  // threads.
+  {
+    InstrumentedMutexLock lock(&mutex);
+    while (wait_count_db != 1) {
+      cv.Wait();
+    }
+  }
+
+  for (int i = 0; i < num_dbs; i++) {
+    // Write to multiple columns of db_.
+    writer_threads.emplace_back(write_cf, i % 3);
+    // Write to different dbs.
+    threads.emplace_back(write_db, dbs[i]);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+  for (auto& t : writer_threads) {
+    t.join();
+  }
+
+  ASSERT_TRUE(s1);
+  ASSERT_TRUE(s2);
+
+  // Number of DBs blocked.
+  ASSERT_EQ(num_dbs + 1, wait_count_db);
+  // Number of Writer threads blocked.
+  ASSERT_EQ(w_set.size(), num_dbs);
+  // Clean up DBs.
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Close());
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    delete dbs[i];
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Test multiple threads writing across multiple columns of db_ by passing
+// different values to WriteOption.no_slown_down.
+TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsSingleDB) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options);
+
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  Flush(3);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  Flush(0);
+
+  // Write to "Default", "cf2" and "cf3". No flush will be triggered.
+  ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(40000), wo));
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+  ASSERT_OK(Put(3, Key(2), DummyString(40000), wo));
+
+  // WriteBufferManager::buffer_size_ has exceeded after the previous write to
+  // db_ is completed.
+
+  std::unordered_set<WriteThread::Writer*> w_slowdown_set;
+  std::vector<port::Thread> threads;
+  int wait_count_db = 0;
+  int num_writers = 4;
+  InstrumentedMutex mutex;
+  InstrumentedCondVar cv(&mutex);
+  std::atomic<int> thread_num(0);
+  std::atomic<int> w_no_slowdown(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+        "DBImpl::BackgroundCallFlush:start"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WBMStallInterface::BlockDB", [&](void*) {
+        {
+          InstrumentedMutexLock lock(&mutex);
+          wait_count_db++;
+          cv.SignalAll();
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::WriteStall::Wait", [&](void* arg) {
+        {
+          InstrumentedMutexLock lock(&mutex);
+          WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
+          w_slowdown_set.insert(w);
+          // Allow the flush continue if all writer threads are blocked.
+          if (w_slowdown_set.size() + (unsigned long)w_no_slowdown.load(
+                                          std::memory_order_relaxed) ==
+              (unsigned long)num_writers) {
+            TEST_SYNC_POINT(
+                "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool s1 = true, s2 = true;
+
+  std::function<void(int)> write_slow_down = [&](int cf) {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions write_op;
+    write_op.no_slowdown = false;
+    Status tmp = Put(cf, Slice(key), DummyString(1), write_op);
+    InstrumentedMutexLock lock(&mutex);
+    s1 = s1 && tmp.ok();
+  };
+
+  std::function<void(int)> write_no_slow_down = [&](int cf) {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions write_op;
+    write_op.no_slowdown = true;
+    Status tmp = Put(cf, Slice(key), DummyString(1), write_op);
+    {
+      InstrumentedMutexLock lock(&mutex);
+      s2 = s2 && !tmp.ok();
+      w_no_slowdown.fetch_add(1);
+      // Allow the flush continue if all writer threads are blocked.
+      if (w_slowdown_set.size() +
+              (unsigned long)w_no_slowdown.load(std::memory_order_relaxed) ==
+          (unsigned long)num_writers) {
+        TEST_SYNC_POINT(
+            "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+      }
+    }
+  };
+
+  // Flow:
+  // main_writer thread will write but will be blocked (as Flush will on hold,
+  // buffer_size_ has exceeded, thus will create stall in effect).
+  //  |
+  //  |
+  //  multiple writer threads will be created to write across multiple columns
+  //  with different values of WriteOptions.no_slowdown. Some of them will
+  //  be blocked and some of them will return with Incomplete status.
+  //  |
+  //  |
+  //  Last writer thread will write and when its blocked/return it will signal
+  //  Flush to continue to clear the stall.
+  threads.emplace_back(write_slow_down, 1);
+  // Wait untill first thread (main_writer) writing to DB is blocked and then
+  // create the multiple writers which will be blocked from getting added to the
+  // queue because stall is in effect.
+  {
+    InstrumentedMutexLock lock(&mutex);
+    while (wait_count_db != 1) {
+      cv.Wait();
+    }
+  }
+
+  for (int i = 0; i < num_writers; i += 2) {
+    threads.emplace_back(write_no_slow_down, (i) % 4);
+    threads.emplace_back(write_slow_down, (i + 1) % 4);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  ASSERT_TRUE(s1);
+  ASSERT_TRUE(s2);
+  // Number of DBs blocked.
+  ASSERT_EQ(wait_count_db, 1);
+  // Number of Writer threads blocked.
+  ASSERT_EQ(w_slowdown_set.size(), num_writers / 2);
+  // Number of Writer threads with WriteOptions.no_slowdown = true.
+  ASSERT_EQ(w_no_slowdown.load(std::memory_order_relaxed), num_writers / 2);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Test multiple threads writing across multiple columns of db_ and different
+// dbs by passing different values to WriteOption.no_slown_down.
+TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) {
+  std::vector<std::string> dbnames;
+  std::vector<DB*> dbs;
+  int num_dbs = 4;
+
+  for (int i = 0; i < num_dbs; i++) {
+    dbs.push_back(nullptr);
+    dbnames.push_back(
+        test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
+  }
+
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+  CreateAndReopenWithCF({"cf1", "cf2"}, options);
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i])));
+  }
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
+  }
+  // Insert to db_.
+  ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
+
+  // WriteBufferManager::buffer_size_ has exceeded after the previous write to
+  // dbs[0] is completed.
+  std::vector<port::Thread> threads;
+  int wait_count_db = 0;
+  InstrumentedMutex mutex;
+  InstrumentedCondVar cv(&mutex);
+  std::unordered_set<WriteThread::Writer*> w_slowdown_set;
+  std::vector<port::Thread> writer_threads;
+  std::atomic<int> thread_num(0);
+  std::atomic<int> w_no_slowdown(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+        "DBImpl::BackgroundCallFlush:start"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WBMStallInterface::BlockDB", [&](void*) {
+        InstrumentedMutexLock lock(&mutex);
+        wait_count_db++;
+        cv.Signal();
+        // Allow the flush continue if all writer threads are blocked.
+        if (w_slowdown_set.size() +
+                (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) +
+                                wait_count_db) ==
+            (unsigned long)(2 * num_dbs + 1)) {
+          TEST_SYNC_POINT(
+              "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::WriteStall::Wait", [&](void* arg) {
+        WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
+        InstrumentedMutexLock lock(&mutex);
+        w_slowdown_set.insert(w);
+        // Allow the flush continue if all writer threads are blocked.
+        if (w_slowdown_set.size() +
+                (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) +
+                                wait_count_db) ==
+            (unsigned long)(2 * num_dbs + 1)) {
+          TEST_SYNC_POINT(
+              "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool s1 = true, s2 = true;
+  std::function<void(DB*)> write_slow_down = [&](DB* db) {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions write_op;
+    write_op.no_slowdown = false;
+    Status tmp = db->Put(write_op, Slice(key), DummyString(1));
+    InstrumentedMutexLock lock(&mutex);
+    s1 = s1 && tmp.ok();
+  };
+
+  std::function<void(DB*)> write_no_slow_down = [&](DB* db) {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions write_op;
+    write_op.no_slowdown = true;
+    Status tmp = db->Put(write_op, Slice(key), DummyString(1));
+    {
+      InstrumentedMutexLock lock(&mutex);
+      s2 = s2 && !tmp.ok();
+      w_no_slowdown.fetch_add(1);
+      if (w_slowdown_set.size() +
+              (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) +
+                              wait_count_db) ==
+          (unsigned long)(2 * num_dbs + 1)) {
+        TEST_SYNC_POINT(
+            "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+      }
+    }
+  };
+
+  // Flow:
+  // first thread will write but will be blocked (as Flush will on hold,
+  // buffer_size_ has exceeded, thus will create stall in effect).
+  //  |
+  //  |
+  //  multiple writer threads will be created to write across multiple columns
+  //  of db_ and different DBs with different values of
+  //  WriteOptions.no_slowdown. Some of them will be blocked and some of them
+  //  will return with Incomplete status.
+  //  |
+  //  |
+  //  Last writer thread will write and when its blocked/return it will signal
+  //  Flush to continue to clear the stall.
+  threads.emplace_back(write_slow_down, db_);
+  // Wait untill first thread writing to DB is blocked and then
+  // create the multiple writers.
+  {
+    InstrumentedMutexLock lock(&mutex);
+    while (wait_count_db != 1) {
+      cv.Wait();
+    }
+  }
+
+  for (int i = 0; i < num_dbs; i += 2) {
+    // Write to multiple columns of db_.
+    writer_threads.emplace_back(write_slow_down, db_);
+    writer_threads.emplace_back(write_no_slow_down, db_);
+    // Write to different DBs.
+    threads.emplace_back(write_slow_down, dbs[i]);
+    threads.emplace_back(write_no_slow_down, dbs[i + 1]);
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  for (auto& t : writer_threads) {
+    t.join();
+  }
+
+  ASSERT_TRUE(s1);
+  ASSERT_TRUE(s2);
+  // Number of DBs blocked.
+  ASSERT_EQ((num_dbs / 2) + 1, wait_count_db);
+  // Number of writer threads writing to db_ blocked from getting added to the
+  // queue.
+  ASSERT_EQ(w_slowdown_set.size(), num_dbs / 2);
+  // Number of threads with WriteOptions.no_slowdown = true.
+  ASSERT_EQ(w_no_slowdown.load(std::memory_order_relaxed), num_dbs);
+
+  // Clean up DBs.
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Close());
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    delete dbs[i];
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(DBWriteBufferManagerTest, DBWriteBufferManagerTest,
+                        testing::Bool());
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+extern "C" {
+void RegisterCustomObjects(int argc, char** argv);
+}
+#else
+void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
+#endif  // !ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/db/db_write_test.cc b/db/db_write_test.cc
index b2389605060..4a8f90c2cb2 100644
--- a/db/db_write_test.cc
+++ b/db/db_write_test.cc
@@ -60,14 +60,15 @@ TEST_P(DBWriteTest, WriteStallRemoveNoSlowdownWrite) {
     std::string key = "foo" + std::to_string(a);
     WriteOptions wo;
     wo.no_slowdown = false;
-    dbfull()->Put(wo, key, "bar");
+    ASSERT_OK(dbfull()->Put(wo, key, "bar"));
   };
   std::function<void()> write_no_slowdown_func = [&]() {
     int a = thread_num.fetch_add(1);
     std::string key = "foo" + std::to_string(a);
     WriteOptions wo;
     wo.no_slowdown = true;
-    dbfull()->Put(wo, key, "bar");
+    Status s = dbfull()->Put(wo, key, "bar");
+    ASSERT_TRUE(s.ok() || s.IsIncomplete());
   };
   std::function<void(void*)> unblock_main_thread_func = [&](void*) {
     mutex.Lock();
@@ -77,13 +78,13 @@ TEST_P(DBWriteTest, WriteStallRemoveNoSlowdownWrite) {
   };
 
   // Create 3 L0 files and schedule 4th without waiting
-  Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
-  Flush();
-  Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
-  Flush();
-  Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
-  Flush();
-  Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "WriteThread::JoinBatchGroup:Start", unblock_main_thread_func);
@@ -104,7 +105,7 @@ TEST_P(DBWriteTest, WriteStallRemoveNoSlowdownWrite) {
   // write_thread
   FlushOptions fopt;
   fopt.wait = false;
-  dbfull()->Flush(fopt);
+  ASSERT_OK(dbfull()->Flush(fopt));
 
   // Create a mix of slowdown/no_slowdown write threads
   mutex.Lock();
@@ -145,7 +146,7 @@ TEST_P(DBWriteTest, WriteStallRemoveNoSlowdownWrite) {
   mutex.Unlock();
 
   TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:1");
-  dbfull()->TEST_WaitForFlushMemTable(nullptr);
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(nullptr));
   // This would have triggered a write stall. Unblock the write group leader
   TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:2");
   // The leader is going to create missing newer links. When the leader
@@ -178,14 +179,15 @@ TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) {
     std::string key = "foo" + std::to_string(a);
     WriteOptions wo;
     wo.no_slowdown = false;
-    dbfull()->Put(wo, key, "bar");
+    ASSERT_OK(dbfull()->Put(wo, key, "bar"));
   };
   std::function<void()> write_no_slowdown_func = [&]() {
     int a = thread_num.fetch_add(1);
     std::string key = "foo" + std::to_string(a);
     WriteOptions wo;
     wo.no_slowdown = true;
-    dbfull()->Put(wo, key, "bar");
+    Status s = dbfull()->Put(wo, key, "bar");
+    ASSERT_TRUE(s.ok() || s.IsIncomplete());
   };
   std::function<void(void *)> unblock_main_thread_func = [&](void *) {
     mutex.Lock();
@@ -195,13 +197,13 @@ TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) {
   };
 
   // Create 3 L0 files and schedule 4th without waiting
-  Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
-  Flush();
-  Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
-  Flush();
-  Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
-  Flush();
-  Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar");
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "WriteThread::JoinBatchGroup:Start", unblock_main_thread_func);
@@ -222,7 +224,7 @@ TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) {
   // write_thread
   FlushOptions fopt;
   fopt.wait = false;
-  dbfull()->Flush(fopt);
+  ASSERT_OK(dbfull()->Flush(fopt));
 
   // Create a mix of slowdown/no_slowdown write threads
   mutex.Lock();
@@ -243,7 +245,7 @@ TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) {
   mutex.Unlock();
 
   TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:1");
-  dbfull()->TEST_WaitForFlushMemTable(nullptr);
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(nullptr));
   // This would have triggered a write stall. Unblock the write group leader
   TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:2");
   // The leader is going to create missing newer links. When the leader finishes,
@@ -260,7 +262,7 @@ TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) {
 TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) {
   constexpr int kNumThreads = 5;
   std::unique_ptr<FaultInjectionTestEnv> mock_env(
-      new FaultInjectionTestEnv(Env::Default()));
+      new FaultInjectionTestEnv(env_));
   Options options = GetOptions();
   options.env = mock_env.get();
   Reopen(options);
@@ -307,6 +309,11 @@ TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) {
     threads[i].join();
   }
   ASSERT_EQ(1, leader_count);
+
+  // The Failed PUT operations can cause a BG error to be set.
+  // Mark it as Checked for the ASSERT_STATUS_CHECKED
+  dbfull()->Resume().PermitUncheckedError();
+
   // Close before mock_env destruct.
   Close();
 }
@@ -320,7 +327,7 @@ TEST_P(DBWriteTest, ManualWalFlushInEffect) {
   ASSERT_TRUE(dbfull()->FlushWAL(false).ok());
   ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty());
   // try the 2nd wal created during SwitchWAL
-  dbfull()->TEST_SwitchWAL();
+  ASSERT_OK(dbfull()->TEST_SwitchWAL());
   ASSERT_TRUE(Put("key" + ToString(0), "value").ok());
   ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty());
   ASSERT_TRUE(dbfull()->FlushWAL(false).ok());
@@ -329,7 +336,7 @@ TEST_P(DBWriteTest, ManualWalFlushInEffect) {
 
 TEST_P(DBWriteTest, IOErrorOnWALWriteTriggersReadOnlyMode) {
   std::unique_ptr<FaultInjectionTestEnv> mock_env(
-      new FaultInjectionTestEnv(Env::Default()));
+      new FaultInjectionTestEnv(env_));
   Options options = GetOptions();
   options.env = mock_env.get();
   Reopen(options);
@@ -351,7 +358,9 @@ TEST_P(DBWriteTest, IOErrorOnWALWriteTriggersReadOnlyMode) {
     }
     */
     if (!options.manual_wal_flush) {
-      ASSERT_FALSE(res.ok());
+      ASSERT_NOK(res);
+    } else {
+      ASSERT_OK(res);
     }
   }
   // Close before mock_env destruct.
@@ -361,7 +370,7 @@ TEST_P(DBWriteTest, IOErrorOnWALWriteTriggersReadOnlyMode) {
 TEST_P(DBWriteTest, IOErrorOnSwitchMemtable) {
   Random rnd(301);
   std::unique_ptr<FaultInjectionTestEnv> mock_env(
-      new FaultInjectionTestEnv(Env::Default()));
+      new FaultInjectionTestEnv(env_));
   Options options = GetOptions();
   options.env = mock_env.get();
   options.writable_file_max_buffer_size = 4 * 1024 * 1024;
@@ -395,7 +404,7 @@ TEST_P(DBWriteTest, LockWalInEffect) {
   ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty(false));
   ASSERT_OK(dbfull()->UnlockWAL());
   // try the 2nd wal created during SwitchWAL
-  dbfull()->TEST_SwitchWAL();
+  ASSERT_OK(dbfull()->TEST_SwitchWAL());
   ASSERT_OK(Put("key" + ToString(0), "value"));
   ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty());
   ASSERT_OK(dbfull()->LockWAL());
@@ -423,13 +432,14 @@ TEST_P(DBWriteTest, ConcurrentlyDisabledWAL) {
               ROCKSDB_NAMESPACE::WriteOptions write_option_default;
               std::string no_wal_key = no_wal_key_prefix + std::to_string(t) +
                                        "_" + std::to_string(i);
-              this->Put(no_wal_key, no_wal_value, write_option_disable);
+              ASSERT_OK(
+                  this->Put(no_wal_key, no_wal_value, write_option_disable));
               std::string wal_key =
                   wal_key_prefix + std::to_string(i) + "_" + std::to_string(i);
-              this->Put(wal_key, wal_value, write_option_default);
-              dbfull()->SyncWAL();
+              ASSERT_OK(this->Put(wal_key, wal_value, write_option_default));
+              ASSERT_OK(dbfull()->SyncWAL());
             }
-            return 0;
+            return;
         });
     }
     for (auto& t: threads) {
diff --git a/db/dbformat.cc b/db/dbformat.cc
index ada35f1fb2e..8ac0617649f 100644
--- a/db/dbformat.cc
+++ b/db/dbformat.cc
@@ -49,7 +49,8 @@ EntryType GetEntryType(ValueType value_type) {
 
 bool ParseFullKey(const Slice& internal_key, FullKey* fkey) {
   ParsedInternalKey ikey;
-  if (ParseInternalKey(internal_key, &ikey) != Status::OK()) {
+  if (!ParseInternalKey(internal_key, &ikey, false /*log_err_key */)
+           .ok()) {  // TODO
     return false;
   }
   fkey->user_key = ikey.user_key;
@@ -77,12 +78,34 @@ void AppendInternalKeyFooter(std::string* result, SequenceNumber s,
   PutFixed64(result, PackSequenceAndType(s, t));
 }
 
-std::string ParsedInternalKey::DebugString(bool hex) const {
+void AppendKeyWithMinTimestamp(std::string* result, const Slice& key,
+                               size_t ts_sz) {
+  assert(ts_sz > 0);
+  const std::string kTsMin(ts_sz, static_cast<unsigned char>(0));
+  result->append(key.data(), key.size());
+  result->append(kTsMin.data(), ts_sz);
+}
+
+void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key,
+                               size_t ts_sz) {
+  assert(ts_sz > 0);
+  const std::string kTsMax(ts_sz, static_cast<unsigned char>(0xff));
+  result->append(key.data(), key.size());
+  result->append(kTsMax.data(), ts_sz);
+}
+
+std::string ParsedInternalKey::DebugString(bool log_err_key, bool hex) const {
+  std::string result = "'";
+  if (log_err_key) {
+    result += user_key.ToString(hex);
+  } else {
+    result += "<redacted>";
+  }
+
   char buf[50];
   snprintf(buf, sizeof(buf), "' seq:%" PRIu64 ", type:%d", sequence,
            static_cast<int>(type));
-  std::string result = "'";
-  result += user_key.ToString(hex);
+
   result += buf;
   return result;
 }
@@ -90,8 +113,8 @@ std::string ParsedInternalKey::DebugString(bool hex) const {
 std::string InternalKey::DebugString(bool hex) const {
   std::string result;
   ParsedInternalKey parsed;
-  if (ParseInternalKey(rep_, &parsed) == Status::OK()) {
-    result = parsed.DebugString(hex);
+  if (ParseInternalKey(rep_, &parsed, false /* log_err_key */).ok()) {
+    result = parsed.DebugString(true /* log_err_key */, hex);  // TODO
   } else {
     result = "(bad)";
     result.append(EscapeString(rep_));
diff --git a/db/dbformat.h b/db/dbformat.h
index 81c852ac40a..c3f5c543735 100644
--- a/db/dbformat.h
+++ b/db/dbformat.h
@@ -112,13 +112,19 @@ struct ParsedInternalKey {
   // u contains timestamp if user timestamp feature is enabled.
   ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
       : user_key(u), sequence(seq), type(t) {}
-  std::string DebugString(bool hex = false) const;
+  std::string DebugString(bool log_err_key, bool hex) const;
 
   void clear() {
     user_key.clear();
     sequence = 0;
     type = kTypeDeletion;
   }
+
+  void SetTimestamp(const Slice& ts) {
+    assert(ts.size() <= user_key.size());
+    const char* addr = user_key.data() + user_key.size() - ts.size();
+    memcpy(const_cast<char*>(addr), ts.data(), ts.size());
+  }
 };
 
 // Return the length of the encoding of "key".
@@ -140,8 +146,10 @@ inline void UnPackSequenceAndType(uint64_t packed, uint64_t* seq,
   *seq = packed >> 8;
   *t = static_cast<ValueType>(packed & 0xff);
 
-  assert(*seq <= kMaxSequenceNumber);
-  assert(IsExtendedValueType(*t));
+  // Commented the following two assertions in order to test key-value checksum
+  // on corrupted keys without crashing ("DbKvChecksumTest").
+  // assert(*seq <= kMaxSequenceNumber);
+  // assert(IsExtendedValueType(*t));
 }
 
 EntryType GetEntryType(ValueType value_type);
@@ -161,12 +169,20 @@ extern void AppendInternalKeyWithDifferentTimestamp(
 extern void AppendInternalKeyFooter(std::string* result, SequenceNumber s,
                                     ValueType t);
 
+// Append the key and a minimal timestamp to *result
+extern void AppendKeyWithMinTimestamp(std::string* result, const Slice& key,
+                                      size_t ts_sz);
+
+// Append the key and a maximal timestamp to *result
+extern void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key,
+                                      size_t ts_sz);
+
 // Attempt to parse an internal key from "internal_key".  On success,
 // stores the parsed data in "*result", and returns true.
 //
 // On error, returns false, leaves "*result" in an undefined state.
 extern Status ParseInternalKey(const Slice& internal_key,
-                               ParsedInternalKey* result);
+                               ParsedInternalKey* result, bool log_err_key);
 
 // Returns the user key portion of an internal key.
 inline Slice ExtractUserKey(const Slice& internal_key) {
@@ -285,8 +301,8 @@ class InternalKey {
 
   bool Valid() const {
     ParsedInternalKey parsed;
-    return (ParseInternalKey(Slice(rep_), &parsed) == Status::OK()) ? true
-                                                                    : false;
+    return (ParseInternalKey(Slice(rep_), &parsed, false /* log_err_key */)
+                .ok());  // TODO
   }
 
   void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
@@ -319,7 +335,7 @@ class InternalKey {
     AppendInternalKeyFooter(&rep_, s, t);
   }
 
-  std::string DebugString(bool hex = false) const;
+  std::string DebugString(bool hex) const;
 };
 
 inline int InternalKeyComparator::Compare(const InternalKey& a,
@@ -328,20 +344,27 @@ inline int InternalKeyComparator::Compare(const InternalKey& a,
 }
 
 inline Status ParseInternalKey(const Slice& internal_key,
-                               ParsedInternalKey* result) {
+                               ParsedInternalKey* result, bool log_err_key) {
   const size_t n = internal_key.size();
+
   if (n < kNumInternalBytes) {
-    return Status::Corruption("Internal Key too small");
+    return Status::Corruption("Corrupted Key: Internal Key too small. Size=" +
+                              std::to_string(n) + ". ");
   }
+
   uint64_t num = DecodeFixed64(internal_key.data() + n - kNumInternalBytes);
   unsigned char c = num & 0xff;
   result->sequence = num >> 8;
   result->type = static_cast<ValueType>(c);
   assert(result->type <= ValueType::kMaxValue);
   result->user_key = Slice(internal_key.data(), n - kNumInternalBytes);
-  return IsExtendedValueType(result->type)
-             ? Status::OK()
-             : Status::Corruption("Invalid Key Type");
+
+  if (IsExtendedValueType(result->type)) {
+    return Status::OK();
+  } else {
+    return Status::Corruption("Corrupted Key",
+                              result->DebugString(log_err_key, true));
+  }
 }
 
 // Update the sequence number in the internal key.
@@ -475,15 +498,21 @@ class IterKey {
 
   // Update the sequence number in the internal key.  Guarantees not to
   // invalidate slices to the key (and the user key).
-  void UpdateInternalKey(uint64_t seq, ValueType t) {
+  void UpdateInternalKey(uint64_t seq, ValueType t, const Slice* ts = nullptr) {
     assert(!IsKeyPinned());
     assert(key_size_ >= kNumInternalBytes);
+    if (ts) {
+      assert(key_size_ >= kNumInternalBytes + ts->size());
+      memcpy(&buf_[key_size_ - kNumInternalBytes - ts->size()], ts->data(),
+             ts->size());
+    }
     uint64_t newval = (seq << 8) | t;
     EncodeFixed64(&buf_[key_size_ - kNumInternalBytes], newval);
   }
 
   bool IsKeyPinned() const { return (key_ != buf_); }
 
+  // user_key does not have timestamp.
   void SetInternalKey(const Slice& key_prefix, const Slice& user_key,
                       SequenceNumber s,
                       ValueType value_type = kValueTypeForSeek,
@@ -587,7 +616,7 @@ class IterKey {
   void EnlargeBuffer(size_t key_size);
 };
 
-// Convert from a SliceTranform of user keys, to a SliceTransform of
+// Convert from a SliceTransform of user keys, to a SliceTransform of
 // user keys.
 class InternalKeySliceTransform : public SliceTransform {
  public:
diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc
index 09ee4a38b4d..06c5bb2025e 100644
--- a/db/dbformat_test.cc
+++ b/db/dbformat_test.cc
@@ -8,7 +8,6 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/dbformat.h"
-#include "logging/logging.h"
 #include "test_util/testharness.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -41,12 +40,12 @@ static void TestKey(const std::string& key,
   Slice in(encoded);
   ParsedInternalKey decoded("", 0, kTypeValue);
 
-  ASSERT_OK(ParseInternalKey(in, &decoded));
+  ASSERT_OK(ParseInternalKey(in, &decoded, true /* log_err_key */));
   ASSERT_EQ(key, decoded.user_key.ToString());
   ASSERT_EQ(seq, decoded.sequence);
   ASSERT_EQ(vt, decoded.type);
 
-  ASSERT_NOK(ParseInternalKey(Slice("bar"), &decoded));
+  ASSERT_NOK(ParseInternalKey(Slice("bar"), &decoded, true /* log_err_key */));
 }
 
 class FormatTest : public testing::Test {};
@@ -186,7 +185,7 @@ TEST_F(FormatTest, UpdateInternalKey) {
 
   Slice in(ikey);
   ParsedInternalKey decoded;
-  ASSERT_OK(ParseInternalKey(in, &decoded));
+  ASSERT_OK(ParseInternalKey(in, &decoded, true /* log_err_key */));
   ASSERT_EQ(user_key, decoded.user_key.ToString());
   ASSERT_EQ(new_seq, decoded.sequence);
   ASSERT_EQ(new_val_type, decoded.type);
diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc
index 635951a1e1c..38bc4960275 100644
--- a/db/deletefile_test.cc
+++ b/db/deletefile_test.cc
@@ -40,7 +40,7 @@ class DeleteFileTest : public DBTestBase {
         wal_dir_(dbname_ + "/wal_files") {}
 
   void SetOptions(Options* options) {
-    assert(options);
+    ASSERT_NE(options, nullptr);
     options->delete_obsolete_files_period_micros = 0;  // always do full purge
     options->enable_thread_tracking = true;
     options->write_buffer_size = 1024 * 1024 * 1000;
@@ -105,14 +105,14 @@ class DeleteFileTest : public DBTestBase {
   void CheckFileTypeCounts(const std::string& dir, int required_log,
                            int required_sst, int required_manifest) {
     std::vector<std::string> filenames;
-    env_->GetChildren(dir, &filenames);
+    ASSERT_OK(env_->GetChildren(dir, &filenames));
 
     int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
     for (auto file : filenames) {
       uint64_t number;
       FileType type;
       if (ParseFileName(file, &number, &type)) {
-        log_cnt += (type == kLogFile);
+        log_cnt += (type == kWalFile);
         sst_cnt += (type == kTableFile);
         manifest_cnt += (type == kDescriptorFile);
       }
@@ -180,7 +180,8 @@ TEST_F(DeleteFileTest, AddKeysAndQueryLevels) {
   ASSERT_TRUE(status.IsInvalidArgument());
 
   // Lowest level file deletion should succeed.
-  ASSERT_OK(db_->DeleteFile(level2file));
+  status = db_->DeleteFile(level2file);
+  ASSERT_OK(status);
 }
 
 TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) {
@@ -201,7 +202,7 @@ TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) {
   compact_options.change_level = true;
   compact_options.target_level = 2;
   Slice first_slice(first), last_slice(last);
-  db_->CompactRange(compact_options, &first_slice, &last_slice);
+  ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
   // 1 sst after compaction
   CheckFileTypeCounts(dbname_, 0, 1, 1);
 
@@ -210,7 +211,9 @@ TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) {
   Iterator *itr = nullptr;
   CreateTwoLevels();
   itr = db_->NewIterator(ReadOptions());
-  db_->CompactRange(compact_options, &first_slice, &last_slice);
+  ASSERT_OK(itr->status());
+  ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
+  ASSERT_OK(itr->status());
   // 3 sst after compaction with live iterator
   CheckFileTypeCounts(dbname_, 0, 3, 1);
   delete itr;
@@ -237,7 +240,8 @@ TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) {
   ReadOptions read_options;
   read_options.background_purge_on_iterator_cleanup = true;
   itr = db_->NewIterator(read_options);
-  db_->CompactRange(compact_options, &first_slice, &last_slice);
+  ASSERT_OK(itr->status());
+  ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
   // 3 sst after compaction with live iterator
   CheckFileTypeCounts(dbname_, 0, 3, 1);
   test::SleepingBackgroundTask sleeping_task_before;
@@ -344,11 +348,12 @@ TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) {
     ReadOptions read_options;
     read_options.background_purge_on_iterator_cleanup = true;
     itr = db_->NewIterator(read_options);
+    ASSERT_OK(itr->status());
     // ReadOptions is deleted, but iterator cleanup function should not be
     // affected
   }
 
-  db_->CompactRange(compact_options, &first_slice, &last_slice);
+  ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
   // 3 sst after compaction with live iterator
   CheckFileTypeCounts(dbname_, 0, 3, 1);
   delete itr;
@@ -382,9 +387,11 @@ TEST_F(DeleteFileTest, BackgroundPurgeTestMultipleJobs) {
   ReadOptions read_options;
   read_options.background_purge_on_iterator_cleanup = true;
   Iterator* itr1 = db_->NewIterator(read_options);
+  ASSERT_OK(itr1->status());
   CreateTwoLevels();
   Iterator* itr2 = db_->NewIterator(read_options);
-  db_->CompactRange(compact_options, &first_slice, &last_slice);
+  ASSERT_OK(itr2->status());
+  ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
   // 5 sst files after 2 compactions with 2 live iterators
   CheckFileTypeCounts(dbname_, 0, 5, 1);
 
@@ -417,6 +424,7 @@ TEST_F(DeleteFileTest, DeleteFileWithIterator) {
   CreateTwoLevels();
   ReadOptions read_options;
   Iterator* it = db_->NewIterator(read_options);
+  ASSERT_OK(it->status());
   std::vector<LiveFileMetaData> metadata;
   db_->GetLiveFilesMetaData(&metadata);
 
@@ -432,7 +440,7 @@ TEST_F(DeleteFileTest, DeleteFileWithIterator) {
   Status status = db_->DeleteFile(level2file);
   fprintf(stdout, "Deletion status %s: %s\n",
           level2file.c_str(), status.ToString().c_str());
-  ASSERT_TRUE(status.ok());
+  ASSERT_OK(status);
   it->SeekToFirst();
   int numKeysIterated = 0;
   while(it->Valid()) {
@@ -452,7 +460,7 @@ TEST_F(DeleteFileTest, DeleteLogFiles) {
 
   AddKeys(10, 0);
   VectorLogPtr logfiles;
-  db_->GetSortedWalFiles(logfiles);
+  ASSERT_OK(db_->GetSortedWalFiles(logfiles));
   ASSERT_GT(logfiles.size(), 0UL);
   // Take the last log file which is expected to be alive and try to delete it
   // Should not succeed because live logs are not allowed to be deleted
@@ -461,7 +469,7 @@ TEST_F(DeleteFileTest, DeleteLogFiles) {
   ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName()));
   fprintf(stdout, "Deleting alive log file %s\n",
           alive_log->PathName().c_str());
-  ASSERT_TRUE(!db_->DeleteFile(alive_log->PathName()).ok());
+  ASSERT_NOK(db_->DeleteFile(alive_log->PathName()));
   ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName()));
   logfiles.clear();
 
@@ -469,10 +477,10 @@ TEST_F(DeleteFileTest, DeleteLogFiles) {
   // Call Flush again to flush out memtable and move alive log to archived log
   // and try to delete the archived log file
   FlushOptions fopts;
-  db_->Flush(fopts);
+  ASSERT_OK(db_->Flush(fopts));
   AddKeys(10, 0);
-  db_->Flush(fopts);
-  db_->GetSortedWalFiles(logfiles);
+  ASSERT_OK(db_->Flush(fopts));
+  ASSERT_OK(db_->GetSortedWalFiles(logfiles));
   ASSERT_GT(logfiles.size(), 0UL);
   std::unique_ptr<LogFile> archived_log = std::move(logfiles.front());
   ASSERT_EQ(archived_log->Type(), kArchivedLogFile);
@@ -480,8 +488,8 @@ TEST_F(DeleteFileTest, DeleteLogFiles) {
   fprintf(stdout, "Deleting archived log file %s\n",
           archived_log->PathName().c_str());
   ASSERT_OK(db_->DeleteFile(archived_log->PathName()));
-  ASSERT_EQ(Status::NotFound(),
-            env_->FileExists(wal_dir_ + "/" + archived_log->PathName()));
+  ASSERT_TRUE(
+      env_->FileExists(wal_dir_ + "/" + archived_log->PathName()).IsNotFound());
 }
 
 TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) {
@@ -520,6 +528,7 @@ TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) {
 
   {
     std::unique_ptr<Iterator> itr(db_->NewIterator(ReadOptions(), handles_[1]));
+    ASSERT_OK(itr->status());
     int count = 0;
     for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
       ASSERT_OK(itr->status());
diff --git a/db/error_handler.cc b/db/error_handler.cc
index 7aa4aa82689..b5c353a6908 100644
--- a/db/error_handler.cc
+++ b/db/error_handler.cc
@@ -4,9 +4,11 @@
 //  (found in the LICENSE.Apache file in the root directory).
 //
 #include "db/error_handler.h"
+
 #include "db/db_impl/db_impl.h"
 #include "db/event_helpers.h"
 #include "file/sst_file_manager_impl.h"
+#include "logging/logging.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -111,6 +113,23 @@ std::map<std::tuple<BackgroundErrorReason, Status::Code, Status::SubCode, bool>,
                          Status::Code::kIOError, Status::SubCode::kIOFenced,
                          false),
          Status::Severity::kFatalError},
+        // Errors during MANIFEST write when WAL is disabled
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         false),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         false),
+         Status::Severity::kFatalError},
 
 };
 
@@ -175,6 +194,12 @@ std::map<std::tuple<BackgroundErrorReason, Status::Code, bool>,
         {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
                          Status::Code::kIOError, false),
          Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, false),
+         Status::Severity::kFatalError},
 };
 
 std::map<std::tuple<BackgroundErrorReason, bool>, Status::Severity>
@@ -244,12 +269,19 @@ void ErrorHandler::CancelErrorRecovery() {
 // This can also get called as part of a recovery operation. In that case, we
 // also track the error separately in recovery_error_ so we can tell in the
 // end whether recovery succeeded or not
-Status ErrorHandler::SetBGError(const Status& bg_err, BackgroundErrorReason reason) {
+const Status& ErrorHandler::SetBGError(const Status& bg_err,
+                                       BackgroundErrorReason reason) {
   db_mutex_->AssertHeld();
   if (bg_err.ok()) {
-    return Status::OK();
+    return bg_err;
   }
 
+  if (bg_error_stats_ != nullptr) {
+    RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
+  }
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "ErrorHandler: Set regular background error\n");
+
   bool paranoid = db_options_.paranoid_checks;
   Status::Severity sev = Status::Severity::kFatalError;
   Status new_bg_err;
@@ -324,11 +356,32 @@ Status ErrorHandler::SetBGError(const Status& bg_err, BackgroundErrorReason reas
   return bg_error_;
 }
 
-Status ErrorHandler::SetBGError(const IOStatus& bg_io_err,
-                                BackgroundErrorReason reason) {
+// This is the main function for looking at IO related error during the
+// background operations. The main logic is:
+// 1) File scope IO error is treated as retryable IO error in the write
+//    path. In RocksDB, If a file has write IO error and it is at file scope,
+//    RocksDB never write to the same file again. RocksDB will create a new
+//    file and rewrite the whole content. Thus, it is retryable.
+// 1) if the error is caused by data loss, the error is mapped to
+//    unrecoverable error. Application/user must take action to handle
+//    this situation (File scope case is excluded).
+// 2) if the error is a Retryable IO error (i.e., it is a file scope IO error,
+//     or its retryable flag is set and not a data loss error), auto resume
+//     will be called and the auto resume can be controlled by resume count
+//     and resume interval options. There are three sub-cases:
+//    a) if the error happens during compaction, it is mapped to a soft error.
+//       the compaction thread will reschedule a new compaction.
+//    b) if the error happens during flush and also WAL is empty, it is mapped
+//       to a soft error. Note that, it includes the case that IO error happens
+//       in SST or manifest write during flush.
+//    c) all other errors are mapped to hard error.
+// 3) for other cases, SetBGError(const Status& bg_err, BackgroundErrorReason
+//    reason) will be called to handle other error cases.
+const Status& ErrorHandler::SetBGError(const IOStatus& bg_io_err,
+                                       BackgroundErrorReason reason) {
   db_mutex_->AssertHeld();
   if (bg_io_err.ok()) {
-    return Status::OK();
+    return bg_io_err;
   }
   ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s",
                  bg_io_err.ToString().c_str());
@@ -336,45 +389,74 @@ Status ErrorHandler::SetBGError(const IOStatus& bg_io_err,
   if (recovery_in_prog_ && recovery_io_error_.ok()) {
     recovery_io_error_ = bg_io_err;
   }
-  if (BackgroundErrorReason::kManifestWrite == reason) {
+  if (BackgroundErrorReason::kManifestWrite == reason ||
+      BackgroundErrorReason::kManifestWriteNoWAL == reason) {
     // Always returns ok
     db_->DisableFileDeletionsWithLock().PermitUncheckedError();
   }
 
   Status new_bg_io_err = bg_io_err;
-  Status s;
   DBRecoverContext context;
-  if (bg_io_err.GetDataLoss()) {
-    // First, data loss is treated as unrecoverable error. So it can directly
-    // overwrite any existing bg_error_.
+  if (bg_io_err.GetScope() != IOStatus::IOErrorScope::kIOErrorScopeFile &&
+      bg_io_err.GetDataLoss()) {
+    // First, data loss (non file scope) is treated as unrecoverable error. So
+    // it can directly overwrite any existing bg_error_.
     bool auto_recovery = false;
     Status bg_err(new_bg_io_err, Status::Severity::kUnrecoverableError);
     bg_error_ = bg_err;
     if (recovery_in_prog_ && recovery_error_.ok()) {
       recovery_error_ = bg_err;
     }
-    EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s,
-                                          db_mutex_, &auto_recovery);
+    if (bg_error_stats_ != nullptr) {
+      RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
+      RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
+    }
+    ROCKS_LOG_INFO(
+        db_options_.info_log,
+        "ErrorHandler: Set background IO error as unrecoverable error\n");
+    EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
+                                          &bg_err, db_mutex_, &auto_recovery);
     recover_context_ = context;
     return bg_error_;
-  } else if (bg_io_err.GetRetryable()) {
-    // Second, check if the error is a retryable IO error or not. if it is
-    // retryable error and its severity is higher than bg_error_, overwrite
-    // the bg_error_ with new error.
-    // In current stage, for retryable IO error of compaction, treat it as
-    // soft error. In other cases, treat the retryable IO error as hard
-    // error.
+  } else if (bg_io_err.subcode() != IOStatus::SubCode::kNoSpace &&
+             (bg_io_err.GetScope() ==
+                  IOStatus::IOErrorScope::kIOErrorScopeFile ||
+              bg_io_err.GetRetryable())) {
+    // Second, check if the error is a retryable IO error (file scope IO error
+    // is also treated as retryable IO error in RocksDB write path). if it is
+    // retryable error and its severity is higher than bg_error_, overwrite the
+    // bg_error_ with new error. In current stage, for retryable IO error of
+    // compaction, treat it as soft error. In other cases, treat the retryable
+    // IO error as hard error. Note that, all the NoSpace error should be
+    // handled by the SstFileManager::StartErrorRecovery(). Therefore, no matter
+    // it is retryable or file scope, this logic will be bypassed.
     bool auto_recovery = false;
-    EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s,
-                                          db_mutex_, &auto_recovery);
+    EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
+                                          &new_bg_io_err, db_mutex_,
+                                          &auto_recovery);
+    if (bg_error_stats_ != nullptr) {
+      RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
+      RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
+      RecordTick(bg_error_stats_.get(),
+                 ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT);
+    }
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "ErrorHandler: Set background retryable IO error\n");
     if (BackgroundErrorReason::kCompaction == reason) {
-      Status bg_err(new_bg_io_err, Status::Severity::kSoftError);
-      if (bg_err.severity() > bg_error_.severity()) {
-        bg_error_ = bg_err;
+      // We map the retryable IO error during compaction to soft error. Since
+      // compaction can reschedule by itself. We will not set the BG error in
+      // this case
+      // TODO:  a better way to set or clean the retryable IO error which
+      // happens during compaction SST file write.
+      if (bg_error_stats_ != nullptr) {
+        RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT);
       }
-      recover_context_ = context;
+      ROCKS_LOG_INFO(
+          db_options_.info_log,
+          "ErrorHandler: Compaction will schedule by itself to resume\n");
       return bg_error_;
-    } else if (BackgroundErrorReason::kFlushNoWAL == reason) {
+    } else if (BackgroundErrorReason::kFlushNoWAL == reason ||
+               BackgroundErrorReason::kManifestWriteNoWAL == reason) {
       // When the BG Retryable IO error reason is flush without WAL,
       // We map it to a soft error. At the same time, all the background work
       // should be stopped except the BG work from recovery. Therefore, we
@@ -405,12 +487,14 @@ Status ErrorHandler::SetBGError(const IOStatus& bg_io_err,
       return StartRecoverFromRetryableBGIOError(bg_io_err);
     }
   } else {
-    s = SetBGError(new_bg_io_err, reason);
+    if (bg_error_stats_ != nullptr) {
+      RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
+    }
+    return SetBGError(new_bg_io_err, reason);
   }
-  return s;
 }
 
-Status ErrorHandler::OverrideNoSpaceError(Status bg_error,
+Status ErrorHandler::OverrideNoSpaceError(const Status& bg_error,
                                           bool* auto_recovery) {
 #ifndef ROCKSDB_LITE
   if (bg_error.severity() >= Status::Severity::kFatalError) {
@@ -466,7 +550,11 @@ Status ErrorHandler::ClearBGError() {
   // Signal that recovery succeeded
   if (recovery_error_.ok()) {
     Status old_bg_error = bg_error_;
+    // Clear and check the recovery IO and BG error
     bg_error_ = Status::OK();
+    recovery_io_error_ = IOStatus::OK();
+    bg_error_.PermitUncheckedError();
+    recovery_io_error_.PermitUncheckedError();
     recovery_in_prog_ = false;
     soft_error_no_bg_work_ = false;
     EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners,
@@ -516,6 +604,7 @@ Status ErrorHandler::RecoverFromBGError(bool is_manual) {
   // during the recovery process. While recovering, the only operations that
   // can generate background errors should be the flush operations
   recovery_error_ = Status::OK();
+  recovery_error_.PermitUncheckedError();
   Status s = db_->ResumeImpl(recover_context_);
   if (s.ok()) {
     soft_error_no_bg_work_ = false;
@@ -537,24 +626,39 @@ Status ErrorHandler::RecoverFromBGError(bool is_manual) {
 #endif
 }
 
-Status ErrorHandler::StartRecoverFromRetryableBGIOError(IOStatus io_error) {
+const Status& ErrorHandler::StartRecoverFromRetryableBGIOError(
+    const IOStatus& io_error) {
 #ifndef ROCKSDB_LITE
   db_mutex_->AssertHeld();
-  if (bg_error_.ok() || io_error.ok()) {
-    return Status::OK();
-  }
-  if (db_options_.max_bgerror_resume_count <= 0 || recovery_in_prog_ ||
-      recovery_thread_) {
+  if (bg_error_.ok()) {
+    return bg_error_;
+  } else if (io_error.ok()) {
+    return io_error;
+  } else if (db_options_.max_bgerror_resume_count <= 0 || recovery_in_prog_) {
     // Auto resume BG error is not enabled, directly return bg_error_.
     return bg_error_;
   }
+  if (bg_error_stats_ != nullptr) {
+    RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT);
+  }
+  ROCKS_LOG_INFO(
+      db_options_.info_log,
+      "ErrorHandler: Call StartRecoverFromRetryableBGIOError to resume\n");
+  if (recovery_thread_) {
+    // In this case, if recovery_in_prog_ is false, current thread should
+    // wait the previous recover thread to finish and create a new thread
+    // to recover from the bg error.
+    db_mutex_->Unlock();
+    recovery_thread_->join();
+    db_mutex_->Lock();
+  }
 
   recovery_in_prog_ = true;
   recovery_thread_.reset(
       new port::Thread(&ErrorHandler::RecoverFromRetryableBGIOError, this));
 
   if (recovery_io_error_.ok() && recovery_error_.ok()) {
-    return Status::OK();
+    return recovery_error_;
   } else {
     TEST_SYNC_POINT("StartRecoverRetryableBGIOError:RecoverFail");
     return bg_error_;
@@ -578,6 +682,7 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
   DBRecoverContext context = recover_context_;
   int resume_count = db_options_.max_bgerror_resume_count;
   uint64_t wait_interval = db_options_.bgerror_resume_retry_interval;
+  uint64_t retry_count = 0;
   // Recover from the retryable error. Create a separate thread to do it.
   while (resume_count > 0) {
     if (end_recovery_) {
@@ -587,15 +692,24 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
     TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume1");
     recovery_io_error_ = IOStatus::OK();
     recovery_error_ = Status::OK();
+    retry_count++;
     Status s = db_->ResumeImpl(context);
     TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterResume0");
     TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterResume1");
+    if (bg_error_stats_ != nullptr) {
+      RecordTick(bg_error_stats_.get(),
+                 ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT);
+    }
     if (s.IsShutdownInProgress() ||
         bg_error_.severity() >= Status::Severity::kFatalError) {
       // If DB shutdown in progress or the error severity is higher than
       // Hard Error, stop auto resume and returns.
       TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverFail0");
       recovery_in_prog_ = false;
+      if (bg_error_stats_ != nullptr) {
+        RecordInHistogram(bg_error_stats_.get(),
+                          ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
+      }
       return;
     }
     if (!recovery_io_error_.ok() &&
@@ -606,7 +720,7 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
       // a period of time and redo auto resume if it is allowed.
       TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait0");
       TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait1");
-      int64_t wait_until = db_->env_->NowMicros() + wait_interval;
+      int64_t wait_until = db_options_.clock->NowMicros() + wait_interval;
       cv_.TimedWait(wait_until);
       TEST_SYNC_POINT("RecoverFromRetryableBGIOError:AfterWait0");
     } else {
@@ -619,8 +733,15 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
         TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverSuccess");
         Status old_bg_error = bg_error_;
         bg_error_ = Status::OK();
+        bg_error_.PermitUncheckedError();
         EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners,
                                                      old_bg_error, db_mutex_);
+        if (bg_error_stats_ != nullptr) {
+          RecordTick(bg_error_stats_.get(),
+                     ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT);
+          RecordInHistogram(bg_error_stats_.get(),
+                            ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
+        }
         recovery_in_prog_ = false;
         if (soft_error_no_bg_work_) {
           soft_error_no_bg_work_ = false;
@@ -631,6 +752,10 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
         // In this case: 1) recovery_io_error is more serious or not retryable
         // 2) other Non IO recovery_error happens. The auto recovery stops.
         recovery_in_prog_ = false;
+        if (bg_error_stats_ != nullptr) {
+          RecordInHistogram(bg_error_stats_.get(),
+                            ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
+        }
         return;
       }
     }
@@ -638,6 +763,10 @@ void ErrorHandler::RecoverFromRetryableBGIOError() {
   }
   recovery_in_prog_ = false;
   TEST_SYNC_POINT("RecoverFromRetryableBGIOError:LoopOut");
+  if (bg_error_stats_ != nullptr) {
+    RecordInHistogram(bg_error_stats_.get(),
+                      ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
+  }
   return;
 #else
   return;
diff --git a/db/error_handler.h b/db/error_handler.h
index 084434101aa..ab1169bc907 100644
--- a/db/error_handler.h
+++ b/db/error_handler.h
@@ -31,17 +31,15 @@ class ErrorHandler {
                 InstrumentedMutex* db_mutex)
        : db_(db),
          db_options_(db_options),
-         bg_error_(Status::OK()),
-         recovery_error_(Status::OK()),
-         recovery_io_error_(IOStatus::OK()),
          cv_(db_mutex),
          end_recovery_(false),
          recovery_thread_(nullptr),
          db_mutex_(db_mutex),
          auto_recovery_(false),
          recovery_in_prog_(false),
-         soft_error_no_bg_work_(false) {}
-   ~ErrorHandler() {
+         soft_error_no_bg_work_(false),
+         bg_error_stats_(db_options.statistics) {
+     // Clear the checked flag for uninitialized errors
      bg_error_.PermitUncheckedError();
      recovery_error_.PermitUncheckedError();
      recovery_io_error_.PermitUncheckedError();
@@ -53,13 +51,14 @@ class ErrorHandler {
                                      Status::Code code,
                                      Status::SubCode subcode);
 
-   Status SetBGError(const Status& bg_err, BackgroundErrorReason reason);
+   const Status& SetBGError(const Status& bg_err, BackgroundErrorReason reason);
 
-   Status SetBGError(const IOStatus& bg_io_err, BackgroundErrorReason reason);
+   const Status& SetBGError(const IOStatus& bg_io_err,
+                            BackgroundErrorReason reason);
 
-   Status GetBGError() { return bg_error_; }
+   Status GetBGError() const { return bg_error_; }
 
-   Status GetRecoveryError() { return recovery_error_; }
+   Status GetRecoveryError() const { return recovery_error_; }
 
    Status ClearBGError();
 
@@ -104,15 +103,18 @@ class ErrorHandler {
     bool auto_recovery_;
     bool recovery_in_prog_;
     // A flag to indicate that for the soft error, we should not allow any
-    // backrgound work execpt the work is from recovery.
+    // background work except the work is from recovery.
     bool soft_error_no_bg_work_;
 
     // Used to store the context for recover, such as flush reason.
     DBRecoverContext recover_context_;
 
-    Status OverrideNoSpaceError(Status bg_error, bool* auto_recovery);
+    // The pointer of DB statistics.
+    std::shared_ptr<Statistics> bg_error_stats_;
+
+    Status OverrideNoSpaceError(const Status& bg_error, bool* auto_recovery);
     void RecoverFromNoSpace();
-    Status StartRecoverFromRetryableBGIOError(IOStatus io_error);
+    const Status& StartRecoverFromRetryableBGIOError(const IOStatus& io_error);
     void RecoverFromRetryableBGIOError();
 };
 
diff --git a/db/error_handler_fs_test.cc b/db/error_handler_fs_test.cc
index 895c878ab6e..e0456a82737 100644
--- a/db/error_handler_fs_test.cc
+++ b/db/error_handler_fs_test.cc
@@ -9,9 +9,9 @@
 #ifndef ROCKSDB_LITE
 
 #include "db/db_test_util.h"
+#include "file/sst_file_manager_impl.h"
 #include "port/stack_trace.h"
 #include "rocksdb/io_status.h"
-#include "rocksdb/perf_context.h"
 #include "rocksdb/sst_file_manager.h"
 #if !defined(ROCKSDB_LITE)
 #include "test_util/sync_point.h"
@@ -25,7 +25,10 @@ namespace ROCKSDB_NAMESPACE {
 class DBErrorHandlingFSTest : public DBTestBase {
  public:
   DBErrorHandlingFSTest()
-      : DBTestBase("/db_error_handling_fs_test", /*env_do_fsync=*/true) {}
+      : DBTestBase("/db_error_handling_fs_test", /*env_do_fsync=*/true) {
+    fault_fs_.reset(new FaultInjectionTestFS(env_->GetFileSystem()));
+    fault_env_.reset(new CompositeEnvWrapper(env_, fault_fs_));
+  }
 
   std::string GetManifestNameFromLiveFiles() {
     std::vector<std::string> live_files;
@@ -44,21 +47,9 @@ class DBErrorHandlingFSTest : public DBTestBase {
     }
     return "";
   }
-};
-
-class DBErrorHandlingFS : public FileSystemWrapper {
- public:
-  DBErrorHandlingFS()
-      : FileSystemWrapper(FileSystem::Default()),
-        trig_no_space(false),
-        trig_io_error(false) {}
-
-  void SetTrigNoSpace() { trig_no_space = true; }
-  void SetTrigIoError() { trig_io_error = true; }
 
- private:
-  bool trig_no_space;
-  bool trig_io_error;
+  std::shared_ptr<FaultInjectionTestFS> fault_fs_;
+  std::unique_ptr<Env> fault_env_;
 };
 
 class ErrorHandlerFSListener : public EventListener {
@@ -161,15 +152,13 @@ class ErrorHandlerFSListener : public EventListener {
 };
 
 TEST_F(DBErrorHandlingFSTest, FLushWriteError) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.listeners.emplace_back(listener);
+  options.statistics = CreateDBStatistics();
   Status s;
 
   listener->EnableAutoRecovery(false);
@@ -177,32 +166,89 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteError) {
 
   ASSERT_OK(Put(Key(0), "val"));
   SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
-    fault_fs->SetFilesystemActive(false, IOStatus::NoSpace("Out of space"));
+    fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space"));
   });
   SyncPoint::GetInstance()->EnableProcessing();
   s = Flush();
   ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
   SyncPoint::GetInstance()->DisableProcessing();
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   s = dbfull()->Resume();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
 
   Reopen(options);
   ASSERT_EQ("val", Get(Key(0)));
   Destroy(options);
 }
 
-TEST_F(DBErrorHandlingFSTest, FLushWritRetryableError) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+// All the NoSpace IOError will be handled as the regular BG Error no matter the
+// retryable flag is set of not. So the auto resume for retryable IO Error will
+// not be triggered. Also, it is mapped as hard error.
+TEST_F(DBErrorHandlingFSTest, FLushWriteNoSpaceError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::NoSpace("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(1), "val1"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FLushWriteRetryableError) {
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.listeners.emplace_back(listener);
   options.max_bgerror_resume_count = 0;
+  options.statistics = CreateDBStatistics();
   Status s;
 
   listener->EnableAutoRecovery(false);
@@ -214,59 +260,242 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableError) {
   ASSERT_OK(Put(Key(1), "val1"));
   SyncPoint::GetInstance()->SetCallBack(
       "BuildTable:BeforeFinishBuildTable",
-      [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); });
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
   SyncPoint::GetInstance()->EnableProcessing();
   s = Flush();
-  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
   SyncPoint::GetInstance()->DisableProcessing();
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   s = dbfull()->Resume();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
   Reopen(options);
   ASSERT_EQ("val1", Get(Key(1)));
 
   ASSERT_OK(Put(Key(2), "val2"));
   SyncPoint::GetInstance()->SetCallBack(
       "BuildTable:BeforeSyncTable",
-      [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); });
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
   SyncPoint::GetInstance()->EnableProcessing();
   s = Flush();
-  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
   SyncPoint::GetInstance()->DisableProcessing();
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   s = dbfull()->Resume();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
   Reopen(options);
   ASSERT_EQ("val2", Get(Key(2)));
 
   ASSERT_OK(Put(Key(3), "val3"));
   SyncPoint::GetInstance()->SetCallBack(
       "BuildTable:BeforeCloseTableFile",
-      [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); });
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
   SyncPoint::GetInstance()->EnableProcessing();
   s = Flush();
-  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
   SyncPoint::GetInstance()->DisableProcessing();
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   s = dbfull()->Resume();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
   Reopen(options);
   ASSERT_EQ("val3", Get(Key(3)));
 
   Destroy(options);
 }
 
+TEST_F(DBErrorHandlingFSTest, FLushWriteFileScopeError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error");
+  error_msg.SetDataLoss(true);
+  error_msg.SetScope(
+      ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile);
+  error_msg.SetRetryable(false);
+
+  ASSERT_OK(Put(Key(1), "val1"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Reopen(options);
+  ASSERT_EQ("val1", Get(Key(1)));
+
+  ASSERT_OK(Put(Key(2), "val2"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeSyncTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Reopen(options);
+  ASSERT_EQ("val2", Get(Key(2)));
+
+  ASSERT_OK(Put(Key(3), "val3"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeCloseTableFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Reopen(options);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  // not file scope, but retyrable set
+  error_msg.SetDataLoss(false);
+  error_msg.SetScope(
+      ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFileSystem);
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(3), "val3"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeCloseTableFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Reopen(options);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FLushWALWriteRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  listener->EnableAutoRecovery(false);
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::SyncClosedLogs:Start",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu, sdfsdfsdf"}, options);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = false;
+  ASSERT_OK(Put(Key(1), "val1", wo));
+
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  auto cfh = dbfull()->GetColumnFamilyHandle(1);
+  s = dbfull()->DropColumnFamily(cfh);
+
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_OK(Put(Key(3), "val3", wo));
+  ASSERT_EQ("val3", Get(Key(3)));
+  s = Flush();
+  ASSERT_OK(s);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FLushWALAtomicWriteRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  options.atomic_flush = true;
+  Status s;
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  listener->EnableAutoRecovery(false);
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::SyncClosedLogs:Start",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu, sdfsdfsdf"}, options);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = false;
+  ASSERT_OK(Put(Key(1), "val1", wo));
+
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  auto cfh = dbfull()->GetColumnFamilyHandle(1);
+  s = dbfull()->DropColumnFamily(cfh);
+
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_OK(Put(Key(3), "val3", wo));
+  ASSERT_EQ("val3", Get(Key(3)));
+  s = Flush();
+  ASSERT_OK(s);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  Destroy(options);
+}
+
 TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.listeners.emplace_back(listener);
   options.max_bgerror_resume_count = 0;
+  options.statistics = CreateDBStatistics();
   Status s;
 
   listener->EnableAutoRecovery(false);
@@ -280,16 +509,16 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) {
   ASSERT_OK(Put(Key(1), "val1", wo));
   SyncPoint::GetInstance()->SetCallBack(
       "BuildTable:BeforeFinishBuildTable",
-      [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); });
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
   SyncPoint::GetInstance()->EnableProcessing();
   s = Flush();
   ASSERT_OK(Put(Key(2), "val2", wo));
   ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
   ASSERT_EQ("val2", Get(Key(2)));
   SyncPoint::GetInstance()->DisableProcessing();
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   s = dbfull()->Resume();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
   ASSERT_EQ("val1", Get(Key(1)));
   ASSERT_EQ("val2", Get(Key(2)));
   ASSERT_OK(Put(Key(3), "val3", wo));
@@ -297,18 +526,27 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) {
   s = Flush();
   ASSERT_OK(s);
   ASSERT_EQ("val3", Get(Key(3)));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
 
   Destroy(options);
 }
 
-TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError2) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError2) {
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.listeners.emplace_back(listener);
   options.max_bgerror_resume_count = 0;
@@ -326,16 +564,16 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError2) {
   ASSERT_OK(Put(Key(1), "val1", wo));
   SyncPoint::GetInstance()->SetCallBack(
       "BuildTable:BeforeSyncTable",
-      [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); });
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
   SyncPoint::GetInstance()->EnableProcessing();
   s = Flush();
   ASSERT_OK(Put(Key(2), "val2", wo));
   ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
   ASSERT_EQ("val2", Get(Key(2)));
   SyncPoint::GetInstance()->DisableProcessing();
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   s = dbfull()->Resume();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
   ASSERT_EQ("val1", Get(Key(1)));
   ASSERT_EQ("val2", Get(Key(2)));
   ASSERT_OK(Put(Key(3), "val3", wo));
@@ -347,14 +585,11 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError2) {
   Destroy(options);
 }
 
-TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError3) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError3) {
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.listeners.emplace_back(listener);
   options.max_bgerror_resume_count = 0;
@@ -372,16 +607,16 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError3) {
   ASSERT_OK(Put(Key(1), "val1", wo));
   SyncPoint::GetInstance()->SetCallBack(
       "BuildTable:BeforeCloseTableFile",
-      [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); });
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
   SyncPoint::GetInstance()->EnableProcessing();
   s = Flush();
   ASSERT_OK(Put(Key(2), "val2", wo));
   ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
   ASSERT_EQ("val2", Get(Key(2)));
   SyncPoint::GetInstance()->DisableProcessing();
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   s = dbfull()->Resume();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
   ASSERT_EQ("val1", Get(Key(1)));
   ASSERT_EQ("val2", Get(Key(2)));
   ASSERT_OK(Put(Key(3), "val3", wo));
@@ -394,13 +629,10 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError3) {
 }
 
 TEST_F(DBErrorHandlingFSTest, ManifestWriteError) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.listeners.emplace_back(listener);
   Status s;
@@ -416,16 +648,17 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteError) {
   ASSERT_OK(Put(Key(1), "val"));
   SyncPoint::GetInstance()->SetCallBack(
       "VersionSet::LogAndApply:WriteManifest", [&](void*) {
-        fault_fs->SetFilesystemActive(false, IOStatus::NoSpace("Out of space"));
+        fault_fs_->SetFilesystemActive(false,
+                                       IOStatus::NoSpace("Out of space"));
       });
   SyncPoint::GetInstance()->EnableProcessing();
   s = Flush();
   ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
   SyncPoint::GetInstance()->ClearAllCallBacks();
   SyncPoint::GetInstance()->DisableProcessing();
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   s = dbfull()->Resume();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
 
   new_manifest = GetManifestNameFromLiveFiles();
   ASSERT_NE(new_manifest, old_manifest);
@@ -437,13 +670,10 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteError) {
 }
 
 TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableError) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.listeners.emplace_back(listener);
   options.max_bgerror_resume_count = 0;
@@ -463,15 +693,106 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableError) {
   ASSERT_OK(Put(Key(1), "val"));
   SyncPoint::GetInstance()->SetCallBack(
       "VersionSet::LogAndApply:WriteManifest",
-      [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); });
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
   SyncPoint::GetInstance()->EnableProcessing();
   s = Flush();
-  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, ManifestWriteFileScopeError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error");
+  error_msg.SetDataLoss(true);
+  error_msg.SetScope(
+      ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile);
+  error_msg.SetRetryable(false);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = true;
+  ASSERT_OK(Put(Key(0), "val", wo));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val", wo));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
   SyncPoint::GetInstance()->ClearAllCallBacks();
   SyncPoint::GetInstance()->DisableProcessing();
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   s = dbfull()->Resume();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
 
   new_manifest = GetManifestNameFromLiveFiles();
   ASSERT_NE(new_manifest, old_manifest);
@@ -483,13 +804,10 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableError) {
 }
 
 TEST_F(DBErrorHandlingFSTest, DoubleManifestWriteError) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.listeners.emplace_back(listener);
   Status s;
@@ -505,23 +823,24 @@ TEST_F(DBErrorHandlingFSTest, DoubleManifestWriteError) {
   ASSERT_OK(Put(Key(1), "val"));
   SyncPoint::GetInstance()->SetCallBack(
       "VersionSet::LogAndApply:WriteManifest", [&](void*) {
-        fault_fs->SetFilesystemActive(false, IOStatus::NoSpace("Out of space"));
+        fault_fs_->SetFilesystemActive(false,
+                                       IOStatus::NoSpace("Out of space"));
       });
   SyncPoint::GetInstance()->EnableProcessing();
   s = Flush();
   ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
 
   // This Resume() will attempt to create a new manifest file and fail again
   s = dbfull()->Resume();
   ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   SyncPoint::GetInstance()->ClearAllCallBacks();
   SyncPoint::GetInstance()->DisableProcessing();
 
   // A successful Resume() will create a new manifest file
   s = dbfull()->Resume();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
 
   new_manifest = GetManifestNameFromLiveFiles();
   ASSERT_NE(new_manifest, old_manifest);
@@ -533,13 +852,14 @@ TEST_F(DBErrorHandlingFSTest, DoubleManifestWriteError) {
 }
 
 TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteError) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.level0_file_num_compaction_trigger = 2;
   options.listeners.emplace_back(listener);
@@ -553,7 +873,7 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteError) {
   ASSERT_OK(Put(Key(0), "val"));
   ASSERT_OK(Put(Key(2), "val"));
   s = Flush();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
       // Wait for flush of 2nd L0 file before starting compaction
@@ -573,8 +893,8 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteError) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "VersionSet::LogAndApply:WriteManifest", [&](void*) {
         if (fail_manifest.load()) {
-          fault_fs->SetFilesystemActive(false,
-                                        IOStatus::NoSpace("Out of space"));
+          fault_fs_->SetFilesystemActive(false,
+                                         IOStatus::NoSpace("Out of space"));
         }
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
@@ -583,18 +903,18 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteError) {
   // This Flush will trigger a compaction, which will fail when appending to
   // the manifest
   s = Flush();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
 
   TEST_SYNC_POINT("CompactionManifestWriteError:0");
   // Clear all errors so when the compaction is retried, it will succeed
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
   TEST_SYNC_POINT("CompactionManifestWriteError:1");
   TEST_SYNC_POINT("CompactionManifestWriteError:2");
 
   s = dbfull()->TEST_WaitForCompact();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
 
   new_manifest = GetManifestNameFromLiveFiles();
   ASSERT_NE(new_manifest, old_manifest);
@@ -606,13 +926,10 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteError) {
 }
 
 TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableError) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.level0_file_num_compaction_trigger = 2;
   options.listeners.emplace_back(listener);
@@ -630,7 +947,7 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableError) {
   ASSERT_OK(Put(Key(0), "val"));
   ASSERT_OK(Put(Key(2), "val"));
   s = Flush();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
 
   listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
   listener->EnableAutoRecovery(false);
@@ -649,14 +966,14 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableError) {
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "VersionSet::LogAndApply:WriteManifest", [&](void*) {
         if (fail_manifest.load()) {
-          fault_fs->SetFilesystemActive(false, error_msg);
+          fault_fs_->SetFilesystemActive(false, error_msg);
         }
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
   ASSERT_OK(Put(Key(1), "val"));
   s = Flush();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
 
   TEST_SYNC_POINT("CompactionManifestWriteError:0");
   TEST_SYNC_POINT("CompactionManifestWriteError:1");
@@ -664,11 +981,11 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableError) {
   s = dbfull()->TEST_WaitForCompact();
   ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
 
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   SyncPoint::GetInstance()->ClearAllCallBacks();
   SyncPoint::GetInstance()->DisableProcessing();
   s = dbfull()->Resume();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
 
   new_manifest = GetManifestNameFromLiveFiles();
   ASSERT_NE(new_manifest, old_manifest);
@@ -681,13 +998,10 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableError) {
 }
 
 TEST_F(DBErrorHandlingFSTest, CompactionWriteError) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.level0_file_num_compaction_trigger = 2;
   options.listeners.emplace_back(listener);
@@ -697,7 +1011,7 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteError) {
   ASSERT_OK(Put(Key(0), "va;"));
   ASSERT_OK(Put(Key(2), "va;"));
   s = Flush();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
 
   listener->OverrideBGError(
       Status(Status::NoSpace(), Status::Severity::kHardError));
@@ -707,31 +1021,29 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteError) {
         "BackgroundCallCompaction:0"}});
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "BackgroundCallCompaction:0", [&](void*) {
-        fault_fs->SetFilesystemActive(false, IOStatus::NoSpace("Out of space"));
+        fault_fs_->SetFilesystemActive(false,
+                                       IOStatus::NoSpace("Out of space"));
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
   ASSERT_OK(Put(Key(1), "val"));
   s = Flush();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
 
   s = dbfull()->TEST_WaitForCompact();
   ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
 
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   s = dbfull()->Resume();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
   Destroy(options);
 }
 
-TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableError) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+TEST_F(DBErrorHandlingFSTest, DISABLED_CompactionWriteRetryableError) {
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.level0_file_num_compaction_trigger = 2;
   options.listeners.emplace_back(listener);
@@ -745,7 +1057,7 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableError) {
   ASSERT_OK(Put(Key(0), "va;"));
   ASSERT_OK(Put(Key(2), "va;"));
   s = Flush();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
 
   listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
   listener->EnableAutoRecovery(false);
@@ -754,30 +1066,80 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableError) {
         "BackgroundCallCompaction:0"}});
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "CompactionJob::OpenCompactionOutputFile",
-      [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); });
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:Finish",
+      [&](void*) { CancelAllBackgroundWork(dbfull()); });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
   ASSERT_OK(Put(Key(1), "val"));
   s = Flush();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
 
-  s = dbfull()->TEST_WaitForCompact();
-  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  s = dbfull()->TEST_GetBGError();
+  ASSERT_OK(s);
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, DISABLED_CompactionWriteFileScopeError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error");
+  error_msg.SetDataLoss(true);
+  error_msg.SetScope(
+      ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile);
+  error_msg.SetRetryable(false);
+
+  ASSERT_OK(Put(Key(0), "va;"));
+  ASSERT_OK(Put(Key(2), "va;"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+  listener->EnableAutoRecovery(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::OpenCompactionOutputFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:Finish",
+      [&](void*) { CancelAllBackgroundWork(dbfull()); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
 
-  fault_fs->SetFilesystemActive(true);
+  s = dbfull()->TEST_GetBGError();
+  ASSERT_OK(s);
+
+  fault_fs_->SetFilesystemActive(true);
   SyncPoint::GetInstance()->ClearAllCallBacks();
   SyncPoint::GetInstance()->DisableProcessing();
   s = dbfull()->Resume();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
   Destroy(options);
 }
 
 TEST_F(DBErrorHandlingFSTest, CorruptionError) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.level0_file_num_compaction_trigger = 2;
   Status s;
@@ -786,42 +1148,44 @@ TEST_F(DBErrorHandlingFSTest, CorruptionError) {
   ASSERT_OK(Put(Key(0), "va;"));
   ASSERT_OK(Put(Key(2), "va;"));
   s = Flush();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
       {{"DBImpl::FlushMemTable:FlushMemTableFinished",
         "BackgroundCallCompaction:0"}});
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "BackgroundCallCompaction:0", [&](void*) {
-        fault_fs->SetFilesystemActive(false,
-                                      IOStatus::Corruption("Corruption"));
+        fault_fs_->SetFilesystemActive(false,
+                                       IOStatus::Corruption("Corruption"));
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
   ASSERT_OK(Put(Key(1), "val"));
   s = Flush();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
 
   s = dbfull()->TEST_WaitForCompact();
   ASSERT_EQ(s.severity(),
             ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
 
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   s = dbfull()->Resume();
-  ASSERT_NE(s, Status::OK());
+  ASSERT_NOK(s);
   Destroy(options);
 }
 
 TEST_F(DBErrorHandlingFSTest, AutoRecoverFlushError) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.listeners.emplace_back(listener);
+  options.statistics = CreateDBStatistics();
   Status s;
 
   listener->EnableAutoRecovery();
@@ -829,17 +1193,29 @@ TEST_F(DBErrorHandlingFSTest, AutoRecoverFlushError) {
 
   ASSERT_OK(Put(Key(0), "val"));
   SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
-    fault_fs->SetFilesystemActive(false, IOStatus::NoSpace("Out of space"));
+    fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space"));
   });
   SyncPoint::GetInstance()->EnableProcessing();
   s = Flush();
   ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
   SyncPoint::GetInstance()->DisableProcessing();
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   ASSERT_EQ(listener->WaitForRecovery(5000000), true);
 
   s = Put(Key(1), "val");
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
 
   Reopen(options);
   ASSERT_EQ("val", Get(Key(0)));
@@ -848,13 +1224,10 @@ TEST_F(DBErrorHandlingFSTest, AutoRecoverFlushError) {
 }
 
 TEST_F(DBErrorHandlingFSTest, FailRecoverFlushError) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.listeners.emplace_back(listener);
   Status s;
@@ -864,7 +1237,7 @@ TEST_F(DBErrorHandlingFSTest, FailRecoverFlushError) {
 
   ASSERT_OK(Put(Key(0), "val"));
   SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
-    fault_fs->SetFilesystemActive(false, IOStatus::NoSpace("Out of space"));
+    fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space"));
   });
   SyncPoint::GetInstance()->EnableProcessing();
   s = Flush();
@@ -872,17 +1245,18 @@ TEST_F(DBErrorHandlingFSTest, FailRecoverFlushError) {
   // We should be able to shutdown the database while auto recovery is going
   // on in the background
   Close();
-  DestroyDB(dbname_, options);
+  DestroyDB(dbname_, options).PermitUncheckedError();
 }
 
 TEST_F(DBErrorHandlingFSTest, WALWriteError) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.writable_file_max_buffer_size = 32768;
   options.listeners.emplace_back(listener);
@@ -901,7 +1275,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteError) {
 
     WriteOptions wopts;
     wopts.sync = true;
-    ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK());
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
   };
 
   {
@@ -916,8 +1290,8 @@ TEST_F(DBErrorHandlingFSTest, WALWriteError) {
         "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
           write_error++;
           if (write_error > 2) {
-            fault_fs->SetFilesystemActive(false,
-                                          IOStatus::NoSpace("Out of space"));
+            fault_fs_->SetFilesystemActive(false,
+                                           IOStatus::NoSpace("Out of space"));
           }
         });
     SyncPoint::GetInstance()->EnableProcessing();
@@ -927,7 +1301,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteError) {
     ASSERT_EQ(s, s.NoSpace());
   }
   SyncPoint::GetInstance()->DisableProcessing();
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   ASSERT_EQ(listener->WaitForRecovery(5000000), true);
   for (auto i = 0; i < 199; ++i) {
     if (i < 100) {
@@ -948,19 +1322,15 @@ TEST_F(DBErrorHandlingFSTest, WALWriteError) {
 }
 
 TEST_F(DBErrorHandlingFSTest, WALWriteRetryableError) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.writable_file_max_buffer_size = 32768;
   options.listeners.emplace_back(listener);
   options.paranoid_checks = true;
   options.max_bgerror_resume_count = 0;
-  Status s;
   Random rnd(301);
 
   DestroyAndReopen(options);
@@ -978,7 +1348,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableError) {
 
     WriteOptions wopts;
     wopts.sync = true;
-    ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK());
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
   };
 
   // For the second batch, the first 2 file Append are successful, then the
@@ -995,16 +1365,16 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableError) {
         "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
           write_error++;
           if (write_error > 2) {
-            fault_fs->SetFilesystemActive(false, error_msg);
+            fault_fs_->SetFilesystemActive(false, error_msg);
           }
         });
     SyncPoint::GetInstance()->EnableProcessing();
     WriteOptions wopts;
     wopts.sync = true;
-    s = dbfull()->Write(wopts, &batch);
-    ASSERT_EQ(true, s.IsIOError());
+    Status s = dbfull()->Write(wopts, &batch);
+    ASSERT_TRUE(s.IsIOError());
   }
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   SyncPoint::GetInstance()->ClearAllCallBacks();
   SyncPoint::GetInstance()->DisableProcessing();
 
@@ -1018,8 +1388,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableError) {
   }
 
   // Resume and write a new batch, should be in the WAL
-  s = dbfull()->Resume();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(dbfull()->Resume());
   {
     WriteBatch batch;
 
@@ -1029,7 +1398,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableError) {
 
     WriteOptions wopts;
     wopts.sync = true;
-    ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK());
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
   };
 
   Reopen(options);
@@ -1044,17 +1413,17 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableError) {
 }
 
 TEST_F(DBErrorHandlingFSTest, MultiCFWALWriteError) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.writable_file_max_buffer_size = 32768;
   options.listeners.emplace_back(listener);
-  Status s;
   Random rnd(301);
 
   listener->EnableAutoRecovery();
@@ -1071,7 +1440,7 @@ TEST_F(DBErrorHandlingFSTest, MultiCFWALWriteError) {
 
     WriteOptions wopts;
     wopts.sync = true;
-    ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK());
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
   };
 
   {
@@ -1087,18 +1456,18 @@ TEST_F(DBErrorHandlingFSTest, MultiCFWALWriteError) {
         "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
           write_error++;
           if (write_error > 2) {
-            fault_fs->SetFilesystemActive(false,
-                                          IOStatus::NoSpace("Out of space"));
+            fault_fs_->SetFilesystemActive(false,
+                                           IOStatus::NoSpace("Out of space"));
           }
         });
     SyncPoint::GetInstance()->EnableProcessing();
     WriteOptions wopts;
     wopts.sync = true;
-    s = dbfull()->Write(wopts, &batch);
-    ASSERT_EQ(s, s.NoSpace());
+    Status s = dbfull()->Write(wopts, &batch);
+    ASSERT_TRUE(s.IsNoSpace());
   }
   SyncPoint::GetInstance()->DisableProcessing();
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   ASSERT_EQ(listener->WaitForRecovery(5000000), true);
 
   for (auto i = 1; i < 4; ++i) {
@@ -1129,7 +1498,11 @@ TEST_F(DBErrorHandlingFSTest, MultiCFWALWriteError) {
 }
 
 TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
-  FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(Env::Default());
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
+  FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(env_);
   std::vector<std::unique_ptr<Env>> fault_envs;
   std::vector<FaultInjectionTestFS*> fault_fs;
   std::vector<Options> options;
@@ -1142,7 +1515,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
   for (auto i = 0; i < kNumDbInstances; ++i) {
     listener.emplace_back(new ErrorHandlerFSListener());
     options.emplace_back(GetDefaultOptions());
-    fault_fs.emplace_back(new FaultInjectionTestFS(FileSystem::Default()));
+    fault_fs.emplace_back(new FaultInjectionTestFS(env_->GetFileSystem()));
     std::shared_ptr<FileSystem> fs(fault_fs.back());
     fault_envs.emplace_back(new CompositeEnvWrapper(def_env, fs));
     options[i].env = fault_envs.back().get();
@@ -1159,9 +1532,8 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
     listener[i]->InjectFileCreationError(fault_fs[i], 3,
                                          IOStatus::NoSpace("Out of space"));
     snprintf(buf, sizeof(buf), "_%d", i);
-    DestroyDB(dbname_ + std::string(buf), options[i]);
-    ASSERT_EQ(DB::Open(options[i], dbname_ + std::string(buf), &dbptr),
-              Status::OK());
+    ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
+    ASSERT_OK(DB::Open(options[i], dbname_ + std::string(buf), &dbptr));
     db.emplace_back(dbptr);
   }
 
@@ -1174,8 +1546,8 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
 
     WriteOptions wopts;
     wopts.sync = true;
-    ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
-    ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
+    ASSERT_OK(db[i]->Write(wopts, &batch));
+    ASSERT_OK(db[i]->Flush(FlushOptions()));
   }
 
   def_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
@@ -1189,8 +1561,8 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
 
     WriteOptions wopts;
     wopts.sync = true;
-    ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
-    ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
+    ASSERT_OK(db[i]->Write(wopts, &batch));
+    ASSERT_OK(db[i]->Flush(FlushOptions()));
   }
 
   for (auto i = 0; i < kNumDbInstances; ++i) {
@@ -1203,8 +1575,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
   for (auto i = 0; i < kNumDbInstances; ++i) {
     std::string prop;
     ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
-    ASSERT_EQ(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true),
-              Status::OK());
+    ASSERT_OK(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true));
     EXPECT_TRUE(db[i]->GetProperty(
         "rocksdb.num-files-at-level" + NumberToString(0), &prop));
     EXPECT_EQ(atoi(prop.c_str()), 0);
@@ -1213,6 +1584,10 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
     EXPECT_EQ(atoi(prop.c_str()), 1);
   }
 
+  SstFileManagerImpl* sfmImpl =
+      static_cast_with_check<SstFileManagerImpl>(sfm.get());
+  sfmImpl->Close();
+
   for (auto i = 0; i < kNumDbInstances; ++i) {
     char buf[16];
     snprintf(buf, sizeof(buf), "_%d", i);
@@ -1221,7 +1596,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
     if (getenv("KEEP_DB")) {
       printf("DB is still at %s%s\n", dbname_.c_str(), buf);
     } else {
-      Status s = DestroyDB(dbname_ + std::string(buf), options[i]);
+      ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
     }
   }
   options.clear();
@@ -1230,7 +1605,11 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
 }
 
 TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
-  FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(Env::Default());
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
+  FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(env_);
   std::vector<std::unique_ptr<Env>> fault_envs;
   std::vector<FaultInjectionTestFS*> fault_fs;
   std::vector<Options> options;
@@ -1243,7 +1622,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
   for (auto i = 0; i < kNumDbInstances; ++i) {
     listener.emplace_back(new ErrorHandlerFSListener());
     options.emplace_back(GetDefaultOptions());
-    fault_fs.emplace_back(new FaultInjectionTestFS(FileSystem::Default()));
+    fault_fs.emplace_back(new FaultInjectionTestFS(env_->GetFileSystem()));
     std::shared_ptr<FileSystem> fs(fault_fs.back());
     fault_envs.emplace_back(new CompositeEnvWrapper(def_env, fs));
     options[i].env = fault_envs.back().get();
@@ -1272,9 +1651,8 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
         break;
     }
     snprintf(buf, sizeof(buf), "_%d", i);
-    DestroyDB(dbname_ + std::string(buf), options[i]);
-    ASSERT_EQ(DB::Open(options[i], dbname_ + std::string(buf), &dbptr),
-              Status::OK());
+    ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
+    ASSERT_OK(DB::Open(options[i], dbname_ + std::string(buf), &dbptr));
     db.emplace_back(dbptr);
   }
 
@@ -1287,8 +1665,8 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
 
     WriteOptions wopts;
     wopts.sync = true;
-    ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
-    ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
+    ASSERT_OK(db[i]->Write(wopts, &batch));
+    ASSERT_OK(db[i]->Flush(FlushOptions()));
   }
 
   def_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
@@ -1302,11 +1680,11 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
 
     WriteOptions wopts;
     wopts.sync = true;
-    ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
+    ASSERT_OK(db[i]->Write(wopts, &batch));
     if (i != 1) {
-      ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
+      ASSERT_OK(db[i]->Flush(FlushOptions()));
     } else {
-      ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::NoSpace());
+      ASSERT_TRUE(db[i]->Flush(FlushOptions()).IsNoSpace());
     }
   }
 
@@ -1320,7 +1698,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
         ASSERT_EQ(s.severity(), Status::Severity::kHardError);
         break;
       case 2:
-        ASSERT_EQ(s, Status::OK());
+        ASSERT_OK(s);
         break;
     }
     fault_fs[i]->SetFilesystemActive(true);
@@ -1333,8 +1711,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
       ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
     }
     if (i == 1) {
-      ASSERT_EQ(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true),
-                Status::OK());
+      ASSERT_OK(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true));
     }
     EXPECT_TRUE(db[i]->GetProperty(
         "rocksdb.num-files-at-level" + NumberToString(0), &prop));
@@ -1344,6 +1721,10 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
     EXPECT_EQ(atoi(prop.c_str()), 1);
   }
 
+  SstFileManagerImpl* sfmImpl =
+      static_cast_with_check<SstFileManagerImpl>(sfm.get());
+  sfmImpl->Close();
+
   for (auto i = 0; i < kNumDbInstances; ++i) {
     char buf[16];
     snprintf(buf, sizeof(buf), "_%d", i);
@@ -1352,7 +1733,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
     if (getenv("KEEP_DB")) {
       printf("DB is still at %s%s\n", dbname_.c_str(), buf);
     } else {
-      DestroyDB(dbname_ + std::string(buf), options[i]);
+      EXPECT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
     }
   }
   options.clear();
@@ -1364,19 +1745,17 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
 // to soft error and trigger auto resume. During auto resume, SwitchMemtable
 // is disabled to avoid small SST tables. Write can still be applied before
 // the bg error is cleaned unless the memtable is full.
-TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover1) {
+TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover1) {
   // Activate the FS before the first resume
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.listeners.emplace_back(listener);
   options.max_bgerror_resume_count = 2;
   options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  options.statistics = CreateDBStatistics();
   Status s;
 
   listener->EnableAutoRecovery(false);
@@ -1393,7 +1772,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover1) {
         "FLushWritNoWALRetryableeErrorAutoRecover1:1"}});
   SyncPoint::GetInstance()->SetCallBack(
       "BuildTable:BeforeFinishBuildTable",
-      [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); });
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
 
   SyncPoint::GetInstance()->EnableProcessing();
   s = Flush();
@@ -1403,7 +1782,23 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover1) {
   ASSERT_EQ("val1", Get(Key(1)));
   ASSERT_EQ("val1", Get(Key(1)));
   SyncPoint::GetInstance()->DisableProcessing();
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(3, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(3, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(3, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+  HistogramData autoresume_retry;
+  options.statistics->histogramData(ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
+                                    &autoresume_retry);
+  ASSERT_GE(autoresume_retry.max, 0);
   ASSERT_OK(Put(Key(2), "val2", wo));
   s = Flush();
   // Since auto resume fails, the bg error is not cleand, flush will
@@ -1412,29 +1807,25 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover1) {
   ASSERT_EQ("val2", Get(Key(2)));
 
   // call auto resume
-  s = dbfull()->Resume();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(dbfull()->Resume());
   ASSERT_OK(Put(Key(3), "val3", wo));
-  s = Flush();
   // After resume is successful, the flush should be ok.
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Flush());
   ASSERT_EQ("val3", Get(Key(3)));
   Destroy(options);
 }
 
-TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover2) {
+TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover2) {
   // Activate the FS before the first resume
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.listeners.emplace_back(listener);
   options.max_bgerror_resume_count = 2;
   options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  options.statistics = CreateDBStatistics();
   Status s;
 
   listener->EnableAutoRecovery(false);
@@ -1448,16 +1839,32 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover2) {
   ASSERT_OK(Put(Key(1), "val1", wo));
   SyncPoint::GetInstance()->SetCallBack(
       "BuildTable:BeforeFinishBuildTable",
-      [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); });
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
 
   SyncPoint::GetInstance()->EnableProcessing();
   s = Flush();
   ASSERT_EQ("val1", Get(Key(1)));
   ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
   SyncPoint::GetInstance()->DisableProcessing();
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   ASSERT_EQ(listener->WaitForRecovery(5000000), true);
   ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+  HistogramData autoresume_retry;
+  options.statistics->histogramData(ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
+                                    &autoresume_retry);
+  ASSERT_GE(autoresume_retry.max, 0);
   ASSERT_OK(Put(Key(2), "val2", wo));
   s = Flush();
   // Since auto resume is successful, the bg error is cleaned, flush will
@@ -1467,68 +1874,12 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableeErrorAutoRecover2) {
   Destroy(options);
 }
 
-TEST_F(DBErrorHandlingFSTest, DISABLED_FLushWritRetryableeErrorAutoRecover1) {
-  // Fail the first resume and make the second resume successful
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
-  std::shared_ptr<ErrorHandlerFSListener> listener(
-      new ErrorHandlerFSListener());
-  Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
-  options.create_if_missing = true;
-  options.listeners.emplace_back(listener);
-  options.max_bgerror_resume_count = 2;
-  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
-  Status s;
-
-  listener->EnableAutoRecovery(false);
-  DestroyAndReopen(options);
-
-  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
-  error_msg.SetRetryable(true);
-
-  ASSERT_OK(Put(Key(1), "val1"));
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
-      {{"RecoverFromRetryableBGIOError:BeforeWait0",
-        "FLushWritRetryableeErrorAutoRecover1:0"},
-       {"FLushWritRetryableeErrorAutoRecover1:1",
-        "RecoverFromRetryableBGIOError:BeforeWait1"},
-       {"RecoverFromRetryableBGIOError:RecoverSuccess",
-        "FLushWritRetryableeErrorAutoRecover1:2"}});
-  SyncPoint::GetInstance()->SetCallBack(
-      "BuildTable:BeforeFinishBuildTable",
-      [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); });
-  SyncPoint::GetInstance()->EnableProcessing();
-  s = Flush();
-  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
-  TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover1:0");
-  fault_fs->SetFilesystemActive(true);
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
-  TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover1:1");
-  TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover1:2");
-  SyncPoint::GetInstance()->DisableProcessing();
-
-  ASSERT_EQ("val1", Get(Key(1)));
-  Reopen(options);
-  ASSERT_EQ("val1", Get(Key(1)));
-  ASSERT_OK(Put(Key(2), "val2"));
-  s = Flush();
-  ASSERT_EQ(s, Status::OK());
-  ASSERT_EQ("val2", Get(Key(2)));
-
-  Destroy(options);
-}
-
-TEST_F(DBErrorHandlingFSTest, FLushWritRetryableeErrorAutoRecover2) {
+TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover1) {
   // Activate the FS before the first resume
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.listeners.emplace_back(listener);
   options.max_bgerror_resume_count = 2;
@@ -1544,35 +1895,31 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableeErrorAutoRecover2) {
   ASSERT_OK(Put(Key(1), "val1"));
   SyncPoint::GetInstance()->SetCallBack(
       "BuildTable:BeforeFinishBuildTable",
-      [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); });
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
 
   SyncPoint::GetInstance()->EnableProcessing();
   s = Flush();
-  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
   SyncPoint::GetInstance()->DisableProcessing();
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   ASSERT_EQ(listener->WaitForRecovery(5000000), true);
 
   ASSERT_EQ("val1", Get(Key(1)));
   Reopen(options);
   ASSERT_EQ("val1", Get(Key(1)));
   ASSERT_OK(Put(Key(2), "val2"));
-  s = Flush();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Flush());
   ASSERT_EQ("val2", Get(Key(2)));
 
   Destroy(options);
 }
 
-TEST_F(DBErrorHandlingFSTest, FLushWritRetryableeErrorAutoRecover3) {
+TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover2) {
   // Fail all the resume and let user to resume
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.listeners.emplace_back(listener);
   options.max_bgerror_resume_count = 2;
@@ -1587,19 +1934,19 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableeErrorAutoRecover3) {
 
   ASSERT_OK(Put(Key(1), "val1"));
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
-      {{"FLushWritRetryableeErrorAutoRecover3:0",
+      {{"FLushWritRetryableeErrorAutoRecover2:0",
         "RecoverFromRetryableBGIOError:BeforeStart"},
        {"RecoverFromRetryableBGIOError:LoopOut",
-        "FLushWritRetryableeErrorAutoRecover3:1"}});
+        "FLushWritRetryableeErrorAutoRecover2:1"}});
   SyncPoint::GetInstance()->SetCallBack(
       "BuildTable:BeforeFinishBuildTable",
-      [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); });
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
   SyncPoint::GetInstance()->EnableProcessing();
   s = Flush();
-  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
-  TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover3:0");
-  TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover3:1");
-  fault_fs->SetFilesystemActive(true);
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover2:0");
+  TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover2:1");
+  fault_fs_->SetFilesystemActive(true);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
   SyncPoint::GetInstance()->DisableProcessing();
 
@@ -1608,203 +1955,73 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableeErrorAutoRecover3) {
   // resume manually here.
   s = dbfull()->Resume();
   ASSERT_EQ("val1", Get(Key(1)));
-  ASSERT_EQ(s, Status::OK());
-  ASSERT_OK(Put(Key(2), "val2"));
-  s = Flush();
-  ASSERT_EQ(s, Status::OK());
-  ASSERT_EQ("val2", Get(Key(2)));
-
-  Destroy(options);
-}
-
-TEST_F(DBErrorHandlingFSTest, DISABLED_FLushWritRetryableeErrorAutoRecover4) {
-  // Fail the first resume and does not do resume second time because
-  // the IO error severity is Fatal Error and not Retryable.
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
-  std::shared_ptr<ErrorHandlerFSListener> listener(
-      new ErrorHandlerFSListener());
-  Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
-  options.create_if_missing = true;
-  options.listeners.emplace_back(listener);
-  options.max_bgerror_resume_count = 2;
-  options.bgerror_resume_retry_interval = 10;  // 0.1 second
-  Status s;
-
-  listener->EnableAutoRecovery(false);
-  DestroyAndReopen(options);
-
-  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
-  error_msg.SetRetryable(true);
-  IOStatus nr_msg = IOStatus::IOError("No Retryable Fatal IO Error");
-  nr_msg.SetRetryable(false);
-
-  ASSERT_OK(Put(Key(1), "val1"));
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
-      {{"RecoverFromRetryableBGIOError:BeforeStart",
-        "FLushWritRetryableeErrorAutoRecover4:0"},
-       {"FLushWritRetryableeErrorAutoRecover4:2",
-        "RecoverFromRetryableBGIOError:RecoverFail0"}});
-  SyncPoint::GetInstance()->SetCallBack(
-      "BuildTable:BeforeFinishBuildTable",
-      [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); });
-  SyncPoint::GetInstance()->SetCallBack(
-      "RecoverFromRetryableBGIOError:BeforeResume1",
-      [&](void*) { fault_fs->SetFilesystemActive(false, nr_msg); });
-
-  SyncPoint::GetInstance()->EnableProcessing();
-  s = Flush();
-  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
-  TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover4:0");
-  TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover4:2");
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
-  SyncPoint::GetInstance()->DisableProcessing();
-  fault_fs->SetFilesystemActive(true);
-  // Even the FS is recoverd, due to the Fatal Error in bg_error_ the resume
-  // and flush will all fail.
-  ASSERT_EQ("val1", Get(Key(1)));
-  s = dbfull()->Resume();
-  ASSERT_NE(s, Status::OK());
-  ASSERT_EQ("val1", Get(Key(1)));
-  ASSERT_OK(Put(Key(2), "val2"));
-  s = Flush();
-  ASSERT_NE(s, Status::OK());
-  ASSERT_EQ("NOT_FOUND", Get(Key(2)));
-
-  Reopen(options);
-  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_OK(s);
   ASSERT_OK(Put(Key(2), "val2"));
-  s = Flush();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Flush());
   ASSERT_EQ("val2", Get(Key(2)));
 
   Destroy(options);
 }
 
-TEST_F(DBErrorHandlingFSTest, DISABLED_FLushWritRetryableeErrorAutoRecover5) {
-  // During the resume, call DB->CLose, make sure the resume thread exist
-  // before close continues. Due to the shutdown, the resume is not successful
-  // and the FS does not become active, so close status is still IO error
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableErrorAutoRecover) {
+  // Fail the first resume and let the second resume be successful
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.listeners.emplace_back(listener);
   options.max_bgerror_resume_count = 2;
-  options.bgerror_resume_retry_interval = 10;  // 0.1 second
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
   Status s;
+  std::string old_manifest;
+  std::string new_manifest;
 
   listener->EnableAutoRecovery(false);
   DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
 
   IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
   error_msg.SetRetryable(true);
 
-  ASSERT_OK(Put(Key(1), "val1"));
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val"));
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
       {{"RecoverFromRetryableBGIOError:BeforeStart",
-        "FLushWritRetryableeErrorAutoRecover5:0"}});
-  SyncPoint::GetInstance()->SetCallBack(
-      "BuildTable:BeforeFinishBuildTable",
-      [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); });
-  SyncPoint::GetInstance()->EnableProcessing();
-  s = Flush();
-  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
-  TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover5:0");
-  // The first resume will cause recovery_error and its severity is the
-  // Fatal error
-  s = dbfull()->Close();
-  ASSERT_NE(s, Status::OK());
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
-  SyncPoint::GetInstance()->DisableProcessing();
-  fault_fs->SetFilesystemActive(true);
-
-  Reopen(options);
-  ASSERT_NE("val1", Get(Key(1)));
-  ASSERT_OK(Put(Key(2), "val2"));
-  s = Flush();
-  ASSERT_EQ(s, Status::OK());
-  ASSERT_EQ("val2", Get(Key(2)));
-
-  Destroy(options);
-}
-
-TEST_F(DBErrorHandlingFSTest, FLushWritRetryableeErrorAutoRecover6) {
-  // During the resume, call DB->CLose, make sure the resume thread exist
-  // before close continues. Due to the shutdown, the resume is not successful
-  // and the FS does not become active, so close status is still IO error
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
-  std::shared_ptr<ErrorHandlerFSListener> listener(
-      new ErrorHandlerFSListener());
-  Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
-  options.create_if_missing = true;
-  options.listeners.emplace_back(listener);
-  options.max_bgerror_resume_count = 2;
-  options.bgerror_resume_retry_interval = 10;  // 0.1 second
-  Status s;
-
-  listener->EnableAutoRecovery(false);
-  DestroyAndReopen(options);
-
-  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
-  error_msg.SetRetryable(true);
-
-  ASSERT_OK(Put(Key(1), "val1"));
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
-      {{"FLushWritRetryableeErrorAutoRecover6:0",
-        "RecoverFromRetryableBGIOError:BeforeStart"},
-       {"RecoverFromRetryableBGIOError:BeforeWait0",
-        "FLushWritRetryableeErrorAutoRecover6:1"},
-       {"FLushWritRetryableeErrorAutoRecover6:2",
+        "ManifestWriteRetryableErrorAutoRecover:0"},
+       {"ManifestWriteRetryableErrorAutoRecover:1",
         "RecoverFromRetryableBGIOError:BeforeWait1"},
-       {"RecoverFromRetryableBGIOError:AfterWait0",
-        "FLushWritRetryableeErrorAutoRecover6:3"}});
+       {"RecoverFromRetryableBGIOError:RecoverSuccess",
+        "ManifestWriteRetryableErrorAutoRecover:2"}});
   SyncPoint::GetInstance()->SetCallBack(
-      "BuildTable:BeforeFinishBuildTable",
-      [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); });
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
   SyncPoint::GetInstance()->EnableProcessing();
   s = Flush();
-  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
-  TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover6:0");
-  TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover6:1");
-  fault_fs->SetFilesystemActive(true);
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:0");
+  fault_fs_->SetFilesystemActive(true);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
-  TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover6:2");
-  TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover6:3");
-  // The first resume will cause recovery_error and its severity is the
-  // Fatal error
-  s = dbfull()->Close();
-  ASSERT_EQ(s, Status::OK());
+  TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:1");
+  TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:2");
   SyncPoint::GetInstance()->DisableProcessing();
 
-  Reopen(options);
-  ASSERT_EQ("val1", Get(Key(1)));
-  ASSERT_OK(Put(Key(2), "val2"));
-  s = Flush();
-  ASSERT_EQ(s, Status::OK());
-  ASSERT_EQ("val2", Get(Key(2)));
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
 
-  Destroy(options);
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
 }
 
-TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableErrorAutoRecover) {
+TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableErrorAutoRecover) {
   // Fail the first resume and let the second resume be successful
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.listeners.emplace_back(listener);
   options.max_bgerror_resume_count = 2;
@@ -1820,27 +2037,29 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableErrorAutoRecover) {
   IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
   error_msg.SetRetryable(true);
 
-  ASSERT_OK(Put(Key(0), "val"));
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = true;
+  ASSERT_OK(Put(Key(0), "val", wo));
   ASSERT_OK(Flush());
-  ASSERT_OK(Put(Key(1), "val"));
+  ASSERT_OK(Put(Key(1), "val", wo));
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
       {{"RecoverFromRetryableBGIOError:BeforeStart",
-        "ManifestWriteRetryableErrorAutoRecover:0"},
-       {"ManifestWriteRetryableErrorAutoRecover:1",
+        "ManifestWriteNoWALRetryableErrorAutoRecover:0"},
+       {"ManifestWriteNoWALRetryableErrorAutoRecover:1",
         "RecoverFromRetryableBGIOError:BeforeWait1"},
        {"RecoverFromRetryableBGIOError:RecoverSuccess",
-        "ManifestWriteRetryableErrorAutoRecover:2"}});
+        "ManifestWriteNoWALRetryableErrorAutoRecover:2"}});
   SyncPoint::GetInstance()->SetCallBack(
       "VersionSet::LogAndApply:WriteManifest",
-      [&](void*) { fault_fs->SetFilesystemActive(false, error_msg); });
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
   SyncPoint::GetInstance()->EnableProcessing();
   s = Flush();
-  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
-  TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:0");
-  fault_fs->SetFilesystemActive(true);
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:0");
+  fault_fs_->SetFilesystemActive(true);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
-  TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:1");
-  TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:2");
+  TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:1");
+  TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:2");
   SyncPoint::GetInstance()->DisableProcessing();
 
   new_manifest = GetManifestNameFromLiveFiles();
@@ -1854,13 +2073,10 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableErrorAutoRecover) {
 
 TEST_F(DBErrorHandlingFSTest,
        CompactionManifestWriteRetryableErrorAutoRecover) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.level0_file_num_compaction_trigger = 2;
   options.listeners.emplace_back(listener);
@@ -1878,8 +2094,7 @@ TEST_F(DBErrorHandlingFSTest,
 
   ASSERT_OK(Put(Key(0), "val"));
   ASSERT_OK(Put(Key(2), "val"));
-  s = Flush();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Flush());
 
   listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
   listener->EnableAutoRecovery(false);
@@ -1909,14 +2124,14 @@ TEST_F(DBErrorHandlingFSTest,
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "VersionSet::LogAndApply:WriteManifest", [&](void*) {
         if (fail_manifest.load()) {
-          fault_fs->SetFilesystemActive(false, error_msg);
+          fault_fs_->SetFilesystemActive(false, error_msg);
         }
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
   ASSERT_OK(Put(Key(1), "val"));
   s = Flush();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
 
   TEST_SYNC_POINT("CompactionManifestWriteErrorAR:0");
   TEST_SYNC_POINT("CompactionManifestWriteErrorAR:1");
@@ -1925,7 +2140,7 @@ TEST_F(DBErrorHandlingFSTest,
   ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
   TEST_SYNC_POINT("CompactionManifestWriteErrorAR:2");
   TEST_SYNC_POINT("CompactionManifestWriteErrorAR:3");
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   SyncPoint::GetInstance()->ClearAllCallBacks();
   TEST_SYNC_POINT("CompactionManifestWriteErrorAR:4");
   TEST_SYNC_POINT("CompactionManifestWriteErrorAR:5");
@@ -1948,13 +2163,10 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableErrorAutoRecover) {
   // compaction, the FS is set to active and compaction is successful, so
   // the test will hit the CompactionJob::FinishCompactionOutputFile1 sync
   // point.
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.level0_file_num_compaction_trigger = 2;
   options.listeners.emplace_back(listener);
@@ -1969,7 +2181,7 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableErrorAutoRecover) {
   ASSERT_OK(Put(Key(0), "va;"));
   ASSERT_OK(Put(Key(2), "va;"));
   s = Flush();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
 
   listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
   listener->EnableAutoRecovery(false);
@@ -1980,13 +2192,13 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableErrorAutoRecover) {
         "CompactionWriteRetryableErrorAutoRecover0"}});
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:Start",
-      [&](void*) { fault_fs->SetFilesystemActive(true); });
+      [&](void*) { fault_fs_->SetFilesystemActive(true); });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "BackgroundCallCompaction:0", [&](void*) { fail_first.store(true); });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "CompactionJob::OpenCompactionOutputFile", [&](void*) {
         if (fail_first.load() && fail_second.load()) {
-          fault_fs->SetFilesystemActive(false, error_msg);
+          fault_fs_->SetFilesystemActive(false, error_msg);
           fail_second.store(false);
         }
       });
@@ -1994,11 +2206,10 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableErrorAutoRecover) {
 
   ASSERT_OK(Put(Key(1), "val"));
   s = Flush();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
 
   s = dbfull()->TEST_WaitForCompact();
-  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
-
+  ASSERT_OK(s);
   TEST_SYNC_POINT("CompactionWriteRetryableErrorAutoRecover0");
   SyncPoint::GetInstance()->ClearAllCallBacks();
   SyncPoint::GetInstance()->DisableProcessing();
@@ -2006,13 +2217,10 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableErrorAutoRecover) {
 }
 
 TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover1) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.writable_file_max_buffer_size = 32768;
   options.listeners.emplace_back(listener);
@@ -2037,7 +2245,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover1) {
 
     WriteOptions wopts;
     wopts.sync = true;
-    ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK());
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
   };
 
   // For the second batch, the first 2 file Append are successful, then the
@@ -2050,7 +2258,8 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover1) {
       ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
     }
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
-        {{"RecoverFromRetryableBGIOError:BeforeResume0", "WALWriteError1:0"},
+        {{"WALWriteErrorDone", "RecoverFromRetryableBGIOError:BeforeStart"},
+         {"RecoverFromRetryableBGIOError:BeforeResume0", "WALWriteError1:0"},
          {"WALWriteError1:1", "RecoverFromRetryableBGIOError:BeforeResume1"},
          {"RecoverFromRetryableBGIOError:RecoverSuccess", "WALWriteError1:2"}});
 
@@ -2058,7 +2267,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover1) {
         "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
           write_error++;
           if (write_error > 2) {
-            fault_fs->SetFilesystemActive(false, error_msg);
+            fault_fs_->SetFilesystemActive(false, error_msg);
           }
         });
     SyncPoint::GetInstance()->EnableProcessing();
@@ -2066,9 +2275,10 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover1) {
     wopts.sync = true;
     s = dbfull()->Write(wopts, &batch);
     ASSERT_EQ(true, s.IsIOError());
+    TEST_SYNC_POINT("WALWriteErrorDone");
 
     TEST_SYNC_POINT("WALWriteError1:0");
-    fault_fs->SetFilesystemActive(true);
+    fault_fs_->SetFilesystemActive(true);
     SyncPoint::GetInstance()->ClearAllCallBacks();
     TEST_SYNC_POINT("WALWriteError1:1");
     TEST_SYNC_POINT("WALWriteError1:2");
@@ -2094,7 +2304,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover1) {
 
     WriteOptions wopts;
     wopts.sync = true;
-    ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK());
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
   };
 
   Reopen(options);
@@ -2110,13 +2320,10 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover1) {
 
 TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover2) {
   // Fail the first recover and try second time.
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.writable_file_max_buffer_size = 32768;
   options.listeners.emplace_back(listener);
@@ -2141,7 +2348,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover2) {
 
     WriteOptions wopts;
     wopts.sync = true;
-    ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK());
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
   };
 
   // For the second batch, the first 2 file Append are successful, then the
@@ -2162,7 +2369,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover2) {
         "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
           write_error++;
           if (write_error > 2) {
-            fault_fs->SetFilesystemActive(false, error_msg);
+            fault_fs_->SetFilesystemActive(false, error_msg);
           }
         });
     SyncPoint::GetInstance()->EnableProcessing();
@@ -2172,7 +2379,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover2) {
     ASSERT_EQ(true, s.IsIOError());
 
     TEST_SYNC_POINT("WALWriteError2:0");
-    fault_fs->SetFilesystemActive(true);
+    fault_fs_->SetFilesystemActive(true);
     SyncPoint::GetInstance()->ClearAllCallBacks();
     TEST_SYNC_POINT("WALWriteError2:1");
     TEST_SYNC_POINT("WALWriteError2:2");
@@ -2198,7 +2405,7 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover2) {
 
     WriteOptions wopts;
     wopts.sync = true;
-    ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK());
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
   };
 
   Reopen(options);
@@ -2216,13 +2423,10 @@ class DBErrorHandlingFencingTest : public DBErrorHandlingFSTest,
                                    public testing::WithParamInterface<bool> {};
 
 TEST_P(DBErrorHandlingFencingTest, FLushWriteFenced) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.listeners.emplace_back(listener);
   options.paranoid_checks = GetParam();
@@ -2233,27 +2437,24 @@ TEST_P(DBErrorHandlingFencingTest, FLushWriteFenced) {
 
   ASSERT_OK(Put(Key(0), "val"));
   SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
-    fault_fs->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced"));
+    fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced"));
   });
   SyncPoint::GetInstance()->EnableProcessing();
   s = Flush();
   ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
   ASSERT_TRUE(s.IsIOFenced());
   SyncPoint::GetInstance()->DisableProcessing();
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   s = dbfull()->Resume();
   ASSERT_TRUE(s.IsIOFenced());
   Destroy(options);
 }
 
 TEST_P(DBErrorHandlingFencingTest, ManifestWriteFenced) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.listeners.emplace_back(listener);
   options.paranoid_checks = GetParam();
@@ -2266,11 +2467,11 @@ TEST_P(DBErrorHandlingFencingTest, ManifestWriteFenced) {
   old_manifest = GetManifestNameFromLiveFiles();
 
   ASSERT_OK(Put(Key(0), "val"));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_OK(Put(Key(1), "val"));
   SyncPoint::GetInstance()->SetCallBack(
       "VersionSet::LogAndApply:WriteManifest", [&](void*) {
-        fault_fs->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced"));
+        fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced"));
       });
   SyncPoint::GetInstance()->EnableProcessing();
   s = Flush();
@@ -2278,20 +2479,17 @@ TEST_P(DBErrorHandlingFencingTest, ManifestWriteFenced) {
   ASSERT_TRUE(s.IsIOFenced());
   SyncPoint::GetInstance()->ClearAllCallBacks();
   SyncPoint::GetInstance()->DisableProcessing();
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   s = dbfull()->Resume();
   ASSERT_TRUE(s.IsIOFenced());
   Close();
 }
 
 TEST_P(DBErrorHandlingFencingTest, CompactionWriteFenced) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.level0_file_num_compaction_trigger = 2;
   options.listeners.emplace_back(listener);
@@ -2302,7 +2500,7 @@ TEST_P(DBErrorHandlingFencingTest, CompactionWriteFenced) {
   ASSERT_OK(Put(Key(0), "va;"));
   ASSERT_OK(Put(Key(2), "va;"));
   s = Flush();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
 
   listener->EnableAutoRecovery(true);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
@@ -2310,32 +2508,29 @@ TEST_P(DBErrorHandlingFencingTest, CompactionWriteFenced) {
         "BackgroundCallCompaction:0"}});
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "BackgroundCallCompaction:0", [&](void*) {
-        fault_fs->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced"));
+        fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced"));
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
   ASSERT_OK(Put(Key(1), "val"));
   s = Flush();
-  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(s);
 
   s = dbfull()->TEST_WaitForCompact();
   ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
   ASSERT_TRUE(s.IsIOFenced());
 
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   s = dbfull()->Resume();
   ASSERT_TRUE(s.IsIOFenced());
   Destroy(options);
 }
 
 TEST_P(DBErrorHandlingFencingTest, WALWriteFenced) {
-  std::shared_ptr<FaultInjectionTestFS> fault_fs(
-      new FaultInjectionTestFS(FileSystem::Default()));
-  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
   std::shared_ptr<ErrorHandlerFSListener> listener(
       new ErrorHandlerFSListener());
   Options options = GetDefaultOptions();
-  options.env = fault_fs_env.get();
+  options.env = fault_env_.get();
   options.create_if_missing = true;
   options.writable_file_max_buffer_size = 32768;
   options.listeners.emplace_back(listener);
@@ -2355,7 +2550,7 @@ TEST_P(DBErrorHandlingFencingTest, WALWriteFenced) {
 
     WriteOptions wopts;
     wopts.sync = true;
-    ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK());
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
   };
 
   {
@@ -2370,8 +2565,8 @@ TEST_P(DBErrorHandlingFencingTest, WALWriteFenced) {
         "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
           write_error++;
           if (write_error > 2) {
-            fault_fs->SetFilesystemActive(false,
-                                          IOStatus::IOFenced("IO fenced"));
+            fault_fs_->SetFilesystemActive(false,
+                                           IOStatus::IOFenced("IO fenced"));
           }
         });
     SyncPoint::GetInstance()->EnableProcessing();
@@ -2381,7 +2576,7 @@ TEST_P(DBErrorHandlingFencingTest, WALWriteFenced) {
     ASSERT_TRUE(s.IsIOFenced());
   }
   SyncPoint::GetInstance()->DisableProcessing();
-  fault_fs->SetFilesystemActive(true);
+  fault_fs_->SetFilesystemActive(true);
   {
     WriteBatch batch;
 
diff --git a/db/event_helpers.cc b/db/event_helpers.cc
index b9fa35e330d..6164dde2962 100644
--- a/db/event_helpers.cc
+++ b/db/event_helpers.cc
@@ -108,6 +108,7 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
                             table_properties.num_entries)
               << "num_data_blocks" << table_properties.num_data_blocks
               << "num_entries" << table_properties.num_entries
+              << "num_filter_entries" << table_properties.num_filter_entries
               << "num_deletions" << table_properties.num_deletions
               << "num_merge_operands" << table_properties.num_merge_operands
               << "num_range_deletions" << table_properties.num_range_deletions
@@ -125,8 +126,12 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
               << table_properties.compression_options << "creation_time"
               << table_properties.creation_time << "oldest_key_time"
               << table_properties.oldest_key_time << "file_creation_time"
-              << table_properties.file_creation_time << "db_id"
-              << table_properties.db_id << "db_session_id"
+              << table_properties.file_creation_time
+              << "slow_compression_estimated_data_size"
+              << table_properties.slow_compression_estimated_data_size
+              << "fast_compression_estimated_data_size"
+              << table_properties.fast_compression_estimated_data_size
+              << "db_id" << table_properties.db_id << "db_session_id"
               << table_properties.db_session_id;
 
       // user collected properties
@@ -213,17 +218,16 @@ void EventHelpers::NotifyOnErrorRecoveryCompleted(
     const std::vector<std::shared_ptr<EventListener>>& listeners,
     Status old_bg_error, InstrumentedMutex* db_mutex) {
 #ifndef ROCKSDB_LITE
-  if (listeners.size() == 0U) {
-    return;
-  }
-  db_mutex->AssertHeld();
-  // release lock while notifying events
-  db_mutex->Unlock();
-  for (auto& listener : listeners) {
-    listener->OnErrorRecoveryCompleted(old_bg_error);
+  if (listeners.size() > 0) {
+    db_mutex->AssertHeld();
+    // release lock while notifying events
+    db_mutex->Unlock();
+    for (auto& listener : listeners) {
+      listener->OnErrorRecoveryCompleted(old_bg_error);
+    }
+    db_mutex->Lock();
   }
   old_bg_error.PermitUncheckedError();
-  db_mutex->Lock();
 #else
   (void)listeners;
   (void)old_bg_error;
diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc
index 624952563fe..035cb3698c3 100644
--- a/db/external_sst_file_basic_test.cc
+++ b/db/external_sst_file_basic_test.cc
@@ -24,13 +24,13 @@ class ExternalSSTFileBasicTest
   ExternalSSTFileBasicTest()
       : DBTestBase("/external_sst_file_basic_test", /*env_do_fsync=*/true) {
     sst_files_dir_ = dbname_ + "/sst_files/";
-    fault_injection_test_env_.reset(new FaultInjectionTestEnv(Env::Default()));
+    fault_injection_test_env_.reset(new FaultInjectionTestEnv(env_));
     DestroyAndRecreateExternalSSTFilesDir();
   }
 
   void DestroyAndRecreateExternalSSTFilesDir() {
-    DestroyDir(env_, sst_files_dir_);
-    env_->CreateDir(sst_files_dir_);
+    ASSERT_OK(DestroyDir(env_, sst_files_dir_));
+    ASSERT_OK(env_->CreateDir(sst_files_dir_));
   }
 
   Status DeprecatedAddFile(const std::vector<std::string>& files,
@@ -162,7 +162,9 @@ class ExternalSSTFileBasicTest
         write_global_seqno, verify_checksums_before_ingest, true_data);
   }
 
-  ~ExternalSSTFileBasicTest() override { DestroyDir(env_, sst_files_dir_); }
+  ~ExternalSSTFileBasicTest() override {
+    DestroyDir(env_, sst_files_dir_).PermitUncheckedError();
+  }
 
  protected:
   std::string sst_files_dir_;
@@ -186,7 +188,7 @@ TEST_F(ExternalSSTFileBasicTest, Basic) {
   }
   ExternalSstFileInfo file1_info;
   Status s = sst_file_writer.Finish(&file1_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
 
   // Current file size should be non-zero after success write.
   ASSERT_GT(sst_file_writer.FileSize(), 0);
@@ -202,14 +204,14 @@ TEST_F(ExternalSSTFileBasicTest, Basic) {
   ASSERT_EQ(file1_info.file_checksum_func_name, kUnknownFileChecksumFuncName);
   // sst_file_writer already finished, cannot add this value
   s = sst_file_writer.Put(Key(100), "bad_val");
-  ASSERT_FALSE(s.ok()) << s.ToString();
+  ASSERT_NOK(s) << s.ToString();
   s = sst_file_writer.DeleteRange(Key(100), Key(200));
-  ASSERT_FALSE(s.ok()) << s.ToString();
+  ASSERT_NOK(s) << s.ToString();
 
   DestroyAndReopen(options);
   // Add file using file path
   s = DeprecatedAddFile({file1});
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
   for (int k = 0; k < 100; k++) {
     ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
@@ -286,7 +288,7 @@ TEST_F(ExternalSSTFileBasicTest, BasicWithFileChecksumCrc32c) {
   }
   ExternalSstFileInfo file1_info;
   Status s = sst_file_writer.Finish(&file1_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   std::string file_checksum, file_checksum_func_name;
   ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
       file1, &file_checksum, &file_checksum_func_name));
@@ -305,14 +307,14 @@ TEST_F(ExternalSSTFileBasicTest, BasicWithFileChecksumCrc32c) {
   ASSERT_EQ(file1_info.file_checksum_func_name, file_checksum_func_name);
   // sst_file_writer already finished, cannot add this value
   s = sst_file_writer.Put(Key(100), "bad_val");
-  ASSERT_FALSE(s.ok()) << s.ToString();
+  ASSERT_NOK(s) << s.ToString();
   s = sst_file_writer.DeleteRange(Key(100), Key(200));
-  ASSERT_FALSE(s.ok()) << s.ToString();
+  ASSERT_NOK(s) << s.ToString();
 
   DestroyAndReopen(options);
   // Add file using file path
   s = DeprecatedAddFile({file1});
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
   for (int k = 0; k < 100; k++) {
     ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
@@ -338,7 +340,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   }
   ExternalSstFileInfo file1_info;
   Status s = sst_file_writer.Finish(&file1_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_EQ(file1_info.file_path, file1);
   ASSERT_EQ(file1_info.num_entries, 100);
   ASSERT_EQ(file1_info.smallest_key, Key(1000));
@@ -357,7 +359,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   }
   ExternalSstFileInfo file2_info;
   s = sst_file_writer.Finish(&file2_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_EQ(file2_info.file_path, file2);
   ASSERT_EQ(file2_info.num_entries, 200);
   ASSERT_EQ(file2_info.smallest_key, Key(1100));
@@ -376,7 +378,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   }
   ExternalSstFileInfo file3_info;
   s = sst_file_writer.Finish(&file3_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_EQ(file3_info.file_path, file3);
   ASSERT_EQ(file3_info.num_entries, 200);
   ASSERT_EQ(file3_info.smallest_key, Key(1300));
@@ -395,7 +397,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   }
   ExternalSstFileInfo file4_info;
   s = sst_file_writer.Finish(&file4_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_EQ(file4_info.file_path, file4);
   ASSERT_EQ(file4_info.num_entries, 300);
   ASSERT_EQ(file4_info.smallest_key, Key(1500));
@@ -414,7 +416,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   }
   ExternalSstFileInfo file5_info;
   s = sst_file_writer.Finish(&file5_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_EQ(file5_info.file_path, file5);
   ASSERT_EQ(file5_info.num_entries, 200);
   ASSERT_EQ(file5_info.smallest_key, Key(1800));
@@ -433,7 +435,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   }
   ExternalSstFileInfo file6_info;
   s = sst_file_writer.Finish(&file6_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_EQ(file6_info.file_path, file6);
   ASSERT_EQ(file6_info.num_entries, 200);
   ASSERT_EQ(file6_info.smallest_key, Key(2000));
@@ -447,7 +449,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   s = AddFileWithFileChecksum({file1}, {file_checksum1, "xyz"},
                               {file_checksum1}, true, false, false, false);
   // does not care the checksum input since db does not enable file checksum
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_OK(env_->FileExists(file1));
   std::vector<LiveFileMetaData> live_files;
   dbfull()->GetLiveFilesMetaData(&live_files);
@@ -465,26 +467,26 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   s = AddFileWithFileChecksum({file2}, {file_checksum2, "xyz"},
                               {file_checksum_func_name2}, true, false, false,
                               false);
-  ASSERT_FALSE(s.ok()) << s.ToString();
+  ASSERT_NOK(s) << s.ToString();
 
   // Enable verify_file_checksum option
   // The checksum name does not match, fail the ingestion
   s = AddFileWithFileChecksum({file2}, {file_checksum2}, {"xyz"}, true, false,
                               false, false);
-  ASSERT_FALSE(s.ok()) << s.ToString();
+  ASSERT_NOK(s) << s.ToString();
 
   // Enable verify_file_checksum option
   // The checksum itself does not match, fail the ingestion
   s = AddFileWithFileChecksum({file2}, {"xyz"}, {file_checksum_func_name2},
                               true, false, false, false);
-  ASSERT_FALSE(s.ok()) << s.ToString();
+  ASSERT_NOK(s) << s.ToString();
 
   // Enable verify_file_checksum option
   // All matches, ingestion is successful
   s = AddFileWithFileChecksum({file2}, {file_checksum2},
                               {file_checksum_func_name2}, true, false, false,
                               false);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   std::vector<LiveFileMetaData> live_files1;
   dbfull()->GetLiveFilesMetaData(&live_files1);
   for (auto f : live_files1) {
@@ -501,7 +503,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   std::vector<std::string> checksum, checksum_func;
   s = AddFileWithFileChecksum({file3}, checksum, checksum_func, true, false,
                               false, false);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   std::vector<LiveFileMetaData> live_files2;
   dbfull()->GetLiveFilesMetaData(&live_files2);
   for (auto f : live_files2) {
@@ -511,20 +513,20 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
       set1.insert(f.name);
     }
   }
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_OK(env_->FileExists(file3));
 
   // Does not enable verify_file_checksum options
   // The checksum name does not match, fail the ingestion
   s = AddFileWithFileChecksum({file4}, {file_checksum4}, {"xyz"}, false, false,
                               false, false);
-  ASSERT_FALSE(s.ok()) << s.ToString();
+  ASSERT_NOK(s) << s.ToString();
 
   // Does not enable verify_file_checksum options
   // Checksum function name matches, store the checksum being ingested.
   s = AddFileWithFileChecksum({file4}, {"asd"}, {file_checksum_func_name4},
                               false, false, false, false);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   std::vector<LiveFileMetaData> live_files3;
   dbfull()->GetLiveFilesMetaData(&live_files3);
   for (auto f : live_files3) {
@@ -535,7 +537,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
       set1.insert(f.name);
     }
   }
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_OK(env_->FileExists(file4));
 
   // enable verify_file_checksum options, DB enable checksum, and enable
@@ -544,8 +546,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   s = AddFileWithFileChecksum({file5}, {file_checksum5},
                               {file_checksum_func_name5}, true, false, false,
                               true);
-  ASSERT_OK(s);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   std::vector<LiveFileMetaData> live_files4;
   dbfull()->GetLiveFilesMetaData(&live_files4);
   for (auto f : live_files4) {
@@ -558,7 +559,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
       set1.insert(f.name);
     }
   }
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_OK(env_->FileExists(file5));
 
   // Does not enable verify_file_checksum options and also the ingested file
@@ -567,7 +568,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
   std::vector<std::string> files_c6, files_name6;
   s = AddFileWithFileChecksum({file6}, files_c6, files_name6, false, false,
                               false, false);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   std::vector<LiveFileMetaData> live_files6;
   dbfull()->GetLiveFilesMetaData(&live_files6);
   for (auto f : live_files6) {
@@ -577,7 +578,7 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
       set1.insert(f.name);
     }
   }
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_OK(env_->FileExists(file6));
 }
 
@@ -595,7 +596,7 @@ TEST_F(ExternalSSTFileBasicTest, NoCopy) {
   }
   ExternalSstFileInfo file1_info;
   Status s = sst_file_writer.Finish(&file1_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_EQ(file1_info.file_path, file1);
   ASSERT_EQ(file1_info.num_entries, 100);
   ASSERT_EQ(file1_info.smallest_key, Key(0));
@@ -609,7 +610,7 @@ TEST_F(ExternalSSTFileBasicTest, NoCopy) {
   }
   ExternalSstFileInfo file2_info;
   s = sst_file_writer.Finish(&file2_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_EQ(file2_info.file_path, file2);
   ASSERT_EQ(file2_info.num_entries, 200);
   ASSERT_EQ(file2_info.smallest_key, Key(100));
@@ -623,23 +624,23 @@ TEST_F(ExternalSSTFileBasicTest, NoCopy) {
   }
   ExternalSstFileInfo file3_info;
   s = sst_file_writer.Finish(&file3_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_EQ(file3_info.file_path, file3);
   ASSERT_EQ(file3_info.num_entries, 15);
   ASSERT_EQ(file3_info.smallest_key, Key(110));
   ASSERT_EQ(file3_info.largest_key, Key(124));
 
   s = DeprecatedAddFile({file1}, true /* move file */);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_EQ(Status::NotFound(), env_->FileExists(file1));
 
   s = DeprecatedAddFile({file2}, false /* copy file */);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_OK(env_->FileExists(file2));
 
   // This file has overlapping values with the existing data
   s = DeprecatedAddFile({file3}, true /* move file */);
-  ASSERT_FALSE(s.ok()) << s.ToString();
+  ASSERT_NOK(s) << s.ToString();
   ASSERT_OK(env_->FileExists(file3));
 
   for (int k = 0; k < 300; k++) {
@@ -1109,6 +1110,7 @@ TEST_F(ExternalSSTFileBasicTest, SyncFailure) {
     }
 
     Options sst_file_writer_options;
+    sst_file_writer_options.env = env_;
     std::unique_ptr<SstFileWriter> sst_file_writer(
         new SstFileWriter(EnvOptions(), sst_file_writer_options));
     std::string file_name =
@@ -1125,7 +1127,7 @@ TEST_F(ExternalSSTFileBasicTest, SyncFailure) {
     if (i == 2) {
       ingest_opt.write_global_seqno = true;
     }
-    ASSERT_FALSE(db_->IngestExternalFile({file_name}, ingest_opt).ok());
+    ASSERT_NOK(db_->IngestExternalFile({file_name}, ingest_opt));
     db_->ReleaseSnapshot(snapshot);
 
     SyncPoint::GetInstance()->DisableProcessing();
@@ -1134,14 +1136,50 @@ TEST_F(ExternalSSTFileBasicTest, SyncFailure) {
   }
 }
 
+TEST_F(ExternalSSTFileBasicTest, ReopenNotSupported) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "ExternalSstFileIngestionJob::Prepare:Reopen", [&](void* arg) {
+        Status* s = static_cast<Status*>(arg);
+        *s = Status::NotSupported();
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  DestroyAndReopen(options);
+
+  Options sst_file_writer_options;
+  sst_file_writer_options.env = env_;
+  std::unique_ptr<SstFileWriter> sst_file_writer(
+      new SstFileWriter(EnvOptions(), sst_file_writer_options));
+  std::string file_name =
+      sst_files_dir_ + "reopen_not_supported_test_" + ".sst";
+  ASSERT_OK(sst_file_writer->Open(file_name));
+  ASSERT_OK(sst_file_writer->Put("bar", "v2"));
+  ASSERT_OK(sst_file_writer->Finish());
+
+  IngestExternalFileOptions ingest_opt;
+  ingest_opt.move_files = true;
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt));
+  db_->ReleaseSnapshot(snapshot);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  Destroy(options);
+}
+
 TEST_F(ExternalSSTFileBasicTest, VerifyChecksumReadahead) {
   Options options;
   options.create_if_missing = true;
-  SpecialEnv senv(Env::Default());
+  SpecialEnv senv(env_);
   options.env = &senv;
   DestroyAndReopen(options);
 
   Options sst_file_writer_options;
+  sst_file_writer_options.env = env_;
   std::unique_ptr<SstFileWriter> sst_file_writer(
       new SstFileWriter(EnvOptions(), sst_file_writer_options));
   std::string file_name = sst_files_dir_ + "verify_checksum_readahead_test.sst";
@@ -1324,7 +1362,7 @@ TEST_F(ExternalSSTFileBasicTest, AdjacentRangeDeletionTombstones) {
   ASSERT_OK(sst_file_writer.DeleteRange(Key(300), Key(400)));
   ExternalSstFileInfo file8_info;
   Status s = sst_file_writer.Finish(&file8_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_EQ(file8_info.file_path, file8);
   ASSERT_EQ(file8_info.num_entries, 0);
   ASSERT_EQ(file8_info.smallest_key, "");
@@ -1339,7 +1377,7 @@ TEST_F(ExternalSSTFileBasicTest, AdjacentRangeDeletionTombstones) {
   ASSERT_OK(sst_file_writer.DeleteRange(Key(400), Key(500)));
   ExternalSstFileInfo file9_info;
   s = sst_file_writer.Finish(&file9_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_EQ(file9_info.file_path, file9);
   ASSERT_EQ(file9_info.num_entries, 0);
   ASSERT_EQ(file9_info.smallest_key, "");
@@ -1351,7 +1389,7 @@ TEST_F(ExternalSSTFileBasicTest, AdjacentRangeDeletionTombstones) {
   // Range deletion tombstones are exclusive on their end key, so these SSTs
   // should not be considered as overlapping.
   s = DeprecatedAddFile({file8, file9});
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(s) << s.ToString();
   ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
   DestroyAndRecreateExternalSSTFilesDir();
 }
@@ -1539,6 +1577,44 @@ TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) {
   ASSERT_EQ(2, NumTableFilesAtLevel(0));
 }
 
+TEST_F(ExternalSSTFileBasicTest, IngestFileAfterDBPut) {
+  // Repro https://github.com/facebook/rocksdb/issues/6245.
+  // Flush three files to L0. Ingest one more file to trigger L0->L1 compaction
+  // via trivial move. The bug happened when L1 files were incorrectly sorted
+  // resulting in an old value for "k" returned by `Get()`.
+  Options options = CurrentOptions();
+
+  ASSERT_OK(Put("k", "a"));
+  Flush();
+  ASSERT_OK(Put("k", "a"));
+  Flush();
+  ASSERT_OK(Put("k", "a"));
+  Flush();
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // Current file size should be 0 after sst_file_writer init and before open a
+  // file.
+  ASSERT_EQ(sst_file_writer.FileSize(), 0);
+
+  std::string file1 = sst_files_dir_ + "file1.sst";
+  ASSERT_OK(sst_file_writer.Open(file1));
+  ASSERT_OK(sst_file_writer.Put("k", "b"));
+
+  ExternalSstFileInfo file1_info;
+  Status s = sst_file_writer.Finish(&file1_info);
+  ASSERT_OK(s) << s.ToString();
+
+  // Current file size should be non-zero after success write.
+  ASSERT_GT(sst_file_writer.FileSize(), 0);
+
+  IngestExternalFileOptions ifo;
+  s = db_->IngestExternalFile({file1}, ifo);
+  ASSERT_OK(s);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(Get("k"), "b");
+}
+
 INSTANTIATE_TEST_CASE_P(ExternalSSTFileBasicTest, ExternalSSTFileBasicTest,
                         testing::Values(std::make_tuple(true, true),
                                         std::make_tuple(true, false),
diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 0fcaf776256..11c0155f672 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -40,16 +40,25 @@ Status ExternalSstFileIngestionJob::Prepare(
     if (!status.ok()) {
       return status;
     }
-    files_to_ingest_.push_back(file_to_ingest);
-  }
 
-  for (const IngestedFileInfo& f : files_to_ingest_) {
-    if (f.cf_id !=
+    if (file_to_ingest.cf_id !=
             TablePropertiesCollectorFactory::Context::kUnknownColumnFamily &&
-        f.cf_id != cfd_->GetID()) {
+        file_to_ingest.cf_id != cfd_->GetID()) {
       return Status::InvalidArgument(
           "External file column family id don't match");
     }
+
+    if (file_to_ingest.num_entries == 0 &&
+        file_to_ingest.num_range_deletions == 0) {
+      return Status::InvalidArgument("File contain no entries");
+    }
+
+    if (!file_to_ingest.smallest_internal_key.Valid() ||
+        !file_to_ingest.largest_internal_key.Valid()) {
+      return Status::Corruption("Generated table have corrupted keys");
+    }
+
+    files_to_ingest_.emplace_back(std::move(file_to_ingest));
   }
 
   const Comparator* ucmp = cfd_->internal_comparator().user_comparator();
@@ -83,16 +92,6 @@ Status ExternalSstFileIngestionJob::Prepare(
     return Status::NotSupported("Files have overlapping ranges");
   }
 
-  for (IngestedFileInfo& f : files_to_ingest_) {
-    if (f.num_entries == 0 && f.num_range_deletions == 0) {
-      return Status::InvalidArgument("File contain no entries");
-    }
-
-    if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) {
-      return Status::Corruption("Generated table have corrupted keys");
-    }
-  }
-
   // Copy/Move external files into DB
   std::unordered_set<size_t> ingestion_path_ids;
   for (IngestedFileInfo& f : files_to_ingest_) {
@@ -111,18 +110,26 @@ Status ExternalSstFileIngestionJob::Prepare(
           // directory before ingest the file. For integrity of RocksDB we need
           // to sync the file.
           std::unique_ptr<FSWritableFile> file_to_sync;
-          status = fs_->ReopenWritableFile(path_inside_db, env_options_,
-                                           &file_to_sync, nullptr);
-          if (status.ok()) {
-            TEST_SYNC_POINT(
-                "ExternalSstFileIngestionJob::BeforeSyncIngestedFile");
-            status = SyncIngestedFile(file_to_sync.get());
-            TEST_SYNC_POINT(
-                "ExternalSstFileIngestionJob::AfterSyncIngestedFile");
-            if (!status.ok()) {
-              ROCKS_LOG_WARN(db_options_.info_log,
-                             "Failed to sync ingested file %s: %s",
-                             path_inside_db.c_str(), status.ToString().c_str());
+          Status s = fs_->ReopenWritableFile(path_inside_db, env_options_,
+                                             &file_to_sync, nullptr);
+          TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:Reopen",
+                                   &s);
+          // Some file systems (especially remote/distributed) don't support
+          // reopening a file for writing and don't require reopening and
+          // syncing the file. Ignore the NotSupported error in that case.
+          if (!s.IsNotSupported()) {
+            status = s;
+            if (status.ok()) {
+              TEST_SYNC_POINT(
+                  "ExternalSstFileIngestionJob::BeforeSyncIngestedFile");
+              status = SyncIngestedFile(file_to_sync.get());
+              TEST_SYNC_POINT(
+                  "ExternalSstFileIngestionJob::AfterSyncIngestedFile");
+              if (!status.ok()) {
+                ROCKS_LOG_WARN(db_options_.info_log,
+                               "Failed to sync ingested file %s: %s",
+                               path_inside_db.c_str(), status.ToString().c_str());
+              }
             }
           }
         }
@@ -204,7 +211,8 @@ Status ExternalSstFileIngestionJob::Prepare(
             requested_checksum_func_name, &generated_checksum,
             &generated_checksum_func_name,
             ingestion_options_.verify_checksums_readahead_size,
-            db_options_.allow_mmap_reads, io_tracer_);
+            db_options_.allow_mmap_reads, io_tracer_,
+            db_options_.rate_limiter.get());
         if (!io_s.ok()) {
           status = io_s;
           ROCKS_LOG_WARN(db_options_.info_log,
@@ -295,12 +303,13 @@ Status ExternalSstFileIngestionJob::Prepare(
 
   // TODO: The following is duplicated with Cleanup().
   if (!status.ok()) {
+    IOOptions io_opts;
     // We failed, remove all files that we copied into the db
     for (IngestedFileInfo& f : files_to_ingest_) {
       if (f.internal_file_path.empty()) {
         continue;
       }
-      Status s = env_->DeleteFile(f.internal_file_path);
+      Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr);
       if (!s.ok()) {
         ROCKS_LOG_WARN(db_options_.info_log,
                        "AddFile() clean up for file %s failed : %s",
@@ -319,8 +328,8 @@ Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed,
     ranges.emplace_back(file_to_ingest.smallest_internal_key.user_key(),
                         file_to_ingest.largest_internal_key.user_key());
   }
-  Status status =
-      cfd_->RangesOverlapWithMemtables(ranges, super_version, flush_needed);
+  Status status = cfd_->RangesOverlapWithMemtables(
+      ranges, super_version, db_options_.allow_data_in_errors, flush_needed);
   if (status.ok() && *flush_needed &&
       !ingestion_options_.allow_blocking_flush) {
     status = Status::InvalidArgument("External file requires flush");
@@ -338,6 +347,12 @@ Status ExternalSstFileIngestionJob::Run() {
   // with the files we are ingesting
   bool need_flush = false;
   status = NeedsFlush(&need_flush, super_version);
+  if (!status.ok()) {
+    return status;
+  }
+  if (need_flush) {
+    return Status::TryAgain();
+  }
   assert(status.ok() && need_flush == false);
 #endif
 
@@ -363,9 +378,32 @@ Status ExternalSstFileIngestionJob::Run() {
           super_version, force_global_seqno, cfd_->ioptions()->compaction_style,
           last_seqno, &f, &assigned_seqno);
     }
+
+    // Modify the smallest/largest internal key to include the sequence number
+    // that we just learned. Only overwrite sequence number zero. There could
+    // be a nonzero sequence number already to indicate a range tombstone's
+    // exclusive endpoint.
+    ParsedInternalKey smallest_parsed, largest_parsed;
+    if (status.ok()) {
+      status = ParseInternalKey(*f.smallest_internal_key.rep(),
+                                &smallest_parsed, false /* log_err_key */);
+    }
+    if (status.ok()) {
+      status = ParseInternalKey(*f.largest_internal_key.rep(), &largest_parsed,
+                                false /* log_err_key */);
+    }
     if (!status.ok()) {
       return status;
     }
+    if (smallest_parsed.sequence == 0) {
+      UpdateInternalKey(f.smallest_internal_key.rep(), assigned_seqno,
+                        smallest_parsed.type);
+    }
+    if (largest_parsed.sequence == 0) {
+      UpdateInternalKey(f.largest_internal_key.rep(), assigned_seqno,
+                        largest_parsed.type);
+    }
+
     status = AssignGlobalSeqnoForIngestedFile(&f, assigned_seqno);
     TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run",
                              &assigned_seqno);
@@ -388,7 +426,7 @@ Status ExternalSstFileIngestionJob::Run() {
     int64_t temp_current_time = 0;
     uint64_t current_time = kUnknownFileCreationTime;
     uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
-    if (env_->GetCurrentTime(&temp_current_time).ok()) {
+    if (clock_->GetCurrentTime(&temp_current_time).ok()) {
       current_time = oldest_ancester_time =
           static_cast<uint64_t>(temp_current_time);
     }
@@ -406,7 +444,7 @@ void ExternalSstFileIngestionJob::UpdateStats() {
   // Update internal stats for new ingested files
   uint64_t total_keys = 0;
   uint64_t total_l0_files = 0;
-  uint64_t total_time = env_->NowMicros() - job_start_time_;
+  uint64_t total_time = clock_->NowMicros() - job_start_time_;
 
   EventLoggerStream stream = event_logger_->Log();
   stream << "event"
@@ -462,6 +500,7 @@ void ExternalSstFileIngestionJob::UpdateStats() {
 }
 
 void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
+  IOOptions io_opts;
   if (!status.ok()) {
     // We failed to add the files to the database
     // remove all the files we copied
@@ -469,7 +508,7 @@ void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
       if (f.internal_file_path.empty()) {
         continue;
       }
-      Status s = env_->DeleteFile(f.internal_file_path);
+      Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr);
       if (!s.ok()) {
         ROCKS_LOG_WARN(db_options_.info_log,
                        "AddFile() clean up for file %s failed : %s",
@@ -481,7 +520,7 @@ void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
   } else if (status.ok() && ingestion_options_.move_files) {
     // The files were moved and added successfully, remove original file links
     for (IngestedFileInfo& f : files_to_ingest_) {
-      Status s = env_->DeleteFile(f.external_file_path);
+      Status s = fs_->DeleteFile(f.external_file_path, io_opts, nullptr);
       if (!s.ok()) {
         ROCKS_LOG_WARN(
             db_options_.info_log,
@@ -605,22 +644,28 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
   file_to_ingest->largest_internal_key =
       InternalKey("", 0, ValueType::kTypeValue);
   bool bounds_set = false;
+  bool allow_data_in_errors = db_options_.allow_data_in_errors;
   iter->SeekToFirst();
   if (iter->Valid()) {
-    if (ParseInternalKey(iter->key(), &key) != Status::OK()) {
-      return Status::Corruption("external file have corrupted keys");
+    Status pik_status =
+        ParseInternalKey(iter->key(), &key, allow_data_in_errors);
+    if (!pik_status.ok()) {
+      return Status::Corruption("Corrupted key in external file. ",
+                                pik_status.getState());
     }
     if (key.sequence != 0) {
-      return Status::Corruption("external file have non zero sequence number");
+      return Status::Corruption("External file has non zero sequence number");
     }
     file_to_ingest->smallest_internal_key.SetFrom(key);
 
     iter->SeekToLast();
-    if (ParseInternalKey(iter->key(), &key) != Status::OK()) {
-      return Status::Corruption("external file have corrupted keys");
+    pik_status = ParseInternalKey(iter->key(), &key, allow_data_in_errors);
+    if (!pik_status.ok()) {
+      return Status::Corruption("Corrupted key in external file. ",
+                                pik_status.getState());
     }
     if (key.sequence != 0) {
-      return Status::Corruption("external file have non zero sequence number");
+      return Status::Corruption("External file has non zero sequence number");
     }
     file_to_ingest->largest_internal_key.SetFrom(key);
 
@@ -633,8 +678,11 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
   if (range_del_iter != nullptr) {
     for (range_del_iter->SeekToFirst(); range_del_iter->Valid();
          range_del_iter->Next()) {
-      if (ParseInternalKey(range_del_iter->key(), &key) != Status::OK()) {
-        return Status::Corruption("external file have corrupted keys");
+      Status pik_status =
+          ParseInternalKey(range_del_iter->key(), &key, allow_data_in_errors);
+      if (!pik_status.ok()) {
+        return Status::Corruption("Corrupted key in external file. ",
+                                  pik_status.getState());
       }
       RangeTombstone tombstone(key, range_del_iter->value());
 
@@ -797,7 +845,8 @@ Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile(
         fs_->NewRandomRWFile(file_to_ingest->internal_file_path, env_options_,
                              &rwfile, nullptr);
     if (status.ok()) {
-      FSRandomRWFilePtr fsptr(std::move(rwfile), io_tracer_);
+      FSRandomRWFilePtr fsptr(std::move(rwfile), io_tracer_,
+                              file_to_ingest->internal_file_path);
       std::string seqno_val;
       PutFixed64(&seqno_val, seqno);
       status = fsptr->Write(file_to_ingest->global_seqno_offset, seqno_val,
@@ -844,7 +893,7 @@ IOStatus ExternalSstFileIngestionJob::GenerateChecksumForIngestedFile(
       db_options_.file_checksum_gen_factory.get(), requested_checksum_func_name,
       &file_checksum, &file_checksum_func_name,
       ingestion_options_.verify_checksums_readahead_size,
-      db_options_.allow_mmap_reads, io_tracer_);
+      db_options_.allow_mmap_reads, io_tracer_, db_options_.rate_limiter.get());
   if (!io_s.ok()) {
     return io_s;
   }
diff --git a/db/external_sst_file_ingestion_job.h b/db/external_sst_file_ingestion_job.h
index a2782f54a5e..c669089d92b 100644
--- a/db/external_sst_file_ingestion_job.h
+++ b/db/external_sst_file_ingestion_job.h
@@ -16,13 +16,14 @@
 #include "logging/event_logger.h"
 #include "options/db_options.h"
 #include "rocksdb/db.h"
-#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
 #include "rocksdb/sst_file_writer.h"
 #include "util/autovector.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class Directories;
+class SystemClock;
 
 struct IngestedFileInfo {
   // External file path
@@ -73,13 +74,13 @@ struct IngestedFileInfo {
 class ExternalSstFileIngestionJob {
  public:
   ExternalSstFileIngestionJob(
-      Env* env, VersionSet* versions, ColumnFamilyData* cfd,
+      VersionSet* versions, ColumnFamilyData* cfd,
       const ImmutableDBOptions& db_options, const EnvOptions& env_options,
       SnapshotList* db_snapshots,
       const IngestExternalFileOptions& ingestion_options,
       Directories* directories, EventLogger* event_logger,
       const std::shared_ptr<IOTracer>& io_tracer)
-      : env_(env),
+      : clock_(db_options.clock),
         fs_(db_options.fs, io_tracer),
         versions_(versions),
         cfd_(cfd),
@@ -89,7 +90,7 @@ class ExternalSstFileIngestionJob {
         ingestion_options_(ingestion_options),
         directories_(directories),
         event_logger_(event_logger),
-        job_start_time_(env_->NowMicros()),
+        job_start_time_(clock_->NowMicros()),
         consumed_seqno_count_(0),
         io_tracer_(io_tracer) {
     assert(directories != nullptr);
@@ -169,7 +170,7 @@ class ExternalSstFileIngestionJob {
   template <typename TWritableFile>
   Status SyncIngestedFile(TWritableFile* file);
 
-  Env* env_;
+  SystemClock* clock_;
   FileSystemPtr fs_;
   VersionSet* versions_;
   ColumnFamilyData* cfd_;
diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc
index 0ccaf51eddf..6f3f0b3f262 100644
--- a/db/external_sst_file_test.cc
+++ b/db/external_sst_file_test.cc
@@ -16,6 +16,7 @@
 #include "rocksdb/sst_file_writer.h"
 #include "test_util/testutil.h"
 #include "util/random.h"
+#include "util/thread_guard.h"
 #include "utilities/fault_injection_env.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -47,8 +48,8 @@ class ExternSSTFileLinkFailFallbackTest
       : DBTestBase("/external_sst_file_test", /*env_do_fsync=*/true),
         test_env_(new ExternalSSTTestEnv(env_, true)) {
     sst_files_dir_ = dbname_ + "/sst_files/";
-    DestroyDir(env_, sst_files_dir_);
-    env_->CreateDir(sst_files_dir_);
+    EXPECT_EQ(DestroyDir(env_, sst_files_dir_), Status::OK());
+    EXPECT_EQ(env_->CreateDir(sst_files_dir_), Status::OK());
     options_ = CurrentOptions();
     options_.disable_auto_compactions = true;
     options_.env = test_env_;
@@ -79,8 +80,8 @@ class ExternalSSTFileTest
   }
 
   void DestroyAndRecreateExternalSSTFilesDir() {
-    DestroyDir(env_, sst_files_dir_);
-    env_->CreateDir(sst_files_dir_);
+    ASSERT_OK(DestroyDir(env_, sst_files_dir_));
+    ASSERT_OK(env_->CreateDir(sst_files_dir_));
   }
 
   Status GenerateOneExternalFile(
@@ -116,7 +117,7 @@ class ExternalSSTFileTest
     for (const auto& entry : data) {
       s = sst_file_writer.Put(entry.first, entry.second);
       if (!s.ok()) {
-        sst_file_writer.Finish();
+        sst_file_writer.Finish().PermitUncheckedError();
         return s;
       }
     }
@@ -171,7 +172,7 @@ class ExternalSSTFileTest
     for (auto& entry : data) {
       s = sst_file_writer.Put(entry.first, entry.second);
       if (!s.ok()) {
-        sst_file_writer.Finish();
+        sst_file_writer.Finish().PermitUncheckedError();
         return s;
       }
     }
@@ -213,11 +214,10 @@ class ExternalSSTFileTest
     size_t num_cfs = column_families.size();
     assert(ifos.size() == num_cfs);
     assert(data.size() == num_cfs);
-    Status s;
     std::vector<IngestExternalFileArg> args(num_cfs);
     for (size_t i = 0; i != num_cfs; ++i) {
       std::string external_file_path;
-      s = GenerateOneExternalFile(
+      Status s = GenerateOneExternalFile(
           options, column_families[i], data[i], file_id, sort_data,
           &external_file_path,
           true_data.size() == num_cfs ? &true_data[i] : nullptr);
@@ -230,8 +230,7 @@ class ExternalSSTFileTest
       args[i].external_files.push_back(external_file_path);
       args[i].options = ifos[i];
     }
-    s = db_->IngestExternalFiles(args);
-    return s;
+    return db_->IngestExternalFiles(args);
   }
 
   Status GenerateAndAddExternalFile(
@@ -282,7 +281,9 @@ class ExternalSSTFileTest
     return db_->IngestExternalFile(files, opts);
   }
 
-  ~ExternalSSTFileTest() override { DestroyDir(env_, sst_files_dir_); }
+  ~ExternalSSTFileTest() override {
+    DestroyDir(env_, sst_files_dir_).PermitUncheckedError();
+  }
 
  protected:
   int last_file_id_ = 0;
@@ -305,8 +306,7 @@ TEST_F(ExternalSSTFileTest, Basic) {
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
     }
     ExternalSstFileInfo file1_info;
-    Status s = sst_file_writer.Finish(&file1_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file1_info));
 
     // Current file size should be non-zero after success write.
     ASSERT_GT(sst_file_writer.FileSize(), 0);
@@ -319,8 +319,7 @@ TEST_F(ExternalSSTFileTest, Basic) {
     ASSERT_EQ(file1_info.smallest_range_del_key, "");
     ASSERT_EQ(file1_info.largest_range_del_key, "");
     // sst_file_writer already finished, cannot add this value
-    s = sst_file_writer.Put(Key(100), "bad_val");
-    ASSERT_FALSE(s.ok()) << s.ToString();
+    ASSERT_NOK(sst_file_writer.Put(Key(100), "bad_val"));
 
     // file2.sst (100 => 199)
     std::string file2 = sst_files_dir_ + "file2.sst";
@@ -329,11 +328,9 @@ TEST_F(ExternalSSTFileTest, Basic) {
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
     }
     // Cannot add this key because it's not after last added key
-    s = sst_file_writer.Put(Key(99), "bad_val");
-    ASSERT_FALSE(s.ok()) << s.ToString();
+    ASSERT_NOK(sst_file_writer.Put(Key(99), "bad_val"));
     ExternalSstFileInfo file2_info;
-    s = sst_file_writer.Finish(&file2_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file2_info));
     ASSERT_EQ(file2_info.file_path, file2);
     ASSERT_EQ(file2_info.num_entries, 100);
     ASSERT_EQ(file2_info.smallest_key, Key(100));
@@ -347,9 +344,8 @@ TEST_F(ExternalSSTFileTest, Basic) {
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
     }
     ExternalSstFileInfo file3_info;
-    s = sst_file_writer.Finish(&file3_info);
+    ASSERT_OK(sst_file_writer.Finish(&file3_info));
 
-    ASSERT_TRUE(s.ok()) << s.ToString();
     // Current file size should be non-zero after success finish.
     ASSERT_GT(sst_file_writer.FileSize(), 0);
     ASSERT_EQ(file3_info.file_path, file3);
@@ -365,8 +361,7 @@ TEST_F(ExternalSSTFileTest, Basic) {
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
     }
     ExternalSstFileInfo file4_info;
-    s = sst_file_writer.Finish(&file4_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file4_info));
     ASSERT_EQ(file4_info.file_path, file4);
     ASSERT_EQ(file4_info.num_entries, 10);
     ASSERT_EQ(file4_info.smallest_key, Key(30));
@@ -379,8 +374,7 @@ TEST_F(ExternalSSTFileTest, Basic) {
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
     }
     ExternalSstFileInfo file5_info;
-    s = sst_file_writer.Finish(&file5_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file5_info));
     ASSERT_EQ(file5_info.file_path, file5);
     ASSERT_EQ(file5_info.num_entries, 100);
     ASSERT_EQ(file5_info.smallest_key, Key(400));
@@ -389,10 +383,9 @@ TEST_F(ExternalSSTFileTest, Basic) {
     // file6.sst (delete 400 => 500)
     std::string file6 = sst_files_dir_ + "file6.sst";
     ASSERT_OK(sst_file_writer.Open(file6));
-    sst_file_writer.DeleteRange(Key(400), Key(500));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(400), Key(500)));
     ExternalSstFileInfo file6_info;
-    s = sst_file_writer.Finish(&file6_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file6_info));
     ASSERT_EQ(file6_info.file_path, file6);
     ASSERT_EQ(file6_info.num_entries, 0);
     ASSERT_EQ(file6_info.smallest_key, "");
@@ -404,17 +397,16 @@ TEST_F(ExternalSSTFileTest, Basic) {
     // file7.sst (delete 500 => 570, put 520 => 599 divisible by 2)
     std::string file7 = sst_files_dir_ + "file7.sst";
     ASSERT_OK(sst_file_writer.Open(file7));
-    sst_file_writer.DeleteRange(Key(500), Key(550));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(500), Key(550)));
     for (int k = 520; k < 560; k += 2) {
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
     }
-    sst_file_writer.DeleteRange(Key(525), Key(575));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(525), Key(575)));
     for (int k = 560; k < 600; k += 2) {
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
     }
     ExternalSstFileInfo file7_info;
-    s = sst_file_writer.Finish(&file7_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file7_info));
     ASSERT_EQ(file7_info.file_path, file7);
     ASSERT_EQ(file7_info.num_entries, 40);
     ASSERT_EQ(file7_info.smallest_key, Key(520));
@@ -426,10 +418,9 @@ TEST_F(ExternalSSTFileTest, Basic) {
     // file8.sst (delete 600 => 700)
     std::string file8 = sst_files_dir_ + "file8.sst";
     ASSERT_OK(sst_file_writer.Open(file8));
-    sst_file_writer.DeleteRange(Key(600), Key(700));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(600), Key(700)));
     ExternalSstFileInfo file8_info;
-    s = sst_file_writer.Finish(&file8_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file8_info));
     ASSERT_EQ(file8_info.file_path, file8);
     ASSERT_EQ(file8_info.num_entries, 0);
     ASSERT_EQ(file8_info.smallest_key, "");
@@ -441,13 +432,11 @@ TEST_F(ExternalSSTFileTest, Basic) {
     // Cannot create an empty sst file
     std::string file_empty = sst_files_dir_ + "file_empty.sst";
     ExternalSstFileInfo file_empty_info;
-    s = sst_file_writer.Finish(&file_empty_info);
-    ASSERT_NOK(s);
+    ASSERT_NOK(sst_file_writer.Finish(&file_empty_info));
 
     DestroyAndReopen(options);
     // Add file using file path
-    s = DeprecatedAddFile({file1});
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(DeprecatedAddFile({file1}));
     ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
     for (int k = 0; k < 100; k++) {
       ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
@@ -468,12 +457,10 @@ TEST_F(ExternalSSTFileTest, Basic) {
     }
 
     // This file has overlapping values with the existing data
-    s = DeprecatedAddFile({file3});
-    ASSERT_FALSE(s.ok()) << s.ToString();
+    ASSERT_NOK(DeprecatedAddFile({file3}));
 
     // This file has overlapping values with the existing data
-    s = DeprecatedAddFile({file4});
-    ASSERT_FALSE(s.ok()) << s.ToString();
+    ASSERT_NOK(DeprecatedAddFile({file4}));
 
     // Overwrite values of keys divisible by 5
     for (int k = 0; k < 200; k += 5) {
@@ -485,8 +472,7 @@ TEST_F(ExternalSSTFileTest, Basic) {
     ASSERT_OK(DeprecatedAddFile({file5}));
 
     // This file has overlapping values with the existing data
-    s = DeprecatedAddFile({file6});
-    ASSERT_FALSE(s.ok()) << s.ToString();
+    ASSERT_NOK(DeprecatedAddFile({file6}));
 
     // Key range of file7 (500 => 598) don't overlap with any keys in DB
     ASSERT_OK(DeprecatedAddFile({file7}));
@@ -614,15 +600,13 @@ TEST_F(ExternalSSTFileTest, AddList) {
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
     }
     ExternalSstFileInfo file1_info;
-    Status s = sst_file_writer.Finish(&file1_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file1_info));
     ASSERT_EQ(file1_info.file_path, file1);
     ASSERT_EQ(file1_info.num_entries, 100);
     ASSERT_EQ(file1_info.smallest_key, Key(0));
     ASSERT_EQ(file1_info.largest_key, Key(99));
     // sst_file_writer already finished, cannot add this value
-    s = sst_file_writer.Put(Key(100), "bad_val");
-    ASSERT_FALSE(s.ok()) << s.ToString();
+    ASSERT_NOK(sst_file_writer.Put(Key(100), "bad_val"));
 
     // file2.sst (100 => 199)
     std::string file2 = sst_files_dir_ + "file2.sst";
@@ -631,11 +615,9 @@ TEST_F(ExternalSSTFileTest, AddList) {
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
     }
     // Cannot add this key because it's not after last added key
-    s = sst_file_writer.Put(Key(99), "bad_val");
-    ASSERT_FALSE(s.ok()) << s.ToString();
+    ASSERT_NOK(sst_file_writer.Put(Key(99), "bad_val"));
     ExternalSstFileInfo file2_info;
-    s = sst_file_writer.Finish(&file2_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file2_info));
     ASSERT_EQ(file2_info.file_path, file2);
     ASSERT_EQ(file2_info.num_entries, 100);
     ASSERT_EQ(file2_info.smallest_key, Key(100));
@@ -649,8 +631,7 @@ TEST_F(ExternalSSTFileTest, AddList) {
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
     }
     ExternalSstFileInfo file3_info;
-    s = sst_file_writer.Finish(&file3_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file3_info));
     ASSERT_EQ(file3_info.file_path, file3);
     ASSERT_EQ(file3_info.num_entries, 5);
     ASSERT_EQ(file3_info.smallest_key, Key(195));
@@ -664,8 +645,7 @@ TEST_F(ExternalSSTFileTest, AddList) {
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
     }
     ExternalSstFileInfo file4_info;
-    s = sst_file_writer.Finish(&file4_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file4_info));
     ASSERT_EQ(file4_info.file_path, file4);
     ASSERT_EQ(file4_info.num_entries, 10);
     ASSERT_EQ(file4_info.smallest_key, Key(30));
@@ -678,8 +658,7 @@ TEST_F(ExternalSSTFileTest, AddList) {
       ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
     }
     ExternalSstFileInfo file5_info;
-    s = sst_file_writer.Finish(&file5_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file5_info));
     ASSERT_EQ(file5_info.file_path, file5);
     ASSERT_EQ(file5_info.num_entries, 100);
     ASSERT_EQ(file5_info.smallest_key, Key(200));
@@ -691,8 +670,7 @@ TEST_F(ExternalSSTFileTest, AddList) {
     ASSERT_OK(sst_file_writer.DeleteRange(Key(0), Key(75)));
     ASSERT_OK(sst_file_writer.DeleteRange(Key(25), Key(100)));
     ExternalSstFileInfo file6_info;
-    s = sst_file_writer.Finish(&file6_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file6_info));
     ASSERT_EQ(file6_info.file_path, file6);
     ASSERT_EQ(file6_info.num_entries, 0);
     ASSERT_EQ(file6_info.smallest_key, "");
@@ -706,8 +684,7 @@ TEST_F(ExternalSSTFileTest, AddList) {
     ASSERT_OK(sst_file_writer.Open(file7));
     ASSERT_OK(sst_file_writer.DeleteRange(Key(99), Key(201)));
     ExternalSstFileInfo file7_info;
-    s = sst_file_writer.Finish(&file7_info);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(sst_file_writer.Finish(&file7_info));
     ASSERT_EQ(file7_info.file_path, file7);
     ASSERT_EQ(file7_info.num_entries, 0);
     ASSERT_EQ(file7_info.smallest_key, "");
@@ -727,17 +704,13 @@ TEST_F(ExternalSSTFileTest, AddList) {
     DestroyAndReopen(options);
 
     // These lists of files have key ranges that overlap with each other
-    s = DeprecatedAddFile(file_list1);
-    ASSERT_FALSE(s.ok()) << s.ToString();
+    ASSERT_NOK(DeprecatedAddFile(file_list1));
     // Both of the following overlap on the range deletion tombstone.
-    s = DeprecatedAddFile(file_list4);
-    ASSERT_FALSE(s.ok()) << s.ToString();
-    s = DeprecatedAddFile(file_list5);
-    ASSERT_FALSE(s.ok()) << s.ToString();
+    ASSERT_NOK(DeprecatedAddFile(file_list4));
+    ASSERT_NOK(DeprecatedAddFile(file_list5));
 
     // Add files using file path list
-    s = DeprecatedAddFile(file_list0);
-    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_OK(DeprecatedAddFile(file_list0));
     ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
     for (int k = 0; k < 200; k++) {
       ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
@@ -778,8 +751,7 @@ TEST_F(ExternalSSTFileTest, AddList) {
     }
 
     // This file list has overlapping values with the existing data
-    s = DeprecatedAddFile(file_list3);
-    ASSERT_FALSE(s.ok()) << s.ToString();
+    ASSERT_NOK(DeprecatedAddFile(file_list3));
 
     // Overwrite values of keys divisible by 5
     for (int k = 0; k < 200; k += 5) {
@@ -847,16 +819,14 @@ TEST_F(ExternalSSTFileTest, AddListAtomicity) {
       for (int k = i * 100; k < (i + 1) * 100; k++) {
         ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
       }
-      Status s = sst_file_writer.Finish(&files_info[i]);
-      ASSERT_TRUE(s.ok()) << s.ToString();
+      ASSERT_OK(sst_file_writer.Finish(&files_info[i]));
       ASSERT_EQ(files_info[i].file_path, files[i]);
       ASSERT_EQ(files_info[i].num_entries, 100);
       ASSERT_EQ(files_info[i].smallest_key, Key(i * 100));
       ASSERT_EQ(files_info[i].largest_key, Key((i + 1) * 100 - 1));
     }
     files.push_back(sst_files_dir_ + "file" + std::to_string(n) + ".sst");
-    auto s = DeprecatedAddFile(files);
-    ASSERT_NOK(s) << s.ToString();
+    ASSERT_NOK(DeprecatedAddFile(files));
     for (int k = 0; k < n * 100; k++) {
       ASSERT_EQ("NOT_FOUND", Get(Key(k)));
     }
@@ -878,17 +848,14 @@ TEST_F(ExternalSSTFileTest, PurgeObsoleteFilesBug) {
 
   // file1.sst (0 => 500)
   std::string sst_file_path = sst_files_dir_ + "file1.sst";
-  Status s = sst_file_writer.Open(sst_file_path);
-  ASSERT_OK(s);
+  ASSERT_OK(sst_file_writer.Open(sst_file_path));
   for (int i = 0; i < 500; i++) {
     std::string k = Key(i);
-    s = sst_file_writer.Put(k, k + "_val");
-    ASSERT_OK(s);
+    ASSERT_OK(sst_file_writer.Put(k, k + "_val"));
   }
 
   ExternalSstFileInfo sst_file_info;
-  s = sst_file_writer.Finish(&sst_file_info);
-  ASSERT_OK(s);
+  ASSERT_OK(sst_file_writer.Finish(&sst_file_info));
 
   options.delete_obsolete_files_period_micros = 0;
   options.disable_auto_compactions = true;
@@ -900,12 +867,11 @@ TEST_F(ExternalSSTFileTest, PurgeObsoleteFilesBug) {
         ASSERT_OK(Flush());
         ASSERT_OK(Put("aaa", "xxx"));
         ASSERT_OK(Flush());
-        db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+        ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
-  s = DeprecatedAddFile({sst_file_path});
-  ASSERT_OK(s);
+  ASSERT_OK(DeprecatedAddFile({sst_file_path}));
 
   for (int i = 0; i < 500; i++) {
     std::string k = Key(i);
@@ -928,8 +894,7 @@ TEST_F(ExternalSSTFileTest, SkipSnapshot) {
     ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
   }
   ExternalSstFileInfo file1_info;
-  Status s = sst_file_writer.Finish(&file1_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(sst_file_writer.Finish(&file1_info));
   ASSERT_EQ(file1_info.file_path, file1);
   ASSERT_EQ(file1_info.num_entries, 100);
   ASSERT_EQ(file1_info.smallest_key, Key(0));
@@ -942,8 +907,7 @@ TEST_F(ExternalSSTFileTest, SkipSnapshot) {
     ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
   }
   ExternalSstFileInfo file2_info;
-  s = sst_file_writer.Finish(&file2_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(sst_file_writer.Finish(&file2_info));
   ASSERT_EQ(file2_info.file_path, file2);
   ASSERT_EQ(file2_info.num_entries, 200);
   ASSERT_EQ(file2_info.smallest_key, Key(100));
@@ -972,8 +936,7 @@ TEST_F(ExternalSSTFileTest, SkipSnapshot) {
     ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
   }
   ExternalSstFileInfo file3_info;
-  s = sst_file_writer.Finish(&file3_info);
-  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(sst_file_writer.Finish(&file3_info));
   ASSERT_EQ(file3_info.file_path, file3);
   ASSERT_EQ(file3_info.num_entries, 100);
   ASSERT_EQ(file3_info.smallest_key, Key(300));
@@ -1019,8 +982,7 @@ TEST_F(ExternalSSTFileTest, MultiThreaded) {
         ASSERT_OK(sst_file_writer.Put(Key(k), Key(k)));
       }
 
-      Status s = sst_file_writer.Finish();
-      ASSERT_TRUE(s.ok()) << s.ToString();
+      ASSERT_OK(sst_file_writer.Finish());
     };
     // Write num_files files in parallel
     std::vector<port::Thread> sst_writer_threads;
@@ -1082,8 +1044,7 @@ TEST_F(ExternalSSTFileTest, MultiThreaded) {
     // Overwrite values of keys divisible by 100
     for (int k = 0; k < num_files * keys_per_file; k += 100) {
       std::string key = Key(k);
-      Status s = Put(key, key + "_new");
-      ASSERT_TRUE(s.ok());
+      ASSERT_OK(Put(key, key + "_new"));
     }
 
     for (int i = 0; i < 2; i++) {
@@ -1167,7 +1128,8 @@ TEST_F(ExternalSSTFileTest, OverlappingRanges) {
 
         // Generate the file containing the range
         std::string file_name = sst_files_dir_ + env_->GenerateUniqueId();
-        ASSERT_OK(sst_file_writer.Open(file_name));
+        s = sst_file_writer.Open(file_name);
+        ASSERT_OK(s);
         for (int k = range_start; k <= range_end; k++) {
           s = sst_file_writer.Put(Key(k), range_val);
           ASSERT_OK(s);
@@ -1212,10 +1174,10 @@ TEST_F(ExternalSSTFileTest, OverlappingRanges) {
 
       // Flush / Compact the DB
       if (i && i % 50 == 0) {
-        Flush();
+        ASSERT_OK(Flush());
       }
       if (i && i % 75 == 0) {
-        db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+        ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
       }
     }
 
@@ -1293,7 +1255,7 @@ TEST_P(ExternalSSTFileTest, PickedLevel) {
   // Hold compaction from finishing
   TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:2");
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   EXPECT_EQ(FilesPerLevel(), "1,1,1,2");
 
   size_t kcnt = 0;
@@ -1329,8 +1291,11 @@ TEST_F(ExternalSSTFileTest, PickedLevelBug) {
   // We have 2 overlapping files in L0
   EXPECT_EQ(FilesPerLevel(), "2");
 
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
-      {{"DBImpl::AddFile:MutexLock", "ExternalSSTFileTest::PickedLevelBug:0"},
+      {{"DBImpl::IngestExternalFile:AfterIncIngestFileCounter",
+        "ExternalSSTFileTest::PickedLevelBug:0"},
        {"ExternalSSTFileTest::PickedLevelBug:1", "DBImpl::AddFile:MutexUnlock"},
        {"ExternalSSTFileTest::PickedLevelBug:2",
         "DBImpl::RunManualCompaction:0"},
@@ -1344,37 +1309,47 @@ TEST_F(ExternalSSTFileTest, PickedLevelBug) {
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
-  // While writing the MANIFEST start a thread that will ask for compaction
-  ROCKSDB_NAMESPACE::port::Thread bg_compact([&]() {
-    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  });
-  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:2");
+  Status bg_compact_status;
+  Status bg_addfile_status;
 
-  // Start a thread that will ingest a new file
-  ROCKSDB_NAMESPACE::port::Thread bg_addfile([&]() {
-    file_keys = {1, 2, 3};
-    ASSERT_OK(GenerateAndAddExternalFile(options, file_keys, 1));
-  });
+  {
+    // While writing the MANIFEST start a thread that will ask for compaction
+    ThreadGuard bg_compact(port::Thread([&]() {
+      bg_compact_status =
+          db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    }));
+    TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:2");
+
+    // Start a thread that will ingest a new file
+    ThreadGuard bg_addfile(port::Thread([&]() {
+      file_keys = {1, 2, 3};
+      bg_addfile_status = GenerateAndAddExternalFile(options, file_keys, 1);
+    }));
+
+    // Wait for AddFile to start picking levels and writing MANIFEST
+    TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:0");
 
-  // Wait for AddFile to start picking levels and writing MANIFEST
-  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:0");
+    TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:3");
 
-  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:3");
+    // We need to verify that no compactions can run while AddFile is
+    // ingesting the files into the levels it find suitable. So we will
+    // wait for 2 seconds to give a chance for compactions to run during
+    // this period, and then make sure that no compactions where able to run
+    env_->SleepForMicroseconds(1000000 * 2);
+    bool bg_compact_started_tmp = bg_compact_started.load();
 
-  // We need to verify that no compactions can run while AddFile is
-  // ingesting the files into the levels it find suitable. So we will
-  // wait for 2 seconds to give a chance for compactions to run during
-  // this period, and then make sure that no compactions where able to run
-  env_->SleepForMicroseconds(1000000 * 2);
-  ASSERT_FALSE(bg_compact_started.load());
+    // Hold AddFile from finishing writing the MANIFEST
+    TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:1");
 
-  // Hold AddFile from finishing writing the MANIFEST
-  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:1");
+    // check the status at the end, so even if the ASSERT fails the threads
+    // could be joined and return.
+    ASSERT_FALSE(bg_compact_started_tmp);
+  }
 
-  bg_addfile.join();
-  bg_compact.join();
+  ASSERT_OK(bg_addfile_status);
+  ASSERT_OK(bg_compact_status);
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   int total_keys = 0;
   Iterator* iter = db_->NewIterator(ReadOptions());
@@ -1411,7 +1386,7 @@ TEST_F(ExternalSSTFileTest, IngestNonExistingFile) {
 
   // After full compaction, there should be only 1 file.
   std::vector<std::string> files;
-  env_->GetChildren(dbname_, &files);
+  ASSERT_OK(env_->GetChildren(dbname_, &files));
   int num_sst_files = 0;
   for (auto& f : files) {
     uint64_t number;
@@ -1533,7 +1508,7 @@ TEST_F(ExternalSSTFileTest, PickedLevelDynamic) {
   TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:2");
 
   // Output of the compaction will go to L3
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   EXPECT_EQ(FilesPerLevel(), "1,0,0,2");
 
   Close();
@@ -1675,7 +1650,7 @@ TEST_F(ExternalSSTFileTest, AddFileTrivialMoveBug) {
   cro.exclusive_manual_compaction = false;
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
@@ -1727,9 +1702,9 @@ TEST_F(ExternalSSTFileTest, WithUnorderedWrite) {
   Options options = CurrentOptions();
   options.unordered_write = true;
   DestroyAndReopen(options);
-  Put("foo", "v1");
+  ASSERT_OK(Put("foo", "v1"));
   SyncPoint::GetInstance()->EnableProcessing();
-  port::Thread writer([&]() { Put("bar", "v2"); });
+  port::Thread writer([&]() { ASSERT_OK(Put("bar", "v2")); });
 
   TEST_SYNC_POINT("ExternalSSTFileTest::WithUnorderedWrite:WaitWriteWAL");
   ASSERT_OK(GenerateAndAddExternalFile(options, {{"bar", "v3"}}, -1,
@@ -1778,7 +1753,7 @@ TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoRandomized) {
     }
     size_t kcnt = 0;
     VerifyDBFromMap(true_data, &kcnt, false);
-    db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
     VerifyDBFromMap(true_data, &kcnt, false);
   }
 }
@@ -1862,8 +1837,8 @@ TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoMemtableFlush) {
     ASSERT_OK(Put(Key(k), "memtable"));
     true_data[Key(k)] = "memtable";
   }
-  db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
-                      &entries_in_memtable);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
   ASSERT_GE(entries_in_memtable, 1);
 
   bool write_global_seqno = std::get<0>(GetParam());
@@ -1872,40 +1847,40 @@ TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoMemtableFlush) {
   ASSERT_OK(GenerateAndAddExternalFile(
       options, {90, 100, 110}, -1, true, write_global_seqno,
       verify_checksums_before_ingest, false, false, &true_data));
-  db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
-                      &entries_in_memtable);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
   ASSERT_GE(entries_in_memtable, 1);
 
   // This file will flush the memtable
   ASSERT_OK(GenerateAndAddExternalFile(
       options, {19, 20, 21}, -1, true, write_global_seqno,
       verify_checksums_before_ingest, false, false, &true_data));
-  db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
-                      &entries_in_memtable);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
   ASSERT_EQ(entries_in_memtable, 0);
 
   for (int k : {200, 201, 205, 206}) {
     ASSERT_OK(Put(Key(k), "memtable"));
     true_data[Key(k)] = "memtable";
   }
-  db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
-                      &entries_in_memtable);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
   ASSERT_GE(entries_in_memtable, 1);
 
   // No need for flush, this file keys fit between the memtable keys
   ASSERT_OK(GenerateAndAddExternalFile(
       options, {202, 203, 204}, -1, true, write_global_seqno,
       verify_checksums_before_ingest, false, false, &true_data));
-  db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
-                      &entries_in_memtable);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
   ASSERT_GE(entries_in_memtable, 1);
 
   // This file will flush the memtable
   ASSERT_OK(GenerateAndAddExternalFile(
       options, {206, 207}, -1, true, write_global_seqno,
       verify_checksums_before_ingest, false, false, &true_data));
-  db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
-                      &entries_in_memtable);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
   ASSERT_EQ(entries_in_memtable, 0);
 
   size_t kcnt = 0;
@@ -2303,7 +2278,7 @@ TEST_P(ExternalSSTFileTest, IngestBehind) {
     ASSERT_OK(Put(Key(i), "memtable"));
     true_data[Key(i)] = "memtable";
   }
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   // Universal picker should go at second from the bottom level
   ASSERT_EQ("0,1", FilesPerLevel());
   ASSERT_OK(GenerateAndAddExternalFile(
@@ -2317,7 +2292,7 @@ TEST_P(ExternalSSTFileTest, IngestBehind) {
       verify_checksums_before_ingest, true /*ingest_behind*/,
       false /*sort_data*/, &true_data));
   ASSERT_EQ("0,1,1", FilesPerLevel());
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   // bottom level should be empty
   ASSERT_EQ("0,1", FilesPerLevel());
 
@@ -2465,9 +2440,8 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) {
   // Resize the true_data vector upon construction to avoid re-alloc
   std::vector<std::map<std::string, std::string>> true_data(
       column_families.size());
-  Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data,
-                                         -1, true, true_data);
-  ASSERT_OK(s);
+  ASSERT_OK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                        -1, true, true_data));
   Close();
   ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
                            options);
@@ -2648,9 +2622,8 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_PrepareFail) {
   std::vector<std::map<std::string, std::string>> true_data(
       column_families.size());
   port::Thread ingest_thread([&]() {
-    Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data,
-                                           -1, true, true_data);
-    ASSERT_NOK(s);
+    ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                           -1, true, true_data));
   });
   TEST_SYNC_POINT(
       "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_PrepareFail:"
@@ -2718,9 +2691,8 @@ TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_CommitFail) {
   std::vector<std::map<std::string, std::string>> true_data(
       column_families.size());
   port::Thread ingest_thread([&]() {
-    Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data,
-                                           -1, true, true_data);
-    ASSERT_NOK(s);
+    ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                           -1, true, true_data));
   });
   TEST_SYNC_POINT(
       "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
@@ -2793,9 +2765,8 @@ TEST_P(ExternalSSTFileTest,
   std::vector<std::map<std::string, std::string>> true_data(
       column_families.size());
   port::Thread ingest_thread([&]() {
-    Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data,
-                                           -1, true, true_data);
-    ASSERT_NOK(s);
+    ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                           -1, true, true_data));
   });
   TEST_SYNC_POINT(
       "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
@@ -2806,7 +2777,7 @@ TEST_P(ExternalSSTFileTest,
       "PartialManifestWriteFail:1");
   ingest_thread.join();
 
-  fault_injection_env->DropUnsyncedFileData();
+  ASSERT_OK(fault_injection_env->DropUnsyncedFileData());
   fault_injection_env->SetFilesystemActive(true);
   Close();
   ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
@@ -2841,7 +2812,7 @@ TEST_P(ExternalSSTFileTest, IngestFilesTriggerFlushingWithTwoWriteQueue) {
   // sure that it won't enter the 2nd writer queue for the second time.
   std::vector<std::pair<std::string, std::string>> data;
   data.push_back(std::make_pair("1001", "v2"));
-  GenerateAndAddExternalFile(options, data);
+  ASSERT_OK(GenerateAndAddExternalFile(options, data, -1, true));
 }
 
 TEST_P(ExternalSSTFileTest, DeltaEncodingWhileGlobalSeqnoPresent) {
diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc
index 7e208bbf918..1a3715e3213 100644
--- a/db/fault_injection_test.cc
+++ b/db/fault_injection_test.cc
@@ -16,7 +16,6 @@
 #include "db/version_set.h"
 #include "env/mock_env.h"
 #include "file/filename.h"
-#include "logging/logging.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
@@ -193,7 +192,7 @@ class FaultInjectionTest
     for (int i = start_idx; i < start_idx + num_vals; i++) {
       Slice key = Key(i, &key_space);
       batch.Clear();
-      batch.Put(key, Value(i, &value_space));
+      ASSERT_OK(batch.Put(key, Value(i, &value_space)));
       ASSERT_OK(db_->Write(write_options, &batch));
     }
   }
@@ -273,12 +272,12 @@ class FaultInjectionTest
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
       ASSERT_OK(db_->Delete(WriteOptions(), iter->key()));
     }
-
+    ASSERT_OK(iter->status());
     delete iter;
 
     FlushOptions flush_options;
     flush_options.wait = true;
-    db_->Flush(flush_options);
+    ASSERT_OK(db_->Flush(flush_options));
   }
 
   // rnd cannot be null for kResetDropRandomUnsyncedData
@@ -311,7 +310,7 @@ class FaultInjectionTest
 
     Build(write_options, 0, num_pre_sync);
     if (sync_use_compact_) {
-      db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
     }
     write_options.sync = false;
     Build(write_options, num_pre_sync, num_post_sync);
@@ -343,7 +342,7 @@ class FaultInjectionTest
   }
 
   void WaitCompactionFinish() {
-    static_cast<DBImpl*>(db_->GetRootDB())->TEST_WaitForCompact();
+    ASSERT_OK(static_cast<DBImpl*>(db_->GetRootDB())->TEST_WaitForCompact());
     ASSERT_OK(db_->Put(WriteOptions(), "", ""));
   }
 };
@@ -410,7 +409,7 @@ TEST_P(FaultInjectionTest, WriteOptionSyncTest) {
   write_options.sync = true;
   ASSERT_OK(
       db_->Put(write_options, Key(2, &key_space), Value(2, &value_space)));
-  db_->FlushWAL(false);
+  ASSERT_OK(db_->FlushWAL(false));
 
   env_->SetFilesystemActive(false);
   NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
@@ -451,7 +450,7 @@ TEST_P(FaultInjectionTest, UninstalledCompaction) {
   Build(WriteOptions(), 0, kNumKeys);
   FlushOptions flush_options;
   flush_options.wait = true;
-  db_->Flush(flush_options);
+  ASSERT_OK(db_->Flush(flush_options));
   ASSERT_OK(db_->Put(WriteOptions(), "", ""));
   TEST_SYNC_POINT("FaultInjectionTest::FaultTest:0");
   TEST_SYNC_POINT("FaultInjectionTest::FaultTest:1");
@@ -522,9 +521,9 @@ TEST_P(FaultInjectionTest, WriteBatchWalTerminationTest) {
   wo.sync = true;
   wo.disableWAL = false;
   WriteBatch batch;
-  batch.Put("cats", "dogs");
+  ASSERT_OK(batch.Put("cats", "dogs"));
   batch.MarkWalTerminationPoint();
-  batch.Put("boys", "girls");
+  ASSERT_OK(batch.Put("boys", "girls"));
   ASSERT_OK(db_->Write(wo, &batch));
 
   env_->SetFilesystemActive(false);
diff --git a/db/filename_test.cc b/db/filename_test.cc
index 1e53c952002..d166876ba00 100644
--- a/db/filename_test.cc
+++ b/db/filename_test.cc
@@ -10,7 +10,6 @@
 #include "file/filename.h"
 
 #include "db/dbformat.h"
-#include "logging/logging.h"
 #include "port/port.h"
 #include "test_util/testharness.h"
 
@@ -35,23 +34,23 @@ TEST_F(FileNameTest, Parse) {
     FileType type;
     char mode;
   } cases[] = {
-        {"100.log", 100, kLogFile, kAllMode},
-        {"0.log", 0, kLogFile, kAllMode},
-        {"0.sst", 0, kTableFile, kAllMode},
-        {"CURRENT", 0, kCurrentFile, kAllMode},
-        {"LOCK", 0, kDBLockFile, kAllMode},
-        {"MANIFEST-2", 2, kDescriptorFile, kAllMode},
-        {"MANIFEST-7", 7, kDescriptorFile, kAllMode},
-        {"METADB-2", 2, kMetaDatabase, kAllMode},
-        {"METADB-7", 7, kMetaDatabase, kAllMode},
-        {"LOG", 0, kInfoLogFile, kDefautInfoLogDir},
-        {"LOG.old", 0, kInfoLogFile, kDefautInfoLogDir},
-        {"LOG.old.6688", 6688, kInfoLogFile, kDefautInfoLogDir},
-        {"rocksdb_dir_LOG", 0, kInfoLogFile, kDifferentInfoLogDir},
-        {"rocksdb_dir_LOG.old", 0, kInfoLogFile, kDifferentInfoLogDir},
-        {"rocksdb_dir_LOG.old.6688", 6688, kInfoLogFile, kDifferentInfoLogDir},
-        {"18446744073709551615.log", 18446744073709551615ull, kLogFile,
-         kAllMode}, };
+      {"100.log", 100, kWalFile, kAllMode},
+      {"0.log", 0, kWalFile, kAllMode},
+      {"0.sst", 0, kTableFile, kAllMode},
+      {"CURRENT", 0, kCurrentFile, kAllMode},
+      {"LOCK", 0, kDBLockFile, kAllMode},
+      {"MANIFEST-2", 2, kDescriptorFile, kAllMode},
+      {"MANIFEST-7", 7, kDescriptorFile, kAllMode},
+      {"METADB-2", 2, kMetaDatabase, kAllMode},
+      {"METADB-7", 7, kMetaDatabase, kAllMode},
+      {"LOG", 0, kInfoLogFile, kDefautInfoLogDir},
+      {"LOG.old", 0, kInfoLogFile, kDefautInfoLogDir},
+      {"LOG.old.6688", 6688, kInfoLogFile, kDefautInfoLogDir},
+      {"rocksdb_dir_LOG", 0, kInfoLogFile, kDifferentInfoLogDir},
+      {"rocksdb_dir_LOG.old", 0, kInfoLogFile, kDifferentInfoLogDir},
+      {"rocksdb_dir_LOG.old.6688", 6688, kInfoLogFile, kDifferentInfoLogDir},
+      {"18446744073709551615.log", 18446744073709551615ull, kWalFile, kAllMode},
+  };
   for (char mode : {kDifferentInfoLogDir, kDefautInfoLogDir, kNoCheckLogDir}) {
     for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
       InfoLogPrefix info_log_prefix(mode != kDefautInfoLogDir, "/rocksdb/dir");
@@ -142,7 +141,7 @@ TEST_F(FileNameTest, Construction) {
   ASSERT_EQ("foo/", std::string(fname.data(), 4));
   ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
   ASSERT_EQ(192U, number);
-  ASSERT_EQ(kLogFile, type);
+  ASSERT_EQ(kWalFile, type);
 
   fname = TableFileName({DbPath("bar", 0)}, 200, 0);
   std::string fname1 =
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 6e2a60ff903..10d6ed10868 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -75,6 +75,8 @@ const char* GetFlushReasonString (FlushReason flush_reason) {
       return "Manual Flush";
     case FlushReason::kErrorRecovery:
       return "Error Recovery";
+    case FlushReason::kWalFull:
+      return "WAL Full";
     default:
       return "Invalid";
   }
@@ -83,7 +85,7 @@ const char* GetFlushReasonString (FlushReason flush_reason) {
 FlushJob::FlushJob(
     const std::string& dbname, ColumnFamilyData* cfd,
     const ImmutableDBOptions& db_options,
-    const MutableCFOptions& mutable_cf_options, const uint64_t* max_memtable_id,
+    const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id,
     const FileOptions& file_options, VersionSet* versions,
     InstrumentedMutex* db_mutex, std::atomic<bool>* shutting_down,
     std::vector<SequenceNumber> existing_snapshots,
@@ -94,7 +96,8 @@ FlushJob::FlushJob(
     Statistics* stats, EventLogger* event_logger, bool measure_io_stats,
     const bool sync_output_directory, const bool write_manifest,
     Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
-    const std::string& db_id, const std::string& db_session_id)
+    const std::string& db_id, const std::string& db_session_id,
+    std::string full_history_ts_low, BlobFileCompletionCallback* blob_callback)
     : dbname_(dbname),
       db_id_(db_id),
       db_session_id_(db_session_id),
@@ -123,7 +126,10 @@ FlushJob::FlushJob(
       base_(nullptr),
       pick_memtable_called(false),
       thread_pri_(thread_pri),
-      io_tracer_(io_tracer) {
+      io_tracer_(io_tracer),
+      clock_(db_options_.clock),
+      full_history_ts_low_(std::move(full_history_ts_low)),
+      blob_callback_(blob_callback) {
   // Update the thread status to indicate flush.
   ReportStartedFlush();
   TEST_SYNC_POINT("FlushJob::FlushJob()");
@@ -160,7 +166,6 @@ void FlushJob::RecordFlushIOStats() {
       ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
   IOSTATS_RESET(bytes_written);
 }
-
 void FlushJob::PickMemTable() {
   db_mutex_->AssertHeld();
   assert(!pick_memtable_called);
@@ -305,8 +310,8 @@ Status FlushJob::WriteLevel0Table() {
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_FLUSH_WRITE_L0);
   db_mutex_->AssertHeld();
-  const uint64_t start_micros = db_options_.env->NowMicros();
-  const uint64_t start_cpu_micros = db_options_.env->NowCPUNanos() / 1000;
+  const uint64_t start_micros = clock_->NowMicros();
+  const uint64_t start_cpu_micros = clock_->CPUNanos() / 1000;
   Status s;
 
   std::vector<BlobFileAddition> blob_file_additions;
@@ -367,7 +372,7 @@ Status FlushJob::WriteLevel0Table() {
       TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression",
                                &output_compression_);
       int64_t _current_time = 0;
-      auto status = db_options_.env->GetCurrentTime(&_current_time);
+      auto status = clock_->GetCurrentTime(&_current_time);
       // Safe to proceed even if GetCurrentTime fails. So, log and proceed.
       if (!status.ok()) {
         ROCKS_LOG_WARN(
@@ -397,25 +402,49 @@ Status FlushJob::WriteLevel0Table() {
                                    ? current_time
                                    : meta_.oldest_ancester_time;
 
+      uint64_t num_input_entries = 0;
+      uint64_t memtable_payload_bytes = 0;
+      uint64_t memtable_garbage_bytes = 0;
       IOStatus io_s;
+      const std::string* const full_history_ts_low =
+          (full_history_ts_low_.empty()) ? nullptr : &full_history_ts_low_;
+      TableBuilderOptions tboptions(
+          *cfd_->ioptions(), mutable_cf_options_, cfd_->internal_comparator(),
+          cfd_->int_tbl_prop_collector_factories(), output_compression_,
+          mutable_cf_options_.compression_opts, cfd_->GetID(), cfd_->GetName(),
+          0 /* level */, false /* is_bottommost */,
+          TableFileCreationReason::kFlush, creation_time, oldest_key_time,
+          current_time, db_id_, db_session_id_, 0 /* target_file_size */,
+          meta_.fd.GetNumber());
       s = BuildTable(
-          dbname_, versions_, db_options_.env, db_options_.fs.get(),
-          *cfd_->ioptions(), mutable_cf_options_, file_options_,
+          dbname_, versions_, db_options_, tboptions, file_options_,
           cfd_->table_cache(), iter.get(), std::move(range_del_iters), &meta_,
-          &blob_file_additions, cfd_->internal_comparator(),
-          cfd_->int_tbl_prop_collector_factories(), cfd_->GetID(),
-          cfd_->GetName(), existing_snapshots_,
+          &blob_file_additions, existing_snapshots_,
           earliest_write_conflict_snapshot_, snapshot_checker_,
-          output_compression_, mutable_cf_options_.sample_for_compression,
-          mutable_cf_options_.compression_opts,
           mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(),
-          TableFileCreationReason::kFlush, &io_s, io_tracer_, event_logger_,
-          job_context_->job_id, Env::IO_HIGH, &table_properties_, 0 /* level */,
-          creation_time, oldest_key_time, write_hint, current_time, db_id_,
-          db_session_id_);
+          &io_s, io_tracer_, event_logger_, job_context_->job_id, Env::IO_HIGH,
+          &table_properties_, write_hint, full_history_ts_low, blob_callback_,
+          &num_input_entries, &memtable_payload_bytes, &memtable_garbage_bytes);
       if (!io_s.ok()) {
         io_status_ = io_s;
       }
+      if (num_input_entries != total_num_entries && s.ok()) {
+        std::string msg = "Expected " + ToString(total_num_entries) +
+                          " entries in memtables, but read " +
+                          ToString(num_input_entries);
+        ROCKS_LOG_WARN(db_options_.info_log, "[%s] [JOB %d] Level-0 flush %s",
+                       cfd_->GetName().c_str(), job_context_->job_id,
+                       msg.c_str());
+        if (db_options_.flush_verify_memtable_count) {
+          s = Status::Corruption(msg);
+        }
+      }
+      if (tboptions.reason == TableFileCreationReason::kFlush) {
+        RecordTick(stats_, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH,
+                   memtable_payload_bytes);
+        RecordTick(stats_, MEMTABLE_GARBAGE_BYTES_AT_FLUSH,
+                   memtable_garbage_bytes);
+      }
       LogFlush(db_options_.info_log);
     }
     ROCKS_LOG_INFO(db_options_.info_log,
@@ -438,7 +467,6 @@ Status FlushJob::WriteLevel0Table() {
   // Note that if file_size is zero, the file has been deleted and
   // should not be added to the manifest.
   const bool has_output = meta_.fd.GetFileSize() > 0;
-  assert(has_output || blob_file_additions.empty());
 
   if (s.ok() && has_output) {
     // if we have more than 1 background thread, then we cannot
@@ -462,24 +490,26 @@ Status FlushJob::WriteLevel0Table() {
 
   // Note that here we treat flush as level 0 compaction in internal stats
   InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
-  stats.micros = db_options_.env->NowMicros() - start_micros;
-  stats.cpu_micros = db_options_.env->NowCPUNanos() / 1000 - start_cpu_micros;
+  stats.micros = clock_->NowMicros() - start_micros;
+  stats.cpu_micros = clock_->CPUNanos() / 1000 - start_cpu_micros;
 
   if (has_output) {
     stats.bytes_written = meta_.fd.GetFileSize();
+    stats.num_output_files = 1;
+  }
 
-    const auto& blobs = edit_->GetBlobFileAdditions();
-    for (const auto& blob : blobs) {
-      stats.bytes_written += blob.GetTotalBlobBytes();
-    }
-
-    stats.num_output_files = static_cast<int>(blobs.size()) + 1;
+  const auto& blobs = edit_->GetBlobFileAdditions();
+  for (const auto& blob : blobs) {
+    stats.bytes_written_blob += blob.GetTotalBlobBytes();
   }
 
+  stats.num_output_files_blob = static_cast<int>(blobs.size());
+
   RecordTimeToHistogram(stats_, FLUSH_TIME, stats.micros);
   cfd_->internal_stats()->AddCompactionStats(0 /* level */, thread_pri_, stats);
-  cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED,
-                                     stats.bytes_written);
+  cfd_->internal_stats()->AddCFStats(
+      InternalStats::BYTES_FLUSHED,
+      stats.bytes_written + stats.bytes_written_blob);
   RecordFlushIOStats();
   return s;
 }
diff --git a/db/flush_job.h b/db/flush_job.h
index b724b2464d2..ff2ad85bcab 100644
--- a/db/flush_job.h
+++ b/db/flush_job.h
@@ -17,6 +17,7 @@
 #include <utility>
 #include <vector>
 
+#include "db/blob/blob_file_completion_callback.h"
 #include "db/column_family.h"
 #include "db/dbformat.h"
 #include "db/flush_scheduler.h"
@@ -60,10 +61,9 @@ class FlushJob {
   // IMPORTANT: mutable_cf_options needs to be alive while FlushJob is alive
   FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
            const ImmutableDBOptions& db_options,
-           const MutableCFOptions& mutable_cf_options,
-           const uint64_t* max_memtable_id, const FileOptions& file_options,
-           VersionSet* versions, InstrumentedMutex* db_mutex,
-           std::atomic<bool>* shutting_down,
+           const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id,
+           const FileOptions& file_options, VersionSet* versions,
+           InstrumentedMutex* db_mutex, std::atomic<bool>* shutting_down,
            std::vector<SequenceNumber> existing_snapshots,
            SequenceNumber earliest_write_conflict_snapshot,
            SnapshotChecker* snapshot_checker, JobContext* job_context,
@@ -73,8 +73,9 @@ class FlushJob {
            EventLogger* event_logger, bool measure_io_stats,
            const bool sync_output_directory, const bool write_manifest,
            Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
-           const std::string& db_id = "",
-           const std::string& db_session_id = "");
+           const std::string& db_id = "", const std::string& db_session_id = "",
+           std::string full_history_ts_low = "",
+           BlobFileCompletionCallback* blob_callback = nullptr);
 
   ~FlushJob();
 
@@ -110,12 +111,11 @@ class FlushJob {
   ColumnFamilyData* cfd_;
   const ImmutableDBOptions& db_options_;
   const MutableCFOptions& mutable_cf_options_;
-  // Pointer to a variable storing the largest memtable id to flush in this
+  // A variable storing the largest memtable id to flush in this
   // flush job. RocksDB uses this variable to select the memtables to flush in
   // this job. All memtables in this column family with an ID smaller than or
-  // equal to *max_memtable_id_ will be selected for flush. If null, then all
-  // memtables in the column family will be selected.
-  const uint64_t* max_memtable_id_;
+  // equal to max_memtable_id_ will be selected for flush.
+  uint64_t max_memtable_id_;
   const FileOptions file_options_;
   VersionSet* versions_;
   InstrumentedMutex* db_mutex_;
@@ -164,6 +164,10 @@ class FlushJob {
   IOStatus io_status_;
 
   const std::shared_ptr<IOTracer> io_tracer_;
+  SystemClock* clock_;
+
+  const std::string full_history_ts_low_;
+  BlobFileCompletionCallback* blob_callback_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc
index 618594b2d1e..2366da201e1 100644
--- a/db/flush_job_test.cc
+++ b/db/flush_job_test.cc
@@ -16,6 +16,7 @@
 #include "db/version_set.h"
 #include "file/writable_file_writer.h"
 #include "rocksdb/cache.h"
+#include "rocksdb/file_system.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/mock_table.h"
 #include "test_util/testharness.h"
@@ -28,49 +29,35 @@ namespace ROCKSDB_NAMESPACE {
 // TODO(icanadi) Mock out everything else:
 // 1. VersionSet
 // 2. Memtable
-class FlushJobTest : public testing::Test {
- public:
-  FlushJobTest()
+class FlushJobTestBase : public testing::Test {
+ protected:
+  FlushJobTestBase(std::string dbname, const Comparator* ucmp)
       : env_(Env::Default()),
-        fs_(std::make_shared<LegacyFileSystemWrapper>(env_)),
-        dbname_(test::PerThreadDBPath("flush_job_test")),
+        fs_(env_->GetFileSystem()),
+        dbname_(std::move(dbname)),
+        ucmp_(ucmp),
         options_(),
         db_options_(options_),
         column_family_names_({kDefaultColumnFamilyName, "foo", "bar"}),
         table_cache_(NewLRUCache(50000, 16)),
         write_buffer_manager_(db_options_.db_write_buffer_size),
         shutting_down_(false),
-        mock_table_factory_(new mock::MockTableFactory()) {
-    EXPECT_OK(env_->CreateDirIfMissing(dbname_));
-    db_options_.db_paths.emplace_back(dbname_,
-                                      std::numeric_limits<uint64_t>::max());
-    db_options_.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
-    // TODO(icanadi) Remove this once we mock out VersionSet
-    NewDB();
-    std::vector<ColumnFamilyDescriptor> column_families;
-    cf_options_.table_factory = mock_table_factory_;
-    for (const auto& cf_name : column_family_names_) {
-      column_families.emplace_back(cf_name, cf_options_);
-    }
+        mock_table_factory_(new mock::MockTableFactory()) {}
 
-    db_options_.env = env_;
-    db_options_.fs = fs_;
-    versions_.reset(
-        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
-                       &write_buffer_manager_, &write_controller_,
-                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr));
-    EXPECT_OK(versions_->Recover(column_families, false));
+  virtual ~FlushJobTestBase() {
+    if (getenv("KEEP_DB")) {
+      fprintf(stdout, "db is still in %s\n", dbname_.c_str());
+    } else {
+      // destroy versions_ to release all file handles
+      versions_.reset();
+      EXPECT_OK(DestroyDir(env_, dbname_));
+    }
   }
 
   void NewDB() {
-    SetIdentityFile(env_, dbname_);
+    ASSERT_OK(SetIdentityFile(env_, dbname_));
     VersionEdit new_db;
-    if (db_options_.write_dbid_to_manifest) {
-      DBImpl* impl = new DBImpl(DBOptions(), dbname_);
-      std::string db_id;
-      impl->GetDbIdentityFromIdentityFile(&db_id);
-      new_db.SetDBId(db_id);
-    }
+
     new_db.SetLogNumber(0);
     new_db.SetNextFile(2);
     new_db.SetLastSequence(0);
@@ -82,6 +69,7 @@ class FlushJobTest : public testing::Test {
       VersionEdit new_cf;
       new_cf.AddColumnFamily(column_family_names_[i]);
       new_cf.SetColumnFamily(cf_id++);
+      new_cf.SetComparatorName(ucmp_->Name());
       new_cf.SetLogNumber(0);
       new_cf.SetNextFile(2);
       new_cf.SetLastSequence(last_seq++);
@@ -89,17 +77,19 @@ class FlushJobTest : public testing::Test {
     }
 
     const std::string manifest = DescriptorFileName(dbname_, 1);
-    std::unique_ptr<WritableFile> file;
-    Status s = env_->NewWritableFile(
-        manifest, &file, env_->OptimizeForManifestWrite(env_options_));
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<WritableFileWriter> file_writer;
+    Status s = WritableFileWriter::Create(
+        fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer,
+        nullptr);
     ASSERT_OK(s);
-    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-        NewLegacyWritableFileWrapper(std::move(file)), manifest, EnvOptions()));
+
     {
       log::Writer log(std::move(file_writer), 0, false);
       std::string record;
       new_db.EncodeTo(&record);
       s = log.AddRecord(record);
+      ASSERT_OK(s);
 
       for (const auto& e : new_cfs) {
         record.clear();
@@ -114,9 +104,38 @@ class FlushJobTest : public testing::Test {
     ASSERT_OK(s);
   }
 
+  void SetUp() override {
+    EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+
+    // TODO(icanadi) Remove this once we mock out VersionSet
+    NewDB();
+
+    db_options_.env = env_;
+    db_options_.fs = fs_;
+    db_options_.db_paths.emplace_back(dbname_,
+                                      std::numeric_limits<uint64_t>::max());
+    db_options_.statistics = CreateDBStatistics();
+
+    cf_options_.comparator = ucmp_;
+
+    std::vector<ColumnFamilyDescriptor> column_families;
+    cf_options_.table_factory = mock_table_factory_;
+    for (const auto& cf_name : column_family_names_) {
+      column_families.emplace_back(cf_name, cf_options_);
+    }
+
+    versions_.reset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
+    EXPECT_OK(versions_->Recover(column_families, false));
+  }
+
   Env* env_;
   std::shared_ptr<FileSystem> fs_;
   std::string dbname_;
+  const Comparator* const ucmp_;
   EnvOptions env_options_;
   Options options_;
   ImmutableDBOptions db_options_;
@@ -131,6 +150,13 @@ class FlushJobTest : public testing::Test {
   std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
 };
 
+class FlushJobTest : public FlushJobTestBase {
+ public:
+  FlushJobTest()
+      : FlushJobTestBase(test::PerThreadDBPath("flush_job_test"),
+                         BytewiseComparator()) {}
+};
+
 TEST_F(FlushJobTest, Empty) {
   JobContext job_context(0);
   auto cfd = versions_->GetColumnFamilySet()->GetDefault();
@@ -138,7 +164,7 @@ TEST_F(FlushJobTest, Empty) {
   SnapshotChecker* snapshot_checker = nullptr;  // not relavant
   FlushJob flush_job(
       dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
-      *cfd->GetLatestMutableCFOptions(), nullptr /* memtable_id */,
+      *cfd->GetLatestMutableCFOptions(), port::kMaxUint64 /* memtable_id */,
       env_options_, versions_.get(), &mutex_, &shutting_down_, {},
       kMaxSequenceNumber, snapshot_checker, &job_context, nullptr, nullptr,
       nullptr, kNoCompression, nullptr, &event_logger, false,
@@ -167,7 +193,8 @@ TEST_F(FlushJobTest, NonEmpty) {
   for (int i = 1; i < 10000; ++i) {
     std::string key(ToString((i + 1000) % 10000));
     std::string value("value" + key);
-    new_mem->Add(SequenceNumber(i), kTypeValue, key, value);
+    ASSERT_OK(new_mem->Add(SequenceNumber(i), kTypeValue, key, value,
+                           nullptr /* kv_prot_info */));
     if ((i + 1000) % 10000 < 9995) {
       InternalKey internal_key(key, SequenceNumber(i), kTypeValue);
       inserted_keys.push_back({internal_key.Encode().ToString(), value});
@@ -175,7 +202,8 @@ TEST_F(FlushJobTest, NonEmpty) {
   }
 
   {
-    new_mem->Add(SequenceNumber(10000), kTypeRangeDeletion, "9995", "9999a");
+    ASSERT_OK(new_mem->Add(SequenceNumber(10000), kTypeRangeDeletion, "9995",
+                           "9999a", nullptr /* kv_prot_info */));
     InternalKey internal_key("9995", SequenceNumber(10000), kTypeRangeDeletion);
     inserted_keys.push_back({internal_key.Encode().ToString(), "9999a"});
   }
@@ -202,7 +230,8 @@ TEST_F(FlushJobTest, NonEmpty) {
     }
 
     const SequenceNumber seq(i + 10001);
-    new_mem->Add(seq, kTypeBlobIndex, key, blob_index);
+    ASSERT_OK(new_mem->Add(seq, kTypeBlobIndex, key, blob_index,
+                           nullptr /* kv_prot_info */));
 
     InternalKey internal_key(key, seq, kTypeBlobIndex);
     inserted_keys.push_back({internal_key.Encode().ToString(), blob_index});
@@ -219,7 +248,7 @@ TEST_F(FlushJobTest, NonEmpty) {
   SnapshotChecker* snapshot_checker = nullptr;  // not relavant
   FlushJob flush_job(
       dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
-      *cfd->GetLatestMutableCFOptions(), nullptr /* memtable_id */,
+      *cfd->GetLatestMutableCFOptions(), port::kMaxUint64 /* memtable_id */,
       env_options_, versions_.get(), &mutex_, &shutting_down_, {},
       kMaxSequenceNumber, snapshot_checker, &job_context, nullptr, nullptr,
       nullptr, kNoCompression, db_options_.statistics.get(), &event_logger,
@@ -263,8 +292,8 @@ TEST_F(FlushJobTest, FlushMemTablesSingleColumnFamily) {
     for (size_t j = 0; j < num_keys_per_table; ++j) {
       std::string key(ToString(j + i * num_keys_per_table));
       std::string value("value" + key);
-      mem->Add(SequenceNumber(j + i * num_keys_per_table), kTypeValue, key,
-               value);
+      ASSERT_OK(mem->Add(SequenceNumber(j + i * num_keys_per_table), kTypeValue,
+                         key, value, nullptr /* kv_prot_info */));
     }
   }
 
@@ -281,7 +310,7 @@ TEST_F(FlushJobTest, FlushMemTablesSingleColumnFamily) {
   uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1;
   FlushJob flush_job(
       dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
-      *cfd->GetLatestMutableCFOptions(), &flush_memtable_id, env_options_,
+      *cfd->GetLatestMutableCFOptions(), flush_memtable_id, env_options_,
       versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
       snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression,
       db_options_.statistics.get(), &event_logger, true,
@@ -336,7 +365,8 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) {
       for (size_t j = 0; j != num_keys_per_memtable; ++j) {
         std::string key(ToString(j + i * num_keys_per_memtable));
         std::string value("value" + key);
-        mem->Add(curr_seqno++, kTypeValue, key, value);
+        ASSERT_OK(mem->Add(curr_seqno++, kTypeValue, key, value,
+                           nullptr /* kv_prot_info */));
       }
 
       cfd->imm()->Add(mem, &to_delete);
@@ -353,7 +383,7 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) {
     std::vector<SequenceNumber> snapshot_seqs;
     flush_jobs.emplace_back(new FlushJob(
         dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(),
-        &memtable_ids[k], env_options_, versions_.get(), &mutex_,
+        memtable_ids[k], env_options_, versions_.get(), &mutex_,
         &shutting_down_, snapshot_seqs, kMaxSequenceNumber, snapshot_checker,
         &job_context, nullptr, nullptr, nullptr, kNoCompression,
         db_options_.statistics.get(), &event_logger, true,
@@ -391,8 +421,9 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) {
 
   Status s = InstallMemtableAtomicFlushResults(
       nullptr /* imm_lists */, all_cfds, mutable_cf_options_list, mems_list,
-      versions_.get(), &mutex_, file_meta_ptrs, &job_context.memtables_to_free,
-      nullptr /* db_directory */, nullptr /* log_buffer */);
+      versions_.get(), nullptr /* prep_tracker */, &mutex_, file_meta_ptrs,
+      &job_context.memtables_to_free, nullptr /* db_directory */,
+      nullptr /* log_buffer */);
   ASSERT_OK(s);
 
   mutex_.Unlock();
@@ -446,7 +477,8 @@ TEST_F(FlushJobTest, Snapshots) {
     for (int j = 0; j < insertions; ++j) {
       std::string value(rnd.HumanReadableString(10));
       auto seqno = ++current_seqno;
-      new_mem->Add(SequenceNumber(seqno), kTypeValue, key, value);
+      ASSERT_OK(new_mem->Add(SequenceNumber(seqno), kTypeValue, key, value,
+                             nullptr /* kv_prot_info */));
       // a key is visible only if:
       // 1. it's the last one written (j == insertions - 1)
       // 2. there's a snapshot pointing at it
@@ -470,7 +502,7 @@ TEST_F(FlushJobTest, Snapshots) {
   SnapshotChecker* snapshot_checker = nullptr;  // not relavant
   FlushJob flush_job(
       dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
-      *cfd->GetLatestMutableCFOptions(), nullptr /* memtable_id */,
+      *cfd->GetLatestMutableCFOptions(), port::kMaxUint64 /* memtable_id */,
       env_options_, versions_.get(), &mutex_, &shutting_down_, snapshots,
       kMaxSequenceNumber, snapshot_checker, &job_context, nullptr, nullptr,
       nullptr, kNoCompression, db_options_.statistics.get(), &event_logger,
@@ -487,6 +519,136 @@ TEST_F(FlushJobTest, Snapshots) {
   job_context.Clean();
 }
 
+class FlushJobTimestampTest : public FlushJobTestBase {
+ public:
+  FlushJobTimestampTest()
+      : FlushJobTestBase(test::PerThreadDBPath("flush_job_ts_gc_test"),
+                         test::ComparatorWithU64Ts()) {}
+
+  void AddKeyValueToMemtable(MemTable* memtable, std::string key, uint64_t ts,
+                             SequenceNumber seq, ValueType value_type,
+                             Slice value) {
+    std::string key_str(std::move(key));
+    PutFixed64(&key_str, ts);
+    ASSERT_OK(memtable->Add(seq, value_type, key_str, value,
+                            nullptr /* kv_prot_info */));
+  }
+
+ protected:
+  static constexpr uint64_t kStartTs = 10;
+  static constexpr SequenceNumber kStartSeq = 0;
+  SequenceNumber curr_seq_{kStartSeq};
+  std::atomic<uint64_t> curr_ts_{kStartTs};
+};
+
+TEST_F(FlushJobTimestampTest, AllKeysExpired) {
+  ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
+  autovector<MemTable*> to_delete;
+
+  {
+    MemTable* new_mem = cfd->ConstructNewMemtable(
+        *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber);
+    new_mem->Ref();
+    for (int i = 0; i < 100; ++i) {
+      uint64_t ts = curr_ts_.fetch_add(1);
+      SequenceNumber seq = (curr_seq_++);
+      AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq,
+                            ValueType::kTypeValue, "0_value");
+    }
+    uint64_t ts = curr_ts_.fetch_add(1);
+    SequenceNumber seq = (curr_seq_++);
+    AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq,
+                          ValueType::kTypeDeletionWithTimestamp, "");
+    cfd->imm()->Add(new_mem, &to_delete);
+  }
+
+  std::vector<SequenceNumber> snapshots;
+  constexpr SnapshotChecker* const snapshot_checker = nullptr;
+  JobContext job_context(0);
+  EventLogger event_logger(db_options_.info_log.get());
+  std::string full_history_ts_low;
+  PutFixed64(&full_history_ts_low, std::numeric_limits<uint64_t>::max());
+  FlushJob flush_job(
+      dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(),
+      port::kMaxUint64 /* memtable_id */, env_options_, versions_.get(),
+      &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber, snapshot_checker,
+      &job_context, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
+      Env::Priority::USER, nullptr /*IOTracer*/, /*db_id=*/"",
+      /*db_session_id=*/"", full_history_ts_low);
+
+  FileMetaData fmeta;
+  mutex_.Lock();
+  flush_job.PickMemTable();
+  ASSERT_OK(flush_job.Run(/*prep_tracker=*/nullptr, &fmeta));
+  mutex_.Unlock();
+
+  {
+    std::string key = test::EncodeInt(0);
+    key.append(test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1));
+    InternalKey ikey(key, curr_seq_ - 1, ValueType::kTypeDeletionWithTimestamp);
+    ASSERT_EQ(ikey.Encode(), fmeta.smallest.Encode());
+    ASSERT_EQ(ikey.Encode(), fmeta.largest.Encode());
+  }
+
+  job_context.Clean();
+  ASSERT_TRUE(to_delete.empty());
+}
+
+TEST_F(FlushJobTimestampTest, NoKeyExpired) {
+  ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
+  autovector<MemTable*> to_delete;
+
+  {
+    MemTable* new_mem = cfd->ConstructNewMemtable(
+        *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber);
+    new_mem->Ref();
+    for (int i = 0; i < 100; ++i) {
+      uint64_t ts = curr_ts_.fetch_add(1);
+      SequenceNumber seq = (curr_seq_++);
+      AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq,
+                            ValueType::kTypeValue, "0_value");
+    }
+    cfd->imm()->Add(new_mem, &to_delete);
+  }
+
+  std::vector<SequenceNumber> snapshots;
+  SnapshotChecker* const snapshot_checker = nullptr;
+  JobContext job_context(0);
+  EventLogger event_logger(db_options_.info_log.get());
+  std::string full_history_ts_low;
+  PutFixed64(&full_history_ts_low, 0);
+  FlushJob flush_job(
+      dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(),
+      port::kMaxUint64 /* memtable_id */, env_options_, versions_.get(),
+      &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber, snapshot_checker,
+      &job_context, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
+      Env::Priority::USER, nullptr /*IOTracer*/, /*db_id=*/"",
+      /*db_session_id=*/"", full_history_ts_low);
+
+  FileMetaData fmeta;
+  mutex_.Lock();
+  flush_job.PickMemTable();
+  ASSERT_OK(flush_job.Run(/*prep_tracker=*/nullptr, &fmeta));
+  mutex_.Unlock();
+
+  {
+    std::string ukey = test::EncodeInt(0);
+    std::string smallest_key =
+        ukey + test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1);
+    std::string largest_key = ukey + test::EncodeInt(kStartTs);
+    InternalKey smallest(smallest_key, curr_seq_ - 1, ValueType::kTypeValue);
+    InternalKey largest(largest_key, kStartSeq, ValueType::kTypeValue);
+    ASSERT_EQ(smallest.Encode(), fmeta.smallest.Encode());
+    ASSERT_EQ(largest.Encode(), fmeta.largest.Encode());
+  }
+  job_context.Clean();
+  ASSERT_TRUE(to_delete.empty());
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/flush_scheduler.h b/db/flush_scheduler.h
index cbe17994fd4..eb03f3e1142 100644
--- a/db/flush_scheduler.h
+++ b/db/flush_scheduler.h
@@ -5,10 +5,11 @@
 
 #pragma once
 
-#include <stdint.h>
 #include <atomic>
+#include <cstdint>
 #include <mutex>
 #include <set>
+
 #include "util/autovector.h"
 
 namespace ROCKSDB_NAMESPACE {
diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc
index 013af04e997..80dd1bb9e86 100644
--- a/db/forward_iterator.cc
+++ b/db/forward_iterator.cc
@@ -46,7 +46,9 @@ class ForwardLevelIterator : public InternalIterator {
         file_iter_(nullptr),
         pinned_iters_mgr_(nullptr),
         prefix_extractor_(prefix_extractor),
-        allow_unprepared_value_(allow_unprepared_value) {}
+        allow_unprepared_value_(allow_unprepared_value) {
+    status_.PermitUncheckedError();  // Allow uninitialized status through
+  }
 
   ~ForwardLevelIterator() override {
     // Reset current pointer
@@ -238,6 +240,12 @@ ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
   if (sv_) {
     RebuildIterators(false);
   }
+
+  // immutable_status_ is a local aggregation of the
+  // status of the immutable Iterators.
+  // We have to PermitUncheckedError in case it is never
+  // used, otherwise it will fail ASSERT_STATUS_CHECKED.
+  immutable_status_.PermitUncheckedError();
 }
 
 ForwardIterator::~ForwardIterator() {
@@ -418,7 +426,7 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
       if (seek_to_first) {
         l0_iters_[i]->SeekToFirst();
       } else {
-        // If the target key passes over the larget key, we are sure Next()
+        // If the target key passes over the largest key, we are sure Next()
         // won't go over this file.
         if (user_comparator_->Compare(target_user_key,
                                       l0[i]->largest.user_key()) > 0) {
@@ -985,9 +993,9 @@ bool ForwardIterator::TEST_CheckDeletedIters(int* pdeleted_iters,
 uint32_t ForwardIterator::FindFileInRange(
     const std::vector<FileMetaData*>& files, const Slice& internal_key,
     uint32_t left, uint32_t right) {
-  auto cmp = [&](const FileMetaData* f, const Slice& key) -> bool {
+  auto cmp = [&](const FileMetaData* f, const Slice& k) -> bool {
     return cfd_->internal_comparator().InternalKeyComparator::Compare(
-            f->largest.Encode(), key) < 0;
+            f->largest.Encode(), k) < 0;
   };
   const auto &b = files.begin();
   return static_cast<uint32_t>(std::lower_bound(b + left,
diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc
index 9e9073483c1..7c8c44e4ed8 100644
--- a/db/import_column_family_job.cc
+++ b/db/import_column_family_job.cc
@@ -140,7 +140,7 @@ Status ImportColumnFamilyJob::Run() {
   int64_t temp_current_time = 0;
   uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
   uint64_t current_time = kUnknownOldestAncesterTime;
-  if (env_->GetCurrentTime(&temp_current_time).ok()) {
+  if (clock_->GetCurrentTime(&temp_current_time).ok()) {
     current_time = oldest_ancester_time =
         static_cast<uint64_t>(temp_current_time);
   }
@@ -252,15 +252,21 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo(
 
   // Get first (smallest) key from file
   iter->SeekToFirst();
-  if (ParseInternalKey(iter->key(), &key) != Status::OK()) {
-    return Status::Corruption("external file have corrupted keys");
+  Status pik_status =
+      ParseInternalKey(iter->key(), &key, db_options_.allow_data_in_errors);
+  if (!pik_status.ok()) {
+    return Status::Corruption("Corrupted Key in external file. ",
+                              pik_status.getState());
   }
   file_to_import->smallest_internal_key.SetFrom(key);
 
   // Get last (largest) key from file
   iter->SeekToLast();
-  if (ParseInternalKey(iter->key(), &key) != Status::OK()) {
-    return Status::Corruption("external file have corrupted keys");
+  pik_status =
+      ParseInternalKey(iter->key(), &key, db_options_.allow_data_in_errors);
+  if (!pik_status.ok()) {
+    return Status::Corruption("Corrupted Key in external file. ",
+                              pik_status.getState());
   }
   file_to_import->largest_internal_key.SetFrom(key);
 
diff --git a/db/import_column_family_job.h b/db/import_column_family_job.h
index 6cdde2473e8..3cf4eb56e89 100644
--- a/db/import_column_family_job.h
+++ b/db/import_column_family_job.h
@@ -9,24 +9,25 @@
 #include "db/snapshot_impl.h"
 #include "options/db_options.h"
 #include "rocksdb/db.h"
-#include "rocksdb/env.h"
 #include "rocksdb/metadata.h"
 #include "rocksdb/sst_file_writer.h"
 #include "util/autovector.h"
 
 namespace ROCKSDB_NAMESPACE {
+struct EnvOptions;
+class SystemClock;
 
 // Imports a set of sst files as is into a new column family. Logic is similar
 // to ExternalSstFileIngestionJob.
 class ImportColumnFamilyJob {
  public:
-  ImportColumnFamilyJob(Env* env, VersionSet* versions, ColumnFamilyData* cfd,
+  ImportColumnFamilyJob(VersionSet* versions, ColumnFamilyData* cfd,
                         const ImmutableDBOptions& db_options,
                         const EnvOptions& env_options,
                         const ImportColumnFamilyOptions& import_options,
                         const std::vector<LiveFileMetaData>& metadata,
                         const std::shared_ptr<IOTracer>& io_tracer)
-      : env_(env),
+      : clock_(db_options.clock),
         versions_(versions),
         cfd_(cfd),
         db_options_(db_options),
@@ -59,7 +60,7 @@ class ImportColumnFamilyJob {
                              IngestedFileInfo* file_to_import,
                              SuperVersion* sv);
 
-  Env* env_;
+  SystemClock* clock_;
   VersionSet* versions_;
   ColumnFamilyData* cfd_;
   const ImmutableDBOptions& db_options_;
diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc
index fd77e04c5dd..3e76db76ca2 100644
--- a/db/import_column_family_test.cc
+++ b/db/import_column_family_test.cc
@@ -16,8 +16,8 @@ class ImportColumnFamilyTest : public DBTestBase {
   ImportColumnFamilyTest()
       : DBTestBase("/import_column_family_test", /*env_do_fsync=*/true) {
     sst_files_dir_ = dbname_ + "/sst_files/";
-    DestroyAndRecreateExternalSSTFilesDir();
     export_files_dir_ = test::PerThreadDBPath(env_, "export");
+    DestroyAndRecreateExternalSSTFilesDir();
     import_cfh_ = nullptr;
     import_cfh2_ = nullptr;
     metadata_ptr_ = nullptr;
@@ -104,9 +104,9 @@ TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFiles) {
     ASSERT_NE(import_cfh_, nullptr);
 
     std::string value;
-    db_->Get(ReadOptions(), import_cfh_, "K1", &value);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K1", &value));
     ASSERT_EQ(value, "V1");
-    db_->Get(ReadOptions(), import_cfh_, "K2", &value);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K2", &value));
     ASSERT_EQ(value, "V2");
     ASSERT_OK(db_->DropColumnFamily(import_cfh_));
     ASSERT_OK(db_->DestroyColumnFamilyHandle(import_cfh_));
@@ -125,9 +125,9 @@ TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFiles) {
     ASSERT_NE(import_cfh_, nullptr);
 
     std::string value;
-    db_->Get(ReadOptions(), import_cfh_, "K3", &value);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K3", &value));
     ASSERT_EQ(value, "V1");
-    db_->Get(ReadOptions(), import_cfh_, "K4", &value);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K4", &value));
     ASSERT_EQ(value, "V2");
   }
 }
@@ -214,7 +214,7 @@ TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithOverlap) {
 
   for (int i = 0; i < 100; i++) {
     std::string value;
-    db_->Get(ReadOptions(), import_cfh_, Key(i), &value);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value));
     if (i % 16 == 0) {
       ASSERT_EQ(value, Key(i) + "_overwrite4");
     } else if (i % 4 == 0) {
@@ -235,7 +235,7 @@ TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithOverlap) {
   ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_));
   for (int i = 0; i < 100; i++) {
     std::string value;
-    db_->Get(ReadOptions(), import_cfh_, Key(i), &value);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value));
     if (i % 5 == 0) {
       ASSERT_EQ(value, Key(i) + "_overwrite5");
     } else if (i % 16 == 0) {
@@ -254,7 +254,7 @@ TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithOverlap) {
       db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr));
   for (int i = 0; i < 100; i++) {
     std::string value;
-    db_->Get(ReadOptions(), import_cfh_, Key(i), &value);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value));
     if (i % 5 == 0) {
       ASSERT_EQ(value, Key(i) + "_overwrite5");
     } else if (i % 16 == 0) {
@@ -318,12 +318,12 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) {
   std::string value1, value2;
 
   for (int i = 0; i < 100; ++i) {
-    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
     ASSERT_EQ(Get(1, Key(i)), value1);
   }
 
   for (int i = 0; i < 100; ++i) {
-    db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2));
     ASSERT_EQ(Get(1, Key(i)), value2);
   }
 
@@ -340,16 +340,16 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) {
         db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound());
   }
   for (int i = 25; i < 50; ++i) {
-    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
     ASSERT_EQ(Key(i) + "_overwrite3", value1);
   }
   for (int i = 50; i < 100; ++i) {
-    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
     ASSERT_EQ(Key(i) + "_overwrite2", value1);
   }
 
   for (int i = 0; i < 100; ++i) {
-    db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2));
     ASSERT_EQ(Get(1, Key(i)), value2);
   }
 
@@ -363,16 +363,16 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) {
         db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound());
   }
   for (int i = 25; i < 50; ++i) {
-    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
     ASSERT_EQ(Key(i) + "_overwrite3", value1);
   }
   for (int i = 50; i < 100; ++i) {
-    db_->Get(ReadOptions(), import_cfh_, Key(i), &value1);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
     ASSERT_EQ(Key(i) + "_overwrite2", value1);
   }
 
   for (int i = 0; i < 100; ++i) {
-    db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2);
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2));
     ASSERT_EQ(Get(1, Key(i)), value2);
   }
 }
@@ -424,7 +424,7 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) {
 
   for (int i = 0; i < 100; ++i) {
     std::string value;
-    db_copy->Get(ReadOptions(), cfh, Key(i), &value);
+    ASSERT_OK(db_copy->Get(ReadOptions(), cfh, Key(i), &value));
     ASSERT_EQ(Get(1, Key(i)), value);
   }
   ASSERT_OK(db_copy->DropColumnFamily(cfh));
diff --git a/db/internal_stats.cc b/db/internal_stats.cc
index afe3f71419a..5f1f06fa5a6 100644
--- a/db/internal_stats.cc
+++ b/db/internal_stats.cc
@@ -12,15 +12,20 @@
 
 #include <algorithm>
 #include <cinttypes>
+#include <cstddef>
 #include <limits>
 #include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_entry_stats.h"
 #include "db/column_family.h"
 #include "db/db_impl/db_impl.h"
+#include "rocksdb/system_clock.h"
 #include "rocksdb/table.h"
+#include "table/block_based/cachable_entry.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -50,6 +55,8 @@ const std::map<LevelStatType, LevelStat> InternalStats::compaction_level_stats =
         {LevelStatType::AVG_SEC, LevelStat{"AvgSec", "Avg(sec)"}},
         {LevelStatType::KEY_IN, LevelStat{"KeyIn", "KeyIn"}},
         {LevelStatType::KEY_DROP, LevelStat{"KeyDrop", "KeyDrop"}},
+        {LevelStatType::R_BLOB_GB, LevelStat{"RblobGB", "Rblob(GB)"}},
+        {LevelStatType::W_BLOB_GB, LevelStat{"WblobGB", "Wblob(GB)"}},
 };
 
 namespace {
@@ -61,12 +68,14 @@ void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name,
                            const std::string& group_by) {
   int written_size =
       snprintf(buf, len, "\n** Compaction Stats [%s] **\n", cf_name.c_str());
+  written_size = std::min(written_size, static_cast<int>(len));
   auto hdr = [](LevelStatType t) {
     return InternalStats::compaction_level_stats.at(t).header_name.c_str();
   };
   int line_size = snprintf(
       buf + written_size, len - written_size,
-      "%s    %s   %s     %s %s  %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
+      "%s    %s   %s     %s %s  %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s "
+      "%s\n",
       // Note that we skip COMPACTED_FILES and merge it with Files column
       group_by.c_str(), hdr(LevelStatType::NUM_FILES),
       hdr(LevelStatType::SIZE_BYTES), hdr(LevelStatType::SCORE),
@@ -77,9 +86,11 @@ void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name,
       hdr(LevelStatType::WRITE_MBPS), hdr(LevelStatType::COMP_SEC),
       hdr(LevelStatType::COMP_CPU_SEC), hdr(LevelStatType::COMP_COUNT),
       hdr(LevelStatType::AVG_SEC), hdr(LevelStatType::KEY_IN),
-      hdr(LevelStatType::KEY_DROP));
+      hdr(LevelStatType::KEY_DROP), hdr(LevelStatType::R_BLOB_GB),
+      hdr(LevelStatType::W_BLOB_GB));
 
   written_size += line_size;
+  written_size = std::min(written_size, static_cast<int>(len));
   snprintf(buf + written_size, len - written_size, "%s\n",
            std::string(line_size, '-').c_str());
 }
@@ -88,10 +99,12 @@ void PrepareLevelStats(std::map<LevelStatType, double>* level_stats,
                        int num_files, int being_compacted,
                        double total_file_size, double score, double w_amp,
                        const InternalStats::CompactionStats& stats) {
-  uint64_t bytes_read =
-      stats.bytes_read_non_output_levels + stats.bytes_read_output_level;
-  int64_t bytes_new = stats.bytes_written - stats.bytes_read_output_level;
-  double elapsed = (stats.micros + 1) / kMicrosInSec;
+  const uint64_t bytes_read = stats.bytes_read_non_output_levels +
+                              stats.bytes_read_output_level +
+                              stats.bytes_read_blob;
+  const uint64_t bytes_written = stats.bytes_written + stats.bytes_written_blob;
+  const int64_t bytes_new = stats.bytes_written - stats.bytes_read_output_level;
+  const double elapsed = (stats.micros + 1) / kMicrosInSec;
 
   (*level_stats)[LevelStatType::NUM_FILES] = num_files;
   (*level_stats)[LevelStatType::COMPACTED_FILES] = being_compacted;
@@ -106,8 +119,7 @@ void PrepareLevelStats(std::map<LevelStatType, double>* level_stats,
   (*level_stats)[LevelStatType::MOVED_GB] = stats.bytes_moved / kGB;
   (*level_stats)[LevelStatType::WRITE_AMP] = w_amp;
   (*level_stats)[LevelStatType::READ_MBPS] = bytes_read / kMB / elapsed;
-  (*level_stats)[LevelStatType::WRITE_MBPS] =
-      stats.bytes_written / kMB / elapsed;
+  (*level_stats)[LevelStatType::WRITE_MBPS] = bytes_written / kMB / elapsed;
   (*level_stats)[LevelStatType::COMP_SEC] = stats.micros / kMicrosInSec;
   (*level_stats)[LevelStatType::COMP_CPU_SEC] = stats.cpu_micros / kMicrosInSec;
   (*level_stats)[LevelStatType::COMP_COUNT] = stats.count;
@@ -117,6 +129,8 @@ void PrepareLevelStats(std::map<LevelStatType, double>* level_stats,
       static_cast<double>(stats.num_input_records);
   (*level_stats)[LevelStatType::KEY_DROP] =
       static_cast<double>(stats.num_dropped_records);
+  (*level_stats)[LevelStatType::R_BLOB_GB] = stats.bytes_read_blob / kGB;
+  (*level_stats)[LevelStatType::W_BLOB_GB] = stats.bytes_written_blob / kGB;
 }
 
 void PrintLevelStats(char* buf, size_t len, const std::string& name,
@@ -141,7 +155,9 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name,
       "%9d "      /*  Comp(cnt) */
       "%8.3f "    /*  Avg(sec) */
       "%7s "      /*  KeyIn */
-      "%6s\n",    /*  KeyDrop */
+      "%6s "      /*  KeyDrop */
+      "%9.1f "    /*  Rblob(GB) */
+      "%9.1f\n",  /*  Wblob(GB) */
       name.c_str(), static_cast<int>(stat_value.at(LevelStatType::NUM_FILES)),
       static_cast<int>(stat_value.at(LevelStatType::COMPACTED_FILES)),
       BytesToHumanString(
@@ -166,7 +182,9 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name,
           .c_str(),
       NumberToHumanString(
           static_cast<std::int64_t>(stat_value.at(LevelStatType::KEY_DROP)))
-          .c_str());
+          .c_str(),
+      stat_value.at(LevelStatType::R_BLOB_GB),
+      stat_value.at(LevelStatType::W_BLOB_GB));
 }
 
 void PrintLevelStats(char* buf, size_t len, const std::string& name,
@@ -207,6 +225,7 @@ static const std::string cfstats_no_file_histogram =
 static const std::string cf_file_histogram = "cf-file-histogram";
 static const std::string dbstats = "dbstats";
 static const std::string levelstats = "levelstats";
+static const std::string block_cache_entry_stats = "block-cache-entry-stats";
 static const std::string num_immutable_mem_table = "num-immutable-mem-table";
 static const std::string num_immutable_mem_table_flushed =
     "num-immutable-mem-table-flushed";
@@ -273,6 +292,8 @@ const std::string DB::Properties::kCFFileHistogram =
     rocksdb_prefix + cf_file_histogram;
 const std::string DB::Properties::kDBStats = rocksdb_prefix + dbstats;
 const std::string DB::Properties::kLevelStats = rocksdb_prefix + levelstats;
+const std::string DB::Properties::kBlockCacheEntryStats =
+    rocksdb_prefix + block_cache_entry_stats;
 const std::string DB::Properties::kNumImmutableMemTable =
     rocksdb_prefix + num_immutable_mem_table;
 const std::string DB::Properties::kNumImmutableMemTableFlushed =
@@ -372,14 +393,18 @@ const std::unordered_map<std::string, DBPropertyInfo>
           nullptr}},
         {DB::Properties::kDBStats,
          {false, &InternalStats::HandleDBStats, nullptr, nullptr, nullptr}},
+        {DB::Properties::kBlockCacheEntryStats,
+         {false, &InternalStats::HandleBlockCacheEntryStats, nullptr,
+          &InternalStats::HandleBlockCacheEntryStatsMap, nullptr}},
         {DB::Properties::kSSTables,
          {false, &InternalStats::HandleSsTables, nullptr, nullptr, nullptr}},
         {DB::Properties::kAggregatedTableProperties,
          {false, &InternalStats::HandleAggregatedTableProperties, nullptr,
-          nullptr, nullptr}},
+          &InternalStats::HandleAggregatedTablePropertiesMap, nullptr}},
         {DB::Properties::kAggregatedTablePropertiesAtLevel,
          {false, &InternalStats::HandleAggregatedTablePropertiesAtLevel,
-          nullptr, nullptr, nullptr}},
+          nullptr, &InternalStats::HandleAggregatedTablePropertiesAtLevelMap,
+          nullptr}},
         {DB::Properties::kNumImmutableMemTable,
          {false, nullptr, &InternalStats::HandleNumImmutableMemTable, nullptr,
           nullptr}},
@@ -489,6 +514,159 @@ const std::unordered_map<std::string, DBPropertyInfo>
           &DBImpl::GetPropertyHandleOptionsStatistics}},
 };
 
+InternalStats::InternalStats(int num_levels, SystemClock* clock,
+                             ColumnFamilyData* cfd)
+    : db_stats_{},
+      cf_stats_value_{},
+      cf_stats_count_{},
+      comp_stats_(num_levels),
+      comp_stats_by_pri_(Env::Priority::TOTAL),
+      file_read_latency_(num_levels),
+      bg_error_count_(0),
+      number_levels_(num_levels),
+      clock_(clock),
+      cfd_(cfd),
+      started_at_(clock->NowMicros()) {}
+
+Status InternalStats::CollectCacheEntryStats(bool foreground) {
+  // Lazy initialize/reference the collector. It is pinned in cache (through
+  // a shared_ptr) so that it does not get immediately ejected from a full
+  // cache, which would force a re-scan on the next GetStats.
+  if (!cache_entry_stats_collector_) {
+    Cache* block_cache;
+    bool ok = HandleBlockCacheStat(&block_cache);
+    if (ok) {
+      // Extract or create stats collector.
+      Status s = CacheEntryStatsCollector<CacheEntryRoleStats>::GetShared(
+          block_cache, clock_, &cache_entry_stats_collector_);
+      if (!s.ok()) {
+        // Block cache likely under pressure. Scanning could make it worse,
+        // so skip.
+        return s;
+      }
+    } else {
+      return Status::NotFound("block cache not configured");
+    }
+  }
+  assert(cache_entry_stats_collector_);
+
+  // For "background" collections, strictly cap the collection time by
+  // expanding effective cache TTL. For foreground, be more aggressive about
+  // getting latest data.
+  int min_interval_seconds = foreground ? 10 : 180;
+  // 1/500 = max of 0.2% of one CPU thread
+  int min_interval_factor = foreground ? 10 : 500;
+  cache_entry_stats_collector_->GetStats(
+      &cache_entry_stats_, min_interval_seconds, min_interval_factor);
+  return Status::OK();
+}
+
+std::function<void(const Slice&, void*, size_t, Cache::DeleterFn)>
+InternalStats::CacheEntryRoleStats::GetEntryCallback() {
+  return [&](const Slice& /*key*/, void* /*value*/, size_t charge,
+             Cache::DeleterFn deleter) {
+    auto e = role_map_.find(deleter);
+    size_t role_idx;
+    if (e == role_map_.end()) {
+      role_idx = static_cast<size_t>(CacheEntryRole::kMisc);
+    } else {
+      role_idx = static_cast<size_t>(e->second);
+    }
+    entry_counts[role_idx]++;
+    total_charges[role_idx] += charge;
+  };
+}
+
+void InternalStats::CacheEntryRoleStats::BeginCollection(
+    Cache* cache, SystemClock*, uint64_t start_time_micros) {
+  Clear();
+  last_start_time_micros_ = start_time_micros;
+  ++collection_count;
+  role_map_ = CopyCacheDeleterRoleMap();
+  std::ostringstream str;
+  str << cache->Name() << "@" << static_cast<void*>(cache);
+  cache_id = str.str();
+  cache_capacity = cache->GetCapacity();
+}
+
+void InternalStats::CacheEntryRoleStats::EndCollection(
+    Cache*, SystemClock*, uint64_t end_time_micros) {
+  last_end_time_micros_ = end_time_micros;
+}
+
+void InternalStats::CacheEntryRoleStats::SkippedCollection() {
+  ++copies_of_last_collection;
+}
+
+uint64_t InternalStats::CacheEntryRoleStats::GetLastDurationMicros() const {
+  if (last_end_time_micros_ > last_start_time_micros_) {
+    return last_end_time_micros_ - last_start_time_micros_;
+  } else {
+    return 0U;
+  }
+}
+
+std::string InternalStats::CacheEntryRoleStats::ToString(
+    SystemClock* clock) const {
+  std::ostringstream str;
+  str << "Block cache " << cache_id
+      << " capacity: " << BytesToHumanString(cache_capacity)
+      << " collections: " << collection_count
+      << " last_copies: " << copies_of_last_collection
+      << " last_secs: " << (GetLastDurationMicros() / 1000000.0)
+      << " secs_since: "
+      << ((clock->NowMicros() - last_end_time_micros_) / 1000000U) << "\n";
+  str << "Block cache entry stats(count,size,portion):";
+  for (size_t i = 0; i < kNumCacheEntryRoles; ++i) {
+    if (entry_counts[i] > 0) {
+      str << " " << kCacheEntryRoleToCamelString[i] << "(" << entry_counts[i]
+          << "," << BytesToHumanString(total_charges[i]) << ","
+          << (100.0 * total_charges[i] / cache_capacity) << "%)";
+    }
+  }
+  str << "\n";
+  return str.str();
+}
+
+void InternalStats::CacheEntryRoleStats::ToMap(
+    std::map<std::string, std::string>* values, SystemClock* clock) const {
+  values->clear();
+  auto& v = *values;
+  v["id"] = cache_id;
+  v["capacity"] = ROCKSDB_NAMESPACE::ToString(cache_capacity);
+  v["secs_for_last_collection"] =
+      ROCKSDB_NAMESPACE::ToString(GetLastDurationMicros() / 1000000.0);
+  v["secs_since_last_collection"] = ROCKSDB_NAMESPACE::ToString(
+      (clock->NowMicros() - last_end_time_micros_) / 1000000U);
+  for (size_t i = 0; i < kNumCacheEntryRoles; ++i) {
+    std::string role = kCacheEntryRoleToHyphenString[i];
+    v["count." + role] = ROCKSDB_NAMESPACE::ToString(entry_counts[i]);
+    v["bytes." + role] = ROCKSDB_NAMESPACE::ToString(total_charges[i]);
+    v["percent." + role] =
+        ROCKSDB_NAMESPACE::ToString(100.0 * total_charges[i] / cache_capacity);
+  }
+}
+
+bool InternalStats::HandleBlockCacheEntryStats(std::string* value,
+                                               Slice /*suffix*/) {
+  Status s = CollectCacheEntryStats(/*foreground*/ true);
+  if (!s.ok()) {
+    return false;
+  }
+  *value = cache_entry_stats_.ToString(clock_);
+  return true;
+}
+
+bool InternalStats::HandleBlockCacheEntryStatsMap(
+    std::map<std::string, std::string>* values, Slice /*suffix*/) {
+  Status s = CollectCacheEntryStats(/*foreground*/ true);
+  if (!s.ok()) {
+    return false;
+  }
+  cache_entry_stats_.ToMap(values, clock_);
+  return true;
+}
+
 const DBPropertyInfo* GetPropertyInfo(const Slice& property) {
   std::string ppt_name = GetPropertyNameAndArg(property).first.ToString();
   auto ppt_info_iter = InternalStats::ppt_name_to_info.find(ppt_name);
@@ -508,11 +686,12 @@ bool InternalStats::GetStringProperty(const DBPropertyInfo& property_info,
 }
 
 bool InternalStats::GetMapProperty(const DBPropertyInfo& property_info,
-                                   const Slice& /*property*/,
+                                   const Slice& property,
                                    std::map<std::string, std::string>* value) {
   assert(value != nullptr);
   assert(property_info.handle_map != nullptr);
-  return (this->*(property_info.handle_map))(value);
+  Slice arg = GetPropertyNameAndArg(property).second;
+  return (this->*(property_info.handle_map))(value, arg);
 }
 
 bool InternalStats::GetIntProperty(const DBPropertyInfo& property_info,
@@ -588,7 +767,7 @@ bool InternalStats::HandleStats(std::string* value, Slice suffix) {
 }
 
 bool InternalStats::HandleCFMapStats(
-    std::map<std::string, std::string>* cf_stats) {
+    std::map<std::string, std::string>* cf_stats, Slice /*suffix*/) {
   DumpCFMapStats(cf_stats);
   return true;
 }
@@ -632,7 +811,27 @@ bool InternalStats::HandleAggregatedTableProperties(std::string* value,
   return true;
 }
 
-bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* value,
+static std::map<std::string, std::string> MapUint64ValuesToString(
+    const std::map<std::string, uint64_t>& from) {
+  std::map<std::string, std::string> to;
+  for (const auto& e : from) {
+    to[e.first] = ToString(e.second);
+  }
+  return to;
+}
+
+bool InternalStats::HandleAggregatedTablePropertiesMap(
+    std::map<std::string, std::string>* values, Slice /*suffix*/) {
+  std::shared_ptr<const TableProperties> tp;
+  auto s = cfd_->current()->GetAggregatedTableProperties(&tp);
+  if (!s.ok()) {
+    return false;
+  }
+  *values = MapUint64ValuesToString(tp->GetAggregatablePropertiesAsMap());
+  return true;
+}
+
+bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* values,
                                                            Slice suffix) {
   uint64_t level;
   bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty();
@@ -645,7 +844,24 @@ bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* value,
   if (!s.ok()) {
     return false;
   }
-  *value = tp->ToString();
+  *values = tp->ToString();
+  return true;
+}
+
+bool InternalStats::HandleAggregatedTablePropertiesAtLevelMap(
+    std::map<std::string, std::string>* values, Slice suffix) {
+  uint64_t level;
+  bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty();
+  if (!ok || static_cast<int>(level) >= number_levels_) {
+    return false;
+  }
+  std::shared_ptr<const TableProperties> tp;
+  auto s = cfd_->current()->GetAggregatedTableProperties(
+      &tp, static_cast<int>(level));
+  if (!s.ok()) {
+    return false;
+  }
+  *values = MapUint64ValuesToString(tp->GetAggregatablePropertiesAsMap());
   return true;
 }
 
@@ -699,21 +915,24 @@ bool InternalStats::HandleBackgroundErrors(uint64_t* value, DBImpl* /*db*/,
 bool InternalStats::HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* /*db*/,
                                                 Version* /*version*/) {
   // Current size of the active memtable
-  *value = cfd_->mem()->ApproximateMemoryUsage();
+  // Using ApproximateMemoryUsageFast to avoid the need for synchronization
+  *value = cfd_->mem()->ApproximateMemoryUsageFast();
   return true;
 }
 
 bool InternalStats::HandleCurSizeAllMemTables(uint64_t* value, DBImpl* /*db*/,
                                               Version* /*version*/) {
   // Current size of the active memtable + immutable memtables
-  *value = cfd_->mem()->ApproximateMemoryUsage() +
+  // Using ApproximateMemoryUsageFast to avoid the need for synchronization
+  *value = cfd_->mem()->ApproximateMemoryUsageFast() +
            cfd_->imm()->ApproximateUnflushedMemTablesMemoryUsage();
   return true;
 }
 
 bool InternalStats::HandleSizeAllMemTables(uint64_t* value, DBImpl* /*db*/,
                                            Version* /*version*/) {
-  *value = cfd_->mem()->ApproximateMemoryUsage() +
+  // Using ApproximateMemoryUsageFast to avoid the need for synchronization
+  *value = cfd_->mem()->ApproximateMemoryUsageFast() +
            cfd_->imm()->ApproximateMemoryUsage();
   return true;
 }
@@ -906,7 +1125,7 @@ bool InternalStats::HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* /*db*/,
 
 bool InternalStats::HandleBlockCacheStat(Cache** block_cache) {
   assert(block_cache != nullptr);
-  auto* table_factory = cfd_->ioptions()->table_factory;
+  auto* table_factory = cfd_->ioptions()->table_factory.get();
   assert(table_factory != nullptr);
   *block_cache =
       table_factory->GetOptions<Cache>(TableFactory::kBlockCacheOpts());
@@ -949,7 +1168,7 @@ bool InternalStats::HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* /*db*/,
 void InternalStats::DumpDBStats(std::string* value) {
   char buf[1000];
   // DB-level stats, only available from default column family
-  double seconds_up = (env_->NowMicros() - started_at_ + 1) / kMicrosInSec;
+  double seconds_up = (clock_->NowMicros() - started_at_ + 1) / kMicrosInSec;
   double interval_seconds_up = seconds_up - db_stats_snapshot_.seconds_up;
   snprintf(buf, sizeof(buf),
            "\n** DB Stats **\nUptime(secs): %.1f total, %.1f interval\n",
@@ -1033,7 +1252,7 @@ void InternalStats::DumpDBStats(std::string* value) {
   snprintf(
       buf, sizeof(buf),
       "Interval WAL: %s writes, %s syncs, "
-      "%.2f writes per sync, written: %.2f MB, %.2f MB/s\n",
+      "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n",
       NumberToHumanString(interval_write_with_wal).c_str(),
       NumberToHumanString(interval_wal_synced).c_str(),
       interval_write_with_wal / static_cast<double>(interval_wal_synced + 1),
@@ -1071,9 +1290,10 @@ void InternalStats::DumpDBStats(std::string* value) {
  */
 void InternalStats::DumpCFMapStats(
     std::map<std::string, std::string>* cf_stats) {
+  const VersionStorageInfo* vstorage = cfd_->current()->storage_info();
   CompactionStats compaction_stats_sum;
   std::map<int, std::map<LevelStatType, double>> levels_stats;
-  DumpCFMapStats(&levels_stats, &compaction_stats_sum);
+  DumpCFMapStats(vstorage, &levels_stats, &compaction_stats_sum);
   for (auto const& level_ent : levels_stats) {
     auto level_str =
         level_ent.first == -1 ? "Sum" : "L" + ToString(level_ent.first);
@@ -1090,9 +1310,10 @@ void InternalStats::DumpCFMapStats(
 }
 
 void InternalStats::DumpCFMapStats(
+    const VersionStorageInfo* vstorage,
     std::map<int, std::map<LevelStatType, double>>* levels_stats,
     CompactionStats* compaction_stats_sum) {
-  const VersionStorageInfo* vstorage = cfd_->current()->storage_info();
+  assert(vstorage);
 
   int num_levels_to_check =
       (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO)
@@ -1133,12 +1354,14 @@ void InternalStats::DumpCFMapStats(
       if (level == 0) {
         input_bytes = curr_ingest;
       } else {
-        input_bytes = comp_stats_[level].bytes_read_non_output_levels;
+        input_bytes = comp_stats_[level].bytes_read_non_output_levels +
+                      comp_stats_[level].bytes_read_blob;
       }
       double w_amp =
           (input_bytes == 0)
               ? 0.0
-              : static_cast<double>(comp_stats_[level].bytes_written) /
+              : static_cast<double>(comp_stats_[level].bytes_written +
+                                    comp_stats_[level].bytes_written_blob) /
                     input_bytes;
       std::map<LevelStatType, double> level_stats;
       PrepareLevelStats(&level_stats, files, files_being_compacted[level],
@@ -1148,7 +1371,8 @@ void InternalStats::DumpCFMapStats(
     }
   }
   // Cumulative summary
-  double w_amp = compaction_stats_sum->bytes_written /
+  double w_amp = (compaction_stats_sum->bytes_written +
+                  compaction_stats_sum->bytes_written_blob) /
                  static_cast<double>(curr_ingest + 1);
   // Stats summary across levels
   std::map<LevelStatType, double> sum_stats;
@@ -1215,9 +1439,10 @@ void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) {
   value->append(buf);
 
   // Print stats for each level
+  const VersionStorageInfo* vstorage = cfd_->current()->storage_info();
   std::map<int, std::map<LevelStatType, double>> levels_stats;
   CompactionStats compaction_stats_sum;
-  DumpCFMapStats(&levels_stats, &compaction_stats_sum);
+  DumpCFMapStats(vstorage, &levels_stats, &compaction_stats_sum);
   for (int l = 0; l < number_levels_; ++l) {
     if (levels_stats.find(l) != levels_stats.end()) {
       PrintLevelStats(buf, sizeof(buf), "L" + ToString(l), levels_stats[l]);
@@ -1253,7 +1478,8 @@ void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) {
   CompactionStats interval_stats(compaction_stats_sum);
   interval_stats.Subtract(cf_stats_snapshot_.comp_stats);
   double w_amp =
-      interval_stats.bytes_written / static_cast<double>(interval_ingest);
+      (interval_stats.bytes_written + interval_stats.bytes_written_blob) /
+      static_cast<double>(interval_ingest);
   PrintLevelStats(buf, sizeof(buf), "Int", 0, 0, 0, 0, w_amp, interval_stats);
   value->append(buf);
 
@@ -1272,7 +1498,13 @@ void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) {
     }
   }
 
-  double seconds_up = (env_->NowMicros() - started_at_ + 1) / kMicrosInSec;
+  snprintf(buf, sizeof(buf),
+           "\nBlob file count: %" ROCKSDB_PRIszt ", total size: %.1f GB\n\n",
+           vstorage->GetBlobFiles().size(),
+           vstorage->GetTotalBlobFileSize() / kGB);
+  value->append(buf);
+
+  double seconds_up = (clock_->NowMicros() - started_at_ + 1) / kMicrosInSec;
   double interval_seconds_up = seconds_up - cf_stats_snapshot_.seconds_up;
   snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n",
            seconds_up, interval_seconds_up);
@@ -1312,8 +1544,10 @@ void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) {
   uint64_t compact_micros = 0;
   for (int level = 0; level < number_levels_; level++) {
     compact_bytes_read += comp_stats_[level].bytes_read_output_level +
-                          comp_stats_[level].bytes_read_non_output_levels;
-    compact_bytes_write += comp_stats_[level].bytes_written;
+                          comp_stats_[level].bytes_read_non_output_levels +
+                          comp_stats_[level].bytes_read_blob;
+    compact_bytes_write += comp_stats_[level].bytes_written +
+                           comp_stats_[level].bytes_written_blob;
     compact_micros += comp_stats_[level].micros;
   }
 
@@ -1384,6 +1618,16 @@ void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) {
   cf_stats_snapshot_.ingest_keys_addfile = ingest_keys_addfile;
   cf_stats_snapshot_.comp_stats = compaction_stats_sum;
   cf_stats_snapshot_.stall_count = total_stall_count;
+
+  // Always treat CFStats context as "background"
+  Status s = CollectCacheEntryStats(/*foreground=*/false);
+  if (s.ok()) {
+    value->append(cache_entry_stats_.ToString(clock_));
+  } else {
+    value->append("Block cache: ");
+    value->append(s.ToString());
+    value->append("\n");
+  }
 }
 
 void InternalStats::DumpCFFileHistogram(std::string* value) {
@@ -1406,7 +1650,7 @@ void InternalStats::DumpCFFileHistogram(std::string* value) {
         << blob_file_read_latency_.ToString() << '\n';
   }
 
-  *value = oss.str();
+  value->append(oss.str());
 }
 
 #else
diff --git a/db/internal_stats.h b/db/internal_stats.h
index edb2c0582cb..023bf3b5ef9 100644
--- a/db/internal_stats.h
+++ b/db/internal_stats.h
@@ -9,16 +9,22 @@
 //
 
 #pragma once
+
 #include <map>
+#include <memory>
 #include <string>
 #include <vector>
 
+#include "cache/cache_entry_roles.h"
 #include "db/version_set.h"
+#include "rocksdb/system_clock.h"
 
 class ColumnFamilyData;
 
 namespace ROCKSDB_NAMESPACE {
 
+template <class Stats>
+class CacheEntryStatsCollector;
 class DBImpl;
 class MemTableList;
 
@@ -44,7 +50,9 @@ struct DBPropertyInfo {
                                     Version* version);
 
   // @param props Map of general properties to populate
-  bool (InternalStats::*handle_map)(std::map<std::string, std::string>* props);
+  // @param suffix Argument portion of the property. (see handle_string)
+  bool (InternalStats::*handle_map)(std::map<std::string, std::string>* props,
+                                    Slice suffix);
 
   // handle the string type properties rely on DBImpl methods
   // @param value Value-result argument for storing the property's string value
@@ -76,6 +84,8 @@ enum class LevelStatType {
   AVG_SEC,
   KEY_IN,
   KEY_DROP,
+  R_BLOB_GB,
+  W_BLOB_GB,
   TOTAL  // total number of types
 };
 
@@ -120,18 +130,7 @@ class InternalStats {
     kIntStatsNumMax,
   };
 
-  InternalStats(int num_levels, Env* env, ColumnFamilyData* cfd)
-      : db_stats_{},
-        cf_stats_value_{},
-        cf_stats_count_{},
-        comp_stats_(num_levels),
-        comp_stats_by_pri_(Env::Priority::TOTAL),
-        file_read_latency_(num_levels),
-        bg_error_count_(0),
-        number_levels_(num_levels),
-        env_(env),
-        cfd_(cfd),
-        started_at_(env->NowMicros()) {}
+  InternalStats(int num_levels, SystemClock* clock, ColumnFamilyData* cfd);
 
   // Per level compaction stats.  comp_stats_[level] stores the stats for
   // compactions that produced data for the specified "level".
@@ -139,32 +138,42 @@ class InternalStats {
     uint64_t micros;
     uint64_t cpu_micros;
 
-    // The number of bytes read from all non-output levels
+    // The number of bytes read from all non-output levels (table files)
     uint64_t bytes_read_non_output_levels;
 
-    // The number of bytes read from the compaction output level.
+    // The number of bytes read from the compaction output level (table files)
     uint64_t bytes_read_output_level;
 
-    // Total number of bytes written during compaction
+    // The number of bytes read from blob files
+    uint64_t bytes_read_blob;
+
+    // Total number of bytes written to table files during compaction
     uint64_t bytes_written;
 
-    // Total number of bytes moved to the output level
+    // Total number of bytes written to blob files during compaction
+    uint64_t bytes_written_blob;
+
+    // Total number of bytes moved to the output level (table files)
     uint64_t bytes_moved;
 
-    // The number of compaction input files in all non-output levels.
+    // The number of compaction input files in all non-output levels (table
+    // files)
     int num_input_files_in_non_output_levels;
 
-    // The number of compaction input files in the output level.
+    // The number of compaction input files in the output level (table files)
     int num_input_files_in_output_level;
 
-    // The number of compaction output files.
+    // The number of compaction output files (table files)
     int num_output_files;
 
+    // The number of compaction output files (blob files)
+    int num_output_files_blob;
+
     // Total incoming entries during compaction between levels N and N+1
     uint64_t num_input_records;
 
     // Accumulated diff number of entries
-    // (num input entries - num output entires) for compaction  levels N and N+1
+    // (num input entries - num output entries) for compaction levels N and N+1
     uint64_t num_dropped_records;
 
     // Number of compactions done
@@ -178,11 +187,14 @@ class InternalStats {
           cpu_micros(0),
           bytes_read_non_output_levels(0),
           bytes_read_output_level(0),
+          bytes_read_blob(0),
           bytes_written(0),
+          bytes_written_blob(0),
           bytes_moved(0),
           num_input_files_in_non_output_levels(0),
           num_input_files_in_output_level(0),
           num_output_files(0),
+          num_output_files_blob(0),
           num_input_records(0),
           num_dropped_records(0),
           count(0) {
@@ -197,11 +209,14 @@ class InternalStats {
           cpu_micros(0),
           bytes_read_non_output_levels(0),
           bytes_read_output_level(0),
+          bytes_read_blob(0),
           bytes_written(0),
+          bytes_written_blob(0),
           bytes_moved(0),
           num_input_files_in_non_output_levels(0),
           num_input_files_in_output_level(0),
           num_output_files(0),
+          num_output_files_blob(0),
           num_input_records(0),
           num_dropped_records(0),
           count(c) {
@@ -222,12 +237,15 @@ class InternalStats {
           cpu_micros(c.cpu_micros),
           bytes_read_non_output_levels(c.bytes_read_non_output_levels),
           bytes_read_output_level(c.bytes_read_output_level),
+          bytes_read_blob(c.bytes_read_blob),
           bytes_written(c.bytes_written),
+          bytes_written_blob(c.bytes_written_blob),
           bytes_moved(c.bytes_moved),
           num_input_files_in_non_output_levels(
               c.num_input_files_in_non_output_levels),
           num_input_files_in_output_level(c.num_input_files_in_output_level),
           num_output_files(c.num_output_files),
+          num_output_files_blob(c.num_output_files_blob),
           num_input_records(c.num_input_records),
           num_dropped_records(c.num_dropped_records),
           count(c.count) {
@@ -242,12 +260,15 @@ class InternalStats {
       cpu_micros = c.cpu_micros;
       bytes_read_non_output_levels = c.bytes_read_non_output_levels;
       bytes_read_output_level = c.bytes_read_output_level;
+      bytes_read_blob = c.bytes_read_blob;
       bytes_written = c.bytes_written;
+      bytes_written_blob = c.bytes_written_blob;
       bytes_moved = c.bytes_moved;
       num_input_files_in_non_output_levels =
           c.num_input_files_in_non_output_levels;
       num_input_files_in_output_level = c.num_input_files_in_output_level;
       num_output_files = c.num_output_files;
+      num_output_files_blob = c.num_output_files_blob;
       num_input_records = c.num_input_records;
       num_dropped_records = c.num_dropped_records;
       count = c.count;
@@ -264,11 +285,14 @@ class InternalStats {
       this->cpu_micros = 0;
       this->bytes_read_non_output_levels = 0;
       this->bytes_read_output_level = 0;
+      this->bytes_read_blob = 0;
       this->bytes_written = 0;
+      this->bytes_written_blob = 0;
       this->bytes_moved = 0;
       this->num_input_files_in_non_output_levels = 0;
       this->num_input_files_in_output_level = 0;
       this->num_output_files = 0;
+      this->num_output_files_blob = 0;
       this->num_input_records = 0;
       this->num_dropped_records = 0;
       this->count = 0;
@@ -283,13 +307,16 @@ class InternalStats {
       this->cpu_micros += c.cpu_micros;
       this->bytes_read_non_output_levels += c.bytes_read_non_output_levels;
       this->bytes_read_output_level += c.bytes_read_output_level;
+      this->bytes_read_blob += c.bytes_read_blob;
       this->bytes_written += c.bytes_written;
+      this->bytes_written_blob += c.bytes_written_blob;
       this->bytes_moved += c.bytes_moved;
       this->num_input_files_in_non_output_levels +=
           c.num_input_files_in_non_output_levels;
       this->num_input_files_in_output_level +=
           c.num_input_files_in_output_level;
       this->num_output_files += c.num_output_files;
+      this->num_output_files_blob += c.num_output_files_blob;
       this->num_input_records += c.num_input_records;
       this->num_dropped_records += c.num_dropped_records;
       this->count += c.count;
@@ -304,13 +331,16 @@ class InternalStats {
       this->cpu_micros -= c.cpu_micros;
       this->bytes_read_non_output_levels -= c.bytes_read_non_output_levels;
       this->bytes_read_output_level -= c.bytes_read_output_level;
+      this->bytes_read_blob -= c.bytes_read_blob;
       this->bytes_written -= c.bytes_written;
+      this->bytes_written_blob -= c.bytes_written_blob;
       this->bytes_moved -= c.bytes_moved;
       this->num_input_files_in_non_output_levels -=
           c.num_input_files_in_non_output_levels;
       this->num_input_files_in_output_level -=
           c.num_input_files_in_output_level;
       this->num_output_files -= c.num_output_files;
+      this->num_output_files_blob -= c.num_output_files_blob;
       this->num_input_records -= c.num_input_records;
       this->num_dropped_records -= c.num_dropped_records;
       this->count -= c.count;
@@ -321,6 +351,39 @@ class InternalStats {
     }
   };
 
+  // For use with CacheEntryStatsCollector
+  struct CacheEntryRoleStats {
+    uint64_t cache_capacity = 0;
+    std::string cache_id;
+    std::array<uint64_t, kNumCacheEntryRoles> total_charges;
+    std::array<size_t, kNumCacheEntryRoles> entry_counts;
+    uint32_t collection_count = 0;
+    uint32_t copies_of_last_collection = 0;
+    uint64_t last_start_time_micros_ = 0;
+    uint64_t last_end_time_micros_ = 0;
+
+    void Clear() {
+      // Wipe everything except collection_count
+      uint32_t saved_collection_count = collection_count;
+      *this = CacheEntryRoleStats();
+      collection_count = saved_collection_count;
+    }
+
+    void BeginCollection(Cache*, SystemClock*, uint64_t start_time_micros);
+    std::function<void(const Slice&, void*, size_t, Cache::DeleterFn)>
+    GetEntryCallback();
+    void EndCollection(Cache*, SystemClock*, uint64_t end_time_micros);
+    void SkippedCollection();
+
+    std::string ToString(SystemClock* clock) const;
+    void ToMap(std::map<std::string, std::string>* values,
+               SystemClock* clock) const;
+
+   private:
+    std::unordered_map<Cache::DeleterFn, CacheEntryRole> role_map_;
+    uint64_t GetLastDurationMicros() const;
+  };
+
   void Clear() {
     for (int i = 0; i < kIntStatsNumMax; i++) {
       db_stats_[i].store(0);
@@ -329,6 +392,7 @@ class InternalStats {
       cf_stats_count_[i] = 0;
       cf_stats_value_[i] = 0;
     }
+    cache_entry_stats_.Clear();
     for (auto& comp_stat : comp_stats_) {
       comp_stat.Clear();
     }
@@ -339,7 +403,7 @@ class InternalStats {
     cf_stats_snapshot_.Clear();
     db_stats_snapshot_.Clear();
     bg_error_count_ = 0;
-    started_at_ = env_->NowMicros();
+    started_at_ = clock_->NowMicros();
   }
 
   void AddCompactionStats(int level, Env::Priority thread_pri,
@@ -401,6 +465,15 @@ class InternalStats {
     return comp_stats_;
   }
 
+  const CacheEntryRoleStats& TEST_GetCacheEntryRoleStats(bool foreground) {
+    Status s = CollectCacheEntryStats(foreground);
+    if (!s.ok()) {
+      assert(false);
+      cache_entry_stats_.Clear();
+    }
+    return cache_entry_stats_;
+  }
+
   // Store a mapping from the user-facing DB::Properties string to our
   // DBPropertyInfo struct used internally for retrieving properties.
   static const std::unordered_map<std::string, DBPropertyInfo> ppt_name_to_info;
@@ -409,6 +482,7 @@ class InternalStats {
   void DumpDBStats(std::string* value);
   void DumpCFMapStats(std::map<std::string, std::string>* cf_stats);
   void DumpCFMapStats(
+      const VersionStorageInfo* vstorage,
       std::map<int, std::map<LevelStatType, double>>* level_stats,
       CompactionStats* compaction_stats_sum);
   void DumpCFMapStatsByPriority(
@@ -420,11 +494,16 @@ class InternalStats {
 
   bool HandleBlockCacheStat(Cache** block_cache);
 
+  Status CollectCacheEntryStats(bool foreground);
+
   // Per-DB stats
   std::atomic<uint64_t> db_stats_[kIntStatsNumMax];
   // Per-ColumnFamily stats
   uint64_t cf_stats_value_[INTERNAL_CF_STATS_ENUM_MAX];
   uint64_t cf_stats_count_[INTERNAL_CF_STATS_ENUM_MAX];
+  CacheEntryRoleStats cache_entry_stats_;
+  std::shared_ptr<CacheEntryStatsCollector<CacheEntryRoleStats>>
+      cache_entry_stats_collector_;
   // Per-ColumnFamily/level compaction stats
   std::vector<CompactionStats> comp_stats_;
   std::vector<CompactionStats> comp_stats_by_pri_;
@@ -525,7 +604,8 @@ class InternalStats {
   bool HandleCompressionRatioAtLevelPrefix(std::string* value, Slice suffix);
   bool HandleLevelStats(std::string* value, Slice suffix);
   bool HandleStats(std::string* value, Slice suffix);
-  bool HandleCFMapStats(std::map<std::string, std::string>* compaction_stats);
+  bool HandleCFMapStats(std::map<std::string, std::string>* compaction_stats,
+                        Slice suffix);
   bool HandleCFStats(std::string* value, Slice suffix);
   bool HandleCFStatsNoFileHistogram(std::string* value, Slice suffix);
   bool HandleCFFileHistogram(std::string* value, Slice suffix);
@@ -533,6 +613,10 @@ class InternalStats {
   bool HandleSsTables(std::string* value, Slice suffix);
   bool HandleAggregatedTableProperties(std::string* value, Slice suffix);
   bool HandleAggregatedTablePropertiesAtLevel(std::string* value, Slice suffix);
+  bool HandleAggregatedTablePropertiesMap(
+      std::map<std::string, std::string>* values, Slice suffix);
+  bool HandleAggregatedTablePropertiesAtLevelMap(
+      std::map<std::string, std::string>* values, Slice suffix);
   bool HandleNumImmutableMemTable(uint64_t* value, DBImpl* db,
                                   Version* version);
   bool HandleNumImmutableMemTableFlushed(uint64_t* value, DBImpl* db,
@@ -587,6 +671,9 @@ class InternalStats {
   bool HandleBlockCacheUsage(uint64_t* value, DBImpl* db, Version* version);
   bool HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* db,
                                    Version* version);
+  bool HandleBlockCacheEntryStats(std::string* value, Slice suffix);
+  bool HandleBlockCacheEntryStatsMap(std::map<std::string, std::string>* values,
+                                     Slice suffix);
   // Total number of background errors encountered. Every time a flush task
   // or compaction task fails, this counter is incremented. The failure can
   // be caused by any possible reason, including file system errors, out of
@@ -595,7 +682,7 @@ class InternalStats {
   uint64_t bg_error_count_;
 
   const int number_levels_;
-  Env* env_;
+  SystemClock* clock_;
   ColumnFamilyData* cfd_;
   uint64_t started_at_;
 };
@@ -634,18 +721,22 @@ class InternalStats {
     kIntStatsNumMax,
   };
 
-  InternalStats(int /*num_levels*/, Env* /*env*/, ColumnFamilyData* /*cfd*/) {}
+  InternalStats(int /*num_levels*/, SystemClock* /*clock*/,
+                ColumnFamilyData* /*cfd*/) {}
 
   struct CompactionStats {
     uint64_t micros;
     uint64_t cpu_micros;
     uint64_t bytes_read_non_output_levels;
     uint64_t bytes_read_output_level;
+    uint64_t bytes_read_blob;
     uint64_t bytes_written;
+    uint64_t bytes_written_blob;
     uint64_t bytes_moved;
     int num_input_files_in_non_output_levels;
     int num_input_files_in_output_level;
     int num_output_files;
+    int num_output_files_blob;
     uint64_t num_input_records;
     uint64_t num_dropped_records;
     int count;
@@ -673,6 +764,8 @@ class InternalStats {
 
   HistogramImpl* GetFileReadHist(int /*level*/) { return nullptr; }
 
+  HistogramImpl* GetBlobFileReadHist() { return nullptr; }
+
   uint64_t GetBackgroundErrorCount() const { return 0; }
 
   uint64_t BumpAndGetBackgroundErrorCount() { return 0; }
diff --git a/db/job_context.h b/db/job_context.h
index d09937d11f0..cf48888e0db 100644
--- a/db/job_context.h
+++ b/db/job_context.h
@@ -23,7 +23,7 @@ struct SuperVersion;
 struct SuperVersionContext {
   struct WriteStallNotification {
     WriteStallInfo write_stall_info;
-    const ImmutableCFOptions* immutable_cf_options;
+    const ImmutableOptions* immutable_options;
   };
 
   autovector<SuperVersion*> superversions_to_free;
@@ -57,15 +57,16 @@ struct SuperVersionContext {
 #endif
   }
 
-  void PushWriteStallNotification(
-      WriteStallCondition old_cond, WriteStallCondition new_cond,
-      const std::string& name, const ImmutableCFOptions* ioptions) {
+  void PushWriteStallNotification(WriteStallCondition old_cond,
+                                  WriteStallCondition new_cond,
+                                  const std::string& name,
+                                  const ImmutableOptions* ioptions) {
 #if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
     WriteStallNotification notif;
     notif.write_stall_info.cf_name = name;
     notif.write_stall_info.condition.prev = old_cond;
     notif.write_stall_info.condition.cur = new_cond;
-    notif.immutable_cf_options = ioptions;
+    notif.immutable_options = ioptions;
     write_stall_notifications.push_back(notif);
 #else
     (void)old_cond;
@@ -79,7 +80,7 @@ struct SuperVersionContext {
 #if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
     // notify listeners on changed write stall conditions
     for (auto& notif : write_stall_notifications) {
-      for (auto& listener : notif.immutable_cf_options->listeners) {
+      for (auto& listener : notif.immutable_options->listeners) {
         listener->OnStallConditionsChanged(notif.write_stall_info);
       }
     }
diff --git a/db/kv_checksum.h b/db/kv_checksum.h
new file mode 100644
index 00000000000..ba15dca3bb1
--- /dev/null
+++ b/db/kv_checksum.h
@@ -0,0 +1,424 @@
+//  Copyright (c) 2020-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file contains classes containing fields to protect individual entries.
+// The classes are named "ProtectionInfo<suffix>", where <suffix> indicates the
+// combination of fields that are covered. Each field has a single letter
+// abbreviation as follows.
+//
+// K = key
+// V = value
+// O = optype aka value type
+// T = timestamp
+// S = seqno
+// C = CF ID
+//
+// Then, for example, a class that protects an entry consisting of key, value,
+// optype, timestamp, and CF ID (i.e., a `WriteBatch` entry) would be named
+// `ProtectionInfoKVOTC`.
+//
+// The `ProtectionInfo.*` classes are templated on the integer type used to hold
+// the XOR of hashes for each field. Only unsigned integer types are supported,
+// and the maximum supported integer width is 64 bits. When the integer type is
+// narrower than the hash values, we lop off the most significant bits to make
+// them fit.
+//
+// The `ProtectionInfo.*` classes are all intended to be non-persistent. We do
+// not currently make the byte order consistent for integer fields before
+// hashing them, so the resulting values are endianness-dependent.
+
+#pragma once
+
+#include <type_traits>
+
+#include "db/dbformat.h"
+#include "rocksdb/types.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+template <typename T>
+class ProtectionInfo;
+template <typename T>
+class ProtectionInfoKVOT;
+template <typename T>
+class ProtectionInfoKVOTC;
+template <typename T>
+class ProtectionInfoKVOTS;
+
+// Aliases for 64-bit protection infos.
+typedef ProtectionInfo<uint64_t> ProtectionInfo64;
+typedef ProtectionInfoKVOT<uint64_t> ProtectionInfoKVOT64;
+typedef ProtectionInfoKVOTC<uint64_t> ProtectionInfoKVOTC64;
+typedef ProtectionInfoKVOTS<uint64_t> ProtectionInfoKVOTS64;
+
+template <typename T>
+class ProtectionInfo {
+ public:
+  ProtectionInfo<T>() = default;
+
+  Status GetStatus() const;
+  ProtectionInfoKVOT<T> ProtectKVOT(const Slice& key, const Slice& value,
+                                    ValueType op_type,
+                                    const Slice& timestamp) const;
+  ProtectionInfoKVOT<T> ProtectKVOT(const SliceParts& key,
+                                    const SliceParts& value, ValueType op_type,
+                                    const Slice& timestamp) const;
+
+ private:
+  friend class ProtectionInfoKVOT<T>;
+  friend class ProtectionInfoKVOTS<T>;
+  friend class ProtectionInfoKVOTC<T>;
+
+  // Each field is hashed with an independent value so we can catch fields being
+  // swapped. Per the `NPHash64()` docs, using consecutive seeds is a pitfall,
+  // and we should instead vary our seeds by a large odd number. This value by
+  // which we increment (0xD28AAD72F49BD50B) was taken from
+  // `head -c8 /dev/urandom | hexdump`, run repeatedly until it yielded an odd
+  // number. The values are computed manually since the Windows C++ compiler
+  // complains about the overflow when adding constants.
+  static const uint64_t kSeedK = 0;
+  static const uint64_t kSeedV = 0xD28AAD72F49BD50B;
+  static const uint64_t kSeedO = 0xA5155AE5E937AA16;
+  static const uint64_t kSeedT = 0x77A00858DDD37F21;
+  static const uint64_t kSeedS = 0x4A2AB5CBD26F542C;
+  static const uint64_t kSeedC = 0x1CB5633EC70B2937;
+
+  ProtectionInfo<T>(T val) : val_(val) {
+    static_assert(sizeof(ProtectionInfo<T>) == sizeof(T), "");
+  }
+
+  T GetVal() const { return val_; }
+  void SetVal(T val) { val_ = val; }
+
+  T val_ = 0;
+};
+
+template <typename T>
+class ProtectionInfoKVOT {
+ public:
+  ProtectionInfoKVOT<T>() = default;
+
+  ProtectionInfo<T> StripKVOT(const Slice& key, const Slice& value,
+                              ValueType op_type, const Slice& timestamp) const;
+  ProtectionInfo<T> StripKVOT(const SliceParts& key, const SliceParts& value,
+                              ValueType op_type, const Slice& timestamp) const;
+
+  ProtectionInfoKVOTC<T> ProtectC(ColumnFamilyId column_family_id) const;
+  ProtectionInfoKVOTS<T> ProtectS(SequenceNumber sequence_number) const;
+
+  void UpdateK(const Slice& old_key, const Slice& new_key);
+  void UpdateK(const SliceParts& old_key, const SliceParts& new_key);
+  void UpdateV(const Slice& old_value, const Slice& new_value);
+  void UpdateV(const SliceParts& old_value, const SliceParts& new_value);
+  void UpdateO(ValueType old_op_type, ValueType new_op_type);
+  void UpdateT(const Slice& old_timestamp, const Slice& new_timestamp);
+
+ private:
+  friend class ProtectionInfo<T>;
+  friend class ProtectionInfoKVOTS<T>;
+  friend class ProtectionInfoKVOTC<T>;
+
+  ProtectionInfoKVOT<T>(T val) : info_(val) {
+    static_assert(sizeof(ProtectionInfoKVOT<T>) == sizeof(T), "");
+  }
+
+  T GetVal() const { return info_.GetVal(); }
+  void SetVal(T val) { info_.SetVal(val); }
+
+  ProtectionInfo<T> info_;
+};
+
+template <typename T>
+class ProtectionInfoKVOTC {
+ public:
+  ProtectionInfoKVOTC<T>() = default;
+
+  ProtectionInfoKVOT<T> StripC(ColumnFamilyId column_family_id) const;
+
+  void UpdateK(const Slice& old_key, const Slice& new_key) {
+    kvot_.UpdateK(old_key, new_key);
+  }
+  void UpdateK(const SliceParts& old_key, const SliceParts& new_key) {
+    kvot_.UpdateK(old_key, new_key);
+  }
+  void UpdateV(const Slice& old_value, const Slice& new_value) {
+    kvot_.UpdateV(old_value, new_value);
+  }
+  void UpdateV(const SliceParts& old_value, const SliceParts& new_value) {
+    kvot_.UpdateV(old_value, new_value);
+  }
+  void UpdateO(ValueType old_op_type, ValueType new_op_type) {
+    kvot_.UpdateO(old_op_type, new_op_type);
+  }
+  void UpdateT(const Slice& old_timestamp, const Slice& new_timestamp) {
+    kvot_.UpdateT(old_timestamp, new_timestamp);
+  }
+  void UpdateC(ColumnFamilyId old_column_family_id,
+               ColumnFamilyId new_column_family_id);
+
+ private:
+  friend class ProtectionInfoKVOT<T>;
+
+  ProtectionInfoKVOTC<T>(T val) : kvot_(val) {
+    static_assert(sizeof(ProtectionInfoKVOTC<T>) == sizeof(T), "");
+  }
+
+  T GetVal() const { return kvot_.GetVal(); }
+  void SetVal(T val) { kvot_.SetVal(val); }
+
+  ProtectionInfoKVOT<T> kvot_;
+};
+
+template <typename T>
+class ProtectionInfoKVOTS {
+ public:
+  ProtectionInfoKVOTS<T>() = default;
+
+  ProtectionInfoKVOT<T> StripS(SequenceNumber sequence_number) const;
+
+  void UpdateK(const Slice& old_key, const Slice& new_key) {
+    kvot_.UpdateK(old_key, new_key);
+  }
+  void UpdateK(const SliceParts& old_key, const SliceParts& new_key) {
+    kvot_.UpdateK(old_key, new_key);
+  }
+  void UpdateV(const Slice& old_value, const Slice& new_value) {
+    kvot_.UpdateV(old_value, new_value);
+  }
+  void UpdateV(const SliceParts& old_value, const SliceParts& new_value) {
+    kvot_.UpdateV(old_value, new_value);
+  }
+  void UpdateO(ValueType old_op_type, ValueType new_op_type) {
+    kvot_.UpdateO(old_op_type, new_op_type);
+  }
+  void UpdateT(const Slice& old_timestamp, const Slice& new_timestamp) {
+    kvot_.UpdateT(old_timestamp, new_timestamp);
+  }
+  void UpdateS(SequenceNumber old_sequence_number,
+               SequenceNumber new_sequence_number);
+
+ private:
+  friend class ProtectionInfoKVOT<T>;
+
+  ProtectionInfoKVOTS<T>(T val) : kvot_(val) {
+    static_assert(sizeof(ProtectionInfoKVOTS<T>) == sizeof(T), "");
+  }
+
+  T GetVal() const { return kvot_.GetVal(); }
+  void SetVal(T val) { kvot_.SetVal(val); }
+
+  ProtectionInfoKVOT<T> kvot_;
+};
+
+template <typename T>
+Status ProtectionInfo<T>::GetStatus() const {
+  if (val_ != 0) {
+    return Status::Corruption("ProtectionInfo mismatch");
+  }
+  return Status::OK();
+}
+
+template <typename T>
+ProtectionInfoKVOT<T> ProtectionInfo<T>::ProtectKVOT(
+    const Slice& key, const Slice& value, ValueType op_type,
+    const Slice& timestamp) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(GetSliceNPHash64(key, ProtectionInfo<T>::kSeedK));
+  val =
+      val ^ static_cast<T>(GetSliceNPHash64(value, ProtectionInfo<T>::kSeedV));
+  val = val ^
+        static_cast<T>(NPHash64(reinterpret_cast<char*>(&op_type),
+                                sizeof(op_type), ProtectionInfo<T>::kSeedO));
+  val = val ^
+        static_cast<T>(GetSliceNPHash64(timestamp, ProtectionInfo<T>::kSeedT));
+  return ProtectionInfoKVOT<T>(val);
+}
+
+template <typename T>
+ProtectionInfoKVOT<T> ProtectionInfo<T>::ProtectKVOT(
+    const SliceParts& key, const SliceParts& value, ValueType op_type,
+    const Slice& timestamp) const {
+  T val = GetVal();
+  val = val ^
+        static_cast<T>(GetSlicePartsNPHash64(key, ProtectionInfo<T>::kSeedK));
+  val = val ^
+        static_cast<T>(GetSlicePartsNPHash64(value, ProtectionInfo<T>::kSeedV));
+  val = val ^
+        static_cast<T>(NPHash64(reinterpret_cast<char*>(&op_type),
+                                sizeof(op_type), ProtectionInfo<T>::kSeedO));
+  val = val ^
+        static_cast<T>(GetSliceNPHash64(timestamp, ProtectionInfo<T>::kSeedT));
+  return ProtectionInfoKVOT<T>(val);
+}
+
+template <typename T>
+void ProtectionInfoKVOT<T>::UpdateK(const Slice& old_key,
+                                    const Slice& new_key) {
+  T val = GetVal();
+  val = val ^
+        static_cast<T>(GetSliceNPHash64(old_key, ProtectionInfo<T>::kSeedK));
+  val = val ^
+        static_cast<T>(GetSliceNPHash64(new_key, ProtectionInfo<T>::kSeedK));
+  SetVal(val);
+}
+
+template <typename T>
+void ProtectionInfoKVOT<T>::UpdateK(const SliceParts& old_key,
+                                    const SliceParts& new_key) {
+  T val = GetVal();
+  val = val ^ static_cast<T>(
+                  GetSlicePartsNPHash64(old_key, ProtectionInfo<T>::kSeedK));
+  val = val ^ static_cast<T>(
+                  GetSlicePartsNPHash64(new_key, ProtectionInfo<T>::kSeedK));
+  SetVal(val);
+}
+
+template <typename T>
+void ProtectionInfoKVOT<T>::UpdateV(const Slice& old_value,
+                                    const Slice& new_value) {
+  T val = GetVal();
+  val = val ^
+        static_cast<T>(GetSliceNPHash64(old_value, ProtectionInfo<T>::kSeedV));
+  val = val ^
+        static_cast<T>(GetSliceNPHash64(new_value, ProtectionInfo<T>::kSeedV));
+  SetVal(val);
+}
+
+template <typename T>
+void ProtectionInfoKVOT<T>::UpdateV(const SliceParts& old_value,
+                                    const SliceParts& new_value) {
+  T val = GetVal();
+  val = val ^ static_cast<T>(
+                  GetSlicePartsNPHash64(old_value, ProtectionInfo<T>::kSeedV));
+  val = val ^ static_cast<T>(
+                  GetSlicePartsNPHash64(new_value, ProtectionInfo<T>::kSeedV));
+  SetVal(val);
+}
+
+template <typename T>
+void ProtectionInfoKVOT<T>::UpdateO(ValueType old_op_type,
+                                    ValueType new_op_type) {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(reinterpret_cast<char*>(&old_op_type),
+                                      sizeof(old_op_type),
+                                      ProtectionInfo<T>::kSeedO));
+  val = val ^ static_cast<T>(NPHash64(reinterpret_cast<char*>(&new_op_type),
+                                      sizeof(new_op_type),
+                                      ProtectionInfo<T>::kSeedO));
+  SetVal(val);
+}
+
+template <typename T>
+void ProtectionInfoKVOT<T>::UpdateT(const Slice& old_timestamp,
+                                    const Slice& new_timestamp) {
+  T val = GetVal();
+  val = val ^ static_cast<T>(
+                  GetSliceNPHash64(old_timestamp, ProtectionInfo<T>::kSeedT));
+  val = val ^ static_cast<T>(
+                  GetSliceNPHash64(new_timestamp, ProtectionInfo<T>::kSeedT));
+  SetVal(val);
+}
+
+template <typename T>
+ProtectionInfo<T> ProtectionInfoKVOT<T>::StripKVOT(
+    const Slice& key, const Slice& value, ValueType op_type,
+    const Slice& timestamp) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(GetSliceNPHash64(key, ProtectionInfo<T>::kSeedK));
+  val =
+      val ^ static_cast<T>(GetSliceNPHash64(value, ProtectionInfo<T>::kSeedV));
+  val = val ^
+        static_cast<T>(NPHash64(reinterpret_cast<char*>(&op_type),
+                                sizeof(op_type), ProtectionInfo<T>::kSeedO));
+  val = val ^
+        static_cast<T>(GetSliceNPHash64(timestamp, ProtectionInfo<T>::kSeedT));
+  return ProtectionInfo<T>(val);
+}
+
+template <typename T>
+ProtectionInfo<T> ProtectionInfoKVOT<T>::StripKVOT(
+    const SliceParts& key, const SliceParts& value, ValueType op_type,
+    const Slice& timestamp) const {
+  T val = GetVal();
+  val = val ^
+        static_cast<T>(GetSlicePartsNPHash64(key, ProtectionInfo<T>::kSeedK));
+  val = val ^
+        static_cast<T>(GetSlicePartsNPHash64(value, ProtectionInfo<T>::kSeedV));
+  val = val ^
+        static_cast<T>(NPHash64(reinterpret_cast<char*>(&op_type),
+                                sizeof(op_type), ProtectionInfo<T>::kSeedO));
+  val = val ^
+        static_cast<T>(GetSliceNPHash64(timestamp, ProtectionInfo<T>::kSeedT));
+  return ProtectionInfo<T>(val);
+}
+
+template <typename T>
+ProtectionInfoKVOTC<T> ProtectionInfoKVOT<T>::ProtectC(
+    ColumnFamilyId column_family_id) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&column_family_id),
+                  sizeof(column_family_id), ProtectionInfo<T>::kSeedC));
+  return ProtectionInfoKVOTC<T>(val);
+}
+
+template <typename T>
+ProtectionInfoKVOT<T> ProtectionInfoKVOTC<T>::StripC(
+    ColumnFamilyId column_family_id) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&column_family_id),
+                  sizeof(column_family_id), ProtectionInfo<T>::kSeedC));
+  return ProtectionInfoKVOT<T>(val);
+}
+
+template <typename T>
+void ProtectionInfoKVOTC<T>::UpdateC(ColumnFamilyId old_column_family_id,
+                                     ColumnFamilyId new_column_family_id) {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&old_column_family_id),
+                  sizeof(old_column_family_id), ProtectionInfo<T>::kSeedC));
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&new_column_family_id),
+                  sizeof(new_column_family_id), ProtectionInfo<T>::kSeedC));
+  SetVal(val);
+}
+
+template <typename T>
+ProtectionInfoKVOTS<T> ProtectionInfoKVOT<T>::ProtectS(
+    SequenceNumber sequence_number) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(reinterpret_cast<char*>(&sequence_number),
+                                      sizeof(sequence_number),
+                                      ProtectionInfo<T>::kSeedS));
+  return ProtectionInfoKVOTS<T>(val);
+}
+
+template <typename T>
+ProtectionInfoKVOT<T> ProtectionInfoKVOTS<T>::StripS(
+    SequenceNumber sequence_number) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(reinterpret_cast<char*>(&sequence_number),
+                                      sizeof(sequence_number),
+                                      ProtectionInfo<T>::kSeedS));
+  return ProtectionInfoKVOT<T>(val);
+}
+
+template <typename T>
+void ProtectionInfoKVOTS<T>::UpdateS(SequenceNumber old_sequence_number,
+                                     SequenceNumber new_sequence_number) {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&old_sequence_number),
+                  sizeof(old_sequence_number), ProtectionInfo<T>::kSeedS));
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&new_sequence_number),
+                  sizeof(new_sequence_number), ProtectionInfo<T>::kSeedS));
+  SetVal(val);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/db/listener_test.cc b/db/listener_test.cc
index 5f0511d78f6..7c6eb9fe05e 100644
--- a/db/listener_test.cc
+++ b/db/listener_test.cc
@@ -10,7 +10,6 @@
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
 #include "file/filename.h"
-#include "logging/logging.h"
 #include "memtable/hash_linklist_rep.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/cache.h"
@@ -193,10 +192,10 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) {
   ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
   for (int i = 1; i < 8; ++i) {
     ASSERT_OK(Flush(i));
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[i],
                                      nullptr, nullptr));
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
   }
 
   ASSERT_EQ(listener->compacted_dbs_.size(), cf_names.size());
@@ -212,6 +211,10 @@ class TestFlushListener : public EventListener {
       : slowdown_count(0), stop_count(0), db_closed(), env_(env), test_(test) {
     db_closed = false;
   }
+
+  virtual ~TestFlushListener() {
+    prev_fc_info_.status.PermitUncheckedError();  // Ignore the status
+  }
   void OnTableFileCreated(
       const TableFileCreationInfo& info) override {
     // remember the info for later checking the FlushJobInfo.
@@ -334,7 +337,7 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) {
   ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
   for (int i = 1; i < 8; ++i) {
     ASSERT_OK(Flush(i));
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     ASSERT_EQ(listener->flushed_dbs_.size(), i);
     ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
   }
@@ -418,7 +421,7 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) {
     ASSERT_OK(DB::Open(options, dbname_ + ToString(d), &db));
     for (size_t c = 0; c < cf_names.size(); ++c) {
       ColumnFamilyHandle* handle;
-      db->CreateColumnFamily(cf_opts, cf_names[c], &handle);
+      ASSERT_OK(db->CreateColumnFamily(cf_opts, cf_names[c], &handle));
       handles.push_back(handle);
     }
 
@@ -436,7 +439,8 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) {
   for (size_t c = 0; c < cf_names.size(); ++c) {
     for (int d = 0; d < kNumDBs; ++d) {
       ASSERT_OK(dbs[d]->Flush(FlushOptions(), vec_handles[d][c]));
-      static_cast_with_check<DBImpl>(dbs[d])->TEST_WaitForFlushMemTable();
+      ASSERT_OK(
+          static_cast_with_check<DBImpl>(dbs[d])->TEST_WaitForFlushMemTable());
     }
   }
 
@@ -495,10 +499,10 @@ TEST_F(EventListenerTest, DisableBGCompaction) {
   // keep writing until writes are forced to stop.
   for (int i = 0; static_cast<int>(cf_meta.file_count) < kSlowdownTrigger * 10;
        ++i) {
-    Put(1, ToString(i), std::string(10000, 'x'), WriteOptions());
+    ASSERT_OK(Put(1, ToString(i), std::string(10000, 'x'), WriteOptions()));
     FlushOptions fo;
     fo.allow_write_stall = true;
-    db_->Flush(fo, handles_[1]);
+    ASSERT_OK(db_->Flush(fo, handles_[1]));
     db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
   }
   ASSERT_GE(listener->slowdown_count, kSlowdownTrigger * 9);
@@ -535,7 +539,7 @@ TEST_F(EventListenerTest, CompactionReasonLevel) {
   for (int i = 0; i < 4; i++) {
     GenerateNewRandomFile(&rnd);
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_EQ(listener->compaction_reasons_.size(), 1);
   ASSERT_EQ(listener->compaction_reasons_[0],
@@ -552,14 +556,14 @@ TEST_F(EventListenerTest, CompactionReasonLevel) {
   }
 
   // Do a trivial move from L0 -> L1
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
   options.max_bytes_for_level_base = 1;
   Close();
   listener->compaction_reasons_.clear();
   Reopen(options);
 
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   ASSERT_GT(listener->compaction_reasons_.size(), 1);
 
   for (auto compaction_reason : listener->compaction_reasons_) {
@@ -571,7 +575,7 @@ TEST_F(EventListenerTest, CompactionReasonLevel) {
   listener->compaction_reasons_.clear();
   Reopen(options);
 
-  Put("key", "value");
+  ASSERT_OK(Put("key", "value"));
   CompactRangeOptions cro;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
@@ -605,7 +609,7 @@ TEST_F(EventListenerTest, CompactionReasonUniversal) {
   for (int i = 0; i < 8; i++) {
     GenerateNewRandomFile(&rnd);
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_GT(listener->compaction_reasons_.size(), 0);
   for (auto compaction_reason : listener->compaction_reasons_) {
@@ -623,7 +627,7 @@ TEST_F(EventListenerTest, CompactionReasonUniversal) {
   for (int i = 0; i < 8; i++) {
     GenerateNewRandomFile(&rnd);
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_GT(listener->compaction_reasons_.size(), 0);
   for (auto compaction_reason : listener->compaction_reasons_) {
@@ -635,7 +639,7 @@ TEST_F(EventListenerTest, CompactionReasonUniversal) {
   listener->compaction_reasons_.clear();
   Reopen(options);
 
-  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
   ASSERT_GT(listener->compaction_reasons_.size(), 0);
   for (auto compaction_reason : listener->compaction_reasons_) {
@@ -664,7 +668,7 @@ TEST_F(EventListenerTest, CompactionReasonFIFO) {
   for (int i = 0; i < 4; i++) {
     GenerateNewRandomFile(&rnd);
   }
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_GT(listener->compaction_reasons_.size(), 0);
   for (auto compaction_reason : listener->compaction_reasons_) {
@@ -676,7 +680,7 @@ class TableFileCreationListener : public EventListener {
  public:
   class TestEnv : public EnvWrapper {
    public:
-    TestEnv() : EnvWrapper(Env::Default()) {}
+    explicit TestEnv(Env* t) : EnvWrapper(t) {}
 
     void SetStatus(Status s) { status_ = s; }
 
@@ -688,7 +692,7 @@ class TableFileCreationListener : public EventListener {
           return status_;
         }
       }
-      return Env::Default()->NewWritableFile(fname, result, options);
+      return target()->NewWritableFile(fname, result, options);
     }
 
    private:
@@ -766,7 +770,6 @@ class TableFileCreationListener : public EventListener {
     }
   }
 
-  TestEnv test_env;
   int started_[2];
   int finished_[2];
   int failure_[2];
@@ -775,44 +778,48 @@ class TableFileCreationListener : public EventListener {
 TEST_F(EventListenerTest, TableFileCreationListenersTest) {
   auto listener = std::make_shared<TableFileCreationListener>();
   Options options;
+  std::unique_ptr<TableFileCreationListener::TestEnv> test_env(
+      new TableFileCreationListener::TestEnv(CurrentOptions().env));
   options.create_if_missing = true;
   options.listeners.push_back(listener);
-  options.env = &listener->test_env;
+  options.env = test_env.get();
   DestroyAndReopen(options);
 
   ASSERT_OK(Put("foo", "aaa"));
   ASSERT_OK(Put("bar", "bbb"));
   ASSERT_OK(Flush());
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0);
-
   ASSERT_OK(Put("foo", "aaa1"));
   ASSERT_OK(Put("bar", "bbb1"));
-  listener->test_env.SetStatus(Status::NotSupported("not supported"));
+  test_env->SetStatus(Status::NotSupported("not supported"));
   ASSERT_NOK(Flush());
   listener->CheckAndResetCounters(1, 1, 1, 0, 0, 0);
-  listener->test_env.SetStatus(Status::OK());
+  test_env->SetStatus(Status::OK());
 
   Reopen(options);
   ASSERT_OK(Put("foo", "aaa2"));
   ASSERT_OK(Put("bar", "bbb2"));
   ASSERT_OK(Flush());
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0);
 
   const Slice kRangeStart = "a";
   const Slice kRangeEnd = "z";
-  dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd);
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(
+      dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   listener->CheckAndResetCounters(0, 0, 0, 1, 1, 0);
 
   ASSERT_OK(Put("foo", "aaa3"));
   ASSERT_OK(Put("bar", "bbb3"));
   ASSERT_OK(Flush());
-  listener->test_env.SetStatus(Status::NotSupported("not supported"));
-  dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd);
-  dbfull()->TEST_WaitForCompact();
+  test_env->SetStatus(Status::NotSupported("not supported"));
+  ASSERT_NOK(
+      dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd));
+  ASSERT_NOK(dbfull()->TEST_WaitForCompact());
   listener->CheckAndResetCounters(1, 1, 0, 1, 1, 1);
+  Close();
 }
 
 class MemTableSealedListener : public EventListener {
@@ -833,6 +840,7 @@ class MemTableSealedListener : public EventListener {
 TEST_F(EventListenerTest, MemTableSealedListenerTest) {
   auto listener = std::make_shared<MemTableSealedListener>();
   Options options;
+  options.env = CurrentOptions().env;
   options.create_if_missing = true;
   options.listeners.push_back(listener);
   DestroyAndReopen(options);
@@ -1066,7 +1074,7 @@ TEST_F(EventListenerTest, OnFileOperationTest) {
   TestFileOperationListener* listener = new TestFileOperationListener();
   options.listeners.emplace_back(listener);
 
-  options.use_direct_io_for_flush_and_compaction = true;
+  options.use_direct_io_for_flush_and_compaction = false;
   Status s = TryReopen(options);
   if (s.IsInvalidArgument()) {
     options.use_direct_io_for_flush_and_compaction = false;
@@ -1075,8 +1083,8 @@ TEST_F(EventListenerTest, OnFileOperationTest) {
   }
   DestroyAndReopen(options);
   ASSERT_OK(Put("foo", "aaa"));
-  dbfull()->Flush(FlushOptions());
-  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   ASSERT_GE(listener->file_writes_.load(),
             listener->file_writes_success_.load());
   ASSERT_GT(listener->file_writes_.load(), 0);
diff --git a/db/log_test.cc b/db/log_test.cc
index 2697619683f..2e993d8f905 100644
--- a/db/log_test.cc
+++ b/db/log_test.cc
@@ -9,7 +9,6 @@
 
 #include "db/log_reader.h"
 #include "db/log_writer.h"
-#include "env/composite_env_wrapper.h"
 #include "file/sequence_file_reader.h"
 #include "file/writable_file_writer.h"
 #include "rocksdb/env.h"
@@ -50,7 +49,7 @@ static std::string RandomSkewedString(int i, Random* rnd) {
 // get<1>(tuple): true if allow retry after read EOF, false otherwise
 class LogTest : public ::testing::TestWithParam<std::tuple<int, bool>> {
  private:
-  class StringSource : public SequentialFile {
+  class StringSource : public FSSequentialFile {
    public:
     Slice& contents_;
     bool force_error_;
@@ -68,7 +67,8 @@ class LogTest : public ::testing::TestWithParam<std::tuple<int, bool>> {
           returned_partial_(false),
           fail_after_read_partial_(fail_after_read_partial) {}
 
-    Status Read(size_t n, Slice* result, char* scratch) override {
+    IOStatus Read(size_t n, const IOOptions& /*opts*/, Slice* result,
+                  char* scratch, IODebugContext* /*dbg*/) override {
       if (fail_after_read_partial_) {
         EXPECT_TRUE(!returned_partial_) << "must not Read() after eof/error";
       }
@@ -81,7 +81,7 @@ class LogTest : public ::testing::TestWithParam<std::tuple<int, bool>> {
           contents_.remove_prefix(force_error_position_);
           force_error_ = false;
           returned_partial_ = true;
-          return Status::Corruption("read error");
+          return IOStatus::Corruption("read error");
         }
       }
 
@@ -106,28 +106,21 @@ class LogTest : public ::testing::TestWithParam<std::tuple<int, bool>> {
       *result = Slice(scratch, n);
 
       contents_.remove_prefix(n);
-      return Status::OK();
+      return IOStatus::OK();
     }
 
-    Status Skip(uint64_t n) override {
+    IOStatus Skip(uint64_t n) override {
       if (n > contents_.size()) {
         contents_.clear();
-        return Status::NotFound("in-memory file skipepd past end");
+        return IOStatus::NotFound("in-memory file skipepd past end");
       }
 
       contents_.remove_prefix(n);
 
-      return Status::OK();
+      return IOStatus::OK();
     }
   };
 
-  inline StringSource* GetStringSourceFromLegacyReader(
-      SequentialFileReader* reader) {
-    LegacySequentialFileWrapper* file =
-        static_cast<LegacySequentialFileWrapper*>(reader->file());
-    return static_cast<StringSource*>(file->target());
-  }
-
   class ReportCollector : public Reader::Reporter {
    public:
     size_t dropped_bytes_;
@@ -140,29 +133,17 @@ class LogTest : public ::testing::TestWithParam<std::tuple<int, bool>> {
     }
   };
 
-  std::string& dest_contents() {
-    auto dest = test::GetStringSinkFromLegacyWriter(writer_.file());
-    assert(dest);
-    return dest->contents_;
-  }
+  std::string& dest_contents() { return sink_->contents_; }
 
-  const std::string& dest_contents() const {
-    auto dest = test::GetStringSinkFromLegacyWriter(writer_.file());
-    assert(dest);
-    return dest->contents_;
-  }
+  const std::string& dest_contents() const { return sink_->contents_; }
 
-  void reset_source_contents() {
-    auto src = GetStringSourceFromLegacyReader(reader_->file());
-    assert(src);
-    src->contents_ = dest_contents();
-  }
+  void reset_source_contents() { source_->contents_ = dest_contents(); }
 
   Slice reader_contents_;
-  std::unique_ptr<WritableFileWriter> dest_holder_;
-  std::unique_ptr<SequentialFileReader> source_holder_;
+  test::StringSink* sink_;
+  StringSource* source_;
   ReportCollector report_;
-  Writer writer_;
+  std::unique_ptr<Writer> writer_;
   std::unique_ptr<Reader> reader_;
 
  protected:
@@ -171,19 +152,23 @@ class LogTest : public ::testing::TestWithParam<std::tuple<int, bool>> {
  public:
   LogTest()
       : reader_contents_(),
-        dest_holder_(test::GetWritableFileWriter(
-            new test::StringSink(&reader_contents_), "" /* don't care */)),
-        source_holder_(test::GetSequentialFileReader(
-            new StringSource(reader_contents_, !std::get<1>(GetParam())),
-            "" /* file name */)),
-        writer_(std::move(dest_holder_), 123, std::get<0>(GetParam())),
+        sink_(new test::StringSink(&reader_contents_)),
+        source_(new StringSource(reader_contents_, !std::get<1>(GetParam()))),
         allow_retry_read_(std::get<1>(GetParam())) {
+    std::unique_ptr<FSWritableFile> sink_holder(sink_);
+    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+        std::move(sink_holder), "" /* don't care */, FileOptions()));
+    writer_.reset(
+        new Writer(std::move(file_writer), 123, std::get<0>(GetParam())));
+    std::unique_ptr<FSSequentialFile> source_holder(source_);
+    std::unique_ptr<SequentialFileReader> file_reader(
+        new SequentialFileReader(std::move(source_holder), "" /* file name */));
     if (allow_retry_read_) {
-      reader_.reset(new FragmentBufferedReader(
-          nullptr, std::move(source_holder_), &report_, true /* checksum */,
-          123 /* log_number */));
+      reader_.reset(new FragmentBufferedReader(nullptr, std::move(file_reader),
+                                               &report_, true /* checksum */,
+                                               123 /* log_number */));
     } else {
-      reader_.reset(new Reader(nullptr, std::move(source_holder_), &report_,
+      reader_.reset(new Reader(nullptr, std::move(file_reader), &report_,
                                true /* checksum */, 123 /* log_number */));
     }
   }
@@ -191,7 +176,7 @@ class LogTest : public ::testing::TestWithParam<std::tuple<int, bool>> {
   Slice* get_reader_contents() { return &reader_contents_; }
 
   void Write(const std::string& msg) {
-    writer_.AddRecord(Slice(msg));
+    ASSERT_OK(writer_->AddRecord(Slice(msg)));
   }
 
   size_t WrittenBytes() const {
@@ -219,11 +204,7 @@ class LogTest : public ::testing::TestWithParam<std::tuple<int, bool>> {
     dest_contents()[offset] = new_byte;
   }
 
-  void ShrinkSize(int bytes) {
-    auto dest = test::GetStringSinkFromLegacyWriter(writer_.file());
-    assert(dest);
-    dest->Drop(bytes);
-  }
+  void ShrinkSize(int bytes) { sink_->Drop(bytes); }
 
   void FixChecksum(int header_offset, int len, bool recyclable) {
     // Compute crc of type/len/data
@@ -235,9 +216,8 @@ class LogTest : public ::testing::TestWithParam<std::tuple<int, bool>> {
   }
 
   void ForceError(size_t position = 0) {
-    auto src = GetStringSourceFromLegacyReader(reader_->file());
-    src->force_error_ = true;
-    src->force_error_position_ = position;
+    source_->force_error_ = true;
+    source_->force_error_position_ = position;
   }
 
   size_t DroppedBytes() const {
@@ -249,14 +229,12 @@ class LogTest : public ::testing::TestWithParam<std::tuple<int, bool>> {
   }
 
   void ForceEOF(size_t position = 0) {
-    auto src = GetStringSourceFromLegacyReader(reader_->file());
-    src->force_eof_ = true;
-    src->force_eof_position_ = position;
+    source_->force_eof_ = true;
+    source_->force_eof_position_ = position;
   }
 
   void UnmarkEOF() {
-    auto src = GetStringSourceFromLegacyReader(reader_->file());
-    src->returned_partial_ = false;
+    source_->returned_partial_ = false;
     reader_->UnmarkEOF();
   }
 
@@ -685,12 +663,13 @@ TEST_P(LogTest, Recycle) {
   while (get_reader_contents()->size() < log::kBlockSize * 2) {
     Write("xxxxxxxxxxxxxxxx");
   }
-  std::unique_ptr<WritableFileWriter> dest_holder(test::GetWritableFileWriter(
-      new test::OverwritingStringSink(get_reader_contents()),
-      "" /* don't care */));
+  std::unique_ptr<FSWritableFile> sink(
+      new test::OverwritingStringSink(get_reader_contents()));
+  std::unique_ptr<WritableFileWriter> dest_holder(new WritableFileWriter(
+      std::move(sink), "" /* don't care */, FileOptions()));
   Writer recycle_writer(std::move(dest_holder), 123, true);
-  recycle_writer.AddRecord(Slice("foooo"));
-  recycle_writer.AddRecord(Slice("bar"));
+  ASSERT_OK(recycle_writer.AddRecord(Slice("foooo")));
+  ASSERT_OK(recycle_writer.AddRecord(Slice("bar")));
   ASSERT_GE(get_reader_contents()->size(), log::kBlockSize * 2);
   ASSERT_EQ("foooo", Read());
   ASSERT_EQ("bar", Read());
@@ -718,10 +697,9 @@ class RetriableLogTest : public ::testing::TestWithParam<int> {
   };
 
   Slice contents_;
-  std::unique_ptr<WritableFileWriter> dest_holder_;
+  test::StringSink* sink_;
   std::unique_ptr<Writer> log_writer_;
   Env* env_;
-  EnvOptions env_options_;
   const std::string test_dir_;
   const std::string log_file_;
   std::unique_ptr<WritableFileWriter> writer_;
@@ -732,61 +710,58 @@ class RetriableLogTest : public ::testing::TestWithParam<int> {
  public:
   RetriableLogTest()
       : contents_(),
-        dest_holder_(nullptr),
+        sink_(new test::StringSink(&contents_)),
         log_writer_(nullptr),
         env_(Env::Default()),
         test_dir_(test::PerThreadDBPath("retriable_log_test")),
         log_file_(test_dir_ + "/log"),
         writer_(nullptr),
         reader_(nullptr),
-        log_reader_(nullptr) {}
+        log_reader_(nullptr) {
+    std::unique_ptr<FSWritableFile> sink_holder(sink_);
+    std::unique_ptr<WritableFileWriter> wfw(new WritableFileWriter(
+        std::move(sink_holder), "" /* file name */, FileOptions()));
+    log_writer_.reset(new Writer(std::move(wfw), 123, GetParam()));
+  }
 
   Status SetupTestEnv() {
-    dest_holder_.reset(test::GetWritableFileWriter(
-        new test::StringSink(&contents_), "" /* file name */));
-    assert(dest_holder_ != nullptr);
-    log_writer_.reset(new Writer(std::move(dest_holder_), 123, GetParam()));
-    assert(log_writer_ != nullptr);
-
     Status s;
-    s = env_->CreateDirIfMissing(test_dir_);
-    std::unique_ptr<WritableFile> writable_file;
+    FileOptions fopts;
+    auto fs = env_->GetFileSystem();
+    s = fs->CreateDirIfMissing(test_dir_, IOOptions(), nullptr);
+    std::unique_ptr<FSWritableFile> writable_file;
     if (s.ok()) {
-      s = env_->NewWritableFile(log_file_, &writable_file, env_options_);
+      s = fs->NewWritableFile(log_file_, fopts, &writable_file, nullptr);
     }
     if (s.ok()) {
-      writer_.reset(new WritableFileWriter(
-          NewLegacyWritableFileWrapper(std::move(writable_file)), log_file_,
-          env_options_));
-      assert(writer_ != nullptr);
+      writer_.reset(
+          new WritableFileWriter(std::move(writable_file), log_file_, fopts));
+      EXPECT_NE(writer_, nullptr);
     }
-    std::unique_ptr<SequentialFile> seq_file;
+    std::unique_ptr<FSSequentialFile> seq_file;
     if (s.ok()) {
-      s = env_->NewSequentialFile(log_file_, &seq_file, env_options_);
+      s = fs->NewSequentialFile(log_file_, fopts, &seq_file, nullptr);
     }
     if (s.ok()) {
-      reader_.reset(new SequentialFileReader(
-          NewLegacySequentialFileWrapper(seq_file), log_file_));
-      assert(reader_ != nullptr);
+      reader_.reset(new SequentialFileReader(std::move(seq_file), log_file_));
+      EXPECT_NE(reader_, nullptr);
       log_reader_.reset(new FragmentBufferedReader(
           nullptr, std::move(reader_), &report_, true /* checksum */,
           123 /* log_number */));
-      assert(log_reader_ != nullptr);
+      EXPECT_NE(log_reader_, nullptr);
     }
     return s;
   }
 
-  std::string contents() {
-    auto file = test::GetStringSinkFromLegacyWriter(log_writer_->file());
-    assert(file != nullptr);
-    return file->contents_;
-  }
+  std::string contents() { return sink_->contents_; }
 
-  void Encode(const std::string& msg) { log_writer_->AddRecord(Slice(msg)); }
+  void Encode(const std::string& msg) {
+    ASSERT_OK(log_writer_->AddRecord(Slice(msg)));
+  }
 
   void Write(const Slice& data) {
-    writer_->Append(data);
-    writer_->Sync(true);
+    ASSERT_OK(writer_->Append(data));
+    ASSERT_OK(writer_->Sync(true));
   }
 
   bool TryRead(std::string* result) {
diff --git a/db/log_writer.cc b/db/log_writer.cc
index e290eae6258..6a82f31e10e 100644
--- a/db/log_writer.cc
+++ b/db/log_writer.cc
@@ -33,7 +33,7 @@ Writer::Writer(std::unique_ptr<WritableFileWriter>&& dest, uint64_t log_number,
 
 Writer::~Writer() {
   if (dest_) {
-    WriteBuffer();
+    WriteBuffer().PermitUncheckedError();
   }
 }
 
diff --git a/db/log_writer.h b/db/log_writer.h
index 463826e88ee..1a91b21994d 100644
--- a/db/log_writer.h
+++ b/db/log_writer.h
@@ -8,8 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #pragma once
 
-#include <stdint.h>
-
+#include <cstdint>
 #include <memory>
 
 #include "db/log_format.h"
diff --git a/db/logs_with_prep_tracker.h b/db/logs_with_prep_tracker.h
index 86c88012adc..7f9ece76bca 100644
--- a/db/logs_with_prep_tracker.h
+++ b/db/logs_with_prep_tracker.h
@@ -5,8 +5,8 @@
 //
 #pragma once
 
-#include <stdint.h>
 #include <cassert>
+#include <cstdint>
 #include <cstdlib>
 #include <mutex>
 #include <unordered_map>
diff --git a/db/manual_compaction_test.cc b/db/manual_compaction_test.cc
index c8039b5397a..9005e932a65 100644
--- a/db/manual_compaction_test.cc
+++ b/db/manual_compaction_test.cc
@@ -100,13 +100,13 @@ TEST_F(ManualCompactionTest, CompactTouchesAllKeys) {
     options.compaction_filter = new DestroyAllCompactionFilter();
     ASSERT_OK(DB::Open(options, dbname_, &db));
 
-    db->Put(WriteOptions(), Slice("key1"), Slice("destroy"));
-    db->Put(WriteOptions(), Slice("key2"), Slice("destroy"));
-    db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
-    db->Put(WriteOptions(), Slice("key4"), Slice("destroy"));
+    ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice("destroy")));
+    ASSERT_OK(db->Put(WriteOptions(), Slice("key2"), Slice("destroy")));
+    ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+    ASSERT_OK(db->Put(WriteOptions(), Slice("key4"), Slice("destroy")));
 
     Slice key4("key4");
-    db->CompactRange(CompactRangeOptions(), nullptr, &key4);
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, &key4));
     Iterator* itr = db->NewIterator(ReadOptions());
     itr->SeekToFirst();
     ASSERT_TRUE(itr->Valid());
@@ -135,21 +135,21 @@ TEST_F(ManualCompactionTest, Test) {
   // create first key range
   WriteBatch batch;
   for (int i = 0; i < kNumKeys; i++) {
-    batch.Put(Key1(i), "value for range 1 key");
+    ASSERT_OK(batch.Put(Key1(i), "value for range 1 key"));
   }
   ASSERT_OK(db->Write(WriteOptions(), &batch));
 
   // create second key range
   batch.Clear();
   for (int i = 0; i < kNumKeys; i++) {
-    batch.Put(Key2(i), "value for range 2 key");
+    ASSERT_OK(batch.Put(Key2(i), "value for range 2 key"));
   }
   ASSERT_OK(db->Write(WriteOptions(), &batch));
 
   // delete second key range
   batch.Clear();
   for (int i = 0; i < kNumKeys; i++) {
-    batch.Delete(Key2(i));
+    ASSERT_OK(batch.Delete(Key2(i)));
   }
   ASSERT_OK(db->Write(WriteOptions(), &batch));
 
@@ -160,7 +160,7 @@ TEST_F(ManualCompactionTest, Test) {
   Slice greatest(end_key.data(), end_key.size());
 
   // commenting out the line below causes the example to work correctly
-  db->CompactRange(CompactRangeOptions(), &least, &greatest);
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), &least, &greatest));
 
   // count the keys
   Iterator* iter = db->NewIterator(ReadOptions());
@@ -205,7 +205,7 @@ TEST_F(ManualCompactionTest, SkipLevel) {
     Slice start("5");
     Slice end("7");
     filter->Reset();
-    db->CompactRange(CompactRangeOptions(), &start, &end);
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end));
     ASSERT_EQ(0, filter->NumKeys());
   }
 
@@ -215,7 +215,7 @@ TEST_F(ManualCompactionTest, SkipLevel) {
     Slice start("3");
     Slice end("7");
     filter->Reset();
-    db->CompactRange(CompactRangeOptions(), &start, &end);
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end));
     ASSERT_EQ(2, filter->NumKeys());
     ASSERT_EQ(0, filter->KeyLevel("4"));
     ASSERT_EQ(0, filter->KeyLevel("8"));
@@ -227,7 +227,7 @@ TEST_F(ManualCompactionTest, SkipLevel) {
     // no file has keys in range (-inf, 0]
     Slice end("0");
     filter->Reset();
-    db->CompactRange(CompactRangeOptions(), nullptr, &end);
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, &end));
     ASSERT_EQ(0, filter->NumKeys());
   }
 
@@ -237,7 +237,7 @@ TEST_F(ManualCompactionTest, SkipLevel) {
     // no file has keys in range [9, inf)
     Slice start("9");
     filter->Reset();
-    db->CompactRange(CompactRangeOptions(), &start, nullptr);
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, nullptr));
     ASSERT_EQ(0, filter->NumKeys());
   }
 
@@ -248,7 +248,7 @@ TEST_F(ManualCompactionTest, SkipLevel) {
     Slice start("2");
     Slice end("2");
     filter->Reset();
-    db->CompactRange(CompactRangeOptions(), &start, &end);
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end));
     ASSERT_EQ(1, filter->NumKeys());
     ASSERT_EQ(0, filter->KeyLevel("2"));
   }
@@ -260,7 +260,7 @@ TEST_F(ManualCompactionTest, SkipLevel) {
     Slice start("2");
     Slice end("5");
     filter->Reset();
-    db->CompactRange(CompactRangeOptions(), &start, &end);
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end));
     ASSERT_EQ(3, filter->NumKeys());
     ASSERT_EQ(1, filter->KeyLevel("2"));
     ASSERT_EQ(1, filter->KeyLevel("4"));
@@ -273,7 +273,7 @@ TEST_F(ManualCompactionTest, SkipLevel) {
     // [0, inf) overlaps all files
     Slice start("0");
     filter->Reset();
-    db->CompactRange(CompactRangeOptions(), &start, nullptr);
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, nullptr));
     ASSERT_EQ(4, filter->NumKeys());
     // 1 is first compacted to L1 and then further compacted into [2, 4, 8],
     // so finally the logged level for 1 is L1.
diff --git a/db/memtable.cc b/db/memtable.cc
index 53be973e795..2b2598658b1 100644
--- a/db/memtable.cc
+++ b/db/memtable.cc
@@ -13,7 +13,9 @@
 #include <array>
 #include <limits>
 #include <memory>
+
 #include "db/dbformat.h"
+#include "db/kv_checksum.h"
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
 #include "db/pinned_iterators_manager.h"
@@ -41,7 +43,7 @@
 namespace ROCKSDB_NAMESPACE {
 
 ImmutableMemTableOptions::ImmutableMemTableOptions(
-    const ImmutableCFOptions& ioptions,
+    const ImmutableOptions& ioptions,
     const MutableCFOptions& mutable_cf_options)
     : arena_block_size(mutable_cf_options.arena_block_size),
       memtable_prefix_bloom_bits(
@@ -56,13 +58,13 @@ ImmutableMemTableOptions::ImmutableMemTableOptions(
       inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks),
       inplace_callback(ioptions.inplace_callback),
       max_successive_merges(mutable_cf_options.max_successive_merges),
-      statistics(ioptions.statistics),
-      merge_operator(ioptions.merge_operator),
-      info_log(ioptions.info_log),
+      statistics(ioptions.stats),
+      merge_operator(ioptions.merge_operator.get()),
+      info_log(ioptions.logger),
       allow_data_in_errors(ioptions.allow_data_in_errors) {}
 
 MemTable::MemTable(const InternalKeyComparator& cmp,
-                   const ImmutableCFOptions& ioptions,
+                   const ImmutableOptions& ioptions,
                    const MutableCFOptions& mutable_cf_options,
                    WriteBufferManager* write_buffer_manager,
                    SequenceNumber latest_seq, uint32_t column_family_id)
@@ -80,9 +82,9 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
              mutable_cf_options.memtable_huge_page_size),
       table_(ioptions.memtable_factory->CreateMemTableRep(
           comparator_, &arena_, mutable_cf_options.prefix_extractor.get(),
-          ioptions.info_log, column_family_id)),
+          ioptions.logger, column_family_id)),
       range_del_table_(SkipListFactory().CreateMemTableRep(
-          comparator_, &arena_, nullptr /* transform */, ioptions.info_log,
+          comparator_, &arena_, nullptr /* transform */, ioptions.logger,
           column_family_id)),
       is_range_del_table_empty_(true),
       data_size_(0),
@@ -102,9 +104,9 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
                  : 0),
       prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
       flush_state_(FLUSH_NOT_REQUESTED),
-      env_(ioptions.env),
+      clock_(ioptions.clock),
       insert_with_hint_prefix_extractor_(
-          ioptions.memtable_insert_with_hint_prefix_extractor),
+          ioptions.memtable_insert_with_hint_prefix_extractor.get()),
       oldest_key_time_(std::numeric_limits<uint64_t>::max()),
       atomic_flush_seqno_(kMaxSequenceNumber),
       approximate_memory_usage_(0) {
@@ -118,7 +120,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
     bloom_filter_.reset(
         new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits,
                          6 /* hard coded 6 probes */,
-                         moptions_.memtable_huge_page_size, ioptions.info_log));
+                         moptions_.memtable_huge_page_size, ioptions.logger));
   }
 }
 
@@ -221,7 +223,7 @@ void MemTable::UpdateOldestKeyTime() {
   uint64_t oldest_key_time = oldest_key_time_.load(std::memory_order_relaxed);
   if (oldest_key_time == std::numeric_limits<uint64_t>::max()) {
     int64_t current_time = 0;
-    auto s = env_->GetCurrentTime(&current_time);
+    auto s = clock_->GetCurrentTime(&current_time);
     if (s.ok()) {
       assert(current_time >= 0);
       // If fail, the timestamp is already set.
@@ -328,9 +330,11 @@ class MemTableIterator : public InternalIterator {
     PERF_COUNTER_ADD(seek_on_memtable_count, 1);
     if (bloom_) {
       // iterator should only use prefix bloom filter
-      Slice user_k(ExtractUserKey(k));
-      if (prefix_extractor_->InDomain(user_k) &&
-          !bloom_->MayContain(prefix_extractor_->Transform(user_k))) {
+      auto ts_sz = comparator_.comparator.user_comparator()->timestamp_size();
+      Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz));
+      if (prefix_extractor_->InDomain(user_k_without_ts) &&
+          !bloom_->MayContain(
+              prefix_extractor_->Transform(user_k_without_ts))) {
         PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
         valid_ = false;
         return;
@@ -345,9 +349,11 @@ class MemTableIterator : public InternalIterator {
     PERF_TIMER_GUARD(seek_on_memtable_time);
     PERF_COUNTER_ADD(seek_on_memtable_count, 1);
     if (bloom_) {
-      Slice user_k(ExtractUserKey(k));
-      if (prefix_extractor_->InDomain(user_k) &&
-          !bloom_->MayContain(prefix_extractor_->Transform(user_k))) {
+      auto ts_sz = comparator_.comparator.user_comparator()->timestamp_size();
+      Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz));
+      if (prefix_extractor_->InDomain(user_k_without_ts) &&
+          !bloom_->MayContain(
+              prefix_extractor_->Transform(user_k_without_ts))) {
         PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
         valid_ = false;
         return;
@@ -480,10 +486,55 @@ MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey,
   return {entry_count * (data_size / n), entry_count};
 }
 
-bool MemTable::Add(SequenceNumber s, ValueType type,
-                   const Slice& key, /* user key */
-                   const Slice& value, bool allow_concurrent,
-                   MemTablePostProcessInfo* post_process_info, void** hint) {
+Status MemTable::VerifyEncodedEntry(Slice encoded,
+                                    const ProtectionInfoKVOTS64& kv_prot_info) {
+  uint32_t ikey_len = 0;
+  if (!GetVarint32(&encoded, &ikey_len)) {
+    return Status::Corruption("Unable to parse internal key length");
+  }
+  size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
+  if (ikey_len < 8 + ts_sz) {
+    return Status::Corruption("Internal key length too short");
+  }
+  if (ikey_len > encoded.size()) {
+    return Status::Corruption("Internal key length too long");
+  }
+  uint32_t value_len = 0;
+  const size_t key_without_ts_len = ikey_len - ts_sz - 8;
+  Slice key(encoded.data(), key_without_ts_len);
+  encoded.remove_prefix(key_without_ts_len);
+
+  Slice timestamp(encoded.data(), ts_sz);
+  encoded.remove_prefix(ts_sz);
+
+  uint64_t packed = DecodeFixed64(encoded.data());
+  ValueType value_type = kMaxValue;
+  SequenceNumber sequence_number = kMaxSequenceNumber;
+  UnPackSequenceAndType(packed, &sequence_number, &value_type);
+  encoded.remove_prefix(8);
+
+  if (!GetVarint32(&encoded, &value_len)) {
+    return Status::Corruption("Unable to parse value length");
+  }
+  if (value_len < encoded.size()) {
+    return Status::Corruption("Value length too short");
+  }
+  if (value_len > encoded.size()) {
+    return Status::Corruption("Value length too long");
+  }
+  Slice value(encoded.data(), value_len);
+
+  return kv_prot_info.StripS(sequence_number)
+      .StripKVOT(key, value, value_type, timestamp)
+      .GetStatus();
+}
+
+Status MemTable::Add(SequenceNumber s, ValueType type,
+                     const Slice& key, /* user key */
+                     const Slice& value,
+                     const ProtectionInfoKVOTS64* kv_prot_info,
+                     bool allow_concurrent,
+                     MemTablePostProcessInfo* post_process_info, void** hint) {
   // Format of an entry is concatenation of:
   //  key_size     : varint32 of internal_key.size()
   //  key bytes    : char[internal_key.size()]
@@ -510,7 +561,17 @@ bool MemTable::Add(SequenceNumber s, ValueType type,
   p = EncodeVarint32(p, val_size);
   memcpy(p, value.data(), val_size);
   assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
+  if (kv_prot_info != nullptr) {
+    Slice encoded(buf, encoded_len);
+    TEST_SYNC_POINT_CALLBACK("MemTable::Add:Encoded", &encoded);
+    Status status = VerifyEncodedEntry(encoded, *kv_prot_info);
+    if (!status.ok()) {
+      return status;
+    }
+  }
+
   size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
+  Slice key_without_ts = StripTimestampFromUserKey(key, ts_sz);
 
   if (!allow_concurrent) {
     // Extract prefix for insert with hint.
@@ -519,12 +580,12 @@ bool MemTable::Add(SequenceNumber s, ValueType type,
       Slice prefix = insert_with_hint_prefix_extractor_->Transform(key_slice);
       bool res = table->InsertKeyWithHint(handle, &insert_hints_[prefix]);
       if (UNLIKELY(!res)) {
-        return res;
+        return Status::TryAgain("key+seq exists");
       }
     } else {
       bool res = table->InsertKey(handle);
       if (UNLIKELY(!res)) {
-        return res;
+        return Status::TryAgain("key+seq exists");
       }
     }
 
@@ -540,11 +601,11 @@ bool MemTable::Add(SequenceNumber s, ValueType type,
     }
 
     if (bloom_filter_ && prefix_extractor_ &&
-        prefix_extractor_->InDomain(key)) {
-      bloom_filter_->Add(prefix_extractor_->Transform(key));
+        prefix_extractor_->InDomain(key_without_ts)) {
+      bloom_filter_->Add(prefix_extractor_->Transform(key_without_ts));
     }
     if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
-      bloom_filter_->Add(StripTimestampFromUserKey(key, ts_sz));
+      bloom_filter_->Add(key_without_ts);
     }
 
     // The first sequence number inserted into the memtable
@@ -565,7 +626,7 @@ bool MemTable::Add(SequenceNumber s, ValueType type,
                    ? table->InsertKeyConcurrently(handle)
                    : table->InsertKeyWithHintConcurrently(handle, hint);
     if (UNLIKELY(!res)) {
-      return res;
+      return Status::TryAgain("key+seq exists");
     }
 
     assert(post_process_info != nullptr);
@@ -576,11 +637,12 @@ bool MemTable::Add(SequenceNumber s, ValueType type,
     }
 
     if (bloom_filter_ && prefix_extractor_ &&
-        prefix_extractor_->InDomain(key)) {
-      bloom_filter_->AddConcurrently(prefix_extractor_->Transform(key));
+        prefix_extractor_->InDomain(key_without_ts)) {
+      bloom_filter_->AddConcurrently(
+          prefix_extractor_->Transform(key_without_ts));
     }
     if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
-      bloom_filter_->AddConcurrently(StripTimestampFromUserKey(key, ts_sz));
+      bloom_filter_->AddConcurrently(key_without_ts);
     }
 
     // atomically update first_seqno_ and earliest_seqno_.
@@ -599,7 +661,7 @@ bool MemTable::Add(SequenceNumber s, ValueType type,
     is_range_del_table_empty_.store(false, std::memory_order_relaxed);
   }
   UpdateOldestKeyTime();
-  return true;
+  return Status::OK();
 }
 
 // Callback from MemTable::Get()
@@ -622,7 +684,8 @@ struct Saver {
   Statistics* statistics;
   bool inplace_update_support;
   bool do_merge;
-  Env* env_;
+  SystemClock* clock;
+
   ReadCallback* callback_;
   bool* is_blob_index;
   bool allow_data_in_errors;
@@ -660,8 +723,8 @@ static bool SaveValue(void* arg, const char* entry) {
   const Comparator* user_comparator =
       s->mem->GetInternalKeyComparator().user_comparator();
   size_t ts_sz = user_comparator->timestamp_size();
-  if (user_comparator->CompareWithoutTimestamp(user_key_slice,
-                                               s->key->user_key()) == 0) {
+  if (user_comparator->EqualWithoutTimestamp(user_key_slice,
+                                             s->key->user_key())) {
     // Correct user key
     const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
     ValueType type;
@@ -706,7 +769,7 @@ static bool SaveValue(void* arg, const char* entry) {
               *(s->status) = MergeHelper::TimedFullMerge(
                   merge_operator, s->key->user_key(), &v,
                   merge_context->GetOperands(), s->value, s->logger,
-                  s->statistics, s->env_, nullptr /* result_operand */, true);
+                  s->statistics, s->clock, nullptr /* result_operand */, true);
             }
           } else {
             // Preserve the value with the goal of returning it as part of
@@ -745,7 +808,7 @@ static bool SaveValue(void* arg, const char* entry) {
             *(s->status) = MergeHelper::TimedFullMerge(
                 merge_operator, s->key->user_key(), nullptr,
                 merge_context->GetOperands(), s->value, s->logger,
-                s->statistics, s->env_, nullptr /* result_operand */, true);
+                s->statistics, s->clock, nullptr /* result_operand */, true);
           }
         } else {
           *(s->status) = Status::NotFound();
@@ -773,7 +836,7 @@ static bool SaveValue(void* arg, const char* entry) {
           *(s->status) = MergeHelper::TimedFullMerge(
               merge_operator, s->key->user_key(), nullptr,
               merge_context->GetOperands(), s->value, s->logger, s->statistics,
-              s->env_, nullptr /* result_operand */, true);
+              s->clock, nullptr /* result_operand */, true);
           *(s->found_final_value) = true;
           return false;
         }
@@ -820,22 +883,21 @@ bool MemTable::Get(const LookupKey& key, std::string* value,
                  range_del_iter->MaxCoveringTombstoneSeqnum(key.user_key()));
   }
 
-  Slice user_key = key.user_key();
   bool found_final_value = false;
   bool merge_in_progress = s->IsMergeInProgress();
   bool may_contain = true;
   size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
+  Slice user_key_without_ts = StripTimestampFromUserKey(key.user_key(), ts_sz);
   if (bloom_filter_) {
     // when both memtable_whole_key_filtering and prefix_extractor_ are set,
     // only do whole key filtering for Get() to save CPU
     if (moptions_.memtable_whole_key_filtering) {
-      may_contain =
-          bloom_filter_->MayContain(StripTimestampFromUserKey(user_key, ts_sz));
+      may_contain = bloom_filter_->MayContain(user_key_without_ts);
     } else {
       assert(prefix_extractor_);
-      may_contain =
-          !prefix_extractor_->InDomain(user_key) ||
-          bloom_filter_->MayContain(prefix_extractor_->Transform(user_key));
+      may_contain = !prefix_extractor_->InDomain(user_key_without_ts) ||
+                    bloom_filter_->MayContain(
+                        prefix_extractor_->Transform(user_key_without_ts));
     }
   }
 
@@ -882,7 +944,7 @@ void MemTable::GetFromTable(const LookupKey& key,
   saver.logger = moptions_.info_log;
   saver.inplace_update_support = moptions_.inplace_update_support;
   saver.statistics = moptions_.statistics;
-  saver.env_ = env_;
+  saver.clock = clock_;
   saver.callback_ = callback;
   saver.is_blob_index = is_blob_index;
   saver.do_merge = do_merge;
@@ -892,7 +954,7 @@ void MemTable::GetFromTable(const LookupKey& key,
 }
 
 void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
-                        ReadCallback* callback, bool* is_blob) {
+                        ReadCallback* callback) {
   // The sequence number is updated synchronously in version_set.h
   if (IsEmpty()) {
     // Avoiding recording stats for speed.
@@ -908,16 +970,18 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
     int num_keys = 0;
     for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
       if (!prefix_extractor_) {
-        keys[num_keys++] = &iter->ukey;
-      } else if (prefix_extractor_->InDomain(iter->ukey)) {
-        prefixes.emplace_back(prefix_extractor_->Transform(iter->ukey));
+        keys[num_keys++] = &iter->ukey_without_ts;
+      } else if (prefix_extractor_->InDomain(iter->ukey_without_ts)) {
+        prefixes.emplace_back(
+            prefix_extractor_->Transform(iter->ukey_without_ts));
         keys[num_keys++] = &prefixes.back();
       }
     }
     bloom_filter_->MayContain(num_keys, &keys[0], &may_match[0]);
     int idx = 0;
     for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
-      if (prefix_extractor_ && !prefix_extractor_->InDomain(iter->ukey)) {
+      if (prefix_extractor_ &&
+          !prefix_extractor_->InDomain(iter->ukey_without_ts)) {
         PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
         continue;
       }
@@ -943,9 +1007,9 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
           range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key()));
     }
     GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true,
-                 callback, is_blob, iter->value->GetSelf(), iter->timestamp,
-                 iter->s, &(iter->merge_context), &seq, &found_final_value,
-                 &merge_in_progress);
+                 callback, &iter->is_blob_index, iter->value->GetSelf(),
+                 iter->timestamp, iter->s, &(iter->merge_context), &seq,
+                 &found_final_value, &merge_in_progress);
 
     if (!found_final_value && merge_in_progress) {
       *(iter->s) = Status::MergeInProgress();
@@ -970,9 +1034,9 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
   PERF_COUNTER_ADD(get_from_memtable_count, 1);
 }
 
-void MemTable::Update(SequenceNumber seq,
-                      const Slice& key,
-                      const Slice& value) {
+Status MemTable::Update(SequenceNumber seq, const Slice& key,
+                        const Slice& value,
+                        const ProtectionInfoKVOTS64* kv_prot_info) {
   LookupKey lkey(key, seq);
   Slice mem_key = lkey.memtable_key();
 
@@ -1016,22 +1080,26 @@ void MemTable::Update(SequenceNumber seq,
                  (unsigned)(VarintLength(key_length) + key_length +
                             VarintLength(value.size()) + value.size()));
           RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
-          return;
+          if (kv_prot_info != nullptr) {
+            ProtectionInfoKVOTS64 updated_kv_prot_info(*kv_prot_info);
+            // `seq` is swallowed and `existing_seq` prevails.
+            updated_kv_prot_info.UpdateS(seq, existing_seq);
+            Slice encoded(entry, p + value.size() - entry);
+            return VerifyEncodedEntry(encoded, updated_kv_prot_info);
+          }
+          return Status::OK();
         }
       }
     }
   }
 
-  // key doesn't exist
-  bool add_res __attribute__((__unused__));
-  add_res = Add(seq, kTypeValue, key, value);
-  // We already checked unused != seq above. In that case, Add should not fail.
-  assert(add_res);
+  // The latest value is not `kTypeValue` or key doesn't exist
+  return Add(seq, kTypeValue, key, value, kv_prot_info);
 }
 
-bool MemTable::UpdateCallback(SequenceNumber seq,
-                              const Slice& key,
-                              const Slice& delta) {
+Status MemTable::UpdateCallback(SequenceNumber seq, const Slice& key,
+                                const Slice& delta,
+                                const ProtectionInfoKVOTS64* kv_prot_info) {
   LookupKey lkey(key, seq);
   Slice memkey = lkey.memtable_key();
 
@@ -1057,8 +1125,8 @@ bool MemTable::UpdateCallback(SequenceNumber seq,
       // Correct user key
       const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
       ValueType type;
-      uint64_t unused;
-      UnPackSequenceAndType(tag, &unused, &type);
+      uint64_t existing_seq;
+      UnPackSequenceAndType(tag, &existing_seq, &type);
       switch (type) {
         case kTypeValue: {
           Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
@@ -1085,16 +1153,35 @@ bool MemTable::UpdateCallback(SequenceNumber seq,
             }
             RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
             UpdateFlushState();
-            return true;
+            if (kv_prot_info != nullptr) {
+              ProtectionInfoKVOTS64 updated_kv_prot_info(*kv_prot_info);
+              // `seq` is swallowed and `existing_seq` prevails.
+              updated_kv_prot_info.UpdateS(seq, existing_seq);
+              updated_kv_prot_info.UpdateV(delta,
+                                           Slice(prev_buffer, new_prev_size));
+              Slice encoded(entry, prev_buffer + new_prev_size - entry);
+              return VerifyEncodedEntry(encoded, updated_kv_prot_info);
+            }
+            return Status::OK();
           } else if (status == UpdateStatus::UPDATED) {
-            Add(seq, kTypeValue, key, Slice(str_value));
+            Status s;
+            if (kv_prot_info != nullptr) {
+              ProtectionInfoKVOTS64 updated_kv_prot_info(*kv_prot_info);
+              updated_kv_prot_info.UpdateV(delta, str_value);
+              s = Add(seq, kTypeValue, key, Slice(str_value),
+                      &updated_kv_prot_info);
+            } else {
+              s = Add(seq, kTypeValue, key, Slice(str_value),
+                      nullptr /* kv_prot_info */);
+            }
             RecordTick(moptions_.statistics, NUMBER_KEYS_WRITTEN);
             UpdateFlushState();
-            return true;
+            return s;
           } else if (status == UpdateStatus::UPDATE_FAILED) {
-            // No action required. Return.
+            // `UPDATE_FAILED` is named incorrectly. It indicates no update
+            // happened. It does not indicate a failure happened.
             UpdateFlushState();
-            return true;
+            return Status::OK();
           }
         }
         default:
@@ -1102,9 +1189,8 @@ bool MemTable::UpdateCallback(SequenceNumber seq,
       }
     }
   }
-  // If the latest value is not kTypeValue
-  // or key doesn't exist
-  return false;
+  // The latest value is not `kTypeValue` or key doesn't exist
+  return Status::NotFound();
 }
 
 size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
diff --git a/db/memtable.h b/db/memtable.h
index d5bd4e95aee..54155f9b575 100644
--- a/db/memtable.h
+++ b/db/memtable.h
@@ -15,7 +15,9 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+
 #include "db/dbformat.h"
+#include "db/kv_checksum.h"
 #include "db/range_tombstone_fragmenter.h"
 #include "db/read_callback.h"
 #include "db/version_edit.h"
@@ -24,7 +26,6 @@
 #include "monitoring/instrumented_mutex.h"
 #include "options/cf_options.h"
 #include "rocksdb/db.h"
-#include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
 #include "table/multiget_context.h"
 #include "util/dynamic_bloom.h"
@@ -36,9 +37,10 @@ struct FlushJobInfo;
 class Mutex;
 class MemTableIterator;
 class MergeContext;
+class SystemClock;
 
 struct ImmutableMemTableOptions {
-  explicit ImmutableMemTableOptions(const ImmutableCFOptions& ioptions,
+  explicit ImmutableMemTableOptions(const ImmutableOptions& ioptions,
                                     const MutableCFOptions& mutable_cf_options);
   size_t arena_block_size;
   uint32_t memtable_prefix_bloom_bits;
@@ -70,7 +72,7 @@ using MultiGetRange = MultiGetContext::Range;
 // Note:  Many of the methods in this class have comments indicating that
 // external synchronization is required as these methods are not thread-safe.
 // It is up to higher layers of code to decide how to prevent concurrent
-// invokation of these methods.  This is usually done by acquiring either
+// invocation of these methods.  This is usually done by acquiring either
 // the db mutex or the single writer thread.
 //
 // Some of these methods are documented to only require external
@@ -101,7 +103,7 @@ class MemTable {
   // used, but this may prevent some transactions from succeeding until the
   // first key is inserted into the memtable.
   explicit MemTable(const InternalKeyComparator& comparator,
-                    const ImmutableCFOptions& ioptions,
+                    const ImmutableOptions& ioptions,
                     const MutableCFOptions& mutable_cf_options,
                     WriteBufferManager* write_buffer_manager,
                     SequenceNumber earliest_seq, uint32_t column_family_id);
@@ -137,7 +139,7 @@ class MemTable {
   // operations on the same MemTable (unless this Memtable is immutable).
   size_t ApproximateMemoryUsage();
 
-  // As a cheap version of `ApproximateMemoryUsage()`, this function doens't
+  // As a cheap version of `ApproximateMemoryUsage()`, this function doesn't
   // require external synchronization. The value may be less accurate though
   size_t ApproximateMemoryUsageFast() const {
     return approximate_memory_usage_.load(std::memory_order_relaxed);
@@ -175,6 +177,9 @@ class MemTable {
   FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
       const ReadOptions& read_options, SequenceNumber read_seq);
 
+  Status VerifyEncodedEntry(Slice encoded,
+                            const ProtectionInfoKVOTS64& kv_prot_info);
+
   // Add an entry into memtable that maps key to value at the
   // specified sequence number and with the specified type.
   // Typically value will be empty if type==kTypeDeletion.
@@ -182,12 +187,14 @@ class MemTable {
   // REQUIRES: if allow_concurrent = false, external synchronization to prevent
   // simultaneous operations on the same MemTable.
   //
-  // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
-  // the <key, seq> already exists.
-  bool Add(SequenceNumber seq, ValueType type, const Slice& key,
-           const Slice& value, bool allow_concurrent = false,
-           MemTablePostProcessInfo* post_process_info = nullptr,
-           void** hint = nullptr);
+  // Returns `Status::TryAgain` if the `seq`, `key` combination already exists
+  // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
+  // The next attempt should try a larger value for `seq`.
+  Status Add(SequenceNumber seq, ValueType type, const Slice& key,
+             const Slice& value, const ProtectionInfoKVOTS64* kv_prot_info,
+             bool allow_concurrent = false,
+             MemTablePostProcessInfo* post_process_info = nullptr,
+             void** hint = nullptr);
 
   // Used to Get value associated with key or Get Merge Operands associated
   // with key.
@@ -237,37 +244,38 @@ class MemTable {
   }
 
   void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
-                ReadCallback* callback, bool* is_blob);
-
-  // Attempts to update the new_value inplace, else does normal Add
-  // Pseudocode
-  //   if key exists in current memtable && prev_value is of type kTypeValue
-  //     if new sizeof(new_value) <= sizeof(prev_value)
-  //       update inplace
-  //     else add(key, new_value)
-  //   else add(key, new_value)
+                ReadCallback* callback);
+
+  // If `key` exists in current memtable with type `kTypeValue` and the existing
+  // value is at least as large as the new value, updates it in-place. Otherwise
+  // adds the new value to the memtable out-of-place.
+  //
+  // Returns `Status::TryAgain` if the `seq`, `key` combination already exists
+  // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
+  // The next attempt should try a larger value for `seq`.
   //
   // REQUIRES: external synchronization to prevent simultaneous
   // operations on the same MemTable.
-  void Update(SequenceNumber seq,
-              const Slice& key,
-              const Slice& value);
-
-  // If prev_value for key exists, attempts to update it inplace.
-  // else returns false
-  // Pseudocode
-  //   if key exists in current memtable && prev_value is of type kTypeValue
-  //     new_value = delta(prev_value)
-  //     if sizeof(new_value) <= sizeof(prev_value)
-  //       update inplace
-  //     else add(key, new_value)
-  //   else return false
+  Status Update(SequenceNumber seq, const Slice& key, const Slice& value,
+                const ProtectionInfoKVOTS64* kv_prot_info);
+
+  // If `key` exists in current memtable with type `kTypeValue` and the existing
+  // value is at least as large as the new value, updates it in-place. Otherwise
+  // if `key` exists in current memtable with type `kTypeValue`, adds the new
+  // value to the memtable out-of-place.
+  //
+  // Returns `Status::NotFound` if `key` does not exist in current memtable or
+  // the latest version of `key` does not have `kTypeValue`.
+  //
+  // Returns `Status::TryAgain` if the `seq`, `key` combination already exists
+  // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
+  // The next attempt should try a larger value for `seq`.
   //
   // REQUIRES: external synchronization to prevent simultaneous
   // operations on the same MemTable.
-  bool UpdateCallback(SequenceNumber seq,
-                      const Slice& key,
-                      const Slice& delta);
+  Status UpdateCallback(SequenceNumber seq, const Slice& key,
+                        const Slice& delta,
+                        const ProtectionInfoKVOTS64* kv_prot_info);
 
   // Returns the number of successive merge entries starting from the newest
   // entry for the key up to the last non-merge entry or last entry for the
@@ -504,7 +512,7 @@ class MemTable {
 
   std::atomic<FlushStateEnum> flush_state_;
 
-  Env* env_;
+  SystemClock* clock_;
 
   // Extract sequential insert prefixes.
   const SliceTransform* insert_with_hint_prefix_extractor_;
@@ -525,7 +533,7 @@ class MemTable {
   SequenceNumber atomic_flush_seqno_;
 
   // keep track of memory usage in table_, arena_, and range_del_table_.
-  // Gets refrshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow`
+  // Gets refreshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow`
   std::atomic<uint64_t> approximate_memory_usage_;
 
 #ifndef ROCKSDB_LITE
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index dced9f7db7a..97d076b0377 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -113,10 +113,10 @@ bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
 }
 
 void MemTableListVersion::MultiGet(const ReadOptions& read_options,
-                                   MultiGetRange* range, ReadCallback* callback,
-                                   bool* is_blob) {
+                                   MultiGetRange* range,
+                                   ReadCallback* callback) {
   for (auto memtable : memlist_) {
-    memtable->MultiGet(read_options, range, callback, is_blob);
+    memtable->MultiGet(read_options, range, callback);
     if (range->empty()) {
       return;
     }
@@ -334,7 +334,7 @@ bool MemTableList::IsFlushPending() const {
 }
 
 // Returns the memtables that need to be flushed.
-void MemTableList::PickMemtablesToFlush(const uint64_t* max_memtable_id,
+void MemTableList::PickMemtablesToFlush(uint64_t max_memtable_id,
                                         autovector<MemTable*>* ret) {
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH);
@@ -345,7 +345,7 @@ void MemTableList::PickMemtablesToFlush(const uint64_t* max_memtable_id,
     if (!atomic_flush && m->atomic_flush_seqno_ != kMaxSequenceNumber) {
       atomic_flush = true;
     }
-    if (max_memtable_id != nullptr && m->GetID() > *max_memtable_id) {
+    if (m->GetID() > max_memtable_id) {
       break;
     }
     if (!m->flush_in_progress_) {
@@ -473,91 +473,42 @@ Status MemTableList::TryInstallMemtableFlushResults(
 
     // TODO(myabandeh): Not sure how batch_count could be 0 here.
     if (batch_count > 0) {
+      uint64_t min_wal_number_to_keep = 0;
       if (vset->db_options()->allow_2pc) {
         assert(edit_list.size() > 0);
+        min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC(
+            vset, *cfd, edit_list, memtables_to_flush, prep_tracker);
         // We piggyback the information of  earliest log file to keep in the
         // manifest entry for the last file flushed.
-        edit_list.back()->SetMinLogNumberToKeep(PrecomputeMinLogNumberToKeep(
-            vset, *cfd, edit_list, memtables_to_flush, prep_tracker));
+        edit_list.back()->SetMinLogNumberToKeep(min_wal_number_to_keep);
       }
 
-      // this can release and reacquire the mutex.
-      s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu,
-                            db_directory);
-      *io_s = vset->io_status();
-
-      // we will be changing the version in the next code path,
-      // so we better create a new one, since versions are immutable
-      InstallNewVersion();
-
-      // All the later memtables that have the same filenum
-      // are part of the same batch. They can be committed now.
-      uint64_t mem_id = 1;  // how many memtables have been flushed.
-
-      // commit new state only if the column family is NOT dropped.
-      // The reason is as follows (refer to
-      // ColumnFamilyTest.FlushAndDropRaceCondition).
-      // If the column family is dropped, then according to LogAndApply, its
-      // corresponding flush operation is NOT written to the MANIFEST. This
-      // means the DB is not aware of the L0 files generated from the flush.
-      // By committing the new state, we remove the memtable from the memtable
-      // list. Creating an iterator on this column family will not be able to
-      // read full data since the memtable is removed, and the DB is not aware
-      // of the L0 files, causing MergingIterator unable to build child
-      // iterators. RocksDB contract requires that the iterator can be created
-      // on a dropped column family, and we must be able to
-      // read full data as long as column family handle is not deleted, even if
-      // the column family is dropped.
-      if (s.ok() && !cfd->IsDropped()) {  // commit new state
-        while (batch_count-- > 0) {
-          MemTable* m = current_->memlist_.back();
-          if (m->edit_.GetBlobFileAdditions().empty()) {
-            ROCKS_LOG_BUFFER(log_buffer,
-                             "[%s] Level-0 commit table #%" PRIu64
-                             ": memtable #%" PRIu64 " done",
-                             cfd->GetName().c_str(), m->file_number_, mem_id);
-          } else {
-            ROCKS_LOG_BUFFER(log_buffer,
-                             "[%s] Level-0 commit table #%" PRIu64
-                             " (+%zu blob files)"
-                             ": memtable #%" PRIu64 " done",
-                             cfd->GetName().c_str(), m->file_number_,
-                             m->edit_.GetBlobFileAdditions().size(), mem_id);
-          }
-
-          assert(m->file_number_ > 0);
-          current_->Remove(m, to_delete);
-          UpdateCachedValuesFromMemTableListVersion();
-          ResetTrimHistoryNeeded();
-          ++mem_id;
+      std::unique_ptr<VersionEdit> wal_deletion;
+      if (vset->db_options()->track_and_verify_wals_in_manifest) {
+        if (!vset->db_options()->allow_2pc) {
+          min_wal_number_to_keep =
+              PrecomputeMinLogNumberToKeepNon2PC(vset, *cfd, edit_list);
         }
-      } else {
-        for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; ++it) {
-          MemTable* m = *it;
-          // commit failed. setup state so that we can flush again.
-          if (m->edit_.GetBlobFileAdditions().empty()) {
-            ROCKS_LOG_BUFFER(log_buffer,
-                             "Level-0 commit table #%" PRIu64
-                             ": memtable #%" PRIu64 " failed",
-                             m->file_number_, mem_id);
-          } else {
-            ROCKS_LOG_BUFFER(log_buffer,
-                             "Level-0 commit table #%" PRIu64
-                             " (+%zu blob files)"
-                             ": memtable #%" PRIu64 " failed",
-                             m->file_number_,
-                             m->edit_.GetBlobFileAdditions().size(), mem_id);
-          }
-
-          m->flush_completed_ = false;
-          m->flush_in_progress_ = false;
-          m->edit_.Clear();
-          num_flush_not_started_++;
-          m->file_number_ = 0;
-          imm_flush_needed.store(true, std::memory_order_release);
-          ++mem_id;
+        if (min_wal_number_to_keep >
+            vset->GetWalSet().GetMinWalNumberToKeep()) {
+          wal_deletion.reset(new VersionEdit);
+          wal_deletion->DeleteWalsBefore(min_wal_number_to_keep);
+          edit_list.push_back(wal_deletion.get());
         }
       }
+
+      const auto manifest_write_cb = [this, cfd, batch_count, log_buffer,
+                                      to_delete, mu](const Status& status) {
+        RemoveMemTablesOrRestoreFlags(status, cfd, batch_count, log_buffer,
+                                      to_delete, mu);
+      };
+
+      // this can release and reacquire the mutex.
+      s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu,
+                            db_directory, /*new_descriptor_log=*/false,
+                            /*column_family_options=*/nullptr,
+                            manifest_write_cb);
+      *io_s = vset->io_status();
     }
   }
   commit_in_progress_ = false;
@@ -570,7 +521,7 @@ void MemTableList::Add(MemTable* m, autovector<MemTable*>* to_delete) {
   InstallNewVersion();
   // this method is used to move mutable memtable into an immutable list.
   // since mutable memtable is already refcounted by the DBImpl,
-  // and when moving to the imutable list we don't unref it,
+  // and when moving to the immutable list we don't unref it,
   // we don't have to ref the memtable here. we just take over the
   // reference from the DBImpl.
   current_->Add(m, to_delete);
@@ -642,21 +593,93 @@ void MemTableList::InstallNewVersion() {
   }
 }
 
+void MemTableList::RemoveMemTablesOrRestoreFlags(
+    const Status& s, ColumnFamilyData* cfd, size_t batch_count,
+    LogBuffer* log_buffer, autovector<MemTable*>* to_delete,
+    InstrumentedMutex* mu) {
+  assert(mu);
+  mu->AssertHeld();
+  assert(to_delete);
+  // we will be changing the version in the next code path,
+  // so we better create a new one, since versions are immutable
+  InstallNewVersion();
+
+  // All the later memtables that have the same filenum
+  // are part of the same batch. They can be committed now.
+  uint64_t mem_id = 1;  // how many memtables have been flushed.
+
+  // commit new state only if the column family is NOT dropped.
+  // The reason is as follows (refer to
+  // ColumnFamilyTest.FlushAndDropRaceCondition).
+  // If the column family is dropped, then according to LogAndApply, its
+  // corresponding flush operation is NOT written to the MANIFEST. This
+  // means the DB is not aware of the L0 files generated from the flush.
+  // By committing the new state, we remove the memtable from the memtable
+  // list. Creating an iterator on this column family will not be able to
+  // read full data since the memtable is removed, and the DB is not aware
+  // of the L0 files, causing MergingIterator unable to build child
+  // iterators. RocksDB contract requires that the iterator can be created
+  // on a dropped column family, and we must be able to
+  // read full data as long as column family handle is not deleted, even if
+  // the column family is dropped.
+  if (s.ok() && !cfd->IsDropped()) {  // commit new state
+    while (batch_count-- > 0) {
+      MemTable* m = current_->memlist_.back();
+      if (m->edit_.GetBlobFileAdditions().empty()) {
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] Level-0 commit table #%" PRIu64
+                         ": memtable #%" PRIu64 " done",
+                         cfd->GetName().c_str(), m->file_number_, mem_id);
+      } else {
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] Level-0 commit table #%" PRIu64
+                         " (+%zu blob files)"
+                         ": memtable #%" PRIu64 " done",
+                         cfd->GetName().c_str(), m->file_number_,
+                         m->edit_.GetBlobFileAdditions().size(), mem_id);
+      }
+
+      assert(m->file_number_ > 0);
+      current_->Remove(m, to_delete);
+      UpdateCachedValuesFromMemTableListVersion();
+      ResetTrimHistoryNeeded();
+      ++mem_id;
+    }
+  } else {
+    for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; ++it) {
+      MemTable* m = *it;
+      // commit failed. setup state so that we can flush again.
+      if (m->edit_.GetBlobFileAdditions().empty()) {
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "Level-0 commit table #%" PRIu64 ": memtable #%" PRIu64
+                         " failed",
+                         m->file_number_, mem_id);
+      } else {
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "Level-0 commit table #%" PRIu64
+                         " (+%zu blob files)"
+                         ": memtable #%" PRIu64 " failed",
+                         m->file_number_,
+                         m->edit_.GetBlobFileAdditions().size(), mem_id);
+      }
+
+      m->flush_completed_ = false;
+      m->flush_in_progress_ = false;
+      m->edit_.Clear();
+      num_flush_not_started_++;
+      m->file_number_ = 0;
+      imm_flush_needed.store(true, std::memory_order_release);
+      ++mem_id;
+    }
+  }
+}
+
 uint64_t MemTableList::PrecomputeMinLogContainingPrepSection(
-    const autovector<MemTable*>& memtables_to_flush) {
+    const std::unordered_set<MemTable*>* memtables_to_flush) {
   uint64_t min_log = 0;
 
   for (auto& m : current_->memlist_) {
-    // Assume the list is very short, we can live with O(m*n). We can optimize
-    // if the performance has some problem.
-    bool should_skip = false;
-    for (MemTable* m_to_flush : memtables_to_flush) {
-      if (m == m_to_flush) {
-        should_skip = true;
-        break;
-      }
-    }
-    if (should_skip) {
+    if (memtables_to_flush && memtables_to_flush->count(m)) {
       continue;
     }
 
@@ -676,7 +699,8 @@ Status InstallMemtableAtomicFlushResults(
     const autovector<ColumnFamilyData*>& cfds,
     const autovector<const MutableCFOptions*>& mutable_cf_options_list,
     const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
-    InstrumentedMutex* mu, const autovector<FileMetaData*>& file_metas,
+    LogsWithPrepTracker* prep_tracker, InstrumentedMutex* mu,
+    const autovector<FileMetaData*>& file_metas,
     autovector<MemTable*>* to_delete, FSDirectory* db_directory,
     LogBuffer* log_buffer) {
   AutoThreadOperationStageUpdater stage_updater(
@@ -688,6 +712,10 @@ Status InstallMemtableAtomicFlushResults(
   if (imm_lists != nullptr) {
     assert(imm_lists->size() == num);
   }
+  if (num == 0) {
+    return Status::OK();
+  }
+
   for (size_t k = 0; k != num; ++k) {
 #ifndef NDEBUG
     const auto* imm =
@@ -716,12 +744,37 @@ Status InstallMemtableAtomicFlushResults(
     ++num_entries;
     edit_lists.emplace_back(edits);
   }
+
+  WalNumber min_wal_number_to_keep = 0;
+  if (vset->db_options()->allow_2pc) {
+    min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC(
+        vset, cfds, edit_lists, mems_list, prep_tracker);
+    edit_lists.back().back()->SetMinLogNumberToKeep(min_wal_number_to_keep);
+  }
+
+  std::unique_ptr<VersionEdit> wal_deletion;
+  if (vset->db_options()->track_and_verify_wals_in_manifest) {
+    if (!vset->db_options()->allow_2pc) {
+      min_wal_number_to_keep =
+          PrecomputeMinLogNumberToKeepNon2PC(vset, cfds, edit_lists);
+    }
+    if (min_wal_number_to_keep > vset->GetWalSet().GetMinWalNumberToKeep()) {
+      wal_deletion.reset(new VersionEdit);
+      wal_deletion->DeleteWalsBefore(min_wal_number_to_keep);
+      edit_lists.back().push_back(wal_deletion.get());
+      ++num_entries;
+    }
+  }
+
   // Mark the version edits as an atomic group if the number of version edits
   // exceeds 1.
   if (cfds.size() > 1) {
-    for (auto& edits : edit_lists) {
-      assert(edits.size() == 1);
-      edits[0]->MarkAtomicGroup(--num_entries);
+    for (size_t i = 0; i < edit_lists.size(); i++) {
+      assert((edit_lists[i].size() == 1) ||
+             ((edit_lists[i].size() == 2) && (i == edit_lists.size() - 1)));
+      for (auto& e : edit_lists[i]) {
+        e->MarkAtomicGroup(--num_entries);
+      }
     }
     assert(0 == num_entries);
   }
diff --git a/db/memtable_list.h b/db/memtable_list.h
index 72105d26667..493a54d4034 100644
--- a/db/memtable_list.h
+++ b/db/memtable_list.h
@@ -76,7 +76,7 @@ class MemTableListVersion {
   }
 
   void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
-                ReadCallback* callback, bool* is_blob);
+                ReadCallback* callback);
 
   // Returns all the merge operands corresponding to the key by searching all
   // memtables starting from the most recent one.
@@ -138,8 +138,8 @@ class MemTableListVersion {
       const autovector<ColumnFamilyData*>& cfds,
       const autovector<const MutableCFOptions*>& mutable_cf_options_list,
       const autovector<const autovector<MemTable*>*>& mems_list,
-      VersionSet* vset, InstrumentedMutex* mu,
-      const autovector<FileMetaData*>& file_meta,
+      VersionSet* vset, LogsWithPrepTracker* prep_tracker,
+      InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
       autovector<MemTable*>* to_delete, FSDirectory* db_directory,
       LogBuffer* log_buffer);
 
@@ -251,7 +251,7 @@ class MemTableList {
 
   // Returns the earliest memtables that needs to be flushed. The returned
   // memtables are guaranteed to be in the ascending order of created time.
-  void PickMemtablesToFlush(const uint64_t* max_memtable_id,
+  void PickMemtablesToFlush(uint64_t max_memtable_id,
                             autovector<MemTable*>* mems);
 
   // Reset status of the given memtable list back to pending state so that
@@ -335,7 +335,7 @@ class MemTableList {
   // Returns the min log containing the prep section after memtables listsed in
   // `memtables_to_flush` are flushed and their status is persisted in manifest.
   uint64_t PrecomputeMinLogContainingPrepSection(
-      const autovector<MemTable*>& memtables_to_flush);
+      const std::unordered_set<MemTable*>* memtables_to_flush = nullptr);
 
   uint64_t GetEarliestMemTableID() const {
     auto& memlist = current_->memlist_;
@@ -381,14 +381,21 @@ class MemTableList {
       const autovector<ColumnFamilyData*>& cfds,
       const autovector<const MutableCFOptions*>& mutable_cf_options_list,
       const autovector<const autovector<MemTable*>*>& mems_list,
-      VersionSet* vset, InstrumentedMutex* mu,
-      const autovector<FileMetaData*>& file_meta,
+      VersionSet* vset, LogsWithPrepTracker* prep_tracker,
+      InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
       autovector<MemTable*>* to_delete, FSDirectory* db_directory,
       LogBuffer* log_buffer);
 
   // DB mutex held
   void InstallNewVersion();
 
+  // DB mutex held
+  // Called after writing to MANIFEST
+  void RemoveMemTablesOrRestoreFlags(const Status& s, ColumnFamilyData* cfd,
+                                     size_t batch_count, LogBuffer* log_buffer,
+                                     autovector<MemTable*>* to_delete,
+                                     InstrumentedMutex* mu);
+
   const int min_write_buffer_number_to_merge_;
 
   MemTableListVersion* current_;
@@ -424,7 +431,8 @@ extern Status InstallMemtableAtomicFlushResults(
     const autovector<ColumnFamilyData*>& cfds,
     const autovector<const MutableCFOptions*>& mutable_cf_options_list,
     const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
-    InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
+    LogsWithPrepTracker* prep_tracker, InstrumentedMutex* mu,
+    const autovector<FileMetaData*>& file_meta,
     autovector<MemTable*>* to_delete, FSDirectory* db_directory,
     LogBuffer* log_buffer);
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc
index cc6e566ad57..165471b6bbb 100644
--- a/db/memtable_list_test.cc
+++ b/db/memtable_list_test.cc
@@ -30,14 +30,14 @@ class MemTableListTest : public testing::Test {
   MemTableListTest() : db(nullptr), file_number(1) {
     dbname = test::PerThreadDBPath("memtable_list_test");
     options.create_if_missing = true;
-    DestroyDB(dbname, options);
+    EXPECT_OK(DestroyDB(dbname, options));
   }
 
   // Create a test db if not yet created
   void CreateDB() {
     if (db == nullptr) {
       options.create_if_missing = true;
-      DestroyDB(dbname, options);
+      EXPECT_OK(DestroyDB(dbname, options));
       // Open DB only with default column family
       ColumnFamilyOptions cf_options;
       std::vector<ColumnFamilyDescriptor> cf_descs;
@@ -78,7 +78,7 @@ class MemTableListTest : public testing::Test {
       handles.clear();
       delete db;
       db = nullptr;
-      DestroyDB(dbname, options, cf_descs);
+      EXPECT_OK(DestroyDB(dbname, options, cf_descs));
     }
   }
 
@@ -103,7 +103,7 @@ class MemTableListTest : public testing::Test {
     VersionSet versions(dbname, &immutable_db_options, env_options,
                         table_cache.get(), &write_buffer_manager,
                         &write_controller, /*block_cache_tracer=*/nullptr,
-                        /*io_tracer=*/nullptr);
+                        /*io_tracer=*/nullptr, /*db_session_id*/ "");
     std::vector<ColumnFamilyDescriptor> cf_descs;
     cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
     cf_descs.emplace_back("one", ColumnFamilyOptions());
@@ -153,7 +153,7 @@ class MemTableListTest : public testing::Test {
     VersionSet versions(dbname, &immutable_db_options, env_options,
                         table_cache.get(), &write_buffer_manager,
                         &write_controller, /*block_cache_tracer=*/nullptr,
-                        /*io_tracer=*/nullptr);
+                        /*io_tracer=*/nullptr, /*db_session_id*/ "");
     std::vector<ColumnFamilyDescriptor> cf_descs;
     cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
     cf_descs.emplace_back("one", ColumnFamilyOptions());
@@ -185,8 +185,9 @@ class MemTableListTest : public testing::Test {
     InstrumentedMutex mutex;
     InstrumentedMutexLock l(&mutex);
     return InstallMemtableAtomicFlushResults(
-        &lists, cfds, mutable_cf_options_list, mems_list, &versions, &mutex,
-        file_meta_ptrs, to_delete, nullptr, &log_buffer);
+        &lists, cfds, mutable_cf_options_list, mems_list, &versions,
+        nullptr /* prep_tracker */, &mutex, file_meta_ptrs, to_delete, nullptr,
+        &log_buffer);
   }
 };
 
@@ -199,7 +200,7 @@ TEST_F(MemTableListTest, Empty) {
   ASSERT_FALSE(list.IsFlushPending());
 
   autovector<MemTable*> mems;
-  list.PickMemtablesToFlush(nullptr /* memtable_id */, &mems);
+  list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &mems);
   ASSERT_EQ(0, mems.size());
 
   autovector<MemTable*> to_delete;
@@ -234,7 +235,7 @@ TEST_F(MemTableListTest, GetTest) {
   InternalKeyComparator cmp(BytewiseComparator());
   auto factory = std::make_shared<SkipListFactory>();
   options.memtable_factory = factory;
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
 
   WriteBufferManager wb(options.db_write_buffer_size);
   MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
@@ -242,10 +243,14 @@ TEST_F(MemTableListTest, GetTest) {
   mem->Ref();
 
   // Write some keys to this memtable.
-  mem->Add(++seq, kTypeDeletion, "key1", "");
-  mem->Add(++seq, kTypeValue, "key2", "value2");
-  mem->Add(++seq, kTypeValue, "key1", "value1");
-  mem->Add(++seq, kTypeValue, "key2", "value2.2");
+  ASSERT_OK(
+      mem->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
+  ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2",
+                     nullptr /* kv_prot_info */));
+  ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", "value1",
+                     nullptr /* kv_prot_info */));
+  ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2.2",
+                     nullptr /* kv_prot_info */));
 
   // Fetch the newly written keys
   merge_context.Clear();
@@ -283,8 +288,10 @@ TEST_F(MemTableListTest, GetTest) {
                                 kMaxSequenceNumber, 0 /* column_family_id */);
   mem2->Ref();
 
-  mem2->Add(++seq, kTypeDeletion, "key1", "");
-  mem2->Add(++seq, kTypeValue, "key2", "value2.3");
+  ASSERT_OK(
+      mem2->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
+  ASSERT_OK(mem2->Add(++seq, kTypeValue, "key2", "value2.3",
+                      nullptr /* kv_prot_info */));
 
   // Add second memtable to list
   list.Add(mem2, &to_delete);
@@ -351,7 +358,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
   InternalKeyComparator cmp(BytewiseComparator());
   auto factory = std::make_shared<SkipListFactory>();
   options.memtable_factory = factory;
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
 
   WriteBufferManager wb(options.db_write_buffer_size);
   MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
@@ -359,9 +366,12 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
   mem->Ref();
 
   // Write some keys to this memtable.
-  mem->Add(++seq, kTypeDeletion, "key1", "");
-  mem->Add(++seq, kTypeValue, "key2", "value2");
-  mem->Add(++seq, kTypeValue, "key2", "value2.2");
+  ASSERT_OK(
+      mem->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
+  ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2",
+                     nullptr /* kv_prot_info */));
+  ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2.2",
+                     nullptr /* kv_prot_info */));
 
   // Fetch the newly written keys
   merge_context.Clear();
@@ -399,7 +409,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
   // Flush this memtable from the list.
   // (It will then be a part of the memtable history).
   autovector<MemTable*> to_flush;
-  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+  list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush);
   ASSERT_EQ(1, to_flush.size());
 
   MutableCFOptions mutable_cf_options(options);
@@ -443,15 +453,17 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
                                 kMaxSequenceNumber, 0 /* column_family_id */);
   mem2->Ref();
 
-  mem2->Add(++seq, kTypeDeletion, "key1", "");
-  mem2->Add(++seq, kTypeValue, "key3", "value3");
+  ASSERT_OK(
+      mem2->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
+  ASSERT_OK(mem2->Add(++seq, kTypeValue, "key3", "value3",
+                      nullptr /* kv_prot_info */));
 
   // Add second memtable to list
   list.Add(mem2, &to_delete);
   ASSERT_EQ(0, to_delete.size());
 
   to_flush.clear();
-  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+  list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush);
   ASSERT_EQ(1, to_flush.size());
 
   // Flush second memtable
@@ -527,7 +539,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
 
   auto factory = std::make_shared<SkipListFactory>();
   options.memtable_factory = factory;
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   InternalKeyComparator cmp(BytewiseComparator());
   WriteBufferManager wb(options.db_write_buffer_size);
   autovector<MemTable*> to_delete;
@@ -554,11 +566,16 @@ TEST_F(MemTableListTest, FlushPendingTest) {
     std::string value;
     MergeContext merge_context;
 
-    mem->Add(++seq, kTypeValue, "key1", ToString(i));
-    mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN");
-    mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value");
-    mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM");
-    mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), "");
+    ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", ToString(i),
+                       nullptr /* kv_prot_info */));
+    ASSERT_OK(mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN",
+                       nullptr /* kv_prot_info */));
+    ASSERT_OK(mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value",
+                       nullptr /* kv_prot_info */));
+    ASSERT_OK(mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM",
+                       nullptr /* kv_prot_info */));
+    ASSERT_OK(mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), "",
+                       nullptr /* kv_prot_info */));
 
     tables.push_back(mem);
   }
@@ -567,7 +584,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ASSERT_FALSE(list.IsFlushPending());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
   autovector<MemTable*> to_flush;
-  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+  list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush);
   ASSERT_EQ(0, to_flush.size());
 
   // Request a flush even though there is nothing to flush
@@ -576,7 +593,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
 
   // Attempt to 'flush' to clear request for flush
-  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+  list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush);
   ASSERT_EQ(0, to_flush.size());
   ASSERT_FALSE(list.IsFlushPending());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
@@ -600,7 +617,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
 
   // Pick tables to flush
-  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+  list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush);
   ASSERT_EQ(2, to_flush.size());
   ASSERT_EQ(2, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
@@ -621,7 +638,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ASSERT_EQ(0, to_delete.size());
 
   // Pick tables to flush
-  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+  list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush);
   ASSERT_EQ(3, to_flush.size());
   ASSERT_EQ(3, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
@@ -629,7 +646,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
 
   // Pick tables to flush again
   autovector<MemTable*> to_flush2;
-  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush2);
+  list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush2);
   ASSERT_EQ(0, to_flush2.size());
   ASSERT_EQ(3, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
@@ -647,7 +664,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
 
   // Pick tables to flush again
-  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush2);
+  list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush2);
   ASSERT_EQ(1, to_flush2.size());
   ASSERT_EQ(4, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
@@ -668,7 +685,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ASSERT_EQ(0, to_delete.size());
 
   // Pick tables to flush
-  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
+  list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush);
   // Should pick 4 of 5 since 1 table has been picked in to_flush2
   ASSERT_EQ(4, to_flush.size());
   ASSERT_EQ(5, list.NumNotFlushed());
@@ -677,7 +694,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
 
   // Pick tables to flush again
   autovector<MemTable*> to_flush3;
-  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush3);
+  list.PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */, &to_flush3);
   ASSERT_EQ(0, to_flush3.size());  // nothing not in progress of being flushed
   ASSERT_EQ(5, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
@@ -738,7 +755,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   autovector<MemTable*> to_flush4;
   list.FlushRequested();
   ASSERT_TRUE(list.HasFlushRequested());
-  list.PickMemtablesToFlush(&memtable_id, &to_flush4);
+  list.PickMemtablesToFlush(memtable_id, &to_flush4);
   ASSERT_TRUE(to_flush4.empty());
   ASSERT_EQ(1, list.NumNotFlushed());
   ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
@@ -749,7 +766,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   // equal to 5. Therefore, only tables[5] will be selected.
   memtable_id = 5;
   list.FlushRequested();
-  list.PickMemtablesToFlush(&memtable_id, &to_flush4);
+  list.PickMemtablesToFlush(memtable_id, &to_flush4);
   ASSERT_EQ(1, static_cast<int>(to_flush4.size()));
   ASSERT_EQ(1, list.NumNotFlushed());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
@@ -791,7 +808,7 @@ TEST_F(MemTableListTest, AtomicFlusTest) {
 
   auto factory = std::make_shared<SkipListFactory>();
   options.memtable_factory = factory;
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   InternalKeyComparator cmp(BytewiseComparator());
   WriteBufferManager wb(options.db_write_buffer_size);
 
@@ -823,11 +840,16 @@ TEST_F(MemTableListTest, AtomicFlusTest) {
 
       std::string value;
 
-      mem->Add(++seq, kTypeValue, "key1", ToString(i));
-      mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN");
-      mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value");
-      mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM");
-      mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), "");
+      ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", ToString(i),
+                         nullptr /* kv_prot_info */));
+      ASSERT_OK(mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN",
+                         nullptr /* kv_prot_info */));
+      ASSERT_OK(mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value",
+                         nullptr /* kv_prot_info */));
+      ASSERT_OK(mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM",
+                         nullptr /* kv_prot_info */));
+      ASSERT_OK(mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), "",
+                         nullptr /* kv_prot_info */));
 
       elem.push_back(mem);
     }
@@ -841,7 +863,8 @@ TEST_F(MemTableListTest, AtomicFlusTest) {
     auto* list = lists[i];
     ASSERT_FALSE(list->IsFlushPending());
     ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire));
-    list->PickMemtablesToFlush(nullptr /* memtable_id */, &flush_candidates[i]);
+    list->PickMemtablesToFlush(port::kMaxUint64 /* memtable_id */,
+                               &flush_candidates[i]);
     ASSERT_EQ(0, flush_candidates[i].size());
   }
   // Request flush even though there is nothing to flush
@@ -871,8 +894,7 @@ TEST_F(MemTableListTest, AtomicFlusTest) {
   // Pick memtables to flush
   for (auto i = 0; i != num_cfs; ++i) {
     flush_candidates[i].clear();
-    lists[i]->PickMemtablesToFlush(&flush_memtable_ids[i],
-                                   &flush_candidates[i]);
+    lists[i]->PickMemtablesToFlush(flush_memtable_ids[i], &flush_candidates[i]);
     ASSERT_EQ(flush_memtable_ids[i] - 0 + 1,
               static_cast<uint64_t>(flush_candidates[i].size()));
   }
diff --git a/db/merge_context.h b/db/merge_context.h
index e1869a341b0..925bfc0e068 100644
--- a/db/merge_context.h
+++ b/db/merge_context.h
@@ -68,7 +68,7 @@ class MergeContext {
   }
 
   // Get the operand at the index.
-  Slice GetOperand(int index) {
+  Slice GetOperand(int index) const {
     assert(operand_list_);
 
     SetDirectionForward();
@@ -76,13 +76,21 @@ class MergeContext {
   }
 
   // Same as GetOperandsDirectionForward
-  const std::vector<Slice>& GetOperands() {
+  //
+  // Note that the returned reference is only good until another call
+  // to this MergeContext.  If the returned value is needed for longer,
+  // a copy must be made.
+  const std::vector<Slice>& GetOperands() const {
     return GetOperandsDirectionForward();
   }
 
   // Return all the operands in the order as they were merged (passed to
   // FullMerge or FullMergeV2)
-  const std::vector<Slice>& GetOperandsDirectionForward() {
+  //
+  // Note that the returned reference is only good until another call
+  // to this MergeContext.  If the returned value is needed for longer,
+  // a copy must be made.
+  const std::vector<Slice>& GetOperandsDirectionForward() const {
     if (!operand_list_) {
       return empty_operand_list;
     }
@@ -93,7 +101,11 @@ class MergeContext {
 
   // Return all the operands in the reversed order relative to how they were
   // merged (passed to FullMerge or FullMergeV2)
-  const std::vector<Slice>& GetOperandsDirectionBackward() {
+  //
+  // Note that the returned reference is only good until another call
+  // to this MergeContext.  If the returned value is needed for longer,
+  // a copy must be made.
+  const std::vector<Slice>& GetOperandsDirectionBackward() const {
     if (!operand_list_) {
       return empty_operand_list;
     }
@@ -110,14 +122,14 @@ class MergeContext {
     }
   }
 
-  void SetDirectionForward() {
+  void SetDirectionForward() const {
     if (operands_reversed_ == true) {
       std::reverse(operand_list_->begin(), operand_list_->end());
       operands_reversed_ = false;
     }
   }
 
-  void SetDirectionBackward() {
+  void SetDirectionBackward() const {
     if (operands_reversed_ == false) {
       std::reverse(operand_list_->begin(), operand_list_->end());
       operands_reversed_ = true;
@@ -125,10 +137,10 @@ class MergeContext {
   }
 
   // List of operands
-  std::unique_ptr<std::vector<Slice>> operand_list_;
+  mutable std::unique_ptr<std::vector<Slice>> operand_list_;
   // Copy of operands that are not pinned.
   std::unique_ptr<std::vector<std::unique_ptr<std::string>>> copied_operands_;
-  bool operands_reversed_ = true;
+  mutable bool operands_reversed_ = true;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/merge_helper.cc b/db/merge_helper.cc
index ebfd22a7dc3..31cd3b6c587 100644
--- a/db/merge_helper.cc
+++ b/db/merge_helper.cc
@@ -14,6 +14,7 @@
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/merge_operator.h"
+#include "rocksdb/system_clock.h"
 #include "table/format.h"
 #include "table/internal_iterator.h"
 
@@ -28,6 +29,7 @@ MergeHelper::MergeHelper(Env* env, const Comparator* user_comparator,
                          Statistics* stats,
                          const std::atomic<bool>* shutting_down)
     : env_(env),
+      clock_(env->GetSystemClock().get()),
       user_comparator_(user_comparator),
       user_merge_operator_(user_merge_operator),
       compaction_filter_(compaction_filter),
@@ -39,7 +41,7 @@ MergeHelper::MergeHelper(Env* env, const Comparator* user_comparator,
       snapshot_checker_(snapshot_checker),
       level_(level),
       keys_(),
-      filter_timer_(env_),
+      filter_timer_(clock_),
       total_filter_time_(0U),
       stats_(stats) {
   assert(user_comparator_ != nullptr);
@@ -52,7 +54,7 @@ Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator,
                                    const Slice& key, const Slice* value,
                                    const std::vector<Slice>& operands,
                                    std::string* result, Logger* logger,
-                                   Statistics* statistics, Env* env,
+                                   Statistics* statistics, SystemClock* clock,
                                    Slice* result_operand,
                                    bool update_num_ops_stats) {
   assert(merge_operator != nullptr);
@@ -75,7 +77,7 @@ Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator,
   MergeOperator::MergeOperationOutput merge_out(*result, tmp_result_operand);
   {
     // Setup to time the merge
-    StopWatchNano timer(env, statistics != nullptr);
+    StopWatchNano timer(clock, statistics != nullptr);
     PERF_TIMER_GUARD(merge_operator_time_nanos);
 
     // Do the merge
@@ -116,7 +118,8 @@ Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator,
 Status MergeHelper::MergeUntil(InternalIterator* iter,
                                CompactionRangeDelAggregator* range_del_agg,
                                const SequenceNumber stop_before,
-                               const bool at_bottom) {
+                               const bool at_bottom,
+                               const bool allow_data_in_errors) {
   // Get a copy of the internal key, before it's invalidated by iter->Next()
   // Also maintain the list of merge operands seen.
   assert(HasOperator());
@@ -139,7 +142,7 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
   // orig_ikey is backed by keys_.back() if !keys_.empty()
   ParsedInternalKey orig_ikey;
 
-  Status s = ParseInternalKey(original_key, &orig_ikey);
+  Status s = ParseInternalKey(original_key, &orig_ikey, allow_data_in_errors);
   assert(s.ok());
   if (!s.ok()) return s;
 
@@ -153,12 +156,12 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
     ParsedInternalKey ikey;
     assert(keys_.size() == merge_context_.GetNumOperands());
 
-    if (ParseInternalKey(iter->key(), &ikey) != Status::OK()) {
+    Status pik_status =
+        ParseInternalKey(iter->key(), &ikey, allow_data_in_errors);
+    if (!pik_status.ok()) {
       // stop at corrupted key
       if (assert_valid_internal_key_) {
-        assert(!"Corrupted internal key not expected.");
-        s = Status::Corruption("Corrupted internal key not expected.");
-        return s;
+        return pik_status;
       }
       break;
     } else if (first_key) {
@@ -212,7 +215,7 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
       std::string merge_result;
       s = TimedFullMerge(user_merge_operator_, ikey.user_key, val_ptr,
                          merge_context_.GetOperands(), &merge_result, logger_,
-                         stats_, env_);
+                         stats_, clock_);
 
       // We store the result in keys_.back() and operands_.back()
       // if nothing went wrong (i.e.: no operand corruption on disk)
@@ -267,9 +270,10 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
         if (keys_.size() == 1) {
           // we need to re-anchor the orig_ikey because it was anchored by
           // original_key before
-          Status pikStatus = ParseInternalKey(keys_.back(), &orig_ikey);
-          pikStatus.PermitUncheckedError();
-          assert(pikStatus.ok());
+          pik_status =
+              ParseInternalKey(keys_.back(), &orig_ikey, allow_data_in_errors);
+          pik_status.PermitUncheckedError();
+          assert(pik_status.ok());
         }
         if (filter == CompactionFilter::Decision::kKeep) {
           merge_context_.PushOperand(
@@ -322,7 +326,7 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
     std::string merge_result;
     s = TimedFullMerge(user_merge_operator_, orig_ikey.user_key, nullptr,
                        merge_context_.GetOperands(), &merge_result, logger_,
-                       stats_, env_);
+                       stats_, clock_);
     if (s.ok()) {
       // The original key encountered
       // We are certain that keys_ is not empty here (see assertions couple of
@@ -345,7 +349,7 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
       bool merge_success = false;
       std::string merge_result;
       {
-        StopWatchNano timer(env_, stats_ != nullptr);
+        StopWatchNano timer(clock_, stats_ != nullptr);
         PERF_TIMER_GUARD(merge_operator_time_nanos);
         merge_success = user_merge_operator_->PartialMergeMulti(
             orig_ikey.user_key,
diff --git a/db/merge_helper.h b/db/merge_helper.h
index c0534f08ba4..f3bcd948b44 100644
--- a/db/merge_helper.h
+++ b/db/merge_helper.h
@@ -25,6 +25,7 @@ class Iterator;
 class Logger;
 class MergeOperator;
 class Statistics;
+class SystemClock;
 
 class MergeHelper {
  public:
@@ -48,7 +49,7 @@ class MergeHelper {
                                const Slice& key, const Slice* value,
                                const std::vector<Slice>& operands,
                                std::string* result, Logger* logger,
-                               Statistics* statistics, Env* env,
+                               Statistics* statistics, SystemClock* clock,
                                Slice* result_operand = nullptr,
                                bool update_num_ops_stats = false);
 
@@ -66,6 +67,8 @@ class MergeHelper {
   //                   0 means no restriction
   // at_bottom:   (IN) true if the iterator covers the bottem level, which means
   //                   we could reach the start of the history of this user key.
+  // allow_data_in_errors: (IN) if true, data details will be displayed in
+  //                   error/log messages.
   //
   // Returns one of the following statuses:
   // - OK: Entries were successfully merged.
@@ -80,7 +83,8 @@ class MergeHelper {
   Status MergeUntil(InternalIterator* iter,
                     CompactionRangeDelAggregator* range_del_agg = nullptr,
                     const SequenceNumber stop_before = 0,
-                    const bool at_bottom = false);
+                    const bool at_bottom = false,
+                    const bool allow_data_in_errors = false);
 
   // Filters a merge operand using the compaction filter specified
   // in the constructor. Returns the decision that the filter made.
@@ -137,6 +141,7 @@ class MergeHelper {
 
  private:
   Env* env_;
+  SystemClock* clock_;
   const Comparator* user_comparator_;
   const MergeOperator* user_merge_operator_;
   const CompactionFilter* compaction_filter_;
diff --git a/db/merge_test.cc b/db/merge_test.cc
index 76716aefab0..2cca0735ef4 100644
--- a/db/merge_test.cc
+++ b/db/merge_test.cc
@@ -4,8 +4,9 @@
 //  (found in the LICENSE.Apache file in the root directory).
 //
 #include <assert.h>
-#include <memory>
+
 #include <iostream>
+#include <memory>
 
 #include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
@@ -18,6 +19,7 @@
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/utilities/db_ttl.h"
 #include "test_util/testharness.h"
+#include "util/coding.h"
 #include "utilities/merge_operators.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -97,9 +99,9 @@ std::shared_ptr<DB> OpenDb(const std::string& dbname, const bool ttl = false,
   options.create_if_missing = true;
   options.merge_operator = std::make_shared<CountMergeOperator>();
   options.max_successive_merges = max_successive_merges;
-  Status s;
   options.env = EnvMergeTest::GetInstance();
-  DestroyDB(dbname, Options());
+  EXPECT_OK(DestroyDB(dbname, Options()));
+  Status s;
 // DBWithTTL is not supported in ROCKSDB_LITE
 #ifndef ROCKSDB_LITE
   if (ttl) {
@@ -113,10 +115,8 @@ std::shared_ptr<DB> OpenDb(const std::string& dbname, const bool ttl = false,
   assert(!ttl);
   s = DB::Open(options, dbname, &db);
 #endif  // !ROCKSDB_LITE
-  if (!s.ok()) {
-    std::cerr << s.ToString() << std::endl;
-    assert(false);
-  }
+  EXPECT_OK(s);
+  assert(s.ok());
   return std::shared_ptr<DB>(db);
 }
 
@@ -271,21 +271,25 @@ void testCounters(Counters& counters, DB* db, bool test_compaction) {
 
   counters.assert_set("a", 1);
 
-  if (test_compaction) db->Flush(o);
+  if (test_compaction) {
+    ASSERT_OK(db->Flush(o));
+  }
 
-  assert(counters.assert_get("a") == 1);
+  ASSERT_EQ(counters.assert_get("a"), 1);
 
   counters.assert_remove("b");
 
   // defaut value is 0 if non-existent
-  assert(counters.assert_get("b") == 0);
+  ASSERT_EQ(counters.assert_get("b"), 0);
 
   counters.assert_add("a", 2);
 
-  if (test_compaction) db->Flush(o);
+  if (test_compaction) {
+    ASSERT_OK(db->Flush(o));
+  }
 
   // 1+2 = 3
-  assert(counters.assert_get("a")== 3);
+  ASSERT_EQ(counters.assert_get("a"), 3);
 
   dumpDb(db);
 
@@ -295,22 +299,112 @@ void testCounters(Counters& counters, DB* db, bool test_compaction) {
     counters.assert_add("b", i);
     sum += i;
   }
-  assert(counters.assert_get("b") == sum);
+  ASSERT_EQ(counters.assert_get("b"), sum);
 
   dumpDb(db);
 
   if (test_compaction) {
-    db->Flush(o);
+    ASSERT_OK(db->Flush(o));
 
-    db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
     dumpDb(db);
 
-    assert(counters.assert_get("a")== 3);
-    assert(counters.assert_get("b") == sum);
+    ASSERT_EQ(counters.assert_get("a"), 3);
+    ASSERT_EQ(counters.assert_get("b"), sum);
   }
 }
 
+void testCountersWithFlushAndCompaction(Counters& counters, DB* db) {
+  ASSERT_OK(db->Put({}, "1", "1"));
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  std::atomic<int> cnt{0};
+  const auto get_thread_id = [&cnt]() {
+    thread_local int thread_id{cnt++};
+    return thread_id;
+  };
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:BeforeWriterWaiting", [&](void* /*arg*/) {
+        int thread_id = get_thread_id();
+        if (1 == thread_id) {
+          TEST_SYNC_POINT(
+              "testCountersWithFlushAndCompaction::bg_compact_thread:0");
+        } else if (2 == thread_id) {
+          TEST_SYNC_POINT(
+              "testCountersWithFlushAndCompaction::bg_flush_thread:0");
+        }
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void* /*arg*/) {
+        int thread_id = get_thread_id();
+        if (0 == thread_id) {
+          TEST_SYNC_POINT(
+              "testCountersWithFlushAndCompaction::set_options_thread:0");
+          TEST_SYNC_POINT(
+              "testCountersWithFlushAndCompaction::set_options_thread:1");
+        }
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WakeUpAndDone", [&](void* arg) {
+        auto* mutex = reinterpret_cast<InstrumentedMutex*>(arg);
+        mutex->AssertHeld();
+        int thread_id = get_thread_id();
+        ASSERT_EQ(2, thread_id);
+        mutex->Unlock();
+        TEST_SYNC_POINT(
+            "testCountersWithFlushAndCompaction::bg_flush_thread:1");
+        TEST_SYNC_POINT(
+            "testCountersWithFlushAndCompaction::bg_flush_thread:2");
+        mutex->Lock();
+      });
+  SyncPoint::GetInstance()->LoadDependency({
+      {"testCountersWithFlushAndCompaction::set_options_thread:0",
+       "testCountersWithCompactionAndFlush:BeforeCompact"},
+      {"testCountersWithFlushAndCompaction::bg_compact_thread:0",
+       "testCountersWithFlushAndCompaction:BeforeIncCounters"},
+      {"testCountersWithFlushAndCompaction::bg_flush_thread:0",
+       "testCountersWithFlushAndCompaction::set_options_thread:1"},
+      {"testCountersWithFlushAndCompaction::bg_flush_thread:1",
+       "testCountersWithFlushAndCompaction:BeforeVerification"},
+      {"testCountersWithFlushAndCompaction:AfterGet",
+       "testCountersWithFlushAndCompaction::bg_flush_thread:2"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread set_options_thread([&]() {
+    ASSERT_OK(reinterpret_cast<DBImpl*>(db)->SetOptions(
+        {{"disable_auto_compactions", "false"}}));
+  });
+  TEST_SYNC_POINT("testCountersWithCompactionAndFlush:BeforeCompact");
+  port::Thread compact_thread([&]() {
+    ASSERT_OK(reinterpret_cast<DBImpl*>(db)->CompactRange(
+        CompactRangeOptions(), db->DefaultColumnFamily(), nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("testCountersWithFlushAndCompaction:BeforeIncCounters");
+  counters.add("test-key", 1);
+
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(db->Flush(flush_opts));
+
+  TEST_SYNC_POINT("testCountersWithFlushAndCompaction:BeforeVerification");
+  std::string expected;
+  PutFixed64(&expected, 1);
+  std::string actual;
+  Status s = db->Get(ReadOptions(), "test-key", &actual);
+  TEST_SYNC_POINT("testCountersWithFlushAndCompaction:AfterGet");
+  set_options_thread.join();
+  compact_thread.join();
+  ASSERT_OK(s);
+  ASSERT_EQ(expected, actual);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
 void testSuccessiveMerge(Counters& counters, size_t max_num_merges,
                          size_t num_merges) {
   counters.assert_remove("z");
@@ -322,14 +416,14 @@ void testSuccessiveMerge(Counters& counters, size_t max_num_merges,
     sum += i;
 
     if (i % (max_num_merges + 1) == 0) {
-      assert(num_merge_operator_calls == max_num_merges + 1);
+      ASSERT_EQ(num_merge_operator_calls, max_num_merges + 1);
     } else {
-      assert(num_merge_operator_calls == 0);
+      ASSERT_EQ(num_merge_operator_calls, 0);
     }
 
     resetNumMergeOperatorCalls();
-    assert(counters.assert_get("z") == sum);
-    assert(num_merge_operator_calls == i % (max_num_merges + 1));
+    ASSERT_EQ(counters.assert_get("z"), sum);
+    ASSERT_EQ(num_merge_operator_calls, i % (max_num_merges + 1));
   }
 }
 
@@ -346,8 +440,8 @@ void testPartialMerge(Counters* counters, DB* db, size_t max_merge,
     counters->assert_add("b", i);
     tmp_sum += i;
   }
-  db->Flush(o);
-  db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db->Flush(o));
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ(tmp_sum, counters->assert_get("b"));
   if (count > max_merge) {
     // in this case, FullMerge should be called instead.
@@ -360,13 +454,13 @@ void testPartialMerge(Counters* counters, DB* db, size_t max_merge,
   // Test case 2: partial merge should not be called when a put is found.
   resetNumPartialMergeCalls();
   tmp_sum = 0;
-  db->Put(ROCKSDB_NAMESPACE::WriteOptions(), "c", "10");
+  ASSERT_OK(db->Put(ROCKSDB_NAMESPACE::WriteOptions(), "c", "10"));
   for (size_t i = 1; i <= count; i++) {
     counters->assert_add("c", i);
     tmp_sum += i;
   }
-  db->Flush(o);
-  db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(db->Flush(o));
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
   ASSERT_EQ(tmp_sum, counters->assert_get("c"));
   ASSERT_EQ(num_partial_merge_calls, 0U);
   ASSERT_EQ(EnvMergeTest::now_nanos_count_, 0U);
@@ -374,7 +468,7 @@ void testPartialMerge(Counters* counters, DB* db, size_t max_merge,
 
 void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges,
                                     size_t num_merges) {
-  assert(num_merges > max_num_merges);
+  ASSERT_GT(num_merges, max_num_merges);
 
   Slice key("BatchSuccessiveMerge");
   uint64_t merge_value = 1;
@@ -385,15 +479,12 @@ void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges,
   // Create the batch
   WriteBatch batch;
   for (size_t i = 0; i < num_merges; ++i) {
-    batch.Merge(key, merge_value_slice);
+    ASSERT_OK(batch.Merge(key, merge_value_slice));
   }
 
   // Apply to memtable and count the number of merges
   resetNumMergeOperatorCalls();
-  {
-    Status s = db->Write(WriteOptions(), &batch);
-    assert(s.ok());
-  }
+  ASSERT_OK(db->Write(WriteOptions(), &batch));
   ASSERT_EQ(
       num_merge_operator_calls,
       static_cast<size_t>(num_merges - (num_merges % (max_num_merges + 1))));
@@ -401,10 +492,7 @@ void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges,
   // Get the value
   resetNumMergeOperatorCalls();
   std::string get_value_str;
-  {
-    Status s = db->Get(ReadOptions(), key, &get_value_str);
-    assert(s.ok());
-  }
+  ASSERT_OK(db->Get(ReadOptions(), key, &get_value_str));
   assert(get_value_str.size() == sizeof(uint64_t));
   uint64_t get_value = DecodeFixed64(&get_value_str[0]);
   ASSERT_EQ(get_value, num_merges * merge_value);
@@ -427,7 +515,7 @@ void runTest(const std::string& dbname, const bool use_ttl = false) {
     }
   }
 
-  DestroyDB(dbname, Options());
+  ASSERT_OK(DestroyDB(dbname, Options()));
 
   {
     size_t max_merge = 5;
@@ -436,7 +524,8 @@ void runTest(const std::string& dbname, const bool use_ttl = false) {
     testCounters(counters, db.get(), use_compression);
     testSuccessiveMerge(counters, max_merge, max_merge * 2);
     testSingleBatchSuccessiveMerge(db.get(), 5, 7);
-    DestroyDB(dbname, Options());
+    ASSERT_OK(db->Close());
+    ASSERT_OK(DestroyDB(dbname, Options()));
   }
 
   {
@@ -447,14 +536,16 @@ void runTest(const std::string& dbname, const bool use_ttl = false) {
       auto db = OpenDb(dbname, use_ttl, max_merge);
       MergeBasedCounters counters(db, 0);
       testPartialMerge(&counters, db.get(), max_merge, min_merge, count);
-      DestroyDB(dbname, Options());
+      ASSERT_OK(db->Close());
+      ASSERT_OK(DestroyDB(dbname, Options()));
     }
     {
       auto db = OpenDb(dbname, use_ttl, max_merge);
       MergeBasedCounters counters(db, 0);
       testPartialMerge(&counters, db.get(), max_merge, min_merge,
                        min_merge * 10);
-      DestroyDB(dbname, Options());
+      ASSERT_OK(db->Close());
+      ASSERT_OK(DestroyDB(dbname, Options()));
     }
   }
 
@@ -465,15 +556,15 @@ void runTest(const std::string& dbname, const bool use_ttl = false) {
       counters.add("test-key", 1);
       counters.add("test-key", 1);
       counters.add("test-key", 1);
-      db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+      ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
     }
 
     DB* reopen_db;
     ASSERT_OK(DB::Open(Options(), dbname, &reopen_db));
     std::string value;
-    ASSERT_TRUE(!(reopen_db->Get(ReadOptions(), "test-key", &value).ok()));
+    ASSERT_NOK(reopen_db->Get(ReadOptions(), "test-key", &value));
     delete reopen_db;
-    DestroyDB(dbname, Options());
+    ASSERT_OK(DestroyDB(dbname, Options()));
   }
 
   /* Temporary remove this test
@@ -502,6 +593,19 @@ TEST_F(MergeTest, MergeDbTtlTest) {
   runTest(test::PerThreadDBPath("merge_testdbttl"),
           true);  // Run test on TTL database
 }
+
+TEST_F(MergeTest, MergeWithCompactionAndFlush) {
+  const std::string dbname =
+      test::PerThreadDBPath("merge_with_compaction_and_flush");
+  {
+    auto db = OpenDb(dbname);
+    {
+      MergeBasedCounters counters(db, 0);
+      testCountersWithFlushAndCompaction(counters, db.get());
+    }
+  }
+  ASSERT_OK(DestroyDB(dbname, Options()));
+}
 #endif  // !ROCKSDB_LITE
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/obsolete_files_test.cc b/db/obsolete_files_test.cc
index 98d98eae242..ee6b0763997 100644
--- a/db/obsolete_files_test.cc
+++ b/db/obsolete_files_test.cc
@@ -28,10 +28,6 @@
 #include "test_util/testutil.h"
 #include "util/string_util.h"
 
-using std::cerr;
-using std::cout;
-using std::endl;
-using std::flush;
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -65,7 +61,7 @@ class ObsoleteFilesTest : public DBTestBase {
   void CheckFileTypeCounts(const std::string& dir, int required_log,
                            int required_sst, int required_manifest) {
     std::vector<std::string> filenames;
-    env_->GetChildren(dir, &filenames);
+    ASSERT_OK(env_->GetChildren(dir, &filenames));
 
     int log_cnt = 0;
     int sst_cnt = 0;
@@ -74,7 +70,7 @@ class ObsoleteFilesTest : public DBTestBase {
       uint64_t number;
       FileType type;
       if (ParseFileName(file, &number, &type)) {
-        log_cnt += (type == kLogFile);
+        log_cnt += (type == kWalFile);
         sst_cnt += (type == kTableFile);
         manifest_cnt += (type == kDescriptorFile);
       }
@@ -98,6 +94,12 @@ class ObsoleteFilesTest : public DBTestBase {
     options.WAL_ttl_seconds = 300;     // Used to test log files
     options.WAL_size_limit_MB = 1024;  // Used to test log files
     options.wal_dir = wal_dir_;
+
+    // Note: the following prevents an otherwise harmless data race between the
+    // test setup code (AddBlobFile) in ObsoleteFilesTest.BlobFiles and the
+    // periodic stat dumping thread.
+    options.stats_dump_period_sec = 0;
+
     Destroy(options);
     Reopen(options);
   }
@@ -196,6 +198,8 @@ TEST_F(ObsoleteFilesTest, DeleteObsoleteOptionsFile) {
 }
 
 TEST_F(ObsoleteFilesTest, BlobFiles) {
+  ReopenDB();
+
   VersionSet* const versions = dbfull()->TEST_GetVersionSet();
   assert(versions);
   assert(versions->GetColumnFamilySet());
diff --git a/db/options_file_test.cc b/db/options_file_test.cc
index 7ad84642779..3ff7e0952b4 100644
--- a/db/options_file_test.cc
+++ b/db/options_file_test.cc
@@ -65,7 +65,7 @@ TEST_F(OptionsFileTest, NumberOfOptionsFiles) {
   const int kReopenCount = 20;
   Options opt;
   opt.create_if_missing = true;
-  DestroyDB(dbname_, opt);
+  ASSERT_OK(DestroyDB(dbname_, opt));
   std::unordered_set<std::string> filename_history;
   DB* db;
   for (int i = 0; i < kReopenCount; ++i) {
diff --git a/db/output_validator.cc b/db/output_validator.cc
index 56b8fe59ef8..c36c9281e15 100644
--- a/db/output_validator.cc
+++ b/db/output_validator.cc
@@ -9,8 +9,8 @@ namespace ROCKSDB_NAMESPACE {
 Status OutputValidator::Add(const Slice& key, const Slice& value) {
   if (enable_hash_) {
     // Generate a rolling 64-bit hash of the key and values
-    paranoid_hash_ = Hash64(key.data(), key.size(), paranoid_hash_);
-    paranoid_hash_ = Hash64(value.data(), value.size(), paranoid_hash_);
+    paranoid_hash_ = NPHash64(key.data(), key.size(), paranoid_hash_);
+    paranoid_hash_ = NPHash64(value.data(), value.size(), paranoid_hash_);
   }
   if (enable_order_check_) {
     TEST_SYNC_POINT_CALLBACK("OutputValidator::Add:order_check",
diff --git a/db/output_validator.h b/db/output_validator.h
index 167b25e0615..ad9000d5e3b 100644
--- a/db/output_validator.h
+++ b/db/output_validator.h
@@ -17,8 +17,10 @@ namespace ROCKSDB_NAMESPACE {
 class OutputValidator {
  public:
   explicit OutputValidator(const InternalKeyComparator& icmp,
-                           bool enable_order_check, bool enable_hash)
+                           bool enable_order_check, bool enable_hash,
+                           uint64_t precalculated_hash = 0)
       : icmp_(icmp),
+        paranoid_hash_(precalculated_hash),
         enable_order_check_(enable_order_check),
         enable_hash_(enable_hash) {}
 
@@ -33,9 +35,11 @@ class OutputValidator {
     return GetHash() == other_validator.GetHash();
   }
 
- private:
+  // Not (yet) intended to be persisted, so subject to change
+  // without notice between releases.
   uint64_t GetHash() const { return paranoid_hash_; }
 
+ private:
   const InternalKeyComparator& icmp_;
   std::string prev_key_;
   uint64_t paranoid_hash_ = 0;
diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc
index 5a714b9b85a..908e684f73e 100644
--- a/db/perf_context_test.cc
+++ b/db/perf_context_test.cc
@@ -3,6 +3,8 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
+#include "rocksdb/perf_context.h"
+
 #include <algorithm>
 #include <iostream>
 #include <thread>
@@ -15,8 +17,8 @@
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/memtablerep.h"
-#include "rocksdb/perf_context.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/testharness.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
@@ -76,12 +78,12 @@ TEST_F(PerfContextTest, SeekIntoDeletion) {
     std::string key = "k" + ToString(i);
     std::string value = "v" + ToString(i);
 
-    db->Put(write_options, key, value);
+    ASSERT_OK(db->Put(write_options, key, value));
   }
 
   for (int i = 0; i < FLAGS_total_keys -1 ; ++i) {
     std::string key = "k" + ToString(i);
-    db->Delete(write_options, key);
+    ASSERT_OK(db->Delete(write_options, key));
   }
 
   HistogramImpl hist_get;
@@ -91,7 +93,7 @@ TEST_F(PerfContextTest, SeekIntoDeletion) {
     std::string value;
 
     get_perf_context()->Reset();
-    StopWatchNano timer(Env::Default());
+    StopWatchNano timer(SystemClock::Default().get());
     timer.Start();
     auto status = db->Get(read_options, key, &value);
     auto elapsed_nanos = timer.ElapsedNanos();
@@ -110,16 +112,15 @@ TEST_F(PerfContextTest, SeekIntoDeletion) {
     std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
 
     get_perf_context()->Reset();
-    StopWatchNano timer(Env::Default(), true);
+    StopWatchNano timer(SystemClock::Default().get(), true);
     iter->SeekToFirst();
     hist_seek_to_first.Add(get_perf_context()->user_key_comparison_count);
     auto elapsed_nanos = timer.ElapsedNanos();
 
     if (FLAGS_verbose) {
-      std::cout << "SeekToFirst uesr key comparison: \n"
-                << hist_seek_to_first.ToString()
-                << "ikey skipped: " << get_perf_context()->internal_key_skipped_count
-                << "\n"
+      std::cout << "SeekToFirst user key comparison: \n"
+                << hist_seek_to_first.ToString() << "ikey skipped: "
+                << get_perf_context()->internal_key_skipped_count << "\n"
                 << "idelete skipped: "
                 << get_perf_context()->internal_delete_skipped_count << "\n"
                 << "elapsed: " << elapsed_nanos << "\n";
@@ -132,7 +133,7 @@ TEST_F(PerfContextTest, SeekIntoDeletion) {
     std::string key = "k" + ToString(i);
 
     get_perf_context()->Reset();
-    StopWatchNano timer(Env::Default(), true);
+    StopWatchNano timer(SystemClock::Default().get(), true);
     iter->Seek(key);
     auto elapsed_nanos = timer.ElapsedNanos();
     hist_seek.Add(get_perf_context()->user_key_comparison_count);
@@ -146,7 +147,7 @@ TEST_F(PerfContextTest, SeekIntoDeletion) {
 
     get_perf_context()->Reset();
     ASSERT_TRUE(iter->Valid());
-    StopWatchNano timer2(Env::Default(), true);
+    StopWatchNano timer2(SystemClock::Default().get(), true);
     iter->Next();
     auto elapsed_nanos2 = timer2.ElapsedNanos();
     if (FLAGS_verbose) {
@@ -156,7 +157,7 @@ TEST_F(PerfContextTest, SeekIntoDeletion) {
   }
 
   if (FLAGS_verbose) {
-    std::cout << "Seek uesr key comparison: \n" << hist_seek.ToString();
+    std::cout << "Seek user key comparison: \n" << hist_seek.ToString();
   }
 }
 
@@ -165,7 +166,7 @@ TEST_F(PerfContextTest, StopWatchNanoOverhead) {
   const int kTotalIterations = 1000000;
   std::vector<uint64_t> timings(kTotalIterations);
 
-  StopWatchNano timer(Env::Default(), true);
+  StopWatchNano timer(SystemClock::Default().get(), true);
   for (auto& timing : timings) {
     timing = timer.ElapsedNanos(true /* reset */);
   }
@@ -186,7 +187,7 @@ TEST_F(PerfContextTest, StopWatchOverhead) {
   uint64_t elapsed = 0;
   std::vector<uint64_t> timings(kTotalIterations);
 
-  StopWatch timer(Env::Default(), nullptr, 0, &elapsed);
+  StopWatch timer(SystemClock::Default().get(), nullptr, 0, &elapsed);
   for (auto& timing : timings) {
     timing = elapsed;
   }
@@ -270,7 +271,7 @@ void ProfileQueries(bool enabled_time = false) {
     std::vector<std::string> values;
 
     get_perf_context()->Reset();
-    db->Put(write_options, key, value);
+    ASSERT_OK(db->Put(write_options, key, value));
     if (++num_mutex_waited > 3) {
 #ifndef NDEBUG
       ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U);
@@ -314,7 +315,10 @@ void ProfileQueries(bool enabled_time = false) {
     hist_get.Add(get_perf_context()->user_key_comparison_count);
 
     get_perf_context()->Reset();
-    db->MultiGet(read_options, multiget_keys, &values);
+    auto statuses = db->MultiGet(read_options, multiget_keys, &values);
+    for (const auto& s : statuses) {
+      ASSERT_OK(s);
+    }
     hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time);
     hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time);
     hist_mget_files.Add(get_perf_context()->get_from_output_files_time);
@@ -324,9 +328,10 @@ void ProfileQueries(bool enabled_time = false) {
   }
 
   if (FLAGS_verbose) {
-    std::cout << "Put uesr key comparison: \n" << hist_put.ToString()
-              << "Get uesr key comparison: \n" << hist_get.ToString()
-              << "MultiGet uesr key comparison: \n" << hist_get.ToString();
+    std::cout << "Put user key comparison: \n"
+              << hist_put.ToString() << "Get user key comparison: \n"
+              << hist_get.ToString() << "MultiGet user key comparison: \n"
+              << hist_get.ToString();
     std::cout << "Put(): Pre and Post Process Time: \n"
               << hist_write_pre_post.ToString() << " Writing WAL time: \n"
               << hist_write_wal_time.ToString() << "\n"
@@ -428,7 +433,10 @@ void ProfileQueries(bool enabled_time = false) {
     hist_get.Add(get_perf_context()->user_key_comparison_count);
 
     get_perf_context()->Reset();
-    db->MultiGet(read_options, multiget_keys, &values);
+    auto statuses = db->MultiGet(read_options, multiget_keys, &values);
+    for (const auto& s : statuses) {
+      ASSERT_OK(s);
+    }
     hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time);
     hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time);
     hist_mget_files.Add(get_perf_context()->get_from_output_files_time);
@@ -438,8 +446,9 @@ void ProfileQueries(bool enabled_time = false) {
   }
 
   if (FLAGS_verbose) {
-    std::cout << "ReadOnly Get uesr key comparison: \n" << hist_get.ToString()
-              << "ReadOnly MultiGet uesr key comparison: \n"
+    std::cout << "ReadOnly Get user key comparison: \n"
+              << hist_get.ToString()
+              << "ReadOnly MultiGet user key comparison: \n"
               << hist_mget.ToString();
 
     std::cout << "ReadOnly Get(): Time to get snapshot: \n"
@@ -532,14 +541,14 @@ TEST_F(PerfContextTest, SeekKeyComparison) {
   HistogramImpl hist_time_diff;
 
   SetPerfLevel(kEnableTime);
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(SystemClock::Default().get());
   for (const int i : keys) {
     std::string key = "k" + ToString(i);
     std::string value = "v" + ToString(i);
 
     get_perf_context()->Reset();
     timer.Start();
-    db->Put(write_options, key, value);
+    ASSERT_OK(db->Put(write_options, key, value));
     auto put_time = timer.ElapsedNanos();
     hist_put_time.Add(put_time);
     hist_wal_time.Add(get_perf_context()->write_wal_time);
@@ -573,7 +582,7 @@ TEST_F(PerfContextTest, SeekKeyComparison) {
     iter->Next();
     hist_next.Add(get_perf_context()->user_key_comparison_count);
   }
-
+  ASSERT_OK(iter->status());
   if (FLAGS_verbose) {
     std::cout << "Seek:\n" << hist_seek.ToString() << "Next:\n"
               << hist_next.ToString();
@@ -585,25 +594,26 @@ TEST_F(PerfContextTest, DBMutexLockCounter) {
   for (PerfLevel perf_level_test :
        {PerfLevel::kEnableTimeExceptForMutex, PerfLevel::kEnableTime}) {
     for (int c = 0; c < 2; ++c) {
-    InstrumentedMutex mutex(nullptr, Env::Default(), stats_code[c]);
-    mutex.Lock();
-    ROCKSDB_NAMESPACE::port::Thread child_thread([&] {
-      SetPerfLevel(perf_level_test);
-      get_perf_context()->Reset();
-      ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0);
+      InstrumentedMutex mutex(nullptr, SystemClock::Default().get(),
+                              stats_code[c]);
       mutex.Lock();
-      mutex.Unlock();
-      if (perf_level_test == PerfLevel::kEnableTimeExceptForMutex ||
-          stats_code[c] != DB_MUTEX_WAIT_MICROS) {
+      ROCKSDB_NAMESPACE::port::Thread child_thread([&] {
+        SetPerfLevel(perf_level_test);
+        get_perf_context()->Reset();
         ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0);
-      } else {
-        // increment the counter only when it's a DB Mutex
-        ASSERT_GT(get_perf_context()->db_mutex_lock_nanos, 0);
-      }
-    });
-    Env::Default()->SleepForMicroseconds(100);
-    mutex.Unlock();
-    child_thread.join();
+        mutex.Lock();
+        mutex.Unlock();
+        if (perf_level_test == PerfLevel::kEnableTimeExceptForMutex ||
+            stats_code[c] != DB_MUTEX_WAIT_MICROS) {
+          ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0);
+        } else {
+          // increment the counter only when it's a DB Mutex
+          ASSERT_GT(get_perf_context()->db_mutex_lock_nanos, 0);
+        }
+      });
+      SystemClock::Default()->SleepForMicroseconds(100);
+      mutex.Unlock();
+      child_thread.join();
   }
   }
 }
@@ -612,7 +622,8 @@ TEST_F(PerfContextTest, FalseDBMutexWait) {
   SetPerfLevel(kEnableTime);
   int stats_code[] = {0, static_cast<int>(DB_MUTEX_WAIT_MICROS)};
   for (int c = 0; c < 2; ++c) {
-    InstrumentedMutex mutex(nullptr, Env::Default(), stats_code[c]);
+    InstrumentedMutex mutex(nullptr, SystemClock::Default().get(),
+                            stats_code[c]);
     InstrumentedCondVar lock(&mutex);
     get_perf_context()->Reset();
     mutex.Lock();
@@ -817,10 +828,8 @@ TEST_F(PerfContextTest, PerfContextByLevelGetSet) {
 }
 
 TEST_F(PerfContextTest, CPUTimer) {
-  if (Env::Default()->NowCPUNanos() == 0) {
-    // TODO: This should be a GTEST_SKIP when the embedded gtest is updated
-    // to 1.10 or higher.
-    GTEST_SUCCESS_("Skipped on target without NowCPUNanos support");
+  if (SystemClock::Default()->CPUNanos() == 0) {
+    ROCKSDB_GTEST_SKIP("Target without CPUNanos support");
     return;
   }
 
@@ -837,7 +846,7 @@ TEST_F(PerfContextTest, CPUTimer) {
     std::string value = "v" + i_str;
     max_str = max_str > i_str ? max_str : i_str;
 
-    db->Put(write_options, key, value);
+    ASSERT_OK(db->Put(write_options, key, value));
   }
   std::string last_key = "k" + max_str;
   std::string last_value = "v" + max_str;
diff --git a/db/periodic_work_scheduler.cc b/db/periodic_work_scheduler.cc
index 121439011ed..677eec90cdd 100644
--- a/db/periodic_work_scheduler.cc
+++ b/db/periodic_work_scheduler.cc
@@ -6,13 +6,14 @@
 #include "db/periodic_work_scheduler.h"
 
 #include "db/db_impl/db_impl.h"
-#include "util/cast_util.h"
+#include "rocksdb/system_clock.h"
 
 #ifndef ROCKSDB_LITE
 namespace ROCKSDB_NAMESPACE {
 
-PeriodicWorkScheduler::PeriodicWorkScheduler(Env* env) : timer_mu_(env) {
-  timer = std::unique_ptr<Timer>(new Timer(env));
+PeriodicWorkScheduler::PeriodicWorkScheduler(
+    const std::shared_ptr<SystemClock>& clock) {
+  timer = std::unique_ptr<Timer>(new Timer(clock.get()));
 }
 
 void PeriodicWorkScheduler::Register(DBImpl* dbi,
@@ -53,10 +54,10 @@ void PeriodicWorkScheduler::Unregister(DBImpl* dbi) {
 }
 
 PeriodicWorkScheduler* PeriodicWorkScheduler::Default() {
-  // Always use the default Env for the scheduler, as we only use the NowMicros
-  // which is the same for all env.
-  // The Env could only be overridden in test.
-  static PeriodicWorkScheduler scheduler(Env::Default());
+  // Always use the default SystemClock for the scheduler, as we only use the
+  // NowMicros which is the same for all clocks. The Env could only be
+  // overridden in test.
+  static PeriodicWorkScheduler scheduler(SystemClock::Default());
   return &scheduler;
 }
 
@@ -70,12 +71,13 @@ std::string PeriodicWorkScheduler::GetTaskName(DBImpl* dbi,
 
 #ifndef NDEBUG
 
-// Get the static scheduler. For a new env, it needs to re-create the internal
-// timer, so only re-create it when there's no running task. Otherwise, return
-// the existing scheduler. Which means if the unittest needs to update MockEnv,
-// Close all db instances and then re-open them.
-PeriodicWorkTestScheduler* PeriodicWorkTestScheduler::Default(Env* env) {
-  static PeriodicWorkTestScheduler scheduler(env);
+// Get the static scheduler. For a new SystemClock, it needs to re-create the
+// internal timer, so only re-create it when there's no running task. Otherwise,
+// return the existing scheduler. Which means if the unittest needs to update
+// MockClock, Close all db instances and then re-open them.
+PeriodicWorkTestScheduler* PeriodicWorkTestScheduler::Default(
+    const std::shared_ptr<SystemClock>& clock) {
+  static PeriodicWorkTestScheduler scheduler(clock);
   static port::Mutex mutex;
   {
     MutexLock l(&mutex);
@@ -85,7 +87,7 @@ PeriodicWorkTestScheduler* PeriodicWorkTestScheduler::Default(Env* env) {
         MutexLock timer_mu_guard(&scheduler.timer_mu_);
         scheduler.timer->Shutdown();
       }
-      scheduler.timer.reset(new Timer(env));
+      scheduler.timer.reset(new Timer(clock.get()));
     }
   }
   return &scheduler;
@@ -105,8 +107,9 @@ size_t PeriodicWorkTestScheduler::TEST_GetValidTaskNum() const {
   return 0;
 }
 
-PeriodicWorkTestScheduler::PeriodicWorkTestScheduler(Env* env)
-    : PeriodicWorkScheduler(env) {}
+PeriodicWorkTestScheduler::PeriodicWorkTestScheduler(
+    const std::shared_ptr<SystemClock>& clock)
+    : PeriodicWorkScheduler(clock) {}
 
 #endif  // !NDEBUG
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/periodic_work_scheduler.h b/db/periodic_work_scheduler.h
index 9382adc449b..fe89ff567f7 100644
--- a/db/periodic_work_scheduler.h
+++ b/db/periodic_work_scheduler.h
@@ -11,6 +11,7 @@
 #include "util/timer.h"
 
 namespace ROCKSDB_NAMESPACE {
+class SystemClock;
 
 // PeriodicWorkScheduler is a singleton object, which is scheduling/running
 // DumpStats(), PersistStats(), and FlushInfoLog() for all DB instances. All DB
@@ -49,25 +50,26 @@ class PeriodicWorkScheduler {
   //     the `Timer::Cancel()`s and `Timer::Shutdown()` run atomically.
   port::Mutex timer_mu_;
 
-  explicit PeriodicWorkScheduler(Env* env);
+  explicit PeriodicWorkScheduler(const std::shared_ptr<SystemClock>& clock);
 
  private:
   std::string GetTaskName(DBImpl* dbi, const std::string& func_name);
 };
 
 #ifndef NDEBUG
-// PeriodicWorkTestScheduler is for unittest, which can specify the Env like
-// SafeMockTimeEnv. It also contains functions for unittest.
+// PeriodicWorkTestScheduler is for unittest, which can specify the SystemClock
+// It also contains functions for unittest.
 class PeriodicWorkTestScheduler : public PeriodicWorkScheduler {
  public:
-  static PeriodicWorkTestScheduler* Default(Env* env);
+  static PeriodicWorkTestScheduler* Default(
+      const std::shared_ptr<SystemClock>& clock);
 
   void TEST_WaitForRun(std::function<void()> callback) const;
 
   size_t TEST_GetValidTaskNum() const;
 
  private:
-  explicit PeriodicWorkTestScheduler(Env* env);
+  explicit PeriodicWorkTestScheduler(const std::shared_ptr<SystemClock>& clock);
 };
 #endif  // !NDEBUG
 
diff --git a/db/periodic_work_scheduler_test.cc b/db/periodic_work_scheduler_test.cc
index d53265389dd..a92b8730ff8 100644
--- a/db/periodic_work_scheduler_test.cc
+++ b/db/periodic_work_scheduler_test.cc
@@ -6,6 +6,8 @@
 #include "db/periodic_work_scheduler.h"
 
 #include "db/db_test_util.h"
+#include "env/composite_env_wrapper.h"
+#include "test_util/mock_time_env.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -13,20 +15,23 @@ namespace ROCKSDB_NAMESPACE {
 class PeriodicWorkSchedulerTest : public DBTestBase {
  public:
   PeriodicWorkSchedulerTest()
-      : DBTestBase("/periodic_work_scheduler_test", /*env_do_fsync=*/true),
-        mock_env_(new MockTimeEnv(Env::Default())) {}
+      : DBTestBase("/periodic_work_scheduler_test", /*env_do_fsync=*/true) {
+    mock_clock_ = std::make_shared<MockSystemClock>(env_->GetSystemClock());
+    mock_env_.reset(new CompositeEnvWrapper(env_, mock_clock_));
+  }
 
  protected:
-  std::unique_ptr<MockTimeEnv> mock_env_;
+  std::unique_ptr<Env> mock_env_;
+  std::shared_ptr<MockSystemClock> mock_clock_;
 
   void SetUp() override {
-    mock_env_->InstallTimedWaitFixCallback();
+    mock_clock_->InstallTimedWaitFixCallback();
     SyncPoint::GetInstance()->SetCallBack(
         "DBImpl::StartPeriodicWorkScheduler:Init", [&](void* arg) {
           auto* periodic_work_scheduler_ptr =
               reinterpret_cast<PeriodicWorkScheduler**>(arg);
           *periodic_work_scheduler_ptr =
-              PeriodicWorkTestScheduler::Default(mock_env_.get());
+              PeriodicWorkTestScheduler::Default(mock_clock_);
         });
   }
 };
@@ -62,7 +67,7 @@ TEST_F(PeriodicWorkSchedulerTest, Basic) {
 
   ASSERT_GT(kPeriodSec, 1u);
   dbfull()->TEST_WaitForStatsDumpRun([&] {
-    mock_env_->MockSleepForSeconds(static_cast<int>(kPeriodSec) - 1);
+    mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec) - 1);
   });
 
   auto scheduler = dbfull()->TEST_GetPeriodicWorkScheduler();
@@ -74,14 +79,14 @@ TEST_F(PeriodicWorkSchedulerTest, Basic) {
   ASSERT_EQ(1, flush_info_log_counter);
 
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
 
   ASSERT_EQ(2, dump_st_counter);
   ASSERT_EQ(2, pst_st_counter);
   ASSERT_EQ(2, flush_info_log_counter);
 
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
 
   ASSERT_EQ(3, dump_st_counter);
   ASSERT_EQ(3, pst_st_counter);
@@ -95,7 +100,7 @@ TEST_F(PeriodicWorkSchedulerTest, Basic) {
 
   // Info log flush should still run.
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
   ASSERT_EQ(3, dump_st_counter);
   ASSERT_EQ(3, pst_st_counter);
   ASSERT_EQ(4, flush_info_log_counter);
@@ -113,7 +118,7 @@ TEST_F(PeriodicWorkSchedulerTest, Basic) {
   ASSERT_EQ(2, scheduler->TEST_GetValidTaskNum());
 
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
   ASSERT_EQ(4, dump_st_counter);
   ASSERT_EQ(3, pst_st_counter);
   ASSERT_EQ(5, flush_info_log_counter);
@@ -153,19 +158,19 @@ TEST_F(PeriodicWorkSchedulerTest, MultiInstances) {
 
   int expected_run = kInstanceNum;
   dbi->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec - 1); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
   ASSERT_EQ(expected_run, dump_st_counter);
   ASSERT_EQ(expected_run, pst_st_counter);
 
   expected_run += kInstanceNum;
   dbi->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   ASSERT_EQ(expected_run, dump_st_counter);
   ASSERT_EQ(expected_run, pst_st_counter);
 
   expected_run += kInstanceNum;
   dbi->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   ASSERT_EQ(expected_run, dump_st_counter);
   ASSERT_EQ(expected_run, pst_st_counter);
 
@@ -177,14 +182,14 @@ TEST_F(PeriodicWorkSchedulerTest, MultiInstances) {
   expected_run += (kInstanceNum - half) * 2;
 
   dbi->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   dbi->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   ASSERT_EQ(expected_run, dump_st_counter);
   ASSERT_EQ(expected_run, pst_st_counter);
 
   for (int i = half; i < kInstanceNum; i++) {
-    dbs[i]->Close();
+    ASSERT_OK(dbs[i]->Close());
     delete dbs[i];
   }
 }
@@ -201,7 +206,8 @@ TEST_F(PeriodicWorkSchedulerTest, MultiEnv) {
 
   Reopen(options1);
 
-  std::unique_ptr<MockTimeEnv> mock_env2(new MockTimeEnv(Env::Default()));
+  std::unique_ptr<Env> mock_env2(
+      new CompositeEnvWrapper(Env::Default(), mock_clock_));
   Options options2;
   options2.stats_dump_period_sec = kDumpPeriodSec;
   options2.stats_persist_period_sec = kPersistPeriodSec;
@@ -216,7 +222,7 @@ TEST_F(PeriodicWorkSchedulerTest, MultiEnv) {
   ASSERT_EQ(dbi->TEST_GetPeriodicWorkScheduler(),
             dbfull()->TEST_GetPeriodicWorkScheduler());
 
-  db->Close();
+  ASSERT_OK(db->Close());
   delete db;
   Close();
 }
diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc
index dd428da9bb2..7f5023127a7 100644
--- a/db/plain_table_db_test.cc
+++ b/db/plain_table_db_test.cc
@@ -16,7 +16,6 @@
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
 #include "file/filename.h"
-#include "logging/logging.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/db.h"
@@ -39,7 +38,6 @@
 #include "util/string_util.h"
 #include "utilities/merge_operators.h"
 
-using std::unique_ptr;
 
 namespace ROCKSDB_NAMESPACE {
 class PlainTableKeyDecoderTest : public testing::Test {};
@@ -51,9 +49,9 @@ TEST_F(PlainTableKeyDecoderTest, ReadNonMmap) {
   Slice contents(tmp);
   test::StringSource* string_source =
       new test::StringSource(contents, 0, false);
-
+  std::unique_ptr<FSRandomAccessFile> holder(string_source);
   std::unique_ptr<RandomAccessFileReader> file_reader(
-      test::GetRandomAccessFileReader(string_source));
+      new RandomAccessFileReader(std::move(holder), "test"));
   std::unique_ptr<PlainTableReaderFileInfo> file_info(
       new PlainTableReaderFileInfo(std::move(file_reader), EnvOptions(),
                                    kLength));
@@ -264,18 +262,15 @@ extern const uint64_t kPlainTableMagicNumber;
 
 class TestPlainTableReader : public PlainTableReader {
  public:
-  TestPlainTableReader(const EnvOptions& env_options,
-                       const InternalKeyComparator& icomparator,
-                       EncodingType encoding_type, uint64_t file_size,
-                       int bloom_bits_per_key, double hash_table_ratio,
-                       size_t index_sparseness,
-                       const TableProperties* table_properties,
-                       std::unique_ptr<RandomAccessFileReader>&& file,
-                       const ImmutableCFOptions& ioptions,
-                       const SliceTransform* prefix_extractor,
-                       bool* expect_bloom_not_match, bool store_index_in_file,
-                       uint32_t column_family_id,
-                       const std::string& column_family_name)
+  TestPlainTableReader(
+      const EnvOptions& env_options, const InternalKeyComparator& icomparator,
+      EncodingType encoding_type, uint64_t file_size, int bloom_bits_per_key,
+      double hash_table_ratio, size_t index_sparseness,
+      const TableProperties* table_properties,
+      std::unique_ptr<RandomAccessFileReader>&& file,
+      const ImmutableOptions& ioptions, const SliceTransform* prefix_extractor,
+      bool* expect_bloom_not_match, bool store_index_in_file,
+      uint32_t column_family_id, const std::string& column_family_name)
       : PlainTableReader(ioptions, std::move(file), env_options, icomparator,
                          encoding_type, file_size, table_properties,
                          prefix_extractor),
@@ -397,7 +392,7 @@ class TestPlainTableFactory : public PlainTableFactory {
 TEST_P(PlainTableDBTest, BadOptions1) {
   // Build with a prefix extractor
   ASSERT_OK(Put("1000000000000foo", "v1"));
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   // Bad attempt to re-open without a prefix extractor
   Options options = CurrentOptions();
@@ -428,7 +423,9 @@ TEST_P(PlainTableDBTest, BadOptions2) {
   // Build without a prefix extractor
   // (apparently works even if hash_table_ratio > 0)
   ASSERT_OK(Put("1000000000000foo", "v1"));
-  dbfull()->TEST_FlushMemTable();
+  // Build without a prefix extractor, this call will fail and returns the
+  // status for this bad attempt.
+  ASSERT_NOK(dbfull()->TEST_FlushMemTable());
 
   // Bad attempt to re-open with hash_table_ratio > 0 and no prefix extractor
   Status s = TryReopen(&options);
@@ -503,14 +500,15 @@ TEST_P(PlainTableDBTest, Flush) {
           ASSERT_OK(Put("1000000000000foo", "v1"));
           ASSERT_OK(Put("0000000000000bar", "v2"));
           ASSERT_OK(Put("1000000000000foo", "v3"));
-          dbfull()->TEST_FlushMemTable();
+          ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
           ASSERT_TRUE(dbfull()->GetIntProperty(
               "rocksdb.estimate-table-readers-mem", &int_num));
           ASSERT_GT(int_num, 0U);
 
           TablePropertiesCollection ptc;
-          reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc);
+          ASSERT_OK(
+              reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
           ASSERT_EQ(1U, ptc.size());
           auto row = ptc.begin();
           auto tp = row->second;
@@ -595,23 +593,23 @@ TEST_P(PlainTableDBTest, Flush2) {
         DestroyAndReopen(&options);
         ASSERT_OK(Put("0000000000000bar", "b"));
         ASSERT_OK(Put("1000000000000foo", "v1"));
-        dbfull()->TEST_FlushMemTable();
+        ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
         ASSERT_OK(Put("1000000000000foo", "v2"));
-        dbfull()->TEST_FlushMemTable();
+        ASSERT_OK(dbfull()->TEST_FlushMemTable());
         ASSERT_EQ("v2", Get("1000000000000foo"));
 
         ASSERT_OK(Put("0000000000000eee", "v3"));
-        dbfull()->TEST_FlushMemTable();
+        ASSERT_OK(dbfull()->TEST_FlushMemTable());
         ASSERT_EQ("v3", Get("0000000000000eee"));
 
         ASSERT_OK(Delete("0000000000000bar"));
-        dbfull()->TEST_FlushMemTable();
+        ASSERT_OK(dbfull()->TEST_FlushMemTable());
         ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
 
         ASSERT_OK(Put("0000000000000eee", "v5"));
         ASSERT_OK(Put("9000000000000eee", "v5"));
-        dbfull()->TEST_FlushMemTable();
+        ASSERT_OK(dbfull()->TEST_FlushMemTable());
         ASSERT_EQ("v5", Get("0000000000000eee"));
 
         // Test Bloom Filter
@@ -651,7 +649,7 @@ TEST_P(PlainTableDBTest, Immortal) {
     DestroyAndReopen(&options);
     ASSERT_OK(Put("0000000000000bar", "b"));
     ASSERT_OK(Put("1000000000000foo", "v1"));
-    dbfull()->TEST_FlushMemTable();
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
     int copied = 0;
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
@@ -729,7 +727,7 @@ TEST_P(PlainTableDBTest, Iterator) {
         ASSERT_OK(Put("1000000000foo005", "v__5"));
         ASSERT_OK(Put("1000000000foo007", "v__7"));
         ASSERT_OK(Put("1000000000foo008", "v__8"));
-        dbfull()->TEST_FlushMemTable();
+        ASSERT_OK(dbfull()->TEST_FlushMemTable());
         ASSERT_EQ("v1", Get("1000000000foo001"));
         ASSERT_EQ("v__3", Get("1000000000foo003"));
         Iterator* iter = dbfull()->NewIterator(ReadOptions());
@@ -799,7 +797,7 @@ TEST_P(PlainTableDBTest, Iterator) {
             expect_bloom_not_match = false;
           }
         }
-
+        ASSERT_OK(iter->status());
         delete iter;
       }
     }
@@ -840,7 +838,7 @@ TEST_P(PlainTableDBTest, BloomSchema) {
     for (unsigned i = 0; i < 2345; ++i) {
       ASSERT_OK(Put(NthKey(i, 'y'), "added"));
     }
-    dbfull()->TEST_FlushMemTable();
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
     ASSERT_EQ("added", Get(NthKey(42, 'y')));
 
     for (unsigned i = 0; i < 32; ++i) {
@@ -898,7 +896,7 @@ TEST_P(PlainTableDBTest, IteratorLargeKeys) {
     ASSERT_OK(Put(key_list[i], ToString(i)));
   }
 
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   Iterator* iter = dbfull()->NewIterator(ReadOptions());
   iter->Seek(key_list[0]);
@@ -946,7 +944,7 @@ TEST_P(PlainTableDBTest, IteratorLargeKeysWithPrefix) {
     ASSERT_OK(Put(key_list[i], ToString(i)));
   }
 
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   Iterator* iter = dbfull()->NewIterator(ReadOptions());
   iter->Seek(key_list[0]);
@@ -981,7 +979,7 @@ TEST_P(PlainTableDBTest, IteratorReverseSuffixComparator) {
   ASSERT_OK(Put("1000000000foo005", "v__5"));
   ASSERT_OK(Put("1000000000foo007", "v__7"));
   ASSERT_OK(Put("1000000000foo008", "v__8"));
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
   ASSERT_EQ("v1", Get("1000000000foo001"));
   ASSERT_EQ("v__3", Get("1000000000foo003"));
   Iterator* iter = dbfull()->NewIterator(ReadOptions());
@@ -1059,7 +1057,7 @@ TEST_P(PlainTableDBTest, HashBucketConflict) {
       ASSERT_OK(Put("2000000000000fo2", "v"));
       ASSERT_OK(Put("2000000000000fo3", "v"));
 
-      dbfull()->TEST_FlushMemTable();
+      ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
       ASSERT_EQ("v1", Get("5000000000000fo0"));
       ASSERT_EQ("v2", Get("5000000000000fo1"));
@@ -1120,6 +1118,7 @@ TEST_P(PlainTableDBTest, HashBucketConflict) {
       iter->Seek("8000000000000fo2");
       ASSERT_TRUE(!iter->Valid());
 
+      ASSERT_OK(iter->status());
       delete iter;
     }
   }
@@ -1153,7 +1152,7 @@ TEST_P(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
       ASSERT_OK(Put("2000000000000fo2", "v"));
       ASSERT_OK(Put("2000000000000fo3", "v"));
 
-      dbfull()->TEST_FlushMemTable();
+      ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
       ASSERT_EQ("v1", Get("5000000000000fo0"));
       ASSERT_EQ("v2", Get("5000000000000fo1"));
@@ -1213,6 +1212,7 @@ TEST_P(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
       iter->Seek("8000000000000fo2");
       ASSERT_TRUE(!iter->Valid());
 
+      ASSERT_OK(iter->status());
       delete iter;
     }
   }
@@ -1235,7 +1235,7 @@ TEST_P(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) {
   ASSERT_OK(Put("5000000000000fo1", "v2"));
   ASSERT_OK(Put("5000000000000fo2", "v3"));
 
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   ASSERT_EQ("v1", Get("5000000000000fo0"));
   ASSERT_EQ("v2", Get("5000000000000fo1"));
@@ -1259,6 +1259,7 @@ TEST_P(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) {
   iter->Seek("8000000000000fo2");
   ASSERT_TRUE(!iter->Valid());
 
+  ASSERT_OK(iter->status());
   delete iter;
 }
 
@@ -1286,7 +1287,7 @@ TEST_P(PlainTableDBTest, CompactionTrigger) {
       ASSERT_OK(Put(Key(i), values[i]));
     }
     ASSERT_OK(Put(Key(999), ""));
-    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
     ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
   }
 
@@ -1297,7 +1298,7 @@ TEST_P(PlainTableDBTest, CompactionTrigger) {
     ASSERT_OK(Put(Key(i), values[i]));
   }
   ASSERT_OK(Put(Key(999), ""));
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
   ASSERT_EQ(NumTableFilesAtLevel(1), 1);
@@ -1313,7 +1314,7 @@ TEST_P(PlainTableDBTest, AdaptiveTable) {
   ASSERT_OK(Put("1000000000000foo", "v1"));
   ASSERT_OK(Put("0000000000000bar", "v2"));
   ASSERT_OK(Put("1000000000000foo", "v3"));
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
 
   options.create_if_missing = false;
   std::shared_ptr<TableFactory> block_based_factory(
@@ -1329,7 +1330,7 @@ TEST_P(PlainTableDBTest, AdaptiveTable) {
 
   ASSERT_OK(Put("2000000000000foo", "v4"));
   ASSERT_OK(Put("3000000000000bar", "v5"));
-  dbfull()->TEST_FlushMemTable();
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
   ASSERT_EQ("v4", Get("2000000000000foo"));
   ASSERT_EQ("v5", Get("3000000000000bar"));
 
diff --git a/db/prefix_test.cc b/db/prefix_test.cc
index d1ec6a2c8a3..37673eb8c8a 100644
--- a/db/prefix_test.cc
+++ b/db/prefix_test.cc
@@ -25,6 +25,7 @@ int main() {
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/perf_context.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/system_clock.h"
 #include "rocksdb/table.h"
 #include "test_util/testharness.h"
 #include "util/cast_util.h"
@@ -311,7 +312,7 @@ TEST(SamePrefixTest, InDomainTest) {
     ASSERT_OK(db->Put(write_options, "HHKB pro2", "Mar 24, 2006"));
     ASSERT_OK(db->Put(write_options, "HHKB pro2 Type-S", "June 29, 2011"));
     ASSERT_OK(db->Put(write_options, "Realforce 87u", "idk"));
-    db->Flush(FlushOptions());
+    ASSERT_OK(db->Flush(FlushOptions()));
     std::string result;
     auto db_iter = db->NewIterator(ReadOptions());
 
@@ -331,7 +332,7 @@ TEST(SamePrefixTest, InDomainTest) {
     ASSERT_OK(db->Put(write_options, "pikachu", "1"));
     ASSERT_OK(db->Put(write_options, "Meowth", "1"));
     ASSERT_OK(db->Put(write_options, "Mewtwo", "idk"));
-    db->Flush(FlushOptions());
+    ASSERT_OK(db->Flush(FlushOptions()));
     std::string result;
     auto db_iter = db->NewIterator(ReadOptions());
 
@@ -351,7 +352,7 @@ TEST_F(PrefixTest, TestResult) {
       std::cout << "*** Mem table: " << options.memtable_factory->Name()
                 << " number of buckets: " << num_buckets
                 << std::endl;
-      DestroyDB(kDbName, Options());
+      ASSERT_OK(DestroyDB(kDbName, Options()));
       auto db = OpenDb();
       WriteOptions write_options;
       ReadOptions read_options;
@@ -528,7 +529,7 @@ TEST_F(PrefixTest, PrefixValid) {
     while (NextOptions(num_buckets)) {
       std::cout << "*** Mem table: " << options.memtable_factory->Name()
                 << " number of buckets: " << num_buckets << std::endl;
-      DestroyDB(kDbName, Options());
+      ASSERT_OK(DestroyDB(kDbName, Options()));
       auto db = OpenDb();
       WriteOptions write_options;
       ReadOptions read_options;
@@ -543,7 +544,7 @@ TEST_F(PrefixTest, PrefixValid) {
       PutKey(db.get(), write_options, 12345, 8, v18);
       PutKey(db.get(), write_options, 12345, 9, v19);
       PutKey(db.get(), write_options, 12346, 8, v16);
-      db->Flush(FlushOptions());
+      ASSERT_OK(db->Flush(FlushOptions()));
       TestKey test_key(12346, 8);
       std::string s;
       ASSERT_OK(db->Delete(write_options, TestKeyToSlice(s, test_key)));
@@ -581,7 +582,7 @@ TEST_F(PrefixTest, DynamicPrefixIterator) {
   while (NextOptions(FLAGS_bucket_count)) {
     std::cout << "*** Mem table: " << options.memtable_factory->Name()
         << std::endl;
-    DestroyDB(kDbName, Options());
+    ASSERT_OK(DestroyDB(kDbName, Options()));
     auto db = OpenDb();
     WriteOptions write_options;
     ReadOptions read_options;
@@ -597,7 +598,6 @@ TEST_F(PrefixTest, DynamicPrefixIterator) {
 
     HistogramImpl hist_put_time;
     HistogramImpl hist_put_comparison;
-
     // insert x random prefix, each with y continuous element.
     for (auto prefix : prefixes) {
        for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
@@ -608,7 +608,7 @@ TEST_F(PrefixTest, DynamicPrefixIterator) {
         std::string value(FLAGS_value_size, 0);
 
         get_perf_context()->Reset();
-        StopWatchNano timer(Env::Default(), true);
+        StopWatchNano timer(SystemClock::Default().get(), true);
         ASSERT_OK(db->Put(write_options, key, value));
         hist_put_time.Add(timer.ElapsedNanos());
         hist_put_comparison.Add(get_perf_context()->user_key_comparison_count);
@@ -631,7 +631,7 @@ TEST_F(PrefixTest, DynamicPrefixIterator) {
       std::string value = "v" + ToString(0);
 
       get_perf_context()->Reset();
-      StopWatchNano timer(Env::Default(), true);
+      StopWatchNano timer(SystemClock::Default().get(), true);
       auto key_prefix = options.prefix_extractor->Transform(key);
       uint64_t total_keys = 0;
       for (iter->Seek(key);
@@ -665,7 +665,7 @@ TEST_F(PrefixTest, DynamicPrefixIterator) {
       Slice key = TestKeyToSlice(s, test_key);
 
       get_perf_context()->Reset();
-      StopWatchNano timer(Env::Default(), true);
+      StopWatchNano timer(SystemClock::Default().get(), true);
       iter->Seek(key);
       hist_no_seek_time.Add(timer.ElapsedNanos());
       hist_no_seek_comparison.Add(get_perf_context()->user_key_comparison_count);
@@ -689,7 +689,7 @@ TEST_F(PrefixTest, PrefixSeekModePrev) {
   for (size_t m = 1; m < 100; m++) {
     std::cout << "[" + std::to_string(m) + "]" + "*** Mem table: "
               << options.memtable_factory->Name() << std::endl;
-    DestroyDB(kDbName, Options());
+    ASSERT_OK(DestroyDB(kDbName, Options()));
     auto db = OpenDb();
     WriteOptions write_options;
     ReadOptions read_options;
@@ -714,7 +714,7 @@ TEST_F(PrefixTest, PrefixSeekModePrev) {
         }
       }
       if (i < 2) {
-        db->Flush(FlushOptions());
+        ASSERT_OK(db->Flush(FlushOptions()));
       }
     }
 
diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc
index 20616c22e58..47599a18fa3 100644
--- a/db/range_del_aggregator.cc
+++ b/db/range_del_aggregator.cc
@@ -33,9 +33,10 @@ TruncatedRangeDelIterator::TruncatedRangeDelIterator(
   if (smallest != nullptr) {
     pinned_bounds_.emplace_back();
     auto& parsed_smallest = pinned_bounds_.back();
-    Status pikStatus = ParseInternalKey(smallest->Encode(), &parsed_smallest);
-    pikStatus.PermitUncheckedError();
-    assert(pikStatus.ok());
+    Status pik_status = ParseInternalKey(smallest->Encode(), &parsed_smallest,
+                                         false /* log_err_key */);  // TODO
+    pik_status.PermitUncheckedError();
+    assert(pik_status.ok());
 
     smallest_ = &parsed_smallest;
   }
@@ -43,9 +44,10 @@ TruncatedRangeDelIterator::TruncatedRangeDelIterator(
     pinned_bounds_.emplace_back();
     auto& parsed_largest = pinned_bounds_.back();
 
-    Status pikStatus = ParseInternalKey(largest->Encode(), &parsed_largest);
-    pikStatus.PermitUncheckedError();
-    assert(pikStatus.ok());
+    Status pik_status = ParseInternalKey(largest->Encode(), &parsed_largest,
+                                         false /* log_err_key */);  // TODO
+    pik_status.PermitUncheckedError();
+    assert(pik_status.ok());
 
     if (parsed_largest.type == kTypeRangeDeletion &&
         parsed_largest.sequence == kMaxSequenceNumber) {
diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h
index d5d79d5a81a..8bbee50fb9d 100644
--- a/db/range_del_aggregator.h
+++ b/db/range_del_aggregator.h
@@ -43,12 +43,12 @@ class TruncatedRangeDelIterator {
 
   void InternalNext();
 
-  // Seeks to the tombstone with the highest viisble sequence number that covers
+  // Seeks to the tombstone with the highest visible sequence number that covers
   // target (a user key). If no such tombstone exists, the position will be at
   // the earliest tombstone that ends after target.
   void Seek(const Slice& target);
 
-  // Seeks to the tombstone with the highest viisble sequence number that covers
+  // Seeks to the tombstone with the highest visible sequence number that covers
   // target (a user key). If no such tombstone exists, the position will be at
   // the latest tombstone that starts before target.
   void SeekForPrev(const Slice& target);
@@ -284,9 +284,10 @@ class RangeDelAggregator {
   bool ShouldDelete(const Slice& key, RangeDelPositioningMode mode) {
     ParsedInternalKey parsed;
 
-    Status pikStatus = ParseInternalKey(key, &parsed);
-    assert(pikStatus.ok());
-    if (!pikStatus.ok()) {
+    Status pik_status =
+        ParseInternalKey(key, &parsed, false /* log_err_key */);  // TODO
+    assert(pik_status.ok());
+    if (!pik_status.ok()) {
       return false;
     }
 
diff --git a/db/range_del_aggregator_bench.cc b/db/range_del_aggregator_bench.cc
index 3f3135f2e83..061232f9926 100644
--- a/db/range_del_aggregator_bench.cc
+++ b/db/range_del_aggregator_bench.cc
@@ -11,8 +11,8 @@ int main() {
 }
 #else
 
-#include <iostream>
 #include <iomanip>
+#include <iostream>
 #include <memory>
 #include <random>
 #include <set>
@@ -22,14 +22,13 @@ int main() {
 #include "db/range_del_aggregator.h"
 #include "db/range_tombstone_fragmenter.h"
 #include "rocksdb/comparator.h"
-#include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/testutil.h"
 #include "util/coding.h"
+#include "util/gflags_compat.h"
 #include "util/random.h"
 #include "util/stop_watch.h"
 
-#include "util/gflags_compat.h"
-
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 
 DEFINE_int32(num_range_tombstones, 1000, "number of range tombstones created");
@@ -172,6 +171,8 @@ int main(int argc, char** argv) {
   ParseCommandLineFlags(&argc, &argv, true);
 
   Stats stats;
+  ROCKSDB_NAMESPACE::SystemClock* clock =
+      ROCKSDB_NAMESPACE::SystemClock::Default().get();
   ROCKSDB_NAMESPACE::Random64 rnd(FLAGS_seed);
   std::default_random_engine random_gen(FLAGS_seed);
   std::normal_distribution<double> normal_dist(FLAGS_tombstone_width_mean,
@@ -220,7 +221,7 @@ int main(int argc, char** argv) {
                   ROCKSDB_NAMESPACE::kMaxSequenceNumber));
 
       ROCKSDB_NAMESPACE::StopWatchNano stop_watch_add_tombstones(
-          ROCKSDB_NAMESPACE::Env::Default(), true /* auto_start */);
+          clock, true /* auto_start */);
       range_del_agg.AddTombstones(std::move(fragmented_range_del_iter));
       stats.time_add_tombstones += stop_watch_add_tombstones.ElapsedNanos();
     }
@@ -237,7 +238,7 @@ int main(int argc, char** argv) {
       parsed_key.user_key = key_string;
 
       ROCKSDB_NAMESPACE::StopWatchNano stop_watch_should_delete(
-          ROCKSDB_NAMESPACE::Env::Default(), true /* auto_start */);
+          clock, true /* auto_start */);
       range_del_agg.ShouldDelete(parsed_key, mode);
       uint64_t call_time = stop_watch_should_delete.ElapsedNanos();
 
diff --git a/db/range_tombstone_fragmenter.cc b/db/range_tombstone_fragmenter.cc
index 58426248c3e..0985fee0be3 100644
--- a/db/range_tombstone_fragmenter.cc
+++ b/db/range_tombstone_fragmenter.cc
@@ -6,12 +6,11 @@
 #include "db/range_tombstone_fragmenter.h"
 
 #include <algorithm>
+#include <cinttypes>
+#include <cstdio>
 #include <functional>
 #include <set>
 
-#include <stdio.h>
-#include <cinttypes>
-
 #include "util/autovector.h"
 #include "util/kv_map.h"
 #include "util/vector_iterator.h"
@@ -26,12 +25,15 @@ FragmentedRangeTombstoneList::FragmentedRangeTombstoneList(
     return;
   }
   bool is_sorted = true;
-  int num_tombstones = 0;
   InternalKey pinned_last_start_key;
   Slice last_start_key;
+  num_unfragmented_tombstones_ = 0;
+  total_tombstone_payload_bytes_ = 0;
   for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
-       unfragmented_tombstones->Next(), num_tombstones++) {
-    if (num_tombstones > 0 &&
+       unfragmented_tombstones->Next(), num_unfragmented_tombstones_++) {
+    total_tombstone_payload_bytes_ += unfragmented_tombstones->key().size() +
+                                      unfragmented_tombstones->value().size();
+    if (num_unfragmented_tombstones_ > 0 &&
         icmp.Compare(last_start_key, unfragmented_tombstones->key()) > 0) {
       is_sorted = false;
       break;
@@ -51,10 +53,14 @@ FragmentedRangeTombstoneList::FragmentedRangeTombstoneList(
 
   // Sort the tombstones before fragmenting them.
   std::vector<std::string> keys, values;
-  keys.reserve(num_tombstones);
-  values.reserve(num_tombstones);
+  keys.reserve(num_unfragmented_tombstones_);
+  values.reserve(num_unfragmented_tombstones_);
+  // Reset the counter to zero for the next iteration over keys.
+  total_tombstone_payload_bytes_ = 0;
   for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
        unfragmented_tombstones->Next()) {
+    total_tombstone_payload_bytes_ += unfragmented_tombstones->key().size() +
+                                      unfragmented_tombstones->value().size();
     keys.emplace_back(unfragmented_tombstones->key().data(),
                       unfragmented_tombstones->key().size());
     values.emplace_back(unfragmented_tombstones->value().data(),
diff --git a/db/range_tombstone_fragmenter.h b/db/range_tombstone_fragmenter.h
index 63ec24e64f0..f323db5d753 100644
--- a/db/range_tombstone_fragmenter.h
+++ b/db/range_tombstone_fragmenter.h
@@ -68,6 +68,14 @@ struct FragmentedRangeTombstoneList {
   // number in [lower, upper].
   bool ContainsRange(SequenceNumber lower, SequenceNumber upper) const;
 
+  uint64_t num_unfragmented_tombstones() const {
+    return num_unfragmented_tombstones_;
+  }
+
+  uint64_t total_tombstone_payload_bytes() const {
+    return total_tombstone_payload_bytes_;
+  }
+
  private:
   // Given an ordered range tombstone iterator unfragmented_tombstones,
   // "fragment" the tombstones into non-overlapping pieces, and store them in
@@ -82,6 +90,8 @@ struct FragmentedRangeTombstoneList {
   std::set<SequenceNumber> seq_set_;
   std::list<std::string> pinned_slices_;
   PinnedIteratorsManager pinned_iters_mgr_;
+  uint64_t num_unfragmented_tombstones_;
+  uint64_t total_tombstone_payload_bytes_;
 };
 
 // FragmentedRangeTombstoneIterator converts an InternalIterator of a range-del
@@ -180,6 +190,13 @@ class FragmentedRangeTombstoneIterator : public InternalIterator {
   SequenceNumber upper_bound() const { return upper_bound_; }
   SequenceNumber lower_bound() const { return lower_bound_; }
 
+  uint64_t num_unfragmented_tombstones() const {
+    return tombstones_->num_unfragmented_tombstones();
+  }
+  uint64_t total_tombstone_payload_bytes() const {
+    return tombstones_->total_tombstone_payload_bytes();
+  }
+
  private:
   using RangeTombstoneStack = FragmentedRangeTombstoneList::RangeTombstoneStack;
 
diff --git a/db/repair.cc b/db/repair.cc
index 671c105a1dc..1ebd47402bd 100644
--- a/db/repair.cc
+++ b/db/repair.cc
@@ -71,7 +71,6 @@
 #include "db/table_cache.h"
 #include "db/version_edit.h"
 #include "db/write_batch_internal.h"
-#include "env/composite_env_wrapper.h"
 #include "file/filename.h"
 #include "file/writable_file_writer.h"
 #include "options/cf_options.h"
@@ -101,8 +100,8 @@ class Repairer {
         icmp_(default_cf_opts.comparator),
         default_cf_opts_(
             SanitizeOptions(immutable_db_options_, default_cf_opts)),
-        default_cf_iopts_(
-            ImmutableCFOptions(immutable_db_options_, default_cf_opts_)),
+        default_iopts_(
+            ImmutableOptions(immutable_db_options_, default_cf_opts_)),
         unknown_cf_opts_(
             SanitizeOptions(immutable_db_options_, unknown_cf_opts)),
         create_unknown_cfs_(create_unknown_cfs),
@@ -110,14 +109,20 @@ class Repairer {
             // TableCache can be small since we expect each table to be opened
             // once.
             NewLRUCache(10, db_options_.table_cache_numshardbits)),
-        table_cache_(new TableCache(
-            default_cf_iopts_, env_options_, raw_table_cache_.get(),
-            /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr)),
+        table_cache_(
+            // TODO: db_session_id for TableCache should be initialized after
+            // db_session_id_ is set.
+            new TableCache(default_iopts_, env_options_, raw_table_cache_.get(),
+                           /*block_cache_tracer=*/nullptr,
+                           /*io_tracer=*/nullptr, /*db_session_id*/ "")),
         wb_(db_options_.db_write_buffer_size),
         wc_(db_options_.delayed_write_rate),
+        // TODO: db_session_id for VersionSet should be initialized after
+        // db_session_id_ is set and use it for initialization.
         vset_(dbname_, &immutable_db_options_, env_options_,
               raw_table_cache_.get(), &wb_, &wc_,
-              /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr),
+              /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+              /*db_session_id*/ ""),
         next_file_number_(1),
         db_lock_(nullptr),
         closed_(false) {
@@ -249,7 +254,7 @@ class Repairer {
   const ImmutableDBOptions immutable_db_options_;
   const InternalKeyComparator icmp_;
   const ColumnFamilyOptions default_cf_opts_;
-  const ImmutableCFOptions default_cf_iopts_;  // table_cache_ holds reference
+  const ImmutableOptions default_iopts_;  // table_cache_ holds reference
   const ColumnFamilyOptions unknown_cf_opts_;
   const bool create_unknown_cfs_;
   std::shared_ptr<Cache> raw_table_cache_;
@@ -312,7 +317,7 @@ class Repairer {
             if (number + 1 > next_file_number_) {
               next_file_number_ = number + 1;
             }
-            if (type == kLogFile) {
+            if (type == kWalFile) {
               logs_.push_back(number);
             } else if (type == kTableFile) {
               table_fds_.emplace_back(number, static_cast<uint32_t>(path_id),
@@ -358,14 +363,14 @@ class Repairer {
 
     // Open the log file
     std::string logname = LogFileName(db_options_.wal_dir, log);
-    std::unique_ptr<SequentialFile> lfile;
-    Status status = env_->NewSequentialFile(
-        logname, &lfile, env_->OptimizeForLogRead(env_options_));
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<SequentialFileReader> lfile_reader;
+    Status status = SequentialFileReader::Create(
+        fs, logname, fs->OptimizeForLogRead(env_options_), &lfile_reader,
+        nullptr);
     if (!status.ok()) {
       return status;
     }
-    std::unique_ptr<SequentialFileReader> lfile_reader(new SequentialFileReader(
-        NewLegacySequentialFileWrapper(lfile), logname));
 
     // Create the log reader.
     LogReporter reporter;
@@ -426,7 +431,8 @@ class Repairer {
       Arena arena;
       ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
       int64_t _current_time = 0;
-      status = env_->GetCurrentTime(&_current_time);  // ignore error
+      immutable_db_options_.clock->GetCurrentTime(&_current_time)
+          .PermitUncheckedError();  // ignore error
       const uint64_t current_time = static_cast<uint64_t>(_current_time);
       SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance();
 
@@ -439,22 +445,25 @@ class Repairer {
         range_del_iters.emplace_back(range_del_iter);
       }
 
-      LegacyFileSystemWrapper fs(env_);
       IOStatus io_s;
+      CompressionOptions default_compression;
+      TableBuilderOptions tboptions(
+          *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(),
+          cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
+          kNoCompression, default_compression, cfd->GetID(), cfd->GetName(),
+          -1 /* level */, false /* is_bottommost */,
+          TableFileCreationReason::kRecovery, current_time,
+          0 /* oldest_key_time */, 0 /* file_creation_time */,
+          "DB Repairer" /* db_id */, db_session_id_, 0 /*target_file_size*/,
+          meta.fd.GetNumber());
       status = BuildTable(
-          dbname_, /* versions */ nullptr, env_, &fs, *cfd->ioptions(),
-          *cfd->GetLatestMutableCFOptions(), env_options_, table_cache_.get(),
-          iter.get(), std::move(range_del_iters), &meta,
-          nullptr /* blob_file_additions */, cfd->internal_comparator(),
-          cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(),
-          {}, kMaxSequenceNumber, snapshot_checker, kNoCompression,
-          0 /* sample_for_compression */, CompressionOptions(), false,
-          nullptr /* internal_stats */, TableFileCreationReason::kRecovery,
-          &io_s, nullptr /*IOTracer*/, nullptr /* event_logger */,
-          0 /* job_id */, Env::IO_HIGH, nullptr /* table_properties */,
-          -1 /* level */, current_time, 0 /* oldest_key_time */, write_hint,
-          0 /* file_creation_time */, "DB Repairer" /* db_id */,
-          db_session_id_);
+          dbname_, /* versions */ nullptr, immutable_db_options_, tboptions,
+          env_options_, table_cache_.get(), iter.get(),
+          std::move(range_del_iters), &meta, nullptr /* blob_file_additions */,
+          {}, kMaxSequenceNumber, snapshot_checker,
+          false /* paranoid_file_checks*/, nullptr /* internal_stats */, &io_s,
+          nullptr /*IOTracer*/, nullptr /* event_logger */, 0 /* job_id */,
+          Env::IO_HIGH, nullptr /* table_properties */, write_hint);
       ROCKS_LOG_INFO(db_options_.info_log,
                      "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s",
                      log, counter, meta.fd.GetNumber(),
@@ -554,10 +563,12 @@ class Repairer {
       ParsedInternalKey parsed;
       for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
         Slice key = iter->key();
-        if (ParseInternalKey(key, &parsed) != Status::OK()) {
+        Status pik_status =
+            ParseInternalKey(key, &parsed, db_options_.allow_data_in_errors);
+        if (!pik_status.ok()) {
           ROCKS_LOG_ERROR(db_options_.info_log,
-                          "Table #%" PRIu64 ": unparsable key %s",
-                          t->meta.fd.GetNumber(), EscapeString(key).c_str());
+                          "Table #%" PRIu64 ": unparsable key - %s",
+                          t->meta.fd.GetNumber(), pik_status.getState());
           continue;
         }
 
diff --git a/db/repair_test.cc b/db/repair_test.cc
index 49a96bf8aed..9ea2d9460ec 100644
--- a/db/repair_test.cc
+++ b/db/repair_test.cc
@@ -80,8 +80,8 @@ TEST_F(RepairTest, CorruptManifest) {
   Close();
   ASSERT_OK(env_->FileExists(manifest_path));
 
-  LegacyFileSystemWrapper fs(env_);
-  ASSERT_OK(CreateFile(&fs, manifest_path, "blah", false /* use_fsync */));
+  ASSERT_OK(CreateFile(env_->GetFileSystem(), manifest_path, "blah",
+                       false /* use_fsync */));
   ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
   Reopen(CurrentOptions());
 
@@ -163,8 +163,8 @@ TEST_F(RepairTest, CorruptSst) {
   ASSERT_OK(GetFirstSstPath(&sst_path));
   ASSERT_FALSE(sst_path.empty());
 
-  LegacyFileSystemWrapper fs(env_);
-  ASSERT_OK(CreateFile(&fs, sst_path, "blah", false /* use_fsync */));
+  ASSERT_OK(CreateFile(env_->GetFileSystem(), sst_path, "blah",
+                       false /* use_fsync */));
 
   Close();
   ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
@@ -184,7 +184,7 @@ TEST_F(RepairTest, UnflushedSst) {
   {
     uint64_t total_ssts_size;
     std::unordered_map<std::string, uint64_t> sst_files;
-    ASSERT_OK(GetAllSSTFiles(&sst_files, &total_ssts_size));
+    ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size));
     ASSERT_EQ(total_ssts_size, 0);
   }
   // Need to get path before Close() deletes db_, but delete it after Close() to
@@ -203,7 +203,7 @@ TEST_F(RepairTest, UnflushedSst) {
   {
     uint64_t total_ssts_size;
     std::unordered_map<std::string, uint64_t> sst_files;
-    ASSERT_OK(GetAllSSTFiles(&sst_files, &total_ssts_size));
+    ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size));
     ASSERT_GT(total_ssts_size, 0);
   }
   ASSERT_EQ(Get("key"), "val");
@@ -221,7 +221,7 @@ TEST_F(RepairTest, SeparateWalDir) {
     {
       uint64_t total_ssts_size;
       std::unordered_map<std::string, uint64_t> sst_files;
-      ASSERT_OK(GetAllSSTFiles(&sst_files, &total_ssts_size));
+      ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size));
       ASSERT_EQ(total_ssts_size, 0);
     }
     std::string manifest_path =
@@ -241,7 +241,7 @@ TEST_F(RepairTest, SeparateWalDir) {
     {
       uint64_t total_ssts_size;
       std::unordered_map<std::string, uint64_t> sst_files;
-      ASSERT_OK(GetAllSSTFiles(&sst_files, &total_ssts_size));
+      ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size));
       ASSERT_GT(total_ssts_size, 0);
     }
     ASSERT_EQ(Get("key"), "val");
diff --git a/db/snapshot_impl.h b/db/snapshot_impl.h
index 785f814f81d..bfa44e3f53b 100644
--- a/db/snapshot_impl.h
+++ b/db/snapshot_impl.h
@@ -23,7 +23,7 @@ class SnapshotImpl : public Snapshot {
   SequenceNumber number_;  // const after creation
   // It indicates the smallest uncommitted data at the time the snapshot was
   // taken. This is currently used by WritePrepared transactions to limit the
-  // scope of queries to IsInSnpashot.
+  // scope of queries to IsInSnapshot.
   SequenceNumber min_uncommitted_ = kMinUnCommittedSeq;
 
   virtual SequenceNumber GetSequenceNumber() const override { return number_; }
diff --git a/db/table_cache.cc b/db/table_cache.cc
index c6beee18237..2e4d2a58ae4 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -65,17 +65,19 @@ void AppendVarint64(IterKey* key, uint64_t v) {
 
 const int kLoadConcurency = 128;
 
-TableCache::TableCache(const ImmutableCFOptions& ioptions,
+TableCache::TableCache(const ImmutableOptions& ioptions,
                        const FileOptions& file_options, Cache* const cache,
                        BlockCacheTracer* const block_cache_tracer,
-                       const std::shared_ptr<IOTracer>& io_tracer)
+                       const std::shared_ptr<IOTracer>& io_tracer,
+                       const std::string& db_session_id)
     : ioptions_(ioptions),
       file_options_(file_options),
       cache_(cache),
       immortal_tables_(false),
       block_cache_tracer_(block_cache_tracer),
-      loader_mutex_(kLoadConcurency, GetSliceNPHash64),
-      io_tracer_(io_tracer) {
+      loader_mutex_(kLoadConcurency, kGetSliceNPHash64UnseededFnPtr),
+      io_tracer_(io_tracer),
+      db_session_id_(db_session_id) {
   if (ioptions_.row_cache) {
     // If the same cache is shared by multiple instances, we need to
     // disambiguate its entries.
@@ -106,38 +108,38 @@ Status TableCache::GetTableReader(
       TableFileName(ioptions_.cf_paths, fd.GetNumber(), fd.GetPathId());
   std::unique_ptr<FSRandomAccessFile> file;
   FileOptions fopts = file_options;
-  Status s = PrepareIOFromReadOptions(ro, ioptions_.env, fopts.io_options);
+  Status s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options);
   if (s.ok()) {
     s = ioptions_.fs->NewRandomAccessFile(fname, fopts, &file, nullptr);
   }
-  RecordTick(ioptions_.statistics, NO_FILE_OPENS);
+  RecordTick(ioptions_.stats, NO_FILE_OPENS);
   if (s.IsPathNotFound()) {
     fname = Rocks2LevelTableFileName(fname);
-    s = PrepareIOFromReadOptions(ro, ioptions_.env, fopts.io_options);
+    s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options);
     if (s.ok()) {
       s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file,
                                             nullptr);
     }
-    RecordTick(ioptions_.statistics, NO_FILE_OPENS);
+    RecordTick(ioptions_.stats, NO_FILE_OPENS);
   }
 
   if (s.ok()) {
     if (!sequential_mode && ioptions_.advise_random_on_open) {
       file->Hint(FSRandomAccessFile::kRandom);
     }
-    StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS);
+    StopWatch sw(ioptions_.clock, ioptions_.stats, TABLE_OPEN_IO_MICROS);
     std::unique_ptr<RandomAccessFileReader> file_reader(
         new RandomAccessFileReader(
-            std::move(file), fname, ioptions_.env, io_tracer_,
-            record_read_stats ? ioptions_.statistics : nullptr, SST_READ_MICROS,
-            file_read_hist, ioptions_.rate_limiter, ioptions_.listeners));
+            std::move(file), fname, ioptions_.clock, io_tracer_,
+            record_read_stats ? ioptions_.stats : nullptr, SST_READ_MICROS,
+            file_read_hist, ioptions_.rate_limiter.get(), ioptions_.listeners));
     s = ioptions_.table_factory->NewTableReader(
         ro,
-        TableReaderOptions(ioptions_, prefix_extractor, file_options,
-                           internal_comparator, skip_filters, immortal_tables_,
-                           false /* force_direct_prefetch */, level,
-                           fd.largest_seqno, block_cache_tracer_,
-                           max_file_size_for_l0_meta_pin),
+        TableReaderOptions(
+            ioptions_, prefix_extractor, file_options, internal_comparator,
+            skip_filters, immortal_tables_, false /* force_direct_prefetch */,
+            level, fd.largest_seqno, block_cache_tracer_,
+            max_file_size_for_l0_meta_pin, db_session_id_, fd.GetNumber()),
         std::move(file_reader), fd.GetFileSize(), table_reader,
         prefetch_index_and_filter_in_cache);
     TEST_SYNC_POINT("TableCache::GetTableReader:0");
@@ -161,7 +163,7 @@ Status TableCache::FindTable(const ReadOptions& ro,
                              HistogramImpl* file_read_hist, bool skip_filters,
                              int level, bool prefetch_index_and_filter_in_cache,
                              size_t max_file_size_for_l0_meta_pin) {
-  PERF_TIMER_GUARD_WITH_ENV(find_table_nanos, ioptions_.env);
+  PERF_TIMER_GUARD_WITH_CLOCK(find_table_nanos, ioptions_.clock);
   uint64_t number = fd.GetNumber();
   Slice key = GetSliceForFileNumber(&number);
   *handle = cache_->Lookup(key);
@@ -187,7 +189,7 @@ Status TableCache::FindTable(const ReadOptions& ro,
         max_file_size_for_l0_meta_pin);
     if (!s.ok()) {
       assert(table_reader == nullptr);
-      RecordTick(ioptions_.statistics, NO_FILE_ERRORS);
+      RecordTick(ioptions_.stats, NO_FILE_ERRORS);
       // We do not cache error results so that if the error is transient,
       // or somebody repairs the file, we recover automatically.
     } else {
@@ -375,10 +377,10 @@ bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
                                  ioptions_.row_cache.get(), row_handle);
     replayGetContextLog(*found_row_cache_entry, user_key, get_context,
                         &value_pinner);
-    RecordTick(ioptions_.statistics, ROW_CACHE_HIT);
+    RecordTick(ioptions_.stats, ROW_CACHE_HIT);
     found = true;
   } else {
-    RecordTick(ioptions_.statistics, ROW_CACHE_MISS);
+    RecordTick(ioptions_.stats, ROW_CACHE_MISS);
   }
   return found;
 }
@@ -502,8 +504,8 @@ Status TableCache::MultiGet(const ReadOptions& options,
 
     for (auto miter = table_range.begin(); miter != table_range.end();
          ++miter) {
-      const Slice& user_key = miter->ukey;
-      ;
+      const Slice& user_key = miter->ukey_with_ts;
+
       GetContext* get_context = miter->get_context;
 
       if (GetFromRowCache(user_key, row_cache_key, row_cache_key_prefix_size,
@@ -539,9 +541,9 @@ Status TableCache::MultiGet(const ReadOptions& options,
              ++iter) {
           SequenceNumber* max_covering_tombstone_seq =
               iter->get_context->max_covering_tombstone_seq();
-          *max_covering_tombstone_seq =
-              std::max(*max_covering_tombstone_seq,
-                       range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey));
+          *max_covering_tombstone_seq = std::max(
+              *max_covering_tombstone_seq,
+              range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey_with_ts));
         }
       }
     }
@@ -566,7 +568,7 @@ Status TableCache::MultiGet(const ReadOptions& options,
     for (auto miter = table_range.begin(); miter != table_range.end();
          ++miter) {
       std::string& row_cache_entry = row_cache_entries[row_idx++];
-      const Slice& user_key = miter->ukey;
+      const Slice& user_key = miter->ukey_with_ts;
       ;
       GetContext* get_context = miter->get_context;
 
diff --git a/db/table_cache.h b/db/table_cache.h
index a834683fc17..0c263afe56e 100644
--- a/db/table_cache.h
+++ b/db/table_cache.h
@@ -10,9 +10,9 @@
 // Thread-safe (provides internal synchronization)
 
 #pragma once
+#include <cstdint>
 #include <string>
 #include <vector>
-#include <stdint.h>
 
 #include "db/dbformat.h"
 #include "db/range_del_aggregator.h"
@@ -48,10 +48,11 @@ class HistogramImpl;
 // ioptions.row_cache
 class TableCache {
  public:
-  TableCache(const ImmutableCFOptions& ioptions,
+  TableCache(const ImmutableOptions& ioptions,
              const FileOptions& storage_options, Cache* cache,
              BlockCacheTracer* const block_cache_tracer,
-             const std::shared_ptr<IOTracer>& io_tracer);
+             const std::shared_ptr<IOTracer>& io_tracer,
+             const std::string& db_session_id);
   ~TableCache();
 
   // Return an iterator for the specified file number (the corresponding
@@ -183,7 +184,7 @@ class TableCache {
 
   Cache* get_cache() const { return cache_; }
 
-  // Capacity of the backing Cache that indicates inifinite TableCache capacity.
+  // Capacity of the backing Cache that indicates infinite TableCache capacity.
   // For example when max_open_files is -1 we set the backing Cache to this.
   static const int kInfiniteCapacity = 0x400000;
 
@@ -220,7 +221,7 @@ class TableCache {
   bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
                        size_t prefix_size, GetContext* get_context);
 
-  const ImmutableCFOptions& ioptions_;
+  const ImmutableOptions& ioptions_;
   const FileOptions& file_options_;
   Cache* const cache_;
   std::string row_cache_id_;
@@ -228,6 +229,7 @@ class TableCache {
   BlockCacheTracer* const block_cache_tracer_;
   Striped<port::Mutex, Slice> loader_mutex_;
   std::shared_ptr<IOTracer> io_tracer_;
+  std::string db_session_id_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/table_properties_collector.cc b/db/table_properties_collector.cc
index b43afdc5c42..fdf48c92717 100644
--- a/db/table_properties_collector.cc
+++ b/db/table_properties_collector.cc
@@ -33,8 +33,8 @@ Status UserKeyTablePropertiesCollector::InternalAdd(const Slice& key,
                                                     const Slice& value,
                                                     uint64_t file_size) {
   ParsedInternalKey ikey;
-  Status s = ParseInternalKey(key, &ikey);
-  if (s != Status::OK()) {
+  Status s = ParseInternalKey(key, &ikey, false /* log_err_key */);  // TODO
+  if (!s.ok()) {
     return s;
   }
 
@@ -43,10 +43,10 @@ Status UserKeyTablePropertiesCollector::InternalAdd(const Slice& key,
 }
 
 void UserKeyTablePropertiesCollector::BlockAdd(
-    uint64_t bLockRawBytes, uint64_t blockCompressedBytesFast,
-    uint64_t blockCompressedBytesSlow) {
-  return collector_->BlockAdd(bLockRawBytes, blockCompressedBytesFast,
-                              blockCompressedBytesSlow);
+    uint64_t block_raw_bytes, uint64_t block_compressed_bytes_fast,
+    uint64_t block_compressed_bytes_slow) {
+  return collector_->BlockAdd(block_raw_bytes, block_compressed_bytes_fast,
+                              block_compressed_bytes_slow);
 }
 
 Status UserKeyTablePropertiesCollector::Finish(
diff --git a/db/table_properties_collector.h b/db/table_properties_collector.h
index 130eb64d480..befb436529a 100644
--- a/db/table_properties_collector.h
+++ b/db/table_properties_collector.h
@@ -27,9 +27,9 @@ class IntTblPropCollector {
   virtual Status InternalAdd(const Slice& key, const Slice& value,
                              uint64_t file_size) = 0;
 
-  virtual void BlockAdd(uint64_t blockRawBytes,
-                        uint64_t blockCompressedBytesFast,
-                        uint64_t blockCompressedBytesSlow) = 0;
+  virtual void BlockAdd(uint64_t block_raw_bytes,
+                        uint64_t block_compressed_bytes_fast,
+                        uint64_t block_compressed_bytes_slow) = 0;
 
   virtual UserCollectedProperties GetReadableProperties() const = 0;
 
@@ -48,6 +48,13 @@ class IntTblPropCollectorFactory {
   virtual const char* Name() const = 0;
 };
 
+using IntTblPropCollectorFactories =
+    std::vector<std::unique_ptr<IntTblPropCollectorFactory>>;
+using IntTblPropCollectorFactoryIter =
+    IntTblPropCollectorFactories::const_iterator;
+using IntTblPropCollectorFactoryRange =
+    std::pair<IntTblPropCollectorFactoryIter, IntTblPropCollectorFactoryIter>;
+
 // When rocksdb creates a new table, it will encode all "user keys" into
 // "internal keys", which contains meta information of a given entry.
 //
@@ -64,9 +71,9 @@ class UserKeyTablePropertiesCollector : public IntTblPropCollector {
   virtual Status InternalAdd(const Slice& key, const Slice& value,
                              uint64_t file_size) override;
 
-  virtual void BlockAdd(uint64_t blockRawBytes,
-                        uint64_t blockCompressedBytesFast,
-                        uint64_t blockCompressedBytesSlow) override;
+  virtual void BlockAdd(uint64_t block_raw_bytes,
+                        uint64_t block_compressed_bytes_fast,
+                        uint64_t block_compressed_bytes_slow) override;
 
   virtual Status Finish(UserCollectedProperties* properties) override;
 
diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc
index 56d7edefe96..301302baeba 100644
--- a/db/table_properties_collector_test.cc
+++ b/db/table_properties_collector_test.cc
@@ -13,7 +13,6 @@
 
 #include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
-#include "env/composite_env_wrapper.h"
 #include "file/sequence_file_reader.h"
 #include "file/writable_file_writer.h"
 #include "options/cf_options.h"
@@ -42,23 +41,22 @@ namespace {
 static const uint32_t kTestColumnFamilyId = 66;
 static const std::string kTestColumnFamilyName = "test_column_fam";
 
-void MakeBuilder(const Options& options, const ImmutableCFOptions& ioptions,
-                 const MutableCFOptions& moptions,
-                 const InternalKeyComparator& internal_comparator,
-                 const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-                     int_tbl_prop_collector_factories,
-                 std::unique_ptr<WritableFileWriter>* writable,
-                 std::unique_ptr<TableBuilder>* builder) {
-  std::unique_ptr<WritableFile> wf(new test::StringSink);
+void MakeBuilder(
+    const Options& options, const ImmutableOptions& ioptions,
+    const MutableCFOptions& moptions,
+    const InternalKeyComparator& internal_comparator,
+    const IntTblPropCollectorFactories* int_tbl_prop_collector_factories,
+    std::unique_ptr<WritableFileWriter>* writable,
+    std::unique_ptr<TableBuilder>* builder) {
+  std::unique_ptr<FSWritableFile> wf(new test::StringSink);
   writable->reset(
-      new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(wf)),
-                             "" /* don't care */, EnvOptions()));
+      new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions()));
   int unknown_level = -1;
-  builder->reset(NewTableBuilder(
+  TableBuilderOptions tboptions(
       ioptions, moptions, internal_comparator, int_tbl_prop_collector_factories,
-      kTestColumnFamilyId, kTestColumnFamilyName, writable->get(),
-      options.compression, options.sample_for_compression,
-      options.compression_opts, unknown_level));
+      options.compression, options.compression_opts, kTestColumnFamilyId,
+      kTestColumnFamilyName, unknown_level);
+  builder->reset(NewTableBuilder(tboptions, writable->get()));
 }
 }  // namespace
 
@@ -178,9 +176,9 @@ class RegularKeysStartWithAInternal : public IntTblPropCollector {
     return Status::OK();
   }
 
-  void BlockAdd(uint64_t /* blockRawBytes */,
-                uint64_t /* blockCompressedBytesFast */,
-                uint64_t /* blockCompressedBytesSlow */) override {
+  void BlockAdd(uint64_t /* block_raw_bytes */,
+                uint64_t /* block_compressed_bytes_fast */,
+                uint64_t /* block_compressed_bytes_slow */) override {
     // Nothing to do.
     return;
   }
@@ -264,10 +262,9 @@ void TestCustomizedTablePropertiesCollector(
   // -- Step 1: build table
   std::unique_ptr<TableBuilder> builder;
   std::unique_ptr<WritableFileWriter> writer;
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
   if (test_int_tbl_prop_collector) {
     int_tbl_prop_collector_factories.emplace_back(
         new RegularKeysStartWithAFactory(backward_mode));
@@ -286,12 +283,13 @@ void TestCustomizedTablePropertiesCollector(
   writer->Flush();
 
   // -- Step 2: Read properties
-  LegacyWritableFileWrapper* file =
-      static_cast<LegacyWritableFileWrapper*>(writer->writable_file());
-  test::StringSink* fwf = static_cast<test::StringSink*>(file->target());
+  test::StringSink* fwf =
+      static_cast<test::StringSink*>(writer->writable_file());
+  std::unique_ptr<FSRandomAccessFile> source(
+      new test::StringSource(fwf->contents()));
   std::unique_ptr<RandomAccessFileReader> fake_file_reader(
-      test::GetRandomAccessFileReader(
-          new test::StringSource(fwf->contents())));
+      new RandomAccessFileReader(std::move(source), "test"));
+
   TableProperties* props;
   Status s = ReadTableProperties(fake_file_reader.get(), fwf->contents().size(),
                                  magic_number, ioptions, &props,
@@ -396,8 +394,7 @@ void TestInternalKeyPropertiesCollector(
   Options options;
   test::PlainInternalKeyComparator pikc(options.comparator);
 
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
   options.table_factory = table_factory;
   if (sanitized) {
     options.table_properties_collector_factories.emplace_back(
@@ -410,11 +407,11 @@ void TestInternalKeyPropertiesCollector(
     options.info_log = std::make_shared<test::NullLogger>();
     options = SanitizeOptions("db",            // just a place holder
                               options);
-    ImmutableCFOptions ioptions(options);
+    ImmutableOptions ioptions(options);
     GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories);
     options.comparator = comparator;
   }
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   MutableCFOptions moptions(options);
 
   for (int iter = 0; iter < 2; ++iter) {
@@ -427,12 +424,13 @@ void TestInternalKeyPropertiesCollector(
     ASSERT_OK(builder->Finish());
     writable->Flush();
 
-    LegacyWritableFileWrapper* file =
-        static_cast<LegacyWritableFileWrapper*>(writable->writable_file());
-    test::StringSink* fwf = static_cast<test::StringSink*>(file->target());
+    test::StringSink* fwf =
+        static_cast<test::StringSink*>(writable->writable_file());
+    std::unique_ptr<FSRandomAccessFile> source(
+        new test::StringSource(fwf->contents()));
     std::unique_ptr<RandomAccessFileReader> reader(
-        test::GetRandomAccessFileReader(
-            new test::StringSource(fwf->contents())));
+        new RandomAccessFileReader(std::move(source), "test"));
+
     TableProperties* props;
     Status s =
         ReadTableProperties(reader.get(), fwf->contents().size(), magic_number,
diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc
index e6180903ff5..ba4c65ff927 100644
--- a/db/transaction_log_impl.cc
+++ b/db/transaction_log_impl.cc
@@ -34,7 +34,7 @@ TransactionLogIteratorImpl::TransactionLogIteratorImpl(
       io_tracer_(io_tracer) {
   assert(files_ != nullptr);
   assert(versions_ != nullptr);
-
+  current_status_.PermitUncheckedError();  // Clear on start
   reporter_.env = options_->env;
   reporter_.info_log = options_->info_log.get();
   SeekToStartSequence(); // Seek till starting sequence
@@ -225,7 +225,8 @@ bool TransactionLogIteratorImpl::IsBatchExpected(
 
 void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
   std::unique_ptr<WriteBatch> batch(new WriteBatch());
-  WriteBatchInternal::SetContents(batch.get(), record);
+  Status s = WriteBatchInternal::SetContents(batch.get(), record);
+  s.PermitUncheckedError();  // TODO: What should we do with this error?
 
   SequenceNumber expected_seq = current_last_seq_ + 1;
   // If the iterator has started, then confirm that we get continuous batches
diff --git a/db/version_builder.cc b/db/version_builder.cc
index 49c35cf9f2d..474169bda74 100644
--- a/db/version_builder.cc
+++ b/db/version_builder.cc
@@ -334,17 +334,23 @@ class VersionBuilder::Rep {
           TEST_SYNC_POINT_CALLBACK("VersionBuilder::CheckConsistency1", &pair);
 #endif
           if (!level_nonzero_cmp_(f1, f2)) {
-            return Status::Corruption("L" + NumberToString(level) +
-                                      " files are not sorted properly");
+            return Status::Corruption(
+                "L" + NumberToString(level) +
+                " files are not sorted properly: files #" +
+                NumberToString(f1->fd.GetNumber()) + ", #" +
+                NumberToString(f2->fd.GetNumber()));
           }
 
           // Make sure there is no overlap in levels > 0
           if (vstorage->InternalComparator()->Compare(f1->largest,
                                                       f2->smallest) >= 0) {
             return Status::Corruption(
-                "L" + NumberToString(level) + " have overlapping ranges " +
-                (f1->largest).DebugString(true) + " vs. " +
-                (f2->smallest).DebugString(true));
+                "L" + NumberToString(level) +
+                " have overlapping ranges: file #" +
+                NumberToString(f1->fd.GetNumber()) +
+                " largest key: " + (f1->largest).DebugString(true) +
+                " vs. file #" + NumberToString(f2->fd.GetNumber()) +
+                " smallest key: " + (f2->smallest).DebugString(true));
           }
         }
       }
@@ -511,6 +517,28 @@ class VersionBuilder::Rep {
     return meta->oldest_blob_file_number;
   }
 
+  uint64_t GetMinOldestBlobFileNumber() const {
+    uint64_t min_oldest_blob_file_num = std::numeric_limits<uint64_t>::max();
+    for (int level = 0; level < num_levels_; ++level) {
+      const auto& base_files = base_vstorage_->LevelFiles(level);
+      for (const auto* fmeta : base_files) {
+        assert(fmeta);
+        min_oldest_blob_file_num =
+            std::min(min_oldest_blob_file_num, fmeta->oldest_blob_file_number);
+      }
+      const auto& added_files = levels_[level].added_files;
+      for (const auto& elem : added_files) {
+        assert(elem.second);
+        min_oldest_blob_file_num = std::min(
+            min_oldest_blob_file_num, elem.second->oldest_blob_file_number);
+      }
+    }
+    if (min_oldest_blob_file_num == std::numeric_limits<uint64_t>::max()) {
+      min_oldest_blob_file_num = kInvalidBlobFileNumber;
+    }
+    return min_oldest_blob_file_num;
+  }
+
   Status ApplyFileDeletion(int level, uint64_t file_number) {
     assert(level != VersionStorageInfo::FileLocation::Invalid().GetLevel());
 
@@ -828,7 +856,7 @@ class VersionBuilder::Rep {
     }
   }
 
-  // Save the current state in *v.
+  // Save the current state in *vstorage.
   Status SaveTo(VersionStorageInfo* vstorage) {
     Status s = CheckConsistency(base_vstorage_);
     if (!s.ok()) {
@@ -1046,6 +1074,10 @@ Status VersionBuilder::LoadTableHandlers(
       is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin);
 }
 
+uint64_t VersionBuilder::GetMinOldestBlobFileNumber() const {
+  return rep_->GetMinOldestBlobFileNumber();
+}
+
 BaseReferencedVersionBuilder::BaseReferencedVersionBuilder(
     ColumnFamilyData* cfd)
     : version_builder_(new VersionBuilder(
diff --git a/db/version_builder.h b/db/version_builder.h
index a4e1c0d63ba..5a5c9ea1807 100644
--- a/db/version_builder.h
+++ b/db/version_builder.h
@@ -44,6 +44,7 @@ class VersionBuilder {
                            bool is_initial_load,
                            const SliceTransform* prefix_extractor,
                            size_t max_file_size_for_l0_meta_pin);
+  uint64_t GetMinOldestBlobFileNumber() const;
 
  private:
   class Rep;
diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc
index 513900de5bc..26b473f0fe6 100644
--- a/db/version_builder_test.cc
+++ b/db/version_builder_test.cc
@@ -11,7 +11,6 @@
 
 #include "db/version_edit.h"
 #include "db/version_set.h"
-#include "logging/logging.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/string_util.h"
@@ -23,7 +22,7 @@ class VersionBuilderTest : public testing::Test {
   const Comparator* ucmp_;
   InternalKeyComparator icmp_;
   Options options_;
-  ImmutableCFOptions ioptions_;
+  ImmutableOptions ioptions_;
   MutableCFOptions mutable_cf_options_;
   VersionStorageInfo vstorage_;
   uint32_t file_num_;
diff --git a/db/version_edit.cc b/db/version_edit.cc
index 8879f0e1bca..8cb173a2dd0 100644
--- a/db/version_edit.cc
+++ b/db/version_edit.cc
@@ -89,13 +89,14 @@ void VersionEdit::Clear() {
   blob_file_additions_.clear();
   blob_file_garbages_.clear();
   wal_additions_.clear();
-  wal_deletions_.clear();
+  wal_deletion_.Reset();
   column_family_ = 0;
   is_column_family_add_ = false;
   is_column_family_drop_ = false;
   column_family_name_.clear();
   is_in_atomic_group_ = false;
   remaining_entries_ = 0;
+  full_history_ts_low_.clear();
 }
 
 bool VersionEdit::EncodeTo(std::string* dst) const {
@@ -190,6 +191,11 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
       char p = static_cast<char>(f.fd.GetPathId());
       PutLengthPrefixedSlice(dst, Slice(&p, 1));
     }
+    if (f.temperature != Temperature::kUnknown) {
+      PutVarint32(dst, NewFileCustomTag::kTemperature);
+      char p = static_cast<char>(f.temperature);
+      PutLengthPrefixedSlice(dst, Slice(&p, 1));
+    }
     if (f.marked_for_compaction) {
       PutVarint32(dst, NewFileCustomTag::kNeedCompaction);
       char p = static_cast<char>(1);
@@ -225,13 +231,17 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
   }
 
   for (const auto& wal_addition : wal_additions_) {
-    PutVarint32(dst, kWalAddition);
-    wal_addition.EncodeTo(dst);
+    PutVarint32(dst, kWalAddition2);
+    std::string encoded;
+    wal_addition.EncodeTo(&encoded);
+    PutLengthPrefixedSlice(dst, encoded);
   }
 
-  for (const auto& wal_deletion : wal_deletions_) {
-    PutVarint32(dst, kWalDeletion);
-    wal_deletion.EncodeTo(dst);
+  if (!wal_deletion_.IsEmpty()) {
+    PutVarint32(dst, kWalDeletion2);
+    std::string encoded;
+    wal_deletion_.EncodeTo(&encoded);
+    PutLengthPrefixedSlice(dst, encoded);
   }
 
   // 0 is default and does not need to be explicitly written
@@ -252,6 +262,11 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
     PutVarint32(dst, kInAtomicGroup);
     PutVarint32(dst, remaining_entries_);
   }
+
+  if (HasFullHistoryTsLow()) {
+    PutVarint32(dst, kFullHistoryTsLow);
+    PutLengthPrefixedSlice(dst, full_history_ts_low_);
+  }
   return true;
 }
 
@@ -350,6 +365,16 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
             return "invalid oldest blob file number";
           }
           break;
+        case kTemperature:
+          if (field.size() != 1) {
+            return "temperature field wrong size";
+          } else {
+            Temperature casted_field = static_cast<Temperature>(field[0]);
+            if (casted_field <= Temperature::kCold) {
+              f.temperature = casted_field;
+            }
+          }
+          break;
         default:
           if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) {
             // Should not proceed if cannot understand it
@@ -369,6 +394,11 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
 
 Status VersionEdit::DecodeFrom(const Slice& src) {
   Clear();
+#ifndef NDEBUG
+  bool ignore_ignorable_tags = false;
+  TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:IgnoreIgnorableTags",
+                           &ignore_ignorable_tags);
+#endif
   Slice input = src;
   const char* msg = nullptr;
   uint32_t tag = 0;
@@ -379,6 +409,11 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
   Slice str;
   InternalKey key;
   while (msg == nullptr && GetVarint32(&input, &tag)) {
+#ifndef NDEBUG
+    if (ignore_ignorable_tags && tag > kTagSafeIgnoreMask) {
+      tag = kTagSafeIgnoreMask;
+    }
+#endif
     switch (tag) {
       case kDbId:
         if (GetLengthPrefixedSlice(&input, &str)) {
@@ -536,7 +571,8 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
         break;
       }
 
-      case kBlobFileAddition: {
+      case kBlobFileAddition:
+      case kBlobFileAddition_DEPRECATED: {
         BlobFileAddition blob_file_addition;
         const Status s = blob_file_addition.DecodeFrom(&input);
         if (!s.ok()) {
@@ -547,7 +583,8 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
         break;
       }
 
-      case kBlobFileGarbage: {
+      case kBlobFileGarbage:
+      case kBlobFileGarbage_DEPRECATED: {
         BlobFileGarbage blob_file_garbage;
         const Status s = blob_file_garbage.DecodeFrom(&input);
         if (!s.ok()) {
@@ -569,6 +606,23 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
         break;
       }
 
+      case kWalAddition2: {
+        Slice encoded;
+        if (!GetLengthPrefixedSlice(&input, &encoded)) {
+          msg = "WalAddition not prefixed by length";
+          break;
+        }
+
+        WalAddition wal_addition;
+        const Status s = wal_addition.DecodeFrom(&encoded);
+        if (!s.ok()) {
+          return s;
+        }
+
+        wal_additions_.emplace_back(std::move(wal_addition));
+        break;
+      }
+
       case kWalDeletion: {
         WalDeletion wal_deletion;
         const Status s = wal_deletion.DecodeFrom(&input);
@@ -576,7 +630,24 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
           return s;
         }
 
-        wal_deletions_.emplace_back(std::move(wal_deletion));
+        wal_deletion_ = std::move(wal_deletion);
+        break;
+      }
+
+      case kWalDeletion2: {
+        Slice encoded;
+        if (!GetLengthPrefixedSlice(&input, &encoded)) {
+          msg = "WalDeletion not prefixed by length";
+          break;
+        }
+
+        WalDeletion wal_deletion;
+        const Status s = wal_deletion.DecodeFrom(&encoded);
+        if (!s.ok()) {
+          return s;
+        }
+
+        wal_deletion_ = std::move(wal_deletion);
         break;
       }
 
@@ -612,6 +683,16 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
         }
         break;
 
+      case kFullHistoryTsLow:
+        if (!GetLengthPrefixedSlice(&input, &str)) {
+          msg = "full_history_ts_low";
+        } else if (str.empty()) {
+          msg = "full_history_ts_low: empty";
+        } else {
+          full_history_ts_low_.assign(str.data(), str.size());
+        }
+        break;
+
       default:
         if (tag & kTagSafeIgnoreMask) {
           // Tag from future which can be safely ignored.
@@ -708,6 +789,12 @@ std::string VersionEdit::DebugString(bool hex_key) const {
     r.append(f.file_checksum);
     r.append(" file_checksum_func_name: ");
     r.append(f.file_checksum_func_name);
+    if (f.temperature != Temperature::kUnknown) {
+      r.append(" temperature: ");
+      // Maybe change to human readable format whenthe feature becomes
+      // permanent
+      r.append(ToString(static_cast<int>(f.temperature)));
+    }
   }
 
   for (const auto& blob_file_addition : blob_file_additions_) {
@@ -725,9 +812,9 @@ std::string VersionEdit::DebugString(bool hex_key) const {
     r.append(wal_addition.DebugString());
   }
 
-  for (const auto& wal_deletion : wal_deletions_) {
+  if (!wal_deletion_.IsEmpty()) {
     r.append("\n  WalDeletion: ");
-    r.append(wal_deletion.DebugString());
+    r.append(wal_deletion_.DebugString());
   }
 
   r.append("\n  ColumnFamily: ");
@@ -744,6 +831,10 @@ std::string VersionEdit::DebugString(bool hex_key) const {
     AppendNumberTo(&r, remaining_entries_);
     r.append(" entries remains");
   }
+  if (HasFullHistoryTsLow()) {
+    r.append("\n FullHistoryTsLow: ");
+    r.append(Slice(full_history_ts_low_).ToString(hex_key));
+  }
   r.append("\n}\n");
   return r;
 }
@@ -806,6 +897,11 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
       if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
         jw << "OldestBlobFile" << f.oldest_blob_file_number;
       }
+      if (f.temperature != Temperature::kUnknown) {
+        // Maybe change to human readable format whenthe feature becomes
+        // permanent
+        jw << "Temperature" << static_cast<int>(f.temperature);
+      }
       jw.EndArrayedObject();
     }
 
@@ -854,18 +950,11 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
     jw.EndArray();
   }
 
-  if (!wal_deletions_.empty()) {
-    jw << "WalDeletions";
-
-    jw.StartArray();
-
-    for (const auto& wal_deletion : wal_deletions_) {
-      jw.StartArrayedObject();
-      jw << wal_deletion;
-      jw.EndArrayedObject();
-    }
-
-    jw.EndArray();
+  if (!wal_deletion_.IsEmpty()) {
+    jw << "WalDeletion";
+    jw.StartObject();
+    jw << wal_deletion_;
+    jw.EndObject();
   }
 
   jw << "ColumnFamily" << column_family_;
@@ -880,6 +969,10 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
     jw << "AtomicGroup" << remaining_entries_;
   }
 
+  if (HasFullHistoryTsLow()) {
+    jw << "FullHistoryTsLow" << Slice(full_history_ts_low_).ToString(hex_key);
+  }
+
   jw.EndObject();
 
   return jw.Get();
diff --git a/db/version_edit.h b/db/version_edit.h
index b4ab74741a6..ce3283f1422 100644
--- a/db/version_edit.h
+++ b/db/version_edit.h
@@ -52,15 +52,21 @@ enum Tag : uint32_t {
 
   kInAtomicGroup = 300,
 
+  kBlobFileAddition = 400,
+  kBlobFileGarbage,
+
   // Mask for an unidentified tag from the future which can be safely ignored.
   kTagSafeIgnoreMask = 1 << 13,
 
   // Forward compatible (aka ignorable) records
   kDbId,
-  kBlobFileAddition,
-  kBlobFileGarbage,
+  kBlobFileAddition_DEPRECATED,
+  kBlobFileGarbage_DEPRECATED,
   kWalAddition,
   kWalDeletion,
+  kFullHistoryTsLow,
+  kWalAddition2,
+  kWalDeletion2,
 };
 
 enum NewFileCustomTag : uint32_t {
@@ -68,13 +74,14 @@ enum NewFileCustomTag : uint32_t {
   kNeedCompaction = 2,
   // Since Manifest is not entirely forward-compatible, we currently encode
   // kMinLogNumberToKeep as part of NewFile as a hack. This should be removed
-  // when manifest becomes forward-comptabile.
+  // when manifest becomes forward-compatible.
   kMinLogNumberToKeepHack = 3,
   kOldestBlobFileNumber = 4,
   kOldestAncesterTime = 5,
   kFileCreationTime = 6,
   kFileChecksum = 7,
   kFileChecksumFuncName = 8,
+  kTemperature = 9,
 
   // If this bit for the custom tag is set, opening DB should fail if
   // we don't know this field.
@@ -182,6 +189,7 @@ struct FileMetaData {
 
   bool marked_for_compaction = false;  // True if client asked us nicely to
                                        // compact this file.
+  Temperature temperature = Temperature::kUnknown;
 
   // Used only in BlobDB. The file number of the oldest blob file this SST file
   // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1.
@@ -189,7 +197,7 @@ struct FileMetaData {
 
   // The file could be the compaction output from other SST files, which could
   // in turn be outputs for compact older SST files. We track the memtable
-  // flush timestamp for the oldest SST file that eventaully contribute data
+  // flush timestamp for the oldest SST file that eventually contribute data
   // to this file. 0 means the information is not available.
   uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
 
@@ -425,6 +433,7 @@ class VersionEdit {
   }
 
   void SetBlobFileAdditions(BlobFileAdditions blob_file_additions) {
+    assert(blob_file_additions_.empty());
     blob_file_additions_ = std::move(blob_file_additions);
   }
 
@@ -448,32 +457,44 @@ class VersionEdit {
   }
 
   void SetBlobFileGarbages(BlobFileGarbages blob_file_garbages) {
+    assert(blob_file_garbages_.empty());
     blob_file_garbages_ = std::move(blob_file_garbages);
   }
 
   // Add a WAL (either just created or closed).
+  // AddWal and DeleteWalsBefore cannot be called on the same VersionEdit.
   void AddWal(WalNumber number, WalMetadata metadata = WalMetadata()) {
+    assert(NumEntries() == wal_additions_.size());
     wal_additions_.emplace_back(number, std::move(metadata));
   }
 
   // Retrieve all the added WALs.
   const WalAdditions& GetWalAdditions() const { return wal_additions_; }
 
-  bool HasWalAddition() const { return !wal_additions_.empty(); }
+  bool IsWalAddition() const { return !wal_additions_.empty(); }
 
   // Delete a WAL (either directly deleted or archived).
-  void DeleteWal(WalNumber number) { wal_deletions_.emplace_back(number); }
+  // AddWal and DeleteWalsBefore cannot be called on the same VersionEdit.
+  void DeleteWalsBefore(WalNumber number) {
+    assert((NumEntries() == 1) == !wal_deletion_.IsEmpty());
+    wal_deletion_ = WalDeletion(number);
+  }
+
+  const WalDeletion& GetWalDeletion() const { return wal_deletion_; }
 
-  // Retrieve all the deleted WALs.
-  const WalDeletions& GetWalDeletions() const { return wal_deletions_; }
+  bool IsWalDeletion() const { return !wal_deletion_.IsEmpty(); }
 
-  bool HasWalDeletion() const { return !wal_deletions_.empty(); }
+  bool IsWalManipulation() const {
+    size_t entries = NumEntries();
+    return (entries > 0) && ((entries == wal_additions_.size()) ||
+                             (entries == !wal_deletion_.IsEmpty()));
+  }
 
   // Number of edits
   size_t NumEntries() const {
     return new_files_.size() + deleted_files_.size() +
            blob_file_additions_.size() + blob_file_garbages_.size() +
-           wal_additions_.size() + wal_deletions_.size();
+           wal_additions_.size() + !wal_deletion_.IsEmpty();
   }
 
   void SetColumnFamily(uint32_t column_family_id) {
@@ -513,6 +534,16 @@ class VersionEdit {
   bool IsInAtomicGroup() const { return is_in_atomic_group_; }
   uint32_t GetRemainingEntries() const { return remaining_entries_; }
 
+  bool HasFullHistoryTsLow() const { return !full_history_ts_low_.empty(); }
+  const std::string& GetFullHistoryTsLow() const {
+    assert(HasFullHistoryTsLow());
+    return full_history_ts_low_;
+  }
+  void SetFullHistoryTsLow(std::string full_history_ts_low) {
+    assert(!full_history_ts_low.empty());
+    full_history_ts_low_ = std::move(full_history_ts_low);
+  }
+
   // return true on success.
   bool EncodeTo(std::string* dst) const;
   Status DecodeFrom(const Slice& src);
@@ -529,8 +560,11 @@ class VersionEdit {
 
  private:
   friend class ReactiveVersionSet;
+  friend class VersionEditHandlerBase;
+  friend class ListColumnFamiliesHandler;
   friend class VersionEditHandler;
   friend class VersionEditHandlerPointInTime;
+  friend class DumpManifestHandler;
   friend class VersionSet;
   friend class Version;
   friend class AtomicGroupReadBuffer;
@@ -565,7 +599,7 @@ class VersionEdit {
   BlobFileGarbages blob_file_garbages_;
 
   WalAdditions wal_additions_;
-  WalDeletions wal_deletions_;
+  WalDeletion wal_deletion_;
 
   // Each version edit record should have column_family_ set
   // If it's not set, it is default (0)
@@ -579,6 +613,8 @@ class VersionEdit {
 
   bool is_in_atomic_group_ = false;
   uint32_t remaining_entries_ = 0;
+
+  std::string full_history_ts_low_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/version_edit_handler.cc b/db/version_edit_handler.cc
index 75fe107c58f..7a2996a59e2 100644
--- a/db/version_edit_handler.cc
+++ b/db/version_edit_handler.cc
@@ -9,28 +9,16 @@
 
 #include "db/version_edit_handler.h"
 
+#include <cinttypes>
+
+#include "db/blob/blob_file_cache.h"
+#include "db/blob/blob_file_reader.h"
 #include "monitoring/persistent_stats_history.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-VersionEditHandler::VersionEditHandler(
-    bool read_only, const std::vector<ColumnFamilyDescriptor>& column_families,
-    VersionSet* version_set, bool track_missing_files,
-    bool no_error_if_table_files_missing,
-    const std::shared_ptr<IOTracer>& io_tracer)
-    : read_only_(read_only),
-      column_families_(column_families),
-      status_(),
-      version_set_(version_set),
-      track_missing_files_(track_missing_files),
-      no_error_if_table_files_missing_(no_error_if_table_files_missing),
-      initialized_(false),
-      io_tracer_(io_tracer) {
-  assert(version_set_ != nullptr);
-}
-
-void VersionEditHandler::Iterate(log::Reader& reader, Status* log_read_status,
-                                 std::string* db_id) {
+void VersionEditHandlerBase::Iterate(log::Reader& reader,
+                                     Status* log_read_status) {
   Slice record;
   std::string scratch;
   assert(log_read_status);
@@ -38,19 +26,14 @@ void VersionEditHandler::Iterate(log::Reader& reader, Status* log_read_status,
 
   size_t recovered_edits = 0;
   Status s = Initialize();
-  while (s.ok() && reader.ReadRecord(&record, &scratch) &&
-         log_read_status->ok()) {
+  while (reader.LastRecordEnd() < max_manifest_read_size_ && s.ok() &&
+         reader.ReadRecord(&record, &scratch) && log_read_status->ok()) {
     VersionEdit edit;
     s = edit.DecodeFrom(record);
     if (!s.ok()) {
       break;
     }
-    if (edit.has_db_id_) {
-      version_set_->db_id_ = edit.GetDbId();
-      if (db_id != nullptr) {
-        *db_id = version_set_->db_id_;
-      }
-    }
+
     s = read_buffer_.AddEdit(&edit);
     if (!s.ok()) {
       break;
@@ -86,6 +69,80 @@ void VersionEditHandler::Iterate(log::Reader& reader, Status* log_read_status,
   if (!s.ok()) {
     status_ = s;
   }
+  TEST_SYNC_POINT_CALLBACK("VersionEditHandlerBase::Iterate:Finish",
+                           &recovered_edits);
+}
+
+Status ListColumnFamiliesHandler::ApplyVersionEdit(
+    VersionEdit& edit, ColumnFamilyData** /*unused*/) {
+  Status s;
+  if (edit.is_column_family_add_) {
+    if (column_family_names_.find(edit.column_family_) !=
+        column_family_names_.end()) {
+      s = Status::Corruption("Manifest adding the same column family twice");
+    } else {
+      column_family_names_.insert(
+          {edit.column_family_, edit.column_family_name_});
+    }
+  } else if (edit.is_column_family_drop_) {
+    if (column_family_names_.find(edit.column_family_) ==
+        column_family_names_.end()) {
+      s = Status::Corruption("Manifest - dropping non-existing column family");
+    } else {
+      column_family_names_.erase(edit.column_family_);
+    }
+  }
+  return s;
+}
+
+Status FileChecksumRetriever::ApplyVersionEdit(VersionEdit& edit,
+                                               ColumnFamilyData** /*unused*/) {
+  for (const auto& deleted_file : edit.GetDeletedFiles()) {
+    Status s = file_checksum_list_.RemoveOneFileChecksum(deleted_file.second);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  for (const auto& new_file : edit.GetNewFiles()) {
+    Status s = file_checksum_list_.InsertOneFileChecksum(
+        new_file.second.fd.GetNumber(), new_file.second.file_checksum,
+        new_file.second.file_checksum_func_name);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  for (const auto& new_blob_file : edit.GetBlobFileAdditions()) {
+    std::string checksum_value = new_blob_file.GetChecksumValue();
+    std::string checksum_method = new_blob_file.GetChecksumMethod();
+    assert(checksum_value.empty() == checksum_method.empty());
+    if (checksum_method.empty()) {
+      checksum_value = kUnknownFileChecksum;
+      checksum_method = kUnknownFileChecksumFuncName;
+    }
+    Status s = file_checksum_list_.InsertOneFileChecksum(
+        new_blob_file.GetBlobFileNumber(), checksum_value, checksum_method);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  return Status::OK();
+}
+
+VersionEditHandler::VersionEditHandler(
+    bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
+    VersionSet* version_set, bool track_missing_files,
+    bool no_error_if_files_missing, const std::shared_ptr<IOTracer>& io_tracer,
+    bool skip_load_table_files)
+    : VersionEditHandlerBase(),
+      read_only_(read_only),
+      column_families_(std::move(column_families)),
+      version_set_(version_set),
+      track_missing_files_(track_missing_files),
+      no_error_if_files_missing_(no_error_if_files_missing),
+      io_tracer_(io_tracer),
+      skip_load_table_files_(skip_load_table_files),
+      initialized_(false) {
+  assert(version_set_ != nullptr);
 }
 
 Status VersionEditHandler::Initialize() {
@@ -121,6 +178,10 @@ Status VersionEditHandler::ApplyVersionEdit(VersionEdit& edit,
     s = OnColumnFamilyAdd(edit, cfd);
   } else if (edit.is_column_family_drop_) {
     s = OnColumnFamilyDrop(edit, cfd);
+  } else if (edit.IsWalAddition()) {
+    s = OnWalAddition(edit);
+  } else if (edit.IsWalDeletion()) {
+    s = OnWalDeletion(edit);
   } else {
     s = OnNonCfOperation(edit, cfd);
   }
@@ -190,6 +251,17 @@ Status VersionEditHandler::OnColumnFamilyDrop(VersionEdit& edit,
   return s;
 }
 
+Status VersionEditHandler::OnWalAddition(VersionEdit& edit) {
+  assert(edit.IsWalAddition());
+  return version_set_->wals_.AddWals(edit.GetWalAdditions());
+}
+
+Status VersionEditHandler::OnWalDeletion(VersionEdit& edit) {
+  assert(edit.IsWalDeletion());
+  return version_set_->wals_.DeleteWalsBefore(
+      edit.GetWalDeletion().GetLogNumber());
+}
+
 Status VersionEditHandler::OnNonCfOperation(VersionEdit& edit,
                                             ColumnFamilyData** cfd) {
   bool cf_in_not_found = false;
@@ -231,6 +303,14 @@ bool VersionEditHandler::HasMissingFiles() const {
       break;
     }
   }
+  if (!ret) {
+    for (const auto& elem : cf_to_missing_blob_files_high_) {
+      if (elem.second != kInvalidBlobFileNumber) {
+        ret = true;
+        break;
+      }
+    }
+  }
   return ret;
 }
 
@@ -259,7 +339,7 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader,
                                               Status* s) {
   assert(s != nullptr);
   if (!s->ok()) {
-    read_buffer_.Clear();
+    // Do nothing here.
   } else if (!version_edit_params_.has_log_number_ ||
              !version_edit_params_.has_next_file_number_ ||
              !version_edit_params_.has_last_sequence_) {
@@ -277,7 +357,10 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader,
     msg.append(" entry in MANIFEST");
     *s = Status::Corruption(msg);
   }
-  if (s->ok() && !read_only_ && !column_families_not_found_.empty()) {
+  // There were some column families in the MANIFEST that weren't specified
+  // in the argument. This is OK in read_only mode
+  if (s->ok() && MustOpenAllColumnFamilies() &&
+      !column_families_not_found_.empty()) {
     std::string msg;
     for (const auto& cf : column_families_not_found_) {
       msg.append(", ");
@@ -294,6 +377,9 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader,
     version_set_->MarkFileNumberUsed(version_edit_params_.prev_log_number_);
     version_set_->MarkFileNumberUsed(version_edit_params_.log_number_);
     for (auto* cfd : *(version_set_->GetColumnFamilySet())) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
       auto builder_iter = builders_.find(cfd->GetID());
       assert(builder_iter != builders_.end());
       auto* builder = builder_iter->second->version_builder();
@@ -315,6 +401,10 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader,
       *s = LoadTables(cfd, /*prefetch_index_and_filter_in_cache=*/false,
                       /*is_initial_load=*/true);
       if (!s->ok()) {
+        // If s is IOError::PathNotFound, then we mark the db as corrupted.
+        if (s->IsPathNotFound()) {
+          *s = Status::Corruption("Corruption: " + s->ToString());
+        }
         break;
       }
     }
@@ -357,6 +447,8 @@ ColumnFamilyData* VersionEditHandler::CreateCfAndInit(
   if (track_missing_files_) {
     cf_to_missing_files_.emplace(edit.column_family_,
                                  std::unordered_set<uint64_t>());
+    cf_to_missing_blob_files_high_.emplace(edit.column_family_,
+                                           kInvalidBlobFileNumber);
   }
   return cfd;
 }
@@ -370,15 +462,19 @@ ColumnFamilyData* VersionEditHandler::DestroyCfAndCleanup(
     auto missing_files_iter = cf_to_missing_files_.find(edit.column_family_);
     assert(missing_files_iter != cf_to_missing_files_.end());
     cf_to_missing_files_.erase(missing_files_iter);
+
+    auto missing_blob_files_high_iter =
+        cf_to_missing_blob_files_high_.find(edit.column_family_);
+    assert(missing_blob_files_high_iter !=
+           cf_to_missing_blob_files_high_.end());
+    cf_to_missing_blob_files_high_.erase(missing_blob_files_high_iter);
   }
   ColumnFamilyData* ret =
       version_set_->GetColumnFamilySet()->GetColumnFamily(edit.column_family_);
   assert(ret != nullptr);
-  if (ret->UnrefAndTryDelete()) {
-    ret = nullptr;
-  } else {
-    assert(false);
-  }
+  ret->SetDropped();
+  ret->UnrefAndTryDelete();
+  ret = nullptr;
   return ret;
 }
 
@@ -411,6 +507,9 @@ Status VersionEditHandler::MaybeCreateVersion(const VersionEdit& /*edit*/,
 Status VersionEditHandler::LoadTables(ColumnFamilyData* cfd,
                                       bool prefetch_index_and_filter_in_cache,
                                       bool is_initial_load) {
+  if (skip_load_table_files_) {
+    return Status::OK();
+  }
   assert(cfd != nullptr);
   assert(!cfd->IsDropped());
   auto builder_iter = builders_.find(cfd->GetID());
@@ -424,8 +523,7 @@ Status VersionEditHandler::LoadTables(ColumnFamilyData* cfd,
       prefetch_index_and_filter_in_cache, is_initial_load,
       cfd->GetLatestMutableCFOptions()->prefix_extractor.get(),
       MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()));
-  if ((s.IsPathNotFound() || s.IsCorruption()) &&
-      no_error_if_table_files_missing_) {
+  if ((s.IsPathNotFound() || s.IsCorruption()) && no_error_if_files_missing_) {
     s = Status::OK();
   }
   if (!s.ok() && !version_set_->db_options_->paranoid_checks) {
@@ -437,10 +535,11 @@ Status VersionEditHandler::LoadTables(ColumnFamilyData* cfd,
 Status VersionEditHandler::ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
                                                       const VersionEdit& edit) {
   Status s;
+  if (edit.has_db_id_) {
+    version_set_->db_id_ = edit.GetDbId();
+    version_edit_params_.SetDBId(edit.db_id_);
+  }
   if (cfd != nullptr) {
-    if (edit.has_db_id_) {
-      version_edit_params_.SetDBId(edit.db_id_);
-    }
     if (edit.has_log_number_) {
       if (cfd->GetLogNumber() > edit.log_number_) {
         ROCKS_LOG_WARN(
@@ -454,9 +553,17 @@ Status VersionEditHandler::ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
     }
     if (edit.has_comparator_ &&
         edit.comparator_ != cfd->user_comparator()->Name()) {
-      s = Status::InvalidArgument(
-          cfd->user_comparator()->Name(),
-          "does not match existing comparator " + edit.comparator_);
+      if (!cf_to_cmp_names_) {
+        s = Status::InvalidArgument(
+            cfd->user_comparator()->Name(),
+            "does not match existing comparator " + edit.comparator_);
+      } else {
+        cf_to_cmp_names_->emplace(cfd->GetID(), edit.comparator_);
+      }
+    }
+    if (edit.HasFullHistoryTsLow()) {
+      const std::string& new_ts = edit.GetFullHistoryTsLow();
+      cfd->SetFullHistoryTsLow(new_ts);
     }
   }
 
@@ -486,12 +593,11 @@ Status VersionEditHandler::ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
 }
 
 VersionEditHandlerPointInTime::VersionEditHandlerPointInTime(
-    bool read_only, const std::vector<ColumnFamilyDescriptor>& column_families,
+    bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
     VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer)
     : VersionEditHandler(read_only, column_families, version_set,
                          /*track_missing_files=*/true,
-                         /*no_error_if_table_files_missing=*/true, io_tracer),
-      io_tracer_(io_tracer) {}
+                         /*no_error_if_files_missing=*/true, io_tracer) {}
 
 VersionEditHandlerPointInTime::~VersionEditHandlerPointInTime() {
   for (const auto& elem : versions_) {
@@ -541,7 +647,29 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
   auto missing_files_iter = cf_to_missing_files_.find(cfd->GetID());
   assert(missing_files_iter != cf_to_missing_files_.end());
   std::unordered_set<uint64_t>& missing_files = missing_files_iter->second;
-  const bool prev_has_missing_files = !missing_files.empty();
+
+  auto missing_blob_files_high_iter =
+      cf_to_missing_blob_files_high_.find(cfd->GetID());
+  assert(missing_blob_files_high_iter != cf_to_missing_blob_files_high_.end());
+  const uint64_t prev_missing_blob_file_high =
+      missing_blob_files_high_iter->second;
+
+  VersionBuilder* builder = nullptr;
+
+  if (prev_missing_blob_file_high != kInvalidBlobFileNumber) {
+    auto builder_iter = builders_.find(cfd->GetID());
+    assert(builder_iter != builders_.end());
+    builder = builder_iter->second->version_builder();
+    assert(builder != nullptr);
+  }
+
+  // At this point, we have not yet applied the new version edits read from the
+  // MANIFEST. We check whether we have any missing table and blob files.
+  const bool prev_has_missing_files =
+      !missing_files.empty() ||
+      (prev_missing_blob_file_high != kInvalidBlobFileNumber &&
+       prev_missing_blob_file_high >= builder->GetMinOldestBlobFileNumber());
+
   for (const auto& file : edit.GetDeletedFiles()) {
     uint64_t file_num = file.second;
     auto fiter = missing_files.find(file_num);
@@ -549,6 +677,8 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
       missing_files.erase(fiter);
     }
   }
+
+  assert(!cfd->ioptions()->cf_paths.empty());
   Status s;
   for (const auto& elem : edit.GetNewFiles()) {
     const FileMetaData& meta = elem.second;
@@ -556,7 +686,7 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
     uint64_t file_num = fd.GetNumber();
     const std::string fpath =
         MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_num);
-    s = version_set_->VerifyFileMetadata(fpath, meta);
+    s = VerifyFile(fpath, meta);
     if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
       missing_files.insert(file_num);
       s = Status::OK();
@@ -564,17 +694,60 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
       break;
     }
   }
+
+  uint64_t missing_blob_file_num = prev_missing_blob_file_high;
+  for (const auto& elem : edit.GetBlobFileAdditions()) {
+    uint64_t file_num = elem.GetBlobFileNumber();
+    s = VerifyBlobFile(cfd, file_num, elem);
+    if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
+      missing_blob_file_num = std::max(missing_blob_file_num, file_num);
+      s = Status::OK();
+    } else if (!s.ok()) {
+      break;
+    }
+  }
+
+  bool has_missing_blob_files = false;
+  if (missing_blob_file_num != kInvalidBlobFileNumber &&
+      missing_blob_file_num >= prev_missing_blob_file_high) {
+    missing_blob_files_high_iter->second = missing_blob_file_num;
+    has_missing_blob_files = true;
+  } else if (missing_blob_file_num < prev_missing_blob_file_high) {
+    assert(false);
+  }
+
+  // We still have not applied the new version edit, but have tried to add new
+  // table and blob files after verifying their presence and consistency.
+  // Therefore, we know whether we will see new missing table and blob files
+  // later after actually applying the version edit. We perform the check here
+  // and record the result.
+  const bool has_missing_files =
+      !missing_files.empty() || has_missing_blob_files;
+
   bool missing_info = !version_edit_params_.has_log_number_ ||
                       !version_edit_params_.has_next_file_number_ ||
                       !version_edit_params_.has_last_sequence_;
 
-  // Create version before apply edit
+  // Create version before apply edit. The version will represent the state
+  // before applying the version edit.
+  // A new version will created if:
+  // 1) no error has occurred so far, and
+  // 2) log_number_, next_file_number_ and last_sequence_ are known, and
+  // 3) any of the following:
+  //   a) no missing file before, but will have missing file(s) after applying
+  //      this version edit.
+  //   b) no missing file after applying the version edit, and the caller
+  //      explicitly request that a new version be created.
   if (s.ok() && !missing_info &&
-      ((!missing_files.empty() && !prev_has_missing_files) ||
-       (missing_files.empty() && force_create_version))) {
-    auto builder_iter = builders_.find(cfd->GetID());
-    assert(builder_iter != builders_.end());
-    auto* builder = builder_iter->second->version_builder();
+      ((has_missing_files && !prev_has_missing_files) ||
+       (!has_missing_files && force_create_version))) {
+    if (!builder) {
+      auto builder_iter = builders_.find(cfd->GetID());
+      assert(builder_iter != builders_.end());
+      builder = builder_iter->second->version_builder();
+      assert(builder);
+    }
+
     auto* version = new Version(cfd, version_set_, version_set_->file_options_,
                                 *cfd->GetLatestMutableCFOptions(), io_tracer_,
                                 version_set_->current_version_number_++);
@@ -597,4 +770,159 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
   return s;
 }
 
+Status VersionEditHandlerPointInTime::VerifyFile(const std::string& fpath,
+                                                 const FileMetaData& fmeta) {
+  return version_set_->VerifyFileMetadata(fpath, fmeta);
+}
+
+Status VersionEditHandlerPointInTime::VerifyBlobFile(
+    ColumnFamilyData* cfd, uint64_t blob_file_num,
+    const BlobFileAddition& blob_addition) {
+  BlobFileCache* blob_file_cache = cfd->blob_file_cache();
+  assert(blob_file_cache);
+  CacheHandleGuard<BlobFileReader> blob_file_reader;
+  Status s =
+      blob_file_cache->GetBlobFileReader(blob_file_num, &blob_file_reader);
+  if (!s.ok()) {
+    return s;
+  }
+  // TODO: verify checksum
+  (void)blob_addition;
+  return s;
+}
+
+Status ManifestTailer::Initialize() {
+  if (Mode::kRecovery == mode_) {
+    return VersionEditHandler::Initialize();
+  }
+  assert(Mode::kCatchUp == mode_);
+  Status s;
+  if (!initialized_) {
+    ColumnFamilySet* cfd_set = version_set_->GetColumnFamilySet();
+    assert(cfd_set);
+    ColumnFamilyData* default_cfd = cfd_set->GetDefault();
+    assert(default_cfd);
+    auto builder_iter = builders_.find(default_cfd->GetID());
+    assert(builder_iter != builders_.end());
+
+    Version* dummy_version = default_cfd->dummy_versions();
+    assert(dummy_version);
+    Version* base_version = dummy_version->Next();
+    assert(base_version);
+    base_version->Ref();
+    VersionBuilderUPtr new_builder(
+        new BaseReferencedVersionBuilder(default_cfd, base_version));
+    builder_iter->second = std::move(new_builder);
+
+    initialized_ = true;
+  }
+  return s;
+}
+
+Status ManifestTailer::ApplyVersionEdit(VersionEdit& edit,
+                                        ColumnFamilyData** cfd) {
+  Status s = VersionEditHandler::ApplyVersionEdit(edit, cfd);
+  if (s.ok()) {
+    assert(cfd);
+    if (*cfd) {
+      cfds_changed_.insert(*cfd);
+    }
+  }
+  return s;
+}
+
+Status ManifestTailer::OnColumnFamilyAdd(VersionEdit& edit,
+                                         ColumnFamilyData** cfd) {
+  if (Mode::kRecovery == mode_) {
+    return VersionEditHandler::OnColumnFamilyAdd(edit, cfd);
+  }
+  assert(Mode::kCatchUp == mode_);
+  ColumnFamilySet* cfd_set = version_set_->GetColumnFamilySet();
+  assert(cfd_set);
+  ColumnFamilyData* tmp_cfd = cfd_set->GetColumnFamily(edit.GetColumnFamily());
+  assert(cfd);
+  *cfd = tmp_cfd;
+  if (!tmp_cfd) {
+    // For now, ignore new column families created after Recover() succeeds.
+    return Status::OK();
+  }
+  auto builder_iter = builders_.find(edit.GetColumnFamily());
+  assert(builder_iter != builders_.end());
+
+  Version* dummy_version = tmp_cfd->dummy_versions();
+  assert(dummy_version);
+  Version* base_version = dummy_version->Next();
+  assert(base_version);
+  base_version->Ref();
+  VersionBuilderUPtr new_builder(
+      new BaseReferencedVersionBuilder(tmp_cfd, base_version));
+  builder_iter->second = std::move(new_builder);
+
+#ifndef NDEBUG
+  auto version_iter = versions_.find(edit.GetColumnFamily());
+  assert(version_iter != versions_.end());
+#endif  // !NDEBUG
+  return Status::OK();
+}
+
+void ManifestTailer::CheckIterationResult(const log::Reader& reader,
+                                          Status* s) {
+  VersionEditHandlerPointInTime::CheckIterationResult(reader, s);
+  assert(s);
+  if (s->ok()) {
+    if (Mode::kRecovery == mode_) {
+      mode_ = Mode::kCatchUp;
+    } else {
+      assert(Mode::kCatchUp == mode_);
+    }
+  }
+}
+
+Status ManifestTailer::VerifyFile(const std::string& fpath,
+                                  const FileMetaData& fmeta) {
+  Status s = VersionEditHandlerPointInTime::VerifyFile(fpath, fmeta);
+  // TODO: Open file or create hard link to prevent the file from being
+  // deleted.
+  return s;
+}
+
+void DumpManifestHandler::CheckIterationResult(const log::Reader& reader,
+                                               Status* s) {
+  VersionEditHandler::CheckIterationResult(reader, s);
+  if (!s->ok()) {
+    fprintf(stdout, "%s\n", s->ToString().c_str());
+    return;
+  }
+  assert(cf_to_cmp_names_);
+  for (auto* cfd : *(version_set_->column_family_set_)) {
+    fprintf(stdout,
+            "--------------- Column family \"%s\"  (ID %" PRIu32
+            ") --------------\n",
+            cfd->GetName().c_str(), cfd->GetID());
+    fprintf(stdout, "log number: %" PRIu64 "\n", cfd->GetLogNumber());
+    auto it = cf_to_cmp_names_->find(cfd->GetID());
+    if (it != cf_to_cmp_names_->end()) {
+      fprintf(stdout,
+              "comparator: <%s>, but the comparator object is not available.\n",
+              it->second.c_str());
+    } else {
+      fprintf(stdout, "comparator: %s\n", cfd->user_comparator()->Name());
+    }
+    assert(cfd->current());
+
+    // Print out DebugStrings. Can include non-terminating null characters.
+    fwrite(cfd->current()->DebugString(hex_).data(), sizeof(char),
+           cfd->current()->DebugString(hex_).size(), stdout);
+  }
+  fprintf(stdout,
+          "next_file_number %" PRIu64 " last_sequence %" PRIu64
+          "  prev_log_number %" PRIu64 " max_column_family %" PRIu32
+          " min_log_number_to_keep "
+          "%" PRIu64 "\n",
+          version_set_->current_next_file_number(),
+          version_set_->LastSequence(), version_set_->prev_log_number(),
+          version_set_->column_family_set_->GetMaxColumnFamily(),
+          version_set_->min_log_number_to_keep_2pc());
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/version_edit_handler.h b/db/version_edit_handler.h
index da222a8f3db..665e0f0d43f 100644
--- a/db/version_edit_handler.h
+++ b/db/version_edit_handler.h
@@ -15,7 +15,79 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-typedef std::unique_ptr<BaseReferencedVersionBuilder> VersionBuilderUPtr;
+struct FileMetaData;
+
+class VersionEditHandlerBase {
+ public:
+  explicit VersionEditHandlerBase()
+      : max_manifest_read_size_(std::numeric_limits<uint64_t>::max()) {}
+
+  virtual ~VersionEditHandlerBase() {}
+
+  void Iterate(log::Reader& reader, Status* log_read_status);
+
+  const Status& status() const { return status_; }
+
+  AtomicGroupReadBuffer& GetReadBuffer() { return read_buffer_; }
+
+ protected:
+  explicit VersionEditHandlerBase(uint64_t max_read_size)
+      : max_manifest_read_size_(max_read_size) {}
+  virtual Status Initialize() { return Status::OK(); }
+
+  virtual Status ApplyVersionEdit(VersionEdit& edit,
+                                  ColumnFamilyData** cfd) = 0;
+
+  virtual void CheckIterationResult(const log::Reader& /*reader*/,
+                                    Status* /*s*/) {}
+
+  void ClearReadBuffer() { read_buffer_.Clear(); }
+
+  Status status_;
+
+ private:
+  AtomicGroupReadBuffer read_buffer_;
+  const uint64_t max_manifest_read_size_;
+};
+
+class ListColumnFamiliesHandler : public VersionEditHandlerBase {
+ public:
+  ListColumnFamiliesHandler() : VersionEditHandlerBase() {}
+
+  ~ListColumnFamiliesHandler() override {}
+
+  const std::map<uint32_t, std::string> GetColumnFamilyNames() const {
+    return column_family_names_;
+  }
+
+ protected:
+  Status ApplyVersionEdit(VersionEdit& edit,
+                          ColumnFamilyData** /*unused*/) override;
+
+ private:
+  // default column family is always implicitly there
+  std::map<uint32_t, std::string> column_family_names_{
+      {0, kDefaultColumnFamilyName}};
+};
+
+class FileChecksumRetriever : public VersionEditHandlerBase {
+ public:
+  FileChecksumRetriever(uint64_t max_read_size,
+                        FileChecksumList& file_checksum_list)
+      : VersionEditHandlerBase(max_read_size),
+        file_checksum_list_(file_checksum_list) {}
+
+  ~FileChecksumRetriever() override {}
+
+ protected:
+  Status ApplyVersionEdit(VersionEdit& edit,
+                          ColumnFamilyData** /*unused*/) override;
+
+ private:
+  FileChecksumList& file_checksum_list_;
+};
+
+using VersionBuilderUPtr = std::unique_ptr<BaseReferencedVersionBuilder>;
 
 // A class used for scanning MANIFEST file.
 // VersionEditHandler reads a MANIFEST file, parses the version edits, and
@@ -24,44 +96,64 @@ typedef std::unique_ptr<BaseReferencedVersionBuilder> VersionBuilderUPtr;
 // To use this class and its subclasses,
 // 1. Create an object of VersionEditHandler or its subclasses.
 //    VersionEditHandler handler(read_only, column_families, version_set,
-//                               track_missing_files, ignore_missing_files);
+//                               track_missing_files,
+//                               no_error_if_files_missing);
 // 2. Status s = handler.Iterate(reader, &db_id);
 // 3. Check s and handle possible errors.
 //
 // Not thread-safe, external synchronization is necessary if an object of
 // VersionEditHandler is shared by multiple threads.
-class VersionEditHandler {
+class VersionEditHandler : public VersionEditHandlerBase {
  public:
   explicit VersionEditHandler(
       bool read_only,
       const std::vector<ColumnFamilyDescriptor>& column_families,
       VersionSet* version_set, bool track_missing_files,
-      bool ignore_missing_files, const std::shared_ptr<IOTracer>& io_tracer);
+      bool no_error_if_files_missing,
+      const std::shared_ptr<IOTracer>& io_tracer)
+      : VersionEditHandler(read_only, column_families, version_set,
+                           track_missing_files, no_error_if_files_missing,
+                           io_tracer, /*skip_load_table_files=*/false) {}
 
-  virtual ~VersionEditHandler() {}
+  ~VersionEditHandler() override {}
 
-  void Iterate(log::Reader& reader, Status* log_read_status,
-               std::string* db_id);
-
-  const Status& status() const { return status_; }
+  const VersionEditParams& GetVersionEditParams() const {
+    return version_edit_params_;
+  }
 
   bool HasMissingFiles() const;
 
+  void GetDbId(std::string* db_id) const {
+    if (db_id && version_edit_params_.has_db_id_) {
+      *db_id = version_edit_params_.db_id_;
+    }
+  }
+
  protected:
-  Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd);
+  explicit VersionEditHandler(
+      bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
+      VersionSet* version_set, bool track_missing_files,
+      bool no_error_if_files_missing,
+      const std::shared_ptr<IOTracer>& io_tracer, bool skip_load_table_files);
 
-  Status OnColumnFamilyAdd(VersionEdit& edit, ColumnFamilyData** cfd);
+  Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override;
+
+  virtual Status OnColumnFamilyAdd(VersionEdit& edit, ColumnFamilyData** cfd);
 
   Status OnColumnFamilyDrop(VersionEdit& edit, ColumnFamilyData** cfd);
 
   Status OnNonCfOperation(VersionEdit& edit, ColumnFamilyData** cfd);
 
-  Status Initialize();
+  Status OnWalAddition(VersionEdit& edit);
+
+  Status OnWalDeletion(VersionEdit& edit);
+
+  Status Initialize() override;
 
   void CheckColumnFamilyId(const VersionEdit& edit, bool* cf_in_not_found,
                            bool* cf_in_builders) const;
 
-  virtual void CheckIterationResult(const log::Reader& reader, Status* s);
+  void CheckIterationResult(const log::Reader& reader, Status* s) override;
 
   ColumnFamilyData* CreateCfAndInit(const ColumnFamilyOptions& cf_options,
                                     const VersionEdit& edit);
@@ -76,26 +168,31 @@ class VersionEditHandler {
                     bool prefetch_index_and_filter_in_cache,
                     bool is_initial_load);
 
+  virtual bool MustOpenAllColumnFamilies() const { return !read_only_; }
+
   const bool read_only_;
-  const std::vector<ColumnFamilyDescriptor>& column_families_;
-  Status status_;
+  std::vector<ColumnFamilyDescriptor> column_families_;
   VersionSet* version_set_;
-  AtomicGroupReadBuffer read_buffer_;
   std::unordered_map<uint32_t, VersionBuilderUPtr> builders_;
   std::unordered_map<std::string, ColumnFamilyOptions> name_to_options_;
+  // Keeps track of column families in manifest that were not found in
+  // column families parameters. if those column families are not dropped
+  // by subsequent manifest records, Recover() will return failure status.
   std::unordered_map<uint32_t, std::string> column_families_not_found_;
   VersionEditParams version_edit_params_;
   const bool track_missing_files_;
   std::unordered_map<uint32_t, std::unordered_set<uint64_t>>
       cf_to_missing_files_;
-  bool no_error_if_table_files_missing_;
+  std::unordered_map<uint32_t, uint64_t> cf_to_missing_blob_files_high_;
+  bool no_error_if_files_missing_;
+  std::shared_ptr<IOTracer> io_tracer_;
+  bool skip_load_table_files_;
+  bool initialized_;
+  std::unique_ptr<std::unordered_map<uint32_t, std::string>> cf_to_cmp_names_;
 
  private:
   Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
                                     const VersionEdit& edit);
-
-  bool initialized_;
-  std::shared_ptr<IOTracer> io_tracer_;
 };
 
 // A class similar to its base class, i.e. VersionEditHandler.
@@ -107,8 +204,7 @@ class VersionEditHandler {
 class VersionEditHandlerPointInTime : public VersionEditHandler {
  public:
   VersionEditHandlerPointInTime(
-      bool read_only,
-      const std::vector<ColumnFamilyDescriptor>& column_families,
+      bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
       VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer);
   ~VersionEditHandlerPointInTime() override;
 
@@ -117,10 +213,97 @@ class VersionEditHandlerPointInTime : public VersionEditHandler {
   ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit) override;
   Status MaybeCreateVersion(const VersionEdit& edit, ColumnFamilyData* cfd,
                             bool force_create_version) override;
+  virtual Status VerifyFile(const std::string& fpath,
+                            const FileMetaData& fmeta);
+  virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num,
+                                const BlobFileAddition& blob_addition);
 
- private:
   std::unordered_map<uint32_t, Version*> versions_;
-  std::shared_ptr<IOTracer> io_tracer_;
+};
+
+class ManifestTailer : public VersionEditHandlerPointInTime {
+ public:
+  explicit ManifestTailer(std::vector<ColumnFamilyDescriptor> column_families,
+                          VersionSet* version_set,
+                          const std::shared_ptr<IOTracer>& io_tracer)
+      : VersionEditHandlerPointInTime(/*read_only=*/false, column_families,
+                                      version_set, io_tracer),
+        mode_(Mode::kRecovery) {}
+
+  void PrepareToReadNewManifest() {
+    initialized_ = false;
+    ClearReadBuffer();
+  }
+
+  std::unordered_set<ColumnFamilyData*>& GetUpdatedColumnFamilies() {
+    return cfds_changed_;
+  }
+
+ protected:
+  Status Initialize() override;
+
+  bool MustOpenAllColumnFamilies() const override { return false; }
+
+  Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override;
+
+  Status OnColumnFamilyAdd(VersionEdit& edit, ColumnFamilyData** cfd) override;
+
+  void CheckIterationResult(const log::Reader& reader, Status* s) override;
+
+  Status VerifyFile(const std::string& fpath,
+                    const FileMetaData& fmeta) override;
+
+  enum Mode : uint8_t {
+    kRecovery = 0,
+    kCatchUp = 1,
+  };
+
+  Mode mode_;
+  std::unordered_set<ColumnFamilyData*> cfds_changed_;
+};
+
+class DumpManifestHandler : public VersionEditHandler {
+ public:
+  DumpManifestHandler(std::vector<ColumnFamilyDescriptor> column_families,
+                      VersionSet* version_set,
+                      const std::shared_ptr<IOTracer>& io_tracer, bool verbose,
+                      bool hex, bool json)
+      : VersionEditHandler(
+            /*read_only=*/true, column_families, version_set,
+            /*track_missing_files=*/false,
+            /*no_error_if_files_missing=*/false, io_tracer,
+            /*skip_load_table_files=*/true),
+        verbose_(verbose),
+        hex_(hex),
+        json_(json),
+        count_(0) {
+    cf_to_cmp_names_.reset(new std::unordered_map<uint32_t, std::string>());
+  }
+
+  ~DumpManifestHandler() override {}
+
+  Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override {
+    // Write out each individual edit
+    if (verbose_ && !json_) {
+      // Print out DebugStrings. Can include non-terminating null characters.
+      fwrite(edit.DebugString(hex_).data(), sizeof(char),
+             edit.DebugString(hex_).size(), stdout);
+    } else if (json_) {
+      // Print out DebugStrings. Can include non-terminating null characters.
+      fwrite(edit.DebugString(hex_).data(), sizeof(char),
+             edit.DebugString(hex_).size(), stdout);
+    }
+    ++count_;
+    return VersionEditHandler::ApplyVersionEdit(edit, cfd);
+  }
+
+  void CheckIterationResult(const log::Reader& reader, Status* s) override;
+
+ private:
+  const bool verbose_;
+  const bool hex_;
+  const bool json_;
+  int count_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
index ea62d9a784f..43ae6840fb1 100644
--- a/db/version_edit_test.cc
+++ b/db/version_edit_test.cc
@@ -8,8 +8,10 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/version_edit.h"
+
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
+#include "test_util/testutil.h"
 #include "util/coding.h"
 #include "util/string_util.h"
 
@@ -317,23 +319,27 @@ TEST_F(VersionEditTest, AddWalEncodeDecode) {
     if (has_size) {
       meta.SetSyncedSizeInBytes(rand() % 1000);
     }
-    bool is_closed = rand() % 2 == 0;
-    if (is_closed) {
-      meta.SetClosed();
-    }
     edit.AddWal(log_number, meta);
   }
   TestEncodeDecode(edit);
 }
 
+static std::string PrefixEncodedWalAdditionWithLength(
+    const std::string& encoded) {
+  std::string ret;
+  PutVarint32(&ret, Tag::kWalAddition2);
+  PutLengthPrefixedSlice(&ret, encoded);
+  return ret;
+}
+
 TEST_F(VersionEditTest, AddWalDecodeBadLogNumber) {
   std::string encoded;
-  PutVarint32(&encoded, Tag::kWalAddition);
 
   {
     // No log number.
+    std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
     VersionEdit edit;
-    Status s = edit.DecodeFrom(encoded);
+    Status s = edit.DecodeFrom(encoded_edit);
     ASSERT_TRUE(s.IsCorruption());
     ASSERT_TRUE(s.ToString().find("Error decoding WAL log number") !=
                 std::string::npos)
@@ -347,8 +353,10 @@ TEST_F(VersionEditTest, AddWalDecodeBadLogNumber) {
     unsigned char* ptr = reinterpret_cast<unsigned char*>(&c);
     *ptr = 128;
     encoded.append(1, c);
+
+    std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
     VersionEdit edit;
-    Status s = edit.DecodeFrom(encoded);
+    Status s = edit.DecodeFrom(encoded_edit);
     ASSERT_TRUE(s.IsCorruption());
     ASSERT_TRUE(s.ToString().find("Error decoding WAL log number") !=
                 std::string::npos)
@@ -360,14 +368,14 @@ TEST_F(VersionEditTest, AddWalDecodeBadTag) {
   constexpr WalNumber kLogNumber = 100;
   constexpr uint64_t kSizeInBytes = 100;
 
-  std::string encoded_without_tag;
-  PutVarint32(&encoded_without_tag, Tag::kWalAddition);
-  PutVarint64(&encoded_without_tag, kLogNumber);
+  std::string encoded;
+  PutVarint64(&encoded, kLogNumber);
 
   {
     // No tag.
+    std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
     VersionEdit edit;
-    Status s = edit.DecodeFrom(encoded_without_tag);
+    Status s = edit.DecodeFrom(encoded_edit);
     ASSERT_TRUE(s.IsCorruption());
     ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos)
         << s.ToString();
@@ -375,12 +383,15 @@ TEST_F(VersionEditTest, AddWalDecodeBadTag) {
 
   {
     // Only has size tag, no terminate tag.
-    std::string encoded_with_size = encoded_without_tag;
+    std::string encoded_with_size = encoded;
     PutVarint32(&encoded_with_size,
                 static_cast<uint32_t>(WalAdditionTag::kSyncedSize));
     PutVarint64(&encoded_with_size, kSizeInBytes);
+
+    std::string encoded_edit =
+        PrefixEncodedWalAdditionWithLength(encoded_with_size);
     VersionEdit edit;
-    Status s = edit.DecodeFrom(encoded_with_size);
+    Status s = edit.DecodeFrom(encoded_edit);
     ASSERT_TRUE(s.IsCorruption());
     ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos)
         << s.ToString();
@@ -388,11 +399,14 @@ TEST_F(VersionEditTest, AddWalDecodeBadTag) {
 
   {
     // Only has terminate tag.
-    std::string encoded_with_terminate = encoded_without_tag;
+    std::string encoded_with_terminate = encoded;
     PutVarint32(&encoded_with_terminate,
                 static_cast<uint32_t>(WalAdditionTag::kTerminate));
+
+    std::string encoded_edit =
+        PrefixEncodedWalAdditionWithLength(encoded_with_terminate);
     VersionEdit edit;
-    ASSERT_OK(edit.DecodeFrom(encoded_with_terminate));
+    ASSERT_OK(edit.DecodeFrom(encoded_edit));
     auto& wal_addition = edit.GetWalAdditions()[0];
     ASSERT_EQ(wal_addition.GetLogNumber(), kLogNumber);
     ASSERT_FALSE(wal_addition.GetMetadata().HasSyncedSize());
@@ -403,15 +417,15 @@ TEST_F(VersionEditTest, AddWalDecodeNoSize) {
   constexpr WalNumber kLogNumber = 100;
 
   std::string encoded;
-  PutVarint32(&encoded, Tag::kWalAddition);
   PutVarint64(&encoded, kLogNumber);
   PutVarint32(&encoded, static_cast<uint32_t>(WalAdditionTag::kSyncedSize));
   // No real size after the size tag.
 
   {
     // Without terminate tag.
+    std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
     VersionEdit edit;
-    Status s = edit.DecodeFrom(encoded);
+    Status s = edit.DecodeFrom(encoded_edit);
     ASSERT_TRUE(s.IsCorruption());
     ASSERT_TRUE(s.ToString().find("Error decoding WAL file size") !=
                 std::string::npos)
@@ -421,8 +435,10 @@ TEST_F(VersionEditTest, AddWalDecodeNoSize) {
   {
     // With terminate tag.
     PutVarint32(&encoded, static_cast<uint32_t>(WalAdditionTag::kTerminate));
+
+    std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
     VersionEdit edit;
-    Status s = edit.DecodeFrom(encoded);
+    Status s = edit.DecodeFrom(encoded_edit);
     ASSERT_TRUE(s.IsCorruption());
     // The terminate tag is misunderstood as the size.
     ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos)
@@ -442,7 +458,7 @@ TEST_F(VersionEditTest, AddWalDebug) {
 
   const WalAdditions& wals = edit.GetWalAdditions();
 
-  ASSERT_TRUE(edit.HasWalAddition());
+  ASSERT_TRUE(edit.IsWalAddition());
   ASSERT_EQ(wals.size(), n);
   for (int i = 0; i < n; i++) {
     const WalAddition& wal = wals[i];
@@ -454,7 +470,7 @@ TEST_F(VersionEditTest, AddWalDebug) {
   for (int i = 0; i < n; i++) {
     std::stringstream ss;
     ss << "  WalAddition: log_number: " << kLogNumbers[i]
-       << " synced_size_in_bytes: " << kSizeInBytes[i] << " closed: 0\n";
+       << " synced_size_in_bytes: " << kSizeInBytes[i] << "\n";
     expected_str += ss.str();
   }
   expected_str += "  ColumnFamily: 0\n}\n";
@@ -464,8 +480,7 @@ TEST_F(VersionEditTest, AddWalDebug) {
   for (int i = 0; i < n; i++) {
     std::stringstream ss;
     ss << "{\"LogNumber\": " << kLogNumbers[i] << ", "
-       << "\"SyncedSizeInBytes\": " << kSizeInBytes[i] << ", "
-       << "\"Closed\": 0}";
+       << "\"SyncedSizeInBytes\": " << kSizeInBytes[i] << "}";
     if (i < n - 1) ss << ", ";
     expected_json += ss.str();
   }
@@ -475,9 +490,7 @@ TEST_F(VersionEditTest, AddWalDebug) {
 
 TEST_F(VersionEditTest, DeleteWalEncodeDecode) {
   VersionEdit edit;
-  for (uint64_t log_number = 1; log_number <= 20; log_number++) {
-    edit.DeleteWal(log_number);
-  }
+  edit.DeleteWalsBefore(rand() % 100);
   TestEncodeDecode(edit);
 }
 
@@ -486,39 +499,96 @@ TEST_F(VersionEditTest, DeleteWalDebug) {
   constexpr std::array<uint64_t, n> kLogNumbers{{10, 20}};
 
   VersionEdit edit;
-  for (int i = 0; i < n; i++) {
-    edit.DeleteWal(kLogNumbers[i]);
-  }
+  edit.DeleteWalsBefore(kLogNumbers[n - 1]);
 
-  const WalDeletions& wals = edit.GetWalDeletions();
+  const WalDeletion& wal = edit.GetWalDeletion();
 
-  ASSERT_TRUE(edit.HasWalDeletion());
-  ASSERT_EQ(wals.size(), n);
-  for (int i = 0; i < n; i++) {
-    const WalDeletion& wal = wals[i];
-    ASSERT_EQ(wal.GetLogNumber(), kLogNumbers[i]);
-  }
+  ASSERT_TRUE(edit.IsWalDeletion());
+  ASSERT_EQ(wal.GetLogNumber(), kLogNumbers[n - 1]);
 
   std::string expected_str = "VersionEdit {\n";
-  for (int i = 0; i < n; i++) {
+  {
     std::stringstream ss;
-    ss << "  WalDeletion: log_number: " << kLogNumbers[i] << "\n";
+    ss << "  WalDeletion: log_number: " << kLogNumbers[n - 1] << "\n";
     expected_str += ss.str();
   }
   expected_str += "  ColumnFamily: 0\n}\n";
   ASSERT_EQ(edit.DebugString(true), expected_str);
 
-  std::string expected_json = "{\"EditNumber\": 4, \"WalDeletions\": [";
-  for (int i = 0; i < n; i++) {
+  std::string expected_json = "{\"EditNumber\": 4, \"WalDeletion\": ";
+  {
     std::stringstream ss;
-    ss << "{\"LogNumber\": " << kLogNumbers[i] << "}";
-    if (i < n - 1) ss << ", ";
+    ss << "{\"LogNumber\": " << kLogNumbers[n - 1] << "}";
     expected_json += ss.str();
   }
-  expected_json += "], \"ColumnFamily\": 0}";
+  expected_json += ", \"ColumnFamily\": 0}";
   ASSERT_EQ(edit.DebugJSON(4, true), expected_json);
 }
 
+TEST_F(VersionEditTest, FullHistoryTsLow) {
+  VersionEdit edit;
+  ASSERT_FALSE(edit.HasFullHistoryTsLow());
+  std::string ts = test::EncodeInt(0);
+  edit.SetFullHistoryTsLow(ts);
+  TestEncodeDecode(edit);
+}
+
+// Tests that if RocksDB is downgraded, the new types of VersionEdits
+// that have a tag larger than kTagSafeIgnoreMask can be safely ignored.
+TEST_F(VersionEditTest, IgnorableTags) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionEdit::EncodeTo:IgnoreIgnorableTags", [&](void* arg) {
+        bool* ignore = static_cast<bool*>(arg);
+        *ignore = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr uint64_t kPrevLogNumber = 100;
+  constexpr uint64_t kLogNumber = 200;
+  constexpr uint64_t kNextFileNumber = 300;
+  constexpr uint64_t kColumnFamilyId = 400;
+
+  VersionEdit edit;
+  // Add some ignorable entries.
+  for (int i = 0; i < 2; i++) {
+    edit.AddWal(i + 1, WalMetadata(i + 2));
+  }
+  edit.SetDBId("db_id");
+  // Add unignorable entries.
+  edit.SetPrevLogNumber(kPrevLogNumber);
+  edit.SetLogNumber(kLogNumber);
+  // Add more ignorable entries.
+  edit.DeleteWalsBefore(100);
+  // Add unignorable entry.
+  edit.SetNextFile(kNextFileNumber);
+  // Add more ignorable entries.
+  edit.SetFullHistoryTsLow("ts");
+  // Add unignorable entry.
+  edit.SetColumnFamily(kColumnFamilyId);
+
+  std::string encoded;
+  ASSERT_TRUE(edit.EncodeTo(&encoded));
+
+  VersionEdit decoded;
+  ASSERT_OK(decoded.DecodeFrom(encoded));
+
+  // Check that all ignorable entries are ignored.
+  ASSERT_FALSE(decoded.HasDbId());
+  ASSERT_FALSE(decoded.HasFullHistoryTsLow());
+  ASSERT_FALSE(decoded.IsWalAddition());
+  ASSERT_FALSE(decoded.IsWalDeletion());
+  ASSERT_TRUE(decoded.GetWalAdditions().empty());
+  ASSERT_TRUE(decoded.GetWalDeletion().IsEmpty());
+
+  // Check that unignorable entries are still present.
+  ASSERT_EQ(edit.GetPrevLogNumber(), kPrevLogNumber);
+  ASSERT_EQ(edit.GetLogNumber(), kLogNumber);
+  ASSERT_EQ(edit.GetNextFile(), kNextFileNumber);
+  ASSERT_EQ(edit.GetColumnFamily(), kColumnFamilyId);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/version_set.cc b/db/version_set.cc
index b6550dd60c7..e1964482143 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -9,11 +9,10 @@
 
 #include "db/version_set.h"
 
-#include <stdio.h>
-
 #include <algorithm>
 #include <array>
 #include <cinttypes>
+#include <cstdio>
 #include <list>
 #include <map>
 #include <set>
@@ -22,6 +21,10 @@
 #include <vector>
 
 #include "compaction/compaction.h"
+#include "db/blob/blob_fetcher.h"
+#include "db/blob/blob_file_cache.h"
+#include "db/blob/blob_file_reader.h"
+#include "db/blob/blob_index.h"
 #include "db/internal_stats.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
@@ -39,6 +42,7 @@
 #include "monitoring/file_read_sample.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/persistent_stats_history.h"
+#include "options/options_helper.h"
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/write_buffer_manager.h"
@@ -92,7 +96,8 @@ Status OverlapWithIterator(const Comparator* ucmp,
   *overlap = false;
   if (iter->Valid()) {
     ParsedInternalKey seek_result;
-    Status s = ParseInternalKey(iter->key(), &seek_result);
+    Status s = ParseInternalKey(iter->key(), &seek_result,
+                                false /* log_err_key */);  // TODO
     if (!s.ok()) return s;
 
     if (ucmp->CompareWithoutTimestamp(seek_result.user_key, largest_user_key) <=
@@ -404,7 +409,7 @@ class FilePickerMultiGet {
   int GetCurrentLevel() const { return curr_level_; }
 
   // Iterates through files in the current level until it finds a file that
-  // contains atleast one key from the MultiGet batch
+  // contains at least one key from the MultiGet batch
   bool GetNextFileInLevelWithKeys(MultiGetRange* next_file_range,
                                   size_t* file_index, FdWithKeyRange** fd,
                                   bool* is_last_key_in_file) {
@@ -436,7 +441,7 @@ class FilePickerMultiGet {
             !file_hit)) {
       struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()];
       f = &curr_file_level_->files[fp_ctx.curr_index_in_curr_level];
-      Slice& user_key = batch_iter_->ukey;
+      Slice& user_key = batch_iter_->ukey_without_ts;
 
       // Do key range filtering of files or/and fractional cascading if:
       // (1) not all the files are in level 0, or
@@ -450,17 +455,17 @@ class FilePickerMultiGet {
         // Check if key is within a file's range. If search left bound and
         // right bound point to the same find, we are sure key falls in
         // range.
+        int cmp_smallest = user_comparator_->CompareWithoutTimestamp(
+            user_key, false, ExtractUserKey(f->smallest_key), true);
+
         assert(curr_level_ == 0 ||
                fp_ctx.curr_index_in_curr_level ==
                    fp_ctx.start_index_in_curr_level ||
-               user_comparator_->Compare(user_key,
-                                         ExtractUserKey(f->smallest_key)) <= 0);
+               cmp_smallest <= 0);
 
-        int cmp_smallest = user_comparator_->Compare(
-            user_key, ExtractUserKey(f->smallest_key));
         if (cmp_smallest >= 0) {
-          cmp_largest = user_comparator_->Compare(
-              user_key, ExtractUserKey(f->largest_key));
+          cmp_largest = user_comparator_->CompareWithoutTimestamp(
+              user_key, false, ExtractUserKey(f->largest_key), true);
         } else {
           cmp_largest = -1;
         }
@@ -493,8 +498,9 @@ class FilePickerMultiGet {
         upper_key_ = batch_iter_;
         ++upper_key_;
         while (upper_key_ != current_level_range_.end() &&
-               user_comparator_->Compare(batch_iter_->ukey, upper_key_->ukey) ==
-                   0) {
+               user_comparator_->CompareWithoutTimestamp(
+                   batch_iter_->ukey_without_ts, false,
+                   upper_key_->ukey_without_ts, false) == 0) {
           ++upper_key_;
         }
         break;
@@ -1091,13 +1097,17 @@ void LevelIterator::Seek(const Slice& target) {
     //    next key after the prefix, or make the iterator invalid.
     // A side benefit will be that it invalidates the iterator earlier so that
     // the upper level merging iterator can merge fewer child iterators.
-    Slice target_user_key = ExtractUserKey(target);
-    Slice file_user_key = ExtractUserKey(file_iter_.key());
-    if (prefix_extractor_->InDomain(target_user_key) &&
-        (!prefix_extractor_->InDomain(file_user_key) ||
-         user_comparator_.Compare(
-             prefix_extractor_->Transform(target_user_key),
-             prefix_extractor_->Transform(file_user_key)) != 0)) {
+    size_t ts_sz = user_comparator_.timestamp_size();
+    Slice target_user_key_without_ts =
+        ExtractUserKeyAndStripTimestamp(target, ts_sz);
+    Slice file_user_key_without_ts =
+        ExtractUserKeyAndStripTimestamp(file_iter_.key(), ts_sz);
+    if (prefix_extractor_->InDomain(target_user_key_without_ts) &&
+        (!prefix_extractor_->InDomain(file_user_key_without_ts) ||
+         user_comparator_.CompareWithoutTimestamp(
+             prefix_extractor_->Transform(target_user_key_without_ts), false,
+             prefix_extractor_->Transform(file_user_key_without_ts),
+             false) != 0)) {
       SetFileIterator(nullptr);
     }
   }
@@ -1293,7 +1303,7 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
   if (!s.ok()) {
     return s;
   }
-  RecordTick(ioptions->statistics, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES);
+  RecordTick(ioptions->stats, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES);
 
   *tp = std::shared_ptr<const TableProperties>(raw_table_properties);
   return s;
@@ -1476,15 +1486,16 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
         file_path = ioptions->cf_paths.back().path;
       }
       const uint64_t file_number = file->fd.GetNumber();
-      files.emplace_back(SstFileMetaData{
+      files.emplace_back(
           MakeTableFileName("", file_number), file_number, file_path,
           static_cast<size_t>(file->fd.GetFileSize()), file->fd.smallest_seqno,
           file->fd.largest_seqno, file->smallest.user_key().ToString(),
           file->largest.user_key().ToString(),
           file->stats.num_reads_sampled.load(std::memory_order_relaxed),
-          file->being_compacted, file->oldest_blob_file_number,
-          file->TryGetOldestAncesterTime(), file->TryGetFileCreationTime(),
-          file->file_checksum, file->file_checksum_func_name});
+          file->being_compacted, file->temperature,
+          file->oldest_blob_file_number, file->TryGetOldestAncesterTime(),
+          file->TryGetFileCreationTime(), file->file_checksum,
+          file->file_checksum_func_name);
       files.back().num_entries = file->num_entries;
       files.back().num_deletions = file->num_deletions;
       level_size += file->fd.GetFileSize();
@@ -1752,13 +1763,14 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
                  const std::shared_ptr<IOTracer>& io_tracer,
                  uint64_t version_number)
     : env_(vset->env_),
+      clock_(vset->clock_),
       cfd_(column_family_data),
-      info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->info_log),
-      db_statistics_((cfd_ == nullptr) ? nullptr
-                                       : cfd_->ioptions()->statistics),
+      info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->logger),
+      db_statistics_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->stats),
       table_cache_((cfd_ == nullptr) ? nullptr : cfd_->table_cache()),
-      merge_operator_((cfd_ == nullptr) ? nullptr
-                                        : cfd_->ioptions()->merge_operator),
+      blob_file_cache_(cfd_ ? cfd_->blob_file_cache() : nullptr),
+      merge_operator_(
+          (cfd_ == nullptr) ? nullptr : cfd_->ioptions()->merge_operator.get()),
       storage_info_(
           (cfd_ == nullptr) ? nullptr : &cfd_->internal_comparator(),
           (cfd_ == nullptr) ? nullptr : cfd_->user_comparator(),
@@ -1780,6 +1792,62 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
       version_number_(version_number),
       io_tracer_(io_tracer) {}
 
+Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key,
+                        const Slice& blob_index_slice, PinnableSlice* value,
+                        uint64_t* bytes_read) const {
+  if (read_options.read_tier == kBlockCacheTier) {
+    return Status::Incomplete("Cannot read blob: no disk I/O allowed");
+  }
+
+  BlobIndex blob_index;
+
+  {
+    Status s = blob_index.DecodeFrom(blob_index_slice);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  return GetBlob(read_options, user_key, blob_index, value, bytes_read);
+}
+
+Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key,
+                        const BlobIndex& blob_index, PinnableSlice* value,
+                        uint64_t* bytes_read) const {
+  assert(value);
+
+  if (blob_index.HasTTL() || blob_index.IsInlined()) {
+    return Status::Corruption("Unexpected TTL/inlined blob index");
+  }
+
+  const auto& blob_files = storage_info_.GetBlobFiles();
+
+  const uint64_t blob_file_number = blob_index.file_number();
+
+  const auto it = blob_files.find(blob_file_number);
+  if (it == blob_files.end()) {
+    return Status::Corruption("Invalid blob file number");
+  }
+
+  CacheHandleGuard<BlobFileReader> blob_file_reader;
+
+  {
+    assert(blob_file_cache_);
+    const Status s = blob_file_cache_->GetBlobFileReader(blob_file_number,
+                                                         &blob_file_reader);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  assert(blob_file_reader.GetValue());
+  const Status s = blob_file_reader.GetValue()->GetBlob(
+      read_options, user_key, blob_index.offset(), blob_index.size(),
+      blob_index.compression(), value, bytes_read);
+
+  return s;
+}
+
 void Version::Get(const ReadOptions& read_options, const LookupKey& k,
                   PinnableSlice* value, std::string* timestamp, Status* status,
                   MergeContext* merge_context,
@@ -1802,13 +1870,21 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
       vset_->block_cache_tracer_->is_tracing_enabled()) {
     tracing_get_id = vset_->block_cache_tracer_->NextGetId();
   }
+
+  // Note: the old StackableDB-based BlobDB passes in
+  // GetImplOptions::is_blob_index; for the integrated BlobDB implementation, we
+  // need to provide it here.
+  bool is_blob_index = false;
+  bool* const is_blob_to_use = is_blob ? is_blob : &is_blob_index;
+  BlobFetcher blob_fetcher(this, read_options);
+
   GetContext get_context(
       user_comparator(), merge_operator_, info_log_, db_statistics_,
       status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
       do_merge ? value : nullptr, do_merge ? timestamp : nullptr, value_found,
-      merge_context, do_merge, max_covering_tombstone_seq, this->env_, seq,
-      merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob,
-      tracing_get_id);
+      merge_context, do_merge, max_covering_tombstone_seq, clock_, seq,
+      merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob_to_use,
+      tracing_get_id, &blob_fetcher);
 
   // Pin blocks that we read to hold merge operands
   if (merge_operator_) {
@@ -1834,7 +1910,7 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
     bool timer_enabled =
         GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
         get_perf_context()->per_level_perf_context_enabled;
-    StopWatchNano timer(env_, timer_enabled /* auto_start */);
+    StopWatchNano timer(clock_, timer_enabled /* auto_start */);
     *status = table_cache_->Get(
         read_options, *internal_comparator(), *f->file_metadata, ikey,
         &get_context, mutable_cf_options_.prefix_extractor.get(),
@@ -1872,8 +1948,25 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
         } else if (fp.GetHitFileLevel() >= 2) {
           RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
         }
+
         PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1,
                                   fp.GetHitFileLevel());
+
+        if (is_blob_index) {
+          if (do_merge && value) {
+            constexpr uint64_t* bytes_read = nullptr;
+
+            *status =
+                GetBlob(read_options, user_key, *value, value, bytes_read);
+            if (!status->ok()) {
+              if (status->IsIncomplete()) {
+                get_context.MarkKeyMayExist();
+              }
+              return;
+            }
+          }
+        }
+
         return;
       case GetContext::kDeleted:
         // Use empty error message for speed
@@ -1882,7 +1975,7 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
       case GetContext::kCorrupt:
         *status = Status::Corruption("corrupted key for ", user_key);
         return;
-      case GetContext::kBlobIndex:
+      case GetContext::kUnexpectedBlobIndex:
         ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index.");
         *status = Status::NotSupported(
             "Encounter unexpected blob index. Please open DB with "
@@ -1909,7 +2002,7 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
     std::string* str_value = value != nullptr ? value->GetSelf() : nullptr;
     *status = MergeHelper::TimedFullMerge(
         merge_operator_, user_key, nullptr, merge_context->GetOperands(),
-        str_value, info_log_, db_statistics_, env_,
+        str_value, info_log_, db_statistics_, clock_,
         nullptr /* result_operand */, true);
     if (LIKELY(value != nullptr)) {
       value->PinSelf();
@@ -1923,7 +2016,7 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
 }
 
 void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
-                       ReadCallback* callback, bool* is_blob) {
+                       ReadCallback* callback) {
   PinnedIteratorsManager pinned_iters_mgr;
 
   // Pin blocks that we read to hold merge operands
@@ -1940,15 +2033,16 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
   // use autovector in order to avoid unnecessary construction of GetContext
   // objects, which is expensive
   autovector<GetContext, 16> get_ctx;
+  BlobFetcher blob_fetcher(this, read_options);
   for (auto iter = range->begin(); iter != range->end(); ++iter) {
     assert(iter->s->ok() || iter->s->IsMergeInProgress());
     get_ctx.emplace_back(
         user_comparator(), merge_operator_, info_log_, db_statistics_,
-        iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge, iter->ukey,
-        iter->value, iter->timestamp, nullptr, &(iter->merge_context), true,
-        &iter->max_covering_tombstone_seq, this->env_, nullptr,
-        merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob,
-        tracing_mget_id);
+        iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge,
+        iter->ukey_with_ts, iter->value, iter->timestamp, nullptr,
+        &(iter->merge_context), true, &iter->max_covering_tombstone_seq, clock_,
+        nullptr, merge_operator_ ? &pinned_iters_mgr : nullptr, callback,
+        &iter->is_blob_index, tracing_mget_id, &blob_fetcher);
     // MergeInProgress status, if set, has been transferred to the get_context
     // state, so we set status to ok here. From now on, the iter status will
     // be used for IO errors, and get_context state will be used for any
@@ -1978,7 +2072,7 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
     bool timer_enabled =
         GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
         get_perf_context()->per_level_perf_context_enabled;
-    StopWatchNano timer(env_, timer_enabled /* auto_start */);
+    StopWatchNano timer(clock_, timer_enabled /* auto_start */);
     s = table_cache_->MultiGet(
         read_options, *internal_comparator(), *f->file_metadata, &file_range,
         mutable_cf_options_.prefix_extractor.get(),
@@ -2050,10 +2144,29 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
           } else if (fp.GetHitFileLevel() >= 2) {
             RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
           }
+
           PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1,
                                     fp.GetHitFileLevel());
-          file_range.AddValueSize(iter->value->size());
+
           file_range.MarkKeyDone(iter);
+
+          if (iter->is_blob_index) {
+            if (iter->value) {
+              constexpr uint64_t* bytes_read = nullptr;
+
+              *status = GetBlob(read_options, iter->ukey_with_ts, *iter->value,
+                                iter->value, bytes_read);
+              if (!status->ok()) {
+                if (status->IsIncomplete()) {
+                  get_context.MarkKeyMayExist();
+                }
+
+                continue;
+              }
+            }
+          }
+
+          file_range.AddValueSize(iter->value->size());
           if (file_range.GetValueSize() > read_options.value_size_soft_limit) {
             s = Status::Aborted();
             break;
@@ -2069,7 +2182,7 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
               Status::Corruption("corrupted key for ", iter->lkey->user_key());
           file_range.MarkKeyDone(iter);
           continue;
-        case GetContext::kBlobIndex:
+        case GetContext::kUnexpectedBlobIndex:
           ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index.");
           *status = Status::NotSupported(
               "Encounter unexpected blob index. Please open DB with "
@@ -2124,7 +2237,7 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
           iter->value != nullptr ? iter->value->GetSelf() : nullptr;
       *status = MergeHelper::TimedFullMerge(
           merge_operator_, user_key, nullptr, iter->merge_context.GetOperands(),
-          str_value, info_log_, db_statistics_, env_,
+          str_value, info_log_, db_statistics_, clock_,
           nullptr /* result_operand */, true);
       if (LIKELY(iter->value != nullptr)) {
         iter->value->PinSelf();
@@ -2417,13 +2530,13 @@ void VersionStorageInfo::EstimateCompactionBytesNeeded(
 }
 
 namespace {
-uint32_t GetExpiredTtlFilesCount(const ImmutableCFOptions& ioptions,
+uint32_t GetExpiredTtlFilesCount(const ImmutableOptions& ioptions,
                                  const MutableCFOptions& mutable_cf_options,
                                  const std::vector<FileMetaData*>& files) {
   uint32_t ttl_expired_files_count = 0;
 
   int64_t _current_time;
-  auto status = ioptions.env->GetCurrentTime(&_current_time);
+  auto status = ioptions.clock->GetCurrentTime(&_current_time);
   if (status.ok()) {
     const uint64_t current_time = static_cast<uint64_t>(_current_time);
     for (FileMetaData* f : files) {
@@ -2441,7 +2554,7 @@ uint32_t GetExpiredTtlFilesCount(const ImmutableCFOptions& ioptions,
 }  // anonymous namespace
 
 void VersionStorageInfo::ComputeCompactionScore(
-    const ImmutableCFOptions& immutable_cf_options,
+    const ImmutableOptions& immutable_options,
     const MutableCFOptions& mutable_cf_options) {
   for (int level = 0; level <= MaxInputLevel(); level++) {
     double score;
@@ -2493,7 +2606,7 @@ void VersionStorageInfo::ComputeCompactionScore(
         if (mutable_cf_options.ttl > 0) {
           score = std::max(
               static_cast<double>(GetExpiredTtlFilesCount(
-                  immutable_cf_options, mutable_cf_options, files_[level])),
+                  immutable_options, mutable_cf_options, files_[level])),
               score);
         }
 
@@ -2505,7 +2618,7 @@ void VersionStorageInfo::ComputeCompactionScore(
           // L0 files. Take into account size as well to avoid later giant
           // compactions to the base level.
           uint64_t l0_target_size = mutable_cf_options.max_bytes_for_level_base;
-          if (immutable_cf_options.level_compaction_dynamic_level_bytes &&
+          if (immutable_options.level_compaction_dynamic_level_bytes &&
               level_multiplier_ != 0.0) {
             // Prevent L0 to Lbase fanout from growing larger than
             // `level_multiplier_`. This prevents us from getting stuck picking
@@ -2553,11 +2666,11 @@ void VersionStorageInfo::ComputeCompactionScore(
   ComputeFilesMarkedForCompaction();
   ComputeBottommostFilesMarkedForCompaction();
   if (mutable_cf_options.ttl > 0) {
-    ComputeExpiredTtlFiles(immutable_cf_options, mutable_cf_options.ttl);
+    ComputeExpiredTtlFiles(immutable_options, mutable_cf_options.ttl);
   }
   if (mutable_cf_options.periodic_compaction_seconds > 0) {
     ComputeFilesMarkedForPeriodicCompaction(
-        immutable_cf_options, mutable_cf_options.periodic_compaction_seconds);
+        immutable_options, mutable_cf_options.periodic_compaction_seconds);
   }
   EstimateCompactionBytesNeeded(mutable_cf_options);
 }
@@ -2586,13 +2699,13 @@ void VersionStorageInfo::ComputeFilesMarkedForCompaction() {
 }
 
 void VersionStorageInfo::ComputeExpiredTtlFiles(
-    const ImmutableCFOptions& ioptions, const uint64_t ttl) {
+    const ImmutableOptions& ioptions, const uint64_t ttl) {
   assert(ttl > 0);
 
   expired_ttl_files_.clear();
 
   int64_t _current_time;
-  auto status = ioptions.env->GetCurrentTime(&_current_time);
+  auto status = ioptions.clock->GetCurrentTime(&_current_time);
   if (!status.ok()) {
     return;
   }
@@ -2612,14 +2725,14 @@ void VersionStorageInfo::ComputeExpiredTtlFiles(
 }
 
 void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
-    const ImmutableCFOptions& ioptions,
+    const ImmutableOptions& ioptions,
     const uint64_t periodic_compaction_seconds) {
   assert(periodic_compaction_seconds > 0);
 
   files_marked_for_periodic_compaction_.clear();
 
   int64_t temp_current_time;
-  auto status = ioptions.env->GetCurrentTime(&temp_current_time);
+  auto status = ioptions.clock->GetCurrentTime(&temp_current_time);
   if (!status.ok()) {
     return;
   }
@@ -2653,7 +2766,7 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
           status = ioptions.env->GetFileModificationTime(
               file_path, &file_modification_time);
           if (!status.ok()) {
-            ROCKS_LOG_WARN(ioptions.info_log,
+            ROCKS_LOG_WARN(ioptions.logger,
                            "Can't get file modification time: %s: %s",
                            file_path.c_str(), status.ToString().c_str());
             continue;
@@ -2676,7 +2789,7 @@ struct Fsize {
   FileMetaData* file;
 };
 
-// Compator that is used to sort files based on their size
+// Comparator that is used to sort files based on their size
 // In normal mode: descending size
 bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) {
   return (first.file->compensated_file_size >
@@ -3096,7 +3209,7 @@ void VersionStorageInfo::GetCleanInputsWithinInterval(
 // specified range. From that file, iterate backwards and
 // forwards to find all overlapping files.
 // if within_range is set, then only store the maximum clean inputs
-// within range [begin, end]. "clean" means there is a boudnary
+// within range [begin, end]. "clean" means there is a boundary
 // between the files in "*inputs" and the surrounding files
 void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
     int level, const InternalKey* begin, const InternalKey* end,
@@ -3286,7 +3399,7 @@ uint64_t VersionStorageInfo::MaxBytesForLevel(int level) const {
   return level_max_bytes_[level];
 }
 
-void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions,
+void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions,
                                             const MutableCFOptions& options) {
   // Special logic to set number of sorted runs.
   // It is to match the previous behavior when all files are in L0.
@@ -3376,7 +3489,7 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions,
         // base_bytes_min. We set it be base_bytes_min.
         base_level_size = base_bytes_min + 1U;
         base_level_ = first_non_empty_level;
-        ROCKS_LOG_INFO(ioptions.info_log,
+        ROCKS_LOG_INFO(ioptions.logger,
                        "More existing levels in DB than needed. "
                        "max_bytes_for_level_multiplier may not be guaranteed.");
       } else {
@@ -3407,7 +3520,7 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions,
         //   1. the L0 size is larger than level size base, or
         //   2. number of L0 files reaches twice the L0->L1 compaction trigger
         // We don't do this otherwise to keep the LSM-tree structure stable
-        // unless the L0 compation is backlogged.
+        // unless the L0 compaction is backlogged.
         base_level_size = l0_size;
         if (base_level_ == num_levels_ - 1) {
           level_multiplier_ = 1.0;
@@ -3593,16 +3706,30 @@ struct VersionSet::ManifestWriter {
   ColumnFamilyData* cfd;
   const MutableCFOptions mutable_cf_options;
   const autovector<VersionEdit*>& edit_list;
+  const std::function<void(const Status&)> manifest_write_callback;
 
-  explicit ManifestWriter(InstrumentedMutex* mu, ColumnFamilyData* _cfd,
-                          const MutableCFOptions& cf_options,
-                          const autovector<VersionEdit*>& e)
+  explicit ManifestWriter(
+      InstrumentedMutex* mu, ColumnFamilyData* _cfd,
+      const MutableCFOptions& cf_options, const autovector<VersionEdit*>& e,
+      const std::function<void(const Status&)>& manifest_wcb)
       : done(false),
         cv(mu),
         cfd(_cfd),
         mutable_cf_options(cf_options),
-        edit_list(e) {}
+        edit_list(e),
+        manifest_write_callback(manifest_wcb) {}
   ~ManifestWriter() { status.PermitUncheckedError(); }
+
+  bool IsAllWalEdits() const {
+    bool all_wal_edits = true;
+    for (const auto& e : edit_list) {
+      if (!e->IsWalManipulation()) {
+        all_wal_edits = false;
+        break;
+      }
+    }
+    return all_wal_edits;
+  }
 };
 
 Status AtomicGroupReadBuffer::AddEdit(VersionEdit* edit) {
@@ -3656,13 +3783,16 @@ VersionSet::VersionSet(const std::string& dbname,
                        WriteBufferManager* write_buffer_manager,
                        WriteController* write_controller,
                        BlockCacheTracer* const block_cache_tracer,
-                       const std::shared_ptr<IOTracer>& io_tracer)
+                       const std::shared_ptr<IOTracer>& io_tracer,
+                       const std::string& db_session_id)
     : column_family_set_(
           new ColumnFamilySet(dbname, _db_options, storage_options, table_cache,
                               write_buffer_manager, write_controller,
-                              block_cache_tracer, io_tracer)),
+                              block_cache_tracer, io_tracer, db_session_id)),
+      table_cache_(table_cache),
       env_(_db_options->env),
       fs_(_db_options->fs, io_tracer),
+      clock_(_db_options->clock),
       dbname_(dbname),
       db_options_(_db_options),
       next_file_number_(2),
@@ -3677,17 +3807,17 @@ VersionSet::VersionSet(const std::string& dbname,
       manifest_file_size_(0),
       file_options_(storage_options),
       block_cache_tracer_(block_cache_tracer),
-      io_tracer_(io_tracer) {}
+      io_tracer_(io_tracer),
+      db_session_id_(db_session_id) {}
 
 VersionSet::~VersionSet() {
   // we need to delete column_family_set_ because its destructor depends on
   // VersionSet
-  Cache* table_cache = column_family_set_->get_table_cache();
   column_family_set_.reset();
   for (auto& file : obsolete_files_) {
     if (file.metadata->table_reader_handle) {
-      table_cache->Release(file.metadata->table_reader_handle);
-      TableCache::Evict(table_cache, file.metadata->fd.GetNumber());
+      table_cache_->Release(file.metadata->table_reader_handle);
+      TableCache::Evict(table_cache_, file.metadata->fd.GetNumber());
     }
     file.DeleteMetadata();
   }
@@ -3697,12 +3827,11 @@ VersionSet::~VersionSet() {
 
 void VersionSet::Reset() {
   if (column_family_set_) {
-    Cache* table_cache = column_family_set_->get_table_cache();
     WriteBufferManager* wbm = column_family_set_->write_buffer_manager();
     WriteController* wc = column_family_set_->write_controller();
-    column_family_set_.reset(
-        new ColumnFamilySet(dbname_, db_options_, file_options_, table_cache,
-                            wbm, wc, block_cache_tracer_, io_tracer_));
+    column_family_set_.reset(new ColumnFamilySet(
+        dbname_, db_options_, file_options_, table_cache_, wbm, wc,
+        block_cache_tracer_, io_tracer_, db_session_id_));
   }
   db_id_.clear();
   next_file_number_.store(2);
@@ -3755,6 +3884,7 @@ Status VersionSet::ProcessManifestWrites(
     std::deque<ManifestWriter>& writers, InstrumentedMutex* mu,
     FSDirectory* db_directory, bool new_descriptor_log,
     const ColumnFamilyOptions* new_cf_options) {
+  mu->AssertHeld();
   assert(!writers.empty());
   ManifestWriter& first_writer = writers.front();
   ManifestWriter* last_writer = &first_writer;
@@ -3831,16 +3961,22 @@ Status VersionSet::ProcessManifestWrites(
         }
       }
       if (version == nullptr) {
-        version = new Version(last_writer->cfd, this, file_options_,
-                              last_writer->mutable_cf_options, io_tracer_,
-                              current_version_number_++);
-        versions.push_back(version);
-        mutable_cf_options_ptrs.push_back(&last_writer->mutable_cf_options);
-        builder_guards.emplace_back(
-            new BaseReferencedVersionBuilder(last_writer->cfd));
-        builder = builder_guards.back()->version_builder();
+        // WAL manipulations do not need to be applied to versions.
+        if (!last_writer->IsAllWalEdits()) {
+          version = new Version(last_writer->cfd, this, file_options_,
+                                last_writer->mutable_cf_options, io_tracer_,
+                                current_version_number_++);
+          versions.push_back(version);
+          mutable_cf_options_ptrs.push_back(&last_writer->mutable_cf_options);
+          builder_guards.emplace_back(
+              new BaseReferencedVersionBuilder(last_writer->cfd));
+          builder = builder_guards.back()->version_builder();
+        }
+        assert(last_writer->IsAllWalEdits() || builder);
+        assert(last_writer->IsAllWalEdits() || version);
+        TEST_SYNC_POINT_CALLBACK("VersionSet::ProcessManifestWrites:NewVersion",
+                                 version);
       }
-      assert(builder != nullptr);  // make checker happy
       for (const auto& e : last_writer->edit_list) {
         if (e->is_in_atomic_group_) {
           if (batch_edits.empty() || !batch_edits.back()->is_in_atomic_group_ ||
@@ -3926,6 +4062,7 @@ Status VersionSet::ProcessManifestWrites(
   // reads its content after releasing db mutex to avoid race with
   // SwitchMemtable().
   std::unordered_map<uint32_t, MutableCFState> curr_state;
+  VersionEdit wal_additions;
   if (new_descriptor_log) {
     pending_manifest_file_number_ = NewFileNumber();
     batch_edits.back()->SetNextFile(next_file_number_.load());
@@ -3938,18 +4075,25 @@ Status VersionSet::ProcessManifestWrites(
     }
     for (const auto* cfd : *column_family_set_) {
       assert(curr_state.find(cfd->GetID()) == curr_state.end());
-      curr_state[cfd->GetID()] = {cfd->GetLogNumber()};
+      curr_state.emplace(std::make_pair(
+          cfd->GetID(),
+          MutableCFState(cfd->GetLogNumber(), cfd->GetFullHistoryTsLow())));
+    }
+
+    for (const auto& wal : wals_.GetWals()) {
+      wal_additions.AddWal(wal.first, wal.second);
     }
   }
 
   uint64_t new_manifest_file_size = 0;
   Status s;
   IOStatus io_s;
+  IOStatus manifest_io_status;
   {
     FileOptions opt_file_opts = fs_->OptimizeForManifestWrite(file_options_);
     mu->Unlock();
-
-    TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifest");
+    TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestStart");
+    TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WriteManifest", nullptr);
     if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
       for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
         assert(!builder_guards.empty() &&
@@ -3986,15 +4130,17 @@ Status VersionSet::ProcessManifestWrites(
       if (io_s.ok()) {
         descriptor_file->SetPreallocationBlockSize(
             db_options_->manifest_preallocation_size);
-
+        FileTypeSet tmp_set = db_options_->checksum_handoff_file_types;
         std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-            std::move(descriptor_file), descriptor_fname, opt_file_opts, env_,
-            io_tracer_, nullptr, db_options_->listeners));
+            std::move(descriptor_file), descriptor_fname, opt_file_opts, clock_,
+            io_tracer_, nullptr, db_options_->listeners, nullptr,
+            tmp_set.Contains(FileType::kDescriptorFile)));
         descriptor_log_.reset(
             new log::Writer(std::move(file_writer), 0, false));
-        s = WriteCurrentStateToManifest(curr_state, descriptor_log_.get(),
-                                        io_s);
+        s = WriteCurrentStateToManifest(curr_state, wal_additions,
+                                        descriptor_log_.get(), io_s);
       } else {
+        manifest_io_status = io_s;
         s = io_s;
       }
     }
@@ -4017,8 +4163,8 @@ Status VersionSet::ProcessManifestWrites(
                                  e->DebugString(true));
           break;
         }
-        TEST_KILL_RANDOM("VersionSet::LogAndApply:BeforeAddRecord",
-                         rocksdb_kill_odds * REDUCE_ODDS2);
+        TEST_KILL_RANDOM_WITH_WEIGHT("VersionSet::LogAndApply:BeforeAddRecord",
+                                     REDUCE_ODDS2);
 #ifndef NDEBUG
         if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) {
           TEST_SYNC_POINT_CALLBACK(
@@ -4032,12 +4178,14 @@ Status VersionSet::ProcessManifestWrites(
         io_s = descriptor_log_->AddRecord(record);
         if (!io_s.ok()) {
           s = io_s;
+          manifest_io_status = io_s;
           break;
         }
       }
       if (s.ok()) {
         if (!db_options_->disable_manifest_sync) {
-          io_s = SyncManifest(env_, db_options_, descriptor_log_->file());
+          io_s = SyncManifest(db_options_, descriptor_log_->file());
+          manifest_io_status = io_s;
         }
         TEST_SYNC_POINT_CALLBACK(
             "VersionSet::ProcessManifestWrites:AfterSyncManifest", &io_s);
@@ -4051,6 +4199,9 @@ Status VersionSet::ProcessManifestWrites(
 
     // If we just created a new descriptor file, install it by writing a
     // new CURRENT file that points to it.
+    if (s.ok()) {
+      assert(manifest_io_status.ok());
+    }
     if (s.ok() && new_descriptor_log) {
       io_s = SetCurrentFile(fs_.get(), dbname_, pending_manifest_file_number_,
                             db_directory);
@@ -4076,6 +4227,20 @@ Status VersionSet::ProcessManifestWrites(
     mu->Lock();
   }
 
+  if (s.ok()) {
+    // Apply WAL edits, DB mutex must be held.
+    for (auto& e : batch_edits) {
+      if (e->IsWalAddition()) {
+        s = wals_.AddWals(e->GetWalAdditions());
+      } else if (e->IsWalDeletion()) {
+        s = wals_.DeleteWalsBefore(e->GetWalDeletion().GetLogNumber());
+      }
+      if (!s.ok()) {
+        break;
+      }
+    }
+  }
+
   if (!io_s.ok()) {
     if (io_status_.ok()) {
       io_status_ = io_s;
@@ -4105,23 +4270,23 @@ Status VersionSet::ProcessManifestWrites(
       // Each version in versions corresponds to a column family.
       // For each column family, update its log number indicating that logs
       // with number smaller than this should be ignored.
-      for (const auto version : versions) {
-        uint64_t max_log_number_in_batch = 0;
-        uint32_t cf_id = version->cfd_->GetID();
-        for (const auto& e : batch_edits) {
-          if (e->has_log_number_ && e->column_family_ == cf_id) {
-            max_log_number_in_batch =
-                std::max(max_log_number_in_batch, e->log_number_);
-          }
+      uint64_t last_min_log_number_to_keep = 0;
+      for (const auto& e : batch_edits) {
+        ColumnFamilyData* cfd = nullptr;
+        if (!e->IsColumnFamilyManipulation()) {
+          cfd = column_family_set_->GetColumnFamily(e->column_family_);
+          // e would not have been added to batch_edits if its corresponding
+          // column family is dropped.
+          assert(cfd);
         }
-        if (max_log_number_in_batch != 0) {
-          assert(version->cfd_->GetLogNumber() <= max_log_number_in_batch);
-          version->cfd_->SetLogNumber(max_log_number_in_batch);
+        if (cfd) {
+          if (e->has_log_number_ && e->log_number_ > cfd->GetLogNumber()) {
+            cfd->SetLogNumber(e->log_number_);
+          }
+          if (e->HasFullHistoryTsLow()) {
+            cfd->SetFullHistoryTsLow(e->GetFullHistoryTsLow());
+          }
         }
-      }
-
-      uint64_t last_min_log_number_to_keep = 0;
-      for (auto& e : batch_edits) {
         if (e->has_min_log_number_to_keep_) {
           last_min_log_number_to_keep =
               std::max(last_min_log_number_to_keep, e->min_log_number_to_keep_);
@@ -4152,11 +4317,41 @@ Status VersionSet::ProcessManifestWrites(
     for (auto v : versions) {
       delete v;
     }
+    if (manifest_io_status.ok()) {
+      manifest_file_number_ = pending_manifest_file_number_;
+      manifest_file_size_ = new_manifest_file_size;
+    }
     // If manifest append failed for whatever reason, the file could be
     // corrupted. So we need to force the next version update to start a
     // new manifest file.
     descriptor_log_.reset();
-    if (new_descriptor_log) {
+    // If manifest operations failed, then we know the CURRENT file still
+    // points to the original MANIFEST. Therefore, we can safely delete the
+    // new MANIFEST.
+    // If manifest operations succeeded, and we are here, then it is possible
+    // that renaming tmp file to CURRENT failed.
+    //
+    // On local POSIX-compliant FS, the CURRENT must point to the original
+    // MANIFEST. We can delete the new MANIFEST for simplicity, but we can also
+    // keep it. Future recovery will ignore this MANIFEST. It's also ok for the
+    // process not to crash and continue using the db. Any future LogAndApply()
+    // call will switch to a new MANIFEST and update CURRENT, still ignoring
+    // this one.
+    //
+    // On non-local FS, it is
+    // possible that the rename operation succeeded on the server (remote)
+    // side, but the client somehow returns a non-ok status to RocksDB. Note
+    // that this does not violate atomicity. Should we delete the new MANIFEST
+    // successfully, a subsequent recovery attempt will likely see the CURRENT
+    // pointing to the new MANIFEST, thus fail. We will not be able to open the
+    // DB again. Therefore, if manifest operations succeed, we should keep the
+    // the new MANIFEST. If the process proceeds, any future LogAndApply() call
+    // will switch to a new MANIFEST and update CURRENT. If user tries to
+    // re-open the DB,
+    // a) CURRENT points to the new MANIFEST, and the new MANIFEST is present.
+    // b) CURRENT points to the original MANIFEST, and the original MANIFEST
+    //    also exists.
+    if (new_descriptor_log && !manifest_io_status.ok()) {
       ROCKS_LOG_INFO(db_options_->info_log,
                      "Deleting manifest %" PRIu64 " current manifest %" PRIu64
                      "\n",
@@ -4187,6 +4382,9 @@ Status VersionSet::ProcessManifestWrites(
     }
     ready->status = s;
     ready->done = true;
+    if (ready->manifest_write_callback) {
+      (ready->manifest_write_callback)(s);
+    }
     if (need_signal) {
       ready->cv.Signal();
     }
@@ -4200,14 +4398,15 @@ Status VersionSet::ProcessManifestWrites(
   return s;
 }
 
-// 'datas' is gramatically incorrect. We still use this notation to indicate
+// 'datas' is grammatically incorrect. We still use this notation to indicate
 // that this variable represents a collection of column_family_data.
 Status VersionSet::LogAndApply(
     const autovector<ColumnFamilyData*>& column_family_datas,
     const autovector<const MutableCFOptions*>& mutable_cf_options_list,
     const autovector<autovector<VersionEdit*>>& edit_lists,
     InstrumentedMutex* mu, FSDirectory* db_directory, bool new_descriptor_log,
-    const ColumnFamilyOptions* new_cf_options) {
+    const ColumnFamilyOptions* new_cf_options,
+    const std::vector<std::function<void(const Status&)>>& manifest_wcbs) {
   mu->AssertHeld();
   int num_edits = 0;
   for (const auto& elist : edit_lists) {
@@ -4237,12 +4436,16 @@ Status VersionSet::LogAndApply(
     assert(static_cast<size_t>(num_cfds) == edit_lists.size());
   }
   for (int i = 0; i < num_cfds; ++i) {
+    const auto wcb =
+        manifest_wcbs.empty() ? [](const Status&) {} : manifest_wcbs[i];
     writers.emplace_back(mu, column_family_datas[i],
-                         *mutable_cf_options_list[i], edit_lists[i]);
+                         *mutable_cf_options_list[i], edit_lists[i], wcb);
     manifest_writers_.push_back(&writers[i]);
   }
   assert(!writers.empty());
   ManifestWriter& first_writer = writers.front();
+  TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:BeforeWriterWaiting",
+                           nullptr);
   while (!first_writer.done && &first_writer != manifest_writers_.front()) {
     first_writer.cv.Wait();
   }
@@ -4254,6 +4457,7 @@ Status VersionSet::LogAndApply(
     for (const auto& writer : writers) {
       assert(writer.done);
     }
+    TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WakeUpAndDone", mu);
 #endif /* !NDEBUG */
     return first_writer.status;
   }
@@ -4323,153 +4527,11 @@ Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
   edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_
                                                       : last_sequence_);
 
-  Status s = builder->Apply(edit);
-
-  return s;
-}
-
-Status VersionSet::ApplyOneVersionEditToBuilder(
-    VersionEdit& edit,
-    const std::unordered_map<std::string, ColumnFamilyOptions>& name_to_options,
-    std::unordered_map<int, std::string>& column_families_not_found,
-    std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>&
-        builders,
-    VersionEditParams* version_edit_params) {
-  // Not found means that user didn't supply that column
-  // family option AND we encountered column family add
-  // record. Once we encounter column family drop record,
-  // we will delete the column family from
-  // column_families_not_found.
-  bool cf_in_not_found = (column_families_not_found.find(edit.column_family_) !=
-                          column_families_not_found.end());
-  // in builders means that user supplied that column family
-  // option AND that we encountered column family add record
-  bool cf_in_builders = builders.find(edit.column_family_) != builders.end();
-
-  // they can't both be true
-  assert(!(cf_in_not_found && cf_in_builders));
-
-  ColumnFamilyData* cfd = nullptr;
-
-  if (edit.is_column_family_add_) {
-    if (cf_in_builders || cf_in_not_found) {
-      return Status::Corruption(
-          "Manifest adding the same column family twice: " +
-          edit.column_family_name_);
-    }
-    auto cf_options = name_to_options.find(edit.column_family_name_);
-    // implicitly add persistent_stats column family without requiring user
-    // to specify
-    bool is_persistent_stats_column_family =
-        edit.column_family_name_.compare(kPersistentStatsColumnFamilyName) == 0;
-    if (cf_options == name_to_options.end() &&
-        !is_persistent_stats_column_family) {
-      column_families_not_found.insert(
-          {edit.column_family_, edit.column_family_name_});
-    } else {
-      // recover persistent_stats CF from a DB that already contains it
-      if (is_persistent_stats_column_family) {
-        ColumnFamilyOptions cfo;
-        OptimizeForPersistentStats(&cfo);
-        cfd = CreateColumnFamily(cfo, &edit);
-      } else {
-        cfd = CreateColumnFamily(cf_options->second, &edit);
-      }
-      cfd->set_initialized();
-      builders.insert(std::make_pair(
-          edit.column_family_, std::unique_ptr<BaseReferencedVersionBuilder>(
-                                   new BaseReferencedVersionBuilder(cfd))));
-    }
-  } else if (edit.is_column_family_drop_) {
-    if (cf_in_builders) {
-      auto builder = builders.find(edit.column_family_);
-      assert(builder != builders.end());
-      builders.erase(builder);
-      cfd = column_family_set_->GetColumnFamily(edit.column_family_);
-      assert(cfd != nullptr);
-      if (cfd->UnrefAndTryDelete()) {
-        cfd = nullptr;
-      } else {
-        // who else can have reference to cfd!?
-        assert(false);
-      }
-    } else if (cf_in_not_found) {
-      column_families_not_found.erase(edit.column_family_);
-    } else {
-      return Status::Corruption(
-          "Manifest - dropping non-existing column family");
-    }
-  } else if (!cf_in_not_found) {
-    if (!cf_in_builders) {
-      return Status::Corruption(
-          "Manifest record referencing unknown column family");
-    }
-
-    cfd = column_family_set_->GetColumnFamily(edit.column_family_);
-    // this should never happen since cf_in_builders is true
-    assert(cfd != nullptr);
-
-    // if it is not column family add or column family drop,
-    // then it's a file add/delete, which should be forwarded
-    // to builder
-    auto builder = builders.find(edit.column_family_);
-    assert(builder != builders.end());
-    Status s = builder->second->version_builder()->Apply(&edit);
-    if (!s.ok()) {
-      return s;
-    }
-  }
-  return ExtractInfoFromVersionEdit(cfd, edit, version_edit_params);
-}
-
-Status VersionSet::ExtractInfoFromVersionEdit(
-    ColumnFamilyData* cfd, const VersionEdit& from_edit,
-    VersionEditParams* version_edit_params) {
-  if (cfd != nullptr) {
-    if (from_edit.has_db_id_) {
-      version_edit_params->SetDBId(from_edit.db_id_);
-    }
-    if (from_edit.has_log_number_) {
-      if (cfd->GetLogNumber() > from_edit.log_number_) {
-        ROCKS_LOG_WARN(
-            db_options_->info_log,
-            "MANIFEST corruption detected, but ignored - Log numbers in "
-            "records NOT monotonically increasing");
-      } else {
-        cfd->SetLogNumber(from_edit.log_number_);
-        version_edit_params->SetLogNumber(from_edit.log_number_);
-      }
-    }
-    if (from_edit.has_comparator_ &&
-        from_edit.comparator_ != cfd->user_comparator()->Name()) {
-      return Status::InvalidArgument(
-          cfd->user_comparator()->Name(),
-          "does not match existing comparator " + from_edit.comparator_);
-    }
-  }
-
-  if (from_edit.has_prev_log_number_) {
-    version_edit_params->SetPrevLogNumber(from_edit.prev_log_number_);
-  }
-
-  if (from_edit.has_next_file_number_) {
-    version_edit_params->SetNextFile(from_edit.next_file_number_);
-  }
-
-  if (from_edit.has_max_column_family_) {
-    version_edit_params->SetMaxColumnFamily(from_edit.max_column_family_);
-  }
-
-  if (from_edit.has_min_log_number_to_keep_) {
-    version_edit_params->min_log_number_to_keep_ =
-        std::max(version_edit_params->min_log_number_to_keep_,
-                 from_edit.min_log_number_to_keep_);
-  }
-
-  if (from_edit.has_last_sequence_) {
-    version_edit_params->SetLastSequence(from_edit.last_sequence_);
-  }
-  return Status::OK();
+  // The builder can be nullptr only if edit is WAL manipulation,
+  // because WAL edits do not need to be applied to versions,
+  // we return Status::OK() in this case.
+  assert(builder || edit->IsWalManipulation());
+  return builder ? builder->Apply(edit) : Status::OK();
 }
 
 Status VersionSet::GetCurrentManifestPath(const std::string& dbname,
@@ -4503,89 +4565,9 @@ Status VersionSet::GetCurrentManifestPath(const std::string& dbname,
   return Status::OK();
 }
 
-Status VersionSet::ReadAndRecover(
-    log::Reader& reader, AtomicGroupReadBuffer* read_buffer,
-    const std::unordered_map<std::string, ColumnFamilyOptions>& name_to_options,
-    std::unordered_map<int, std::string>& column_families_not_found,
-    std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>&
-        builders,
-    Status* log_read_status, VersionEditParams* version_edit_params,
-    std::string* db_id) {
-  assert(read_buffer != nullptr);
-  assert(log_read_status != nullptr);
-  Status s;
-  Slice record;
-  std::string scratch;
-  size_t recovered_edits = 0;
-  while (s.ok() && reader.ReadRecord(&record, &scratch) &&
-         log_read_status->ok()) {
-    VersionEdit edit;
-    s = edit.DecodeFrom(record);
-    if (!s.ok()) {
-      break;
-    }
-    if (edit.has_db_id_) {
-      db_id_ = edit.GetDbId();
-      if (db_id != nullptr) {
-        db_id->assign(edit.GetDbId());
-      }
-    }
-    s = read_buffer->AddEdit(&edit);
-    if (!s.ok()) {
-      break;
-    }
-    if (edit.is_in_atomic_group_) {
-      if (read_buffer->IsFull()) {
-        // Apply edits in an atomic group when we have read all edits in the
-        // group.
-        for (auto& e : read_buffer->replay_buffer()) {
-          s = ApplyOneVersionEditToBuilder(e, name_to_options,
-                                           column_families_not_found, builders,
-                                           version_edit_params);
-          if (!s.ok()) {
-            break;
-          }
-          recovered_edits++;
-        }
-        if (!s.ok()) {
-          break;
-        }
-        read_buffer->Clear();
-      }
-    } else {
-      // Apply a normal edit immediately.
-      s = ApplyOneVersionEditToBuilder(edit, name_to_options,
-                                       column_families_not_found, builders,
-                                       version_edit_params);
-      if (s.ok()) {
-        recovered_edits++;
-      }
-    }
-  }
-  if (!log_read_status->ok()) {
-    s = *log_read_status;
-  }
-  if (!s.ok()) {
-    // Clear the buffer if we fail to decode/apply an edit.
-    read_buffer->Clear();
-  }
-  TEST_SYNC_POINT_CALLBACK("VersionSet::ReadAndRecover:RecoveredEdits",
-                           &recovered_edits);
-  return s;
-}
-
 Status VersionSet::Recover(
     const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
     std::string* db_id) {
-  std::unordered_map<std::string, ColumnFamilyOptions> cf_name_to_options;
-  for (const auto& cf : column_families) {
-    cf_name_to_options.emplace(cf.name, cf.options);
-  }
-  // keeps track of column families in manifest that were not found in
-  // column families parameters. if those column families are not dropped
-  // by subsequent manifest records, Recover() will return failure status
-  std::unordered_map<int, std::string> column_families_not_found;
-
   // Read "CURRENT" file, which contains a pointer to the current manifest file
   std::string manifest_path;
   Status s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path,
@@ -4610,139 +4592,30 @@ Status VersionSet::Recover(
         new SequentialFileReader(std::move(manifest_file), manifest_path,
                                  db_options_->log_readahead_size, io_tracer_));
   }
-
-  VersionBuilderMap builders;
-
-  // add default column family
-  auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName);
-  if (default_cf_iter == cf_name_to_options.end()) {
-    return Status::InvalidArgument("Default column family not specified");
-  }
-  VersionEdit default_cf_edit;
-  default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName);
-  default_cf_edit.SetColumnFamily(0);
-  ColumnFamilyData* default_cfd =
-      CreateColumnFamily(default_cf_iter->second, &default_cf_edit);
-  // In recovery, nobody else can access it, so it's fine to set it to be
-  // initialized earlier.
-  default_cfd->set_initialized();
-  builders.insert(
-      std::make_pair(0, std::unique_ptr<BaseReferencedVersionBuilder>(
-                            new BaseReferencedVersionBuilder(default_cfd))));
   uint64_t current_manifest_file_size = 0;
-  VersionEditParams version_edit_params;
+  uint64_t log_number = 0;
   {
     VersionSet::LogReporter reporter;
     Status log_read_status;
     reporter.status = &log_read_status;
     log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter,
                        true /* checksum */, 0 /* log_number */);
-    AtomicGroupReadBuffer read_buffer;
-    s = ReadAndRecover(reader, &read_buffer, cf_name_to_options,
-                       column_families_not_found, builders, &log_read_status,
-                       &version_edit_params, db_id);
-    current_manifest_file_size = reader.GetReadOffset();
-    assert(current_manifest_file_size != 0);
-  }
-
-  if (s.ok()) {
-    if (!version_edit_params.has_next_file_number_) {
-      s = Status::Corruption("no meta-nextfile entry in descriptor");
-    } else if (!version_edit_params.has_log_number_) {
-      s = Status::Corruption("no meta-lognumber entry in descriptor");
-    } else if (!version_edit_params.has_last_sequence_) {
-      s = Status::Corruption("no last-sequence-number entry in descriptor");
-    }
-
-    if (!version_edit_params.has_prev_log_number_) {
-      version_edit_params.SetPrevLogNumber(0);
-    }
-
-    column_family_set_->UpdateMaxColumnFamily(
-        version_edit_params.max_column_family_);
-
-    // When reading DB generated using old release, min_log_number_to_keep=0.
-    // All log files will be scanned for potential prepare entries.
-    MarkMinLogNumberToKeep2PC(version_edit_params.min_log_number_to_keep_);
-    MarkFileNumberUsed(version_edit_params.prev_log_number_);
-    MarkFileNumberUsed(version_edit_params.log_number_);
-  }
-
-  // there were some column families in the MANIFEST that weren't specified
-  // in the argument. This is OK in read_only mode
-  if (read_only == false && !column_families_not_found.empty()) {
-    std::string list_of_not_found;
-    for (const auto& cf : column_families_not_found) {
-      list_of_not_found += ", " + cf.second;
-    }
-    list_of_not_found = list_of_not_found.substr(2);
-    s = Status::InvalidArgument(
-        "You have to open all column families. Column families not opened: " +
-        list_of_not_found);
-  }
-
-  if (s.ok()) {
-    for (auto cfd : *column_family_set_) {
-      assert(builders.count(cfd->GetID()) > 0);
-      auto* builder = builders[cfd->GetID()]->version_builder();
-      if (!builder->CheckConsistencyForNumLevels()) {
-        s = Status::InvalidArgument(
-            "db has more levels than options.num_levels");
-        break;
-      }
+    VersionEditHandler handler(read_only, column_families,
+                               const_cast<VersionSet*>(this),
+                               /*track_missing_files=*/false,
+                               /*no_error_if_files_missing=*/false, io_tracer_);
+    handler.Iterate(reader, &log_read_status);
+    s = handler.status();
+    if (s.ok()) {
+      log_number = handler.GetVersionEditParams().log_number_;
+      current_manifest_file_size = reader.GetReadOffset();
+      assert(current_manifest_file_size != 0);
+      handler.GetDbId(db_id);
     }
   }
 
   if (s.ok()) {
-    for (auto cfd : *column_family_set_) {
-      if (cfd->IsDropped()) {
-        continue;
-      }
-      if (read_only) {
-        cfd->table_cache()->SetTablesAreImmortal();
-      }
-      assert(cfd->initialized());
-      auto builders_iter = builders.find(cfd->GetID());
-      assert(builders_iter != builders.end());
-      auto builder = builders_iter->second->version_builder();
-
-      // unlimited table cache. Pre-load table handle now.
-      // Need to do it out of the mutex.
-      s = builder->LoadTableHandlers(
-          cfd->internal_stats(), db_options_->max_file_opening_threads,
-          false /* prefetch_index_and_filter_in_cache */,
-          true /* is_initial_load */,
-          cfd->GetLatestMutableCFOptions()->prefix_extractor.get(),
-          MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()));
-      if (!s.ok()) {
-        if (db_options_->paranoid_checks) {
-          return s;
-        }
-        s = Status::OK();
-      }
-
-      Version* v = new Version(cfd, this, file_options_,
-                               *cfd->GetLatestMutableCFOptions(), io_tracer_,
-                               current_version_number_++);
-      s = builder->SaveTo(v->storage_info());
-      if (!s.ok()) {
-        delete v;
-        return s;
-      }
-
-      // Install recovered version
-      v->PrepareApply(*cfd->GetLatestMutableCFOptions(),
-          !(db_options_->skip_stats_update_on_db_open));
-      AppendVersion(cfd, v);
-    }
-
     manifest_file_size_ = current_manifest_file_size;
-    next_file_number_.store(version_edit_params.next_file_number_ + 1);
-    last_allocated_sequence_ = version_edit_params.last_sequence_;
-    last_published_sequence_ = version_edit_params.last_sequence_;
-    last_sequence_ = version_edit_params.last_sequence_;
-    prev_log_number_ = version_edit_params.prev_log_number_;
-
     ROCKS_LOG_INFO(
         db_options_->info_log,
         "Recovered from manifest file:%s succeeded,"
@@ -4751,9 +4624,8 @@ Status VersionSet::Recover(
         ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32
         ",min_log_number_to_keep is %" PRIu64 "\n",
         manifest_path.c_str(), manifest_file_number_, next_file_number_.load(),
-        last_sequence_.load(), version_edit_params.log_number_,
-        prev_log_number_, column_family_set_->GetMaxColumnFamily(),
-        min_log_number_to_keep_2pc());
+        last_sequence_.load(), log_number, prev_log_number_,
+        column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep_2pc());
 
     for (auto cfd : *column_family_set_) {
       if (cfd->IsDropped()) {
@@ -4901,7 +4773,9 @@ Status VersionSet::TryRecoverFromOneManifest(
   VersionEditHandlerPointInTime handler_pit(
       read_only, column_families, const_cast<VersionSet*>(this), io_tracer_);
 
-  handler_pit.Iterate(reader, &s, db_id);
+  handler_pit.Iterate(reader, &s);
+
+  handler_pit.GetDbId(db_id);
 
   assert(nullptr != has_missing_table_file);
   *has_missing_table_file = handler_pit.HasMissingFiles();
@@ -4912,7 +4786,7 @@ Status VersionSet::TryRecoverFromOneManifest(
 Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
                                       const std::string& dbname,
                                       FileSystem* fs) {
-  // these are just for performance reasons, not correcntes,
+  // these are just for performance reasons, not correctness,
   // so we're fine using the defaults
   FileOptions soptions;
   // Read "CURRENT" file, which contains a pointer to the current manifest file
@@ -4935,48 +4809,23 @@ Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
                                              nullptr /*IOTracer*/));
   }
 
-  std::map<uint32_t, std::string> column_family_names;
-  // default column family is always implicitly there
-  column_family_names.insert({0, kDefaultColumnFamilyName});
   VersionSet::LogReporter reporter;
   reporter.status = &s;
   log::Reader reader(nullptr, std::move(file_reader), &reporter,
                      true /* checksum */, 0 /* log_number */);
-  Slice record;
-  std::string scratch;
-  while (reader.ReadRecord(&record, &scratch) && s.ok()) {
-    VersionEdit edit;
-    s = edit.DecodeFrom(record);
-    if (!s.ok()) {
-      break;
-    }
-    if (edit.is_column_family_add_) {
-      if (column_family_names.find(edit.column_family_) !=
-          column_family_names.end()) {
-        s = Status::Corruption("Manifest adding the same column family twice");
-        break;
-      }
-      column_family_names.insert(
-          {edit.column_family_, edit.column_family_name_});
-    } else if (edit.is_column_family_drop_) {
-      if (column_family_names.find(edit.column_family_) ==
-          column_family_names.end()) {
-        s = Status::Corruption(
-            "Manifest - dropping non-existing column family");
-        break;
-      }
-      column_family_names.erase(edit.column_family_);
-    }
-  }
 
+  ListColumnFamiliesHandler handler;
+  handler.Iterate(reader, &s);
+
+  assert(column_families);
   column_families->clear();
-  if (s.ok()) {
-    for (const auto& iter : column_family_names) {
+  if (handler.status().ok()) {
+    for (const auto& iter : handler.GetColumnFamilyNames()) {
       column_families->push_back(iter.second);
     }
   }
 
-  return s;
+  return handler.status();
 }
 
 #ifndef ROCKSDB_LITE
@@ -4996,7 +4845,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
   WriteController wc(options->delayed_write_rate);
   WriteBufferManager wb(options->db_write_buffer_size);
   VersionSet versions(dbname, &db_options, file_options, tc.get(), &wb, &wc,
-                      nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/);
+                      nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/,
+                      /*db_session_id*/ "");
   Status status;
 
   std::vector<ColumnFamilyDescriptor> dummy;
@@ -5078,7 +4928,7 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
 }
 
 // Get the checksum information including the checksum and checksum function
-// name of all SST files in VersionSet. Store the information in
+// name of all SST and blob files in VersionSet. Store the information in
 // FileChecksumList which contains a map from file number to its checksum info.
 // If DB is not running, make sure call VersionSet::Recover() to load the file
 // metadata from Manifest to VersionSet before calling this function.
@@ -5095,6 +4945,7 @@ Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) {
     if (cfd->IsDropped() || !cfd->initialized()) {
       continue;
     }
+    /* SST files */
     for (int level = 0; level < cfd->NumberLevels(); level++) {
       for (const auto& file :
            cfd->current()->storage_info()->LevelFiles(level)) {
@@ -5102,17 +4953,36 @@ Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) {
                                                  file->file_checksum,
                                                  file->file_checksum_func_name);
         if (!s.ok()) {
-          break;
+          return s;
         }
       }
+    }
+
+    /* Blob files */
+    const auto& blob_files = cfd->current()->storage_info()->GetBlobFiles();
+    for (const auto& pair : blob_files) {
+      const uint64_t blob_file_number = pair.first;
+      const auto& meta = pair.second;
+
+      assert(meta);
+      assert(blob_file_number == meta->GetBlobFileNumber());
+
+      std::string checksum_value = meta->GetChecksumValue();
+      std::string checksum_method = meta->GetChecksumMethod();
+      assert(checksum_value.empty() == checksum_method.empty());
+      if (meta->GetChecksumMethod().empty()) {
+        checksum_value = kUnknownFileChecksum;
+        checksum_method = kUnknownFileChecksumFuncName;
+      }
+
+      s = checksum_list->InsertOneFileChecksum(blob_file_number, checksum_value,
+                                               checksum_method);
       if (!s.ok()) {
-        break;
+        return s;
       }
     }
-    if (!s.ok()) {
-      break;
-    }
   }
+
   return s;
 }
 
@@ -5135,194 +5005,19 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
         std::move(file), dscname, db_options_->log_readahead_size, io_tracer_));
   }
 
-  bool have_prev_log_number = false;
-  bool have_next_file = false;
-  bool have_last_sequence = false;
-  uint64_t next_file = 0;
-  uint64_t last_sequence = 0;
-  uint64_t previous_log_number = 0;
-  int count = 0;
-  std::unordered_map<uint32_t, std::string> comparators;
-  std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
-      builders;
-
-  // add default column family
-  VersionEdit default_cf_edit;
-  default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName);
-  default_cf_edit.SetColumnFamily(0);
-  ColumnFamilyData* default_cfd =
-      CreateColumnFamily(ColumnFamilyOptions(options), &default_cf_edit);
-  builders.insert(
-      std::make_pair(0, std::unique_ptr<BaseReferencedVersionBuilder>(
-                            new BaseReferencedVersionBuilder(default_cfd))));
-
+  std::vector<ColumnFamilyDescriptor> column_families(
+      1, ColumnFamilyDescriptor(kDefaultColumnFamilyName, options));
+  DumpManifestHandler handler(column_families, this, io_tracer_, verbose, hex,
+                              json);
   {
     VersionSet::LogReporter reporter;
     reporter.status = &s;
     log::Reader reader(nullptr, std::move(file_reader), &reporter,
                        true /* checksum */, 0 /* log_number */);
-    Slice record;
-    std::string scratch;
-    while (reader.ReadRecord(&record, &scratch) && s.ok()) {
-      VersionEdit edit;
-      s = edit.DecodeFrom(record);
-      if (!s.ok()) {
-        break;
-      }
-
-      // Write out each individual edit
-      if (verbose && !json) {
-        printf("%s\n", edit.DebugString(hex).c_str());
-      } else if (json) {
-        printf("%s\n", edit.DebugJSON(count, hex).c_str());
-      }
-      count++;
-
-      bool cf_in_builders =
-          builders.find(edit.column_family_) != builders.end();
-
-      if (edit.has_comparator_) {
-        comparators.insert({edit.column_family_, edit.comparator_});
-      }
-
-      ColumnFamilyData* cfd = nullptr;
-
-      if (edit.is_column_family_add_) {
-        if (cf_in_builders) {
-          s = Status::Corruption(
-              "Manifest adding the same column family twice");
-          break;
-        }
-        cfd = CreateColumnFamily(ColumnFamilyOptions(options), &edit);
-        cfd->set_initialized();
-        builders.insert(std::make_pair(
-            edit.column_family_, std::unique_ptr<BaseReferencedVersionBuilder>(
-                                     new BaseReferencedVersionBuilder(cfd))));
-      } else if (edit.is_column_family_drop_) {
-        if (!cf_in_builders) {
-          s = Status::Corruption(
-              "Manifest - dropping non-existing column family");
-          break;
-        }
-        auto builder_iter = builders.find(edit.column_family_);
-        builders.erase(builder_iter);
-        comparators.erase(edit.column_family_);
-        cfd = column_family_set_->GetColumnFamily(edit.column_family_);
-        assert(cfd != nullptr);
-        cfd->UnrefAndTryDelete();
-        cfd = nullptr;
-      } else {
-        if (!cf_in_builders) {
-          s = Status::Corruption(
-              "Manifest record referencing unknown column family");
-          break;
-        }
-
-        cfd = column_family_set_->GetColumnFamily(edit.column_family_);
-        // this should never happen since cf_in_builders is true
-        assert(cfd != nullptr);
-
-        // if it is not column family add or column family drop,
-        // then it's a file add/delete, which should be forwarded
-        // to builder
-        auto builder = builders.find(edit.column_family_);
-        assert(builder != builders.end());
-        s = builder->second->version_builder()->Apply(&edit);
-        if (!s.ok()) {
-          break;
-        }
-      }
-
-      if (cfd != nullptr && edit.has_log_number_) {
-        cfd->SetLogNumber(edit.log_number_);
-      }
-
-
-      if (edit.has_prev_log_number_) {
-        previous_log_number = edit.prev_log_number_;
-        have_prev_log_number = true;
-      }
-
-      if (edit.has_next_file_number_) {
-        next_file = edit.next_file_number_;
-        have_next_file = true;
-      }
-
-      if (edit.has_last_sequence_) {
-        last_sequence = edit.last_sequence_;
-        have_last_sequence = true;
-      }
-
-      if (edit.has_max_column_family_) {
-        column_family_set_->UpdateMaxColumnFamily(edit.max_column_family_);
-      }
-
-      if (edit.has_min_log_number_to_keep_) {
-        MarkMinLogNumberToKeep2PC(edit.min_log_number_to_keep_);
-      }
-    }
+    handler.Iterate(reader, &s);
   }
-  file_reader.reset();
-
-  if (s.ok()) {
-    if (!have_next_file) {
-      s = Status::Corruption("no meta-nextfile entry in descriptor");
-      printf("no meta-nextfile entry in descriptor");
-    } else if (!have_last_sequence) {
-      printf("no last-sequence-number entry in descriptor");
-      s = Status::Corruption("no last-sequence-number entry in descriptor");
-    }
 
-    if (!have_prev_log_number) {
-      previous_log_number = 0;
-    }
-  }
-
-  if (s.ok()) {
-    for (auto cfd : *column_family_set_) {
-      if (cfd->IsDropped()) {
-        continue;
-      }
-      auto builders_iter = builders.find(cfd->GetID());
-      assert(builders_iter != builders.end());
-      auto builder = builders_iter->second->version_builder();
-
-      Version* v = new Version(cfd, this, file_options_,
-                               *cfd->GetLatestMutableCFOptions(), io_tracer_,
-                               current_version_number_++);
-      s = builder->SaveTo(v->storage_info());
-      v->PrepareApply(*cfd->GetLatestMutableCFOptions(), false);
-
-      printf("--------------- Column family \"%s\"  (ID %" PRIu32
-             ") --------------\n",
-             cfd->GetName().c_str(), cfd->GetID());
-      printf("log number: %" PRIu64 "\n", cfd->GetLogNumber());
-      auto comparator = comparators.find(cfd->GetID());
-      if (comparator != comparators.end()) {
-        printf("comparator: %s\n", comparator->second.c_str());
-      } else {
-        printf("comparator: <NO COMPARATOR>\n");
-      }
-      printf("%s \n", v->DebugString(hex).c_str());
-      delete v;
-    }
-
-    next_file_number_.store(next_file + 1);
-    last_allocated_sequence_ = last_sequence;
-    last_published_sequence_ = last_sequence;
-    last_sequence_ = last_sequence;
-    prev_log_number_ = previous_log_number;
-
-    printf("next_file_number %" PRIu64 " last_sequence %" PRIu64
-           "  prev_log_number %" PRIu64 " max_column_family %" PRIu32
-           " min_log_number_to_keep "
-           "%" PRIu64 "\n",
-           next_file_number_.load(), last_sequence, previous_log_number,
-           column_family_set_->GetMaxColumnFamily(),
-           min_log_number_to_keep_2pc());
-  }
-
-  return s;
+  return handler.status();
 }
 #endif  // ROCKSDB_LITE
 
@@ -5343,7 +5038,7 @@ void VersionSet::MarkMinLogNumberToKeep2PC(uint64_t number) {
 
 Status VersionSet::WriteCurrentStateToManifest(
     const std::unordered_map<uint32_t, MutableCFState>& curr_state,
-    log::Writer* log, IOStatus& io_s) {
+    const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s) {
   // TODO: Break up into multiple records to reduce memory usage on recovery?
 
   // WARNING: This method doesn't hold a mutex!!
@@ -5368,6 +5063,21 @@ Status VersionSet::WriteCurrentStateToManifest(
     }
   }
 
+  // Save WALs.
+  if (!wal_additions.GetWalAdditions().empty()) {
+    TEST_SYNC_POINT_CALLBACK("VersionSet::WriteCurrentStateToManifest:SaveWal",
+                             const_cast<VersionEdit*>(&wal_additions));
+    std::string record;
+    if (!wal_additions.EncodeTo(&record)) {
+      return Status::Corruption("Unable to Encode VersionEdit: " +
+                                wal_additions.DebugString(true));
+    }
+    io_s = log->AddRecord(record);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+  }
+
   for (auto cfd : *column_family_set_) {
     assert(cfd);
 
@@ -5438,6 +5148,21 @@ Status VersionSet::WriteCurrentStateToManifest(
       assert(iter != curr_state.end());
       uint64_t log_number = iter->second.log_number;
       edit.SetLogNumber(log_number);
+
+      if (cfd->GetID() == 0) {
+        // min_log_number_to_keep is for the whole db, not for specific column family.
+        // So it does not need to be set for every column family, just need to be set once.
+        // Since default CF can never be dropped, we set the min_log to the default CF here.
+        uint64_t min_log = min_log_number_to_keep_2pc();
+        if (min_log != 0) {
+          edit.SetMinLogNumberToKeep(min_log);
+        }
+      }
+
+      const std::string& full_history_ts_low = iter->second.full_history_ts_low;
+      if (!full_history_ts_low.empty()) {
+        edit.SetFullHistoryTsLow(full_history_ts_low);
+      }
       std::string record;
       if (!edit.EncodeTo(&record)) {
         return Status::Corruption(
@@ -5785,20 +5510,6 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
         "[%s] compaction output being applied to a different base version from"
         " input version",
         c->column_family_data()->GetName().c_str());
-
-    if (vstorage->compaction_style_ == kCompactionStyleLevel &&
-        c->start_level() == 0 && c->num_input_levels() > 2U) {
-      // We are doing a L0->base_level compaction. The assumption is if
-      // base level is not L1, levels from L1 to base_level - 1 is empty.
-      // This is ensured by having one compaction from L0 going on at the
-      // same time in level-based compaction. So that during the time, no
-      // compaction/flush can put files to those levels.
-      for (int l = c->start_level() + 1; l < c->output_level(); l++) {
-        if (vstorage->NumLevelFiles(l) != 0) {
-          return false;
-        }
-      }
-    }
   }
 
   for (size_t input = 0; input < c->num_input_levels(); ++input) {
@@ -5881,6 +5592,9 @@ void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
         filemetadata.oldest_blob_file_number = file->oldest_blob_file_number;
         filemetadata.file_checksum = file->file_checksum;
         filemetadata.file_checksum_func_name = file->file_checksum_func_name;
+        filemetadata.temperature = file->temperature;
+        filemetadata.oldest_ancester_time = file->TryGetOldestAncesterTime();
+        filemetadata.file_creation_time = file->TryGetFileCreationTime();
         metadata->push_back(filemetadata);
       }
     }
@@ -5996,8 +5710,8 @@ ReactiveVersionSet::ReactiveVersionSet(
     const std::shared_ptr<IOTracer>& io_tracer)
     : VersionSet(dbname, _db_options, _file_options, table_cache,
                  write_buffer_manager, write_controller,
-                 /*block_cache_tracer=*/nullptr, io_tracer),
-      number_of_edits_to_skip_(0) {}
+                 /*block_cache_tracer=*/nullptr, io_tracer,
+                 /*db_session_id*/ "") {}
 
 ReactiveVersionSet::~ReactiveVersionSet() {}
 
@@ -6010,394 +5724,44 @@ Status ReactiveVersionSet::Recover(
   assert(manifest_reporter != nullptr);
   assert(manifest_reader_status != nullptr);
 
-  std::unordered_map<std::string, ColumnFamilyOptions> cf_name_to_options;
-  for (const auto& cf : column_families) {
-    cf_name_to_options.insert({cf.name, cf.options});
-  }
-
-  // add default column family
-  auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName);
-  if (default_cf_iter == cf_name_to_options.end()) {
-    return Status::InvalidArgument("Default column family not specified");
-  }
-  VersionEdit default_cf_edit;
-  default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName);
-  default_cf_edit.SetColumnFamily(0);
-  ColumnFamilyData* default_cfd =
-      CreateColumnFamily(default_cf_iter->second, &default_cf_edit);
-  // In recovery, nobody else can access it, so it's fine to set it to be
-  // initialized earlier.
-  default_cfd->set_initialized();
-  VersionBuilderMap builders;
-  std::unordered_map<int, std::string> column_families_not_found;
-  builders.insert(
-      std::make_pair(0, std::unique_ptr<BaseReferencedVersionBuilder>(
-                            new BaseReferencedVersionBuilder(default_cfd))));
-
   manifest_reader_status->reset(new Status());
   manifest_reporter->reset(new LogReporter());
   static_cast_with_check<LogReporter>(manifest_reporter->get())->status =
       manifest_reader_status->get();
   Status s = MaybeSwitchManifest(manifest_reporter->get(), manifest_reader);
   log::Reader* reader = manifest_reader->get();
+  assert(reader);
 
-  int retry = 0;
-  VersionEdit version_edit;
-  while (s.ok() && retry < 1) {
-    assert(reader != nullptr);
-    s = ReadAndRecover(*reader, &read_buffer_, cf_name_to_options,
-                       column_families_not_found, builders,
-                       manifest_reader_status->get(), &version_edit);
-    if (s.ok()) {
-      bool enough = version_edit.has_next_file_number_ &&
-                    version_edit.has_log_number_ &&
-                    version_edit.has_last_sequence_;
-      if (enough) {
-        for (const auto& cf : column_families) {
-          auto cfd = column_family_set_->GetColumnFamily(cf.name);
-          if (cfd == nullptr) {
-            enough = false;
-            break;
-          }
-        }
-      }
-      if (enough) {
-        for (const auto& cf : column_families) {
-          auto cfd = column_family_set_->GetColumnFamily(cf.name);
-          assert(cfd != nullptr);
-          if (!cfd->IsDropped()) {
-            auto builder_iter = builders.find(cfd->GetID());
-            assert(builder_iter != builders.end());
-            auto builder = builder_iter->second->version_builder();
-            assert(builder != nullptr);
-            s = builder->LoadTableHandlers(
-                cfd->internal_stats(), db_options_->max_file_opening_threads,
-                false /* prefetch_index_and_filter_in_cache */,
-                true /* is_initial_load */,
-                cfd->GetLatestMutableCFOptions()->prefix_extractor.get(),
-                MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()));
-            if (!s.ok()) {
-              enough = false;
-              if (s.IsPathNotFound()) {
-                s = Status::OK();
-              }
-              break;
-            }
-          }
-        }
-      }
-      if (enough) {
-        break;
-      }
-    }
-    ++retry;
-  }
-
-  if (s.ok()) {
-    if (!version_edit.has_prev_log_number_) {
-      version_edit.prev_log_number_ = 0;
-    }
-    column_family_set_->UpdateMaxColumnFamily(version_edit.max_column_family_);
-
-    MarkMinLogNumberToKeep2PC(version_edit.min_log_number_to_keep_);
-    MarkFileNumberUsed(version_edit.prev_log_number_);
-    MarkFileNumberUsed(version_edit.log_number_);
+  manifest_tailer_.reset(new ManifestTailer(
+      column_families, const_cast<ReactiveVersionSet*>(this), io_tracer_));
 
-    for (auto cfd : *column_family_set_) {
-      assert(builders.count(cfd->GetID()) > 0);
-      auto builder = builders[cfd->GetID()]->version_builder();
-      if (!builder->CheckConsistencyForNumLevels()) {
-        s = Status::InvalidArgument(
-            "db has more levels than options.num_levels");
-        break;
-      }
-    }
-  }
-
-  if (s.ok()) {
-    for (auto cfd : *column_family_set_) {
-      if (cfd->IsDropped()) {
-        continue;
-      }
-      assert(cfd->initialized());
-      auto builders_iter = builders.find(cfd->GetID());
-      assert(builders_iter != builders.end());
-      auto* builder = builders_iter->second->version_builder();
+  manifest_tailer_->Iterate(*reader, manifest_reader_status->get());
 
-      Version* v = new Version(cfd, this, file_options_,
-                               *cfd->GetLatestMutableCFOptions(), io_tracer_,
-                               current_version_number_++);
-      s = builder->SaveTo(v->storage_info());
-
-      if (s.ok()) {
-        // Install recovered version
-        v->PrepareApply(*cfd->GetLatestMutableCFOptions(),
-                        !(db_options_->skip_stats_update_on_db_open));
-        AppendVersion(cfd, v);
-      } else {
-        ROCKS_LOG_ERROR(db_options_->info_log,
-                        "[%s]: inconsistent version: %s\n",
-                        cfd->GetName().c_str(), s.ToString().c_str());
-        delete v;
-        break;
-      }
-    }
-  }
-  if (s.ok()) {
-    next_file_number_.store(version_edit.next_file_number_ + 1);
-    last_allocated_sequence_ = version_edit.last_sequence_;
-    last_published_sequence_ = version_edit.last_sequence_;
-    last_sequence_ = version_edit.last_sequence_;
-    prev_log_number_ = version_edit.prev_log_number_;
-    for (auto cfd : *column_family_set_) {
-      if (cfd->IsDropped()) {
-        continue;
-      }
-      ROCKS_LOG_INFO(db_options_->info_log,
-                     "Column family [%s] (ID %u), log number is %" PRIu64 "\n",
-                     cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber());
-    }
-  }
-  return s;
+  return manifest_tailer_->status();
 }
 
 Status ReactiveVersionSet::ReadAndApply(
     InstrumentedMutex* mu,
     std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+    Status* manifest_read_status,
     std::unordered_set<ColumnFamilyData*>* cfds_changed) {
   assert(manifest_reader != nullptr);
   assert(cfds_changed != nullptr);
   mu->AssertHeld();
 
   Status s;
-  uint64_t applied_edits = 0;
-  while (s.ok()) {
-    Slice record;
-    std::string scratch;
-    log::Reader* reader = manifest_reader->get();
-    std::string old_manifest_path = reader->file()->file_name();
-    while (reader->ReadRecord(&record, &scratch)) {
-      VersionEdit edit;
-      s = edit.DecodeFrom(record);
-      if (!s.ok()) {
-        break;
-      }
-
-      // Skip the first VersionEdits of each MANIFEST generated by
-      // VersionSet::WriteCurrentStatetoManifest.
-      if (number_of_edits_to_skip_ > 0) {
-        ColumnFamilyData* cfd =
-            column_family_set_->GetColumnFamily(edit.column_family_);
-        if (cfd != nullptr && !cfd->IsDropped()) {
-          --number_of_edits_to_skip_;
-        }
-        continue;
-      }
-
-      s = read_buffer_.AddEdit(&edit);
-      if (!s.ok()) {
-        break;
-      }
-      VersionEdit temp_edit;
-      if (edit.is_in_atomic_group_) {
-        if (read_buffer_.IsFull()) {
-          // Apply edits in an atomic group when we have read all edits in the
-          // group.
-          for (auto& e : read_buffer_.replay_buffer()) {
-            s = ApplyOneVersionEditToBuilder(e, cfds_changed, &temp_edit);
-            if (!s.ok()) {
-              break;
-            }
-            applied_edits++;
-          }
-          if (!s.ok()) {
-            break;
-          }
-          read_buffer_.Clear();
-        }
-      } else {
-        // Apply a normal edit immediately.
-        s = ApplyOneVersionEditToBuilder(edit, cfds_changed, &temp_edit);
-        if (s.ok()) {
-          applied_edits++;
-        } else {
-          break;
-        }
-      }
-    }
-    if (!s.ok()) {
-      // Clear the buffer if we fail to decode/apply an edit.
-      read_buffer_.Clear();
-    }
-    // It's possible that:
-    // 1) s.IsCorruption(), indicating the current MANIFEST is corrupted.
-    //    Or the version(s) rebuilt from tailing the MANIFEST is inconsistent.
-    // 2) we have finished reading the current MANIFEST.
-    // 3) we have encountered an IOError reading the current MANIFEST.
-    // We need to look for the next MANIFEST and start from there. If we cannot
-    // find the next MANIFEST, we should exit the loop.
-    Status tmp_s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader);
-    reader = manifest_reader->get();
-    if (tmp_s.ok()) {
-      if (reader->file()->file_name() == old_manifest_path) {
-        // Still processing the same MANIFEST, thus no need to continue this
-        // loop since no record is available if we have reached here.
-        break;
-      } else {
-        // We have switched to a new MANIFEST whose first records have been
-        // generated by VersionSet::WriteCurrentStatetoManifest. Since the
-        // secondary instance has already finished recovering upon start, there
-        // is no need for the secondary to process these records. Actually, if
-        // the secondary were to replay these records, the secondary may end up
-        // adding the same SST files AGAIN to each column family, causing
-        // consistency checks done by VersionBuilder to fail. Therefore, we
-        // record the number of records to skip at the beginning of the new
-        // MANIFEST and ignore them.
-        number_of_edits_to_skip_ = 0;
-        for (auto* cfd : *column_family_set_) {
-          if (cfd->IsDropped()) {
-            continue;
-          }
-          // Increase number_of_edits_to_skip by 2 because
-          // WriteCurrentStatetoManifest() writes 2 version edits for each
-          // column family at the beginning of the newly-generated MANIFEST.
-          // TODO(yanqin) remove hard-coded value.
-          if (db_options_->write_dbid_to_manifest) {
-            number_of_edits_to_skip_ += 3;
-          } else {
-            number_of_edits_to_skip_ += 2;
-          }
-        }
-        s = tmp_s;
-      }
-    }
-  }
-
-  if (s.ok()) {
-    for (auto cfd : *column_family_set_) {
-      auto builder_iter = active_version_builders_.find(cfd->GetID());
-      if (builder_iter == active_version_builders_.end()) {
-        continue;
-      }
-      auto builder = builder_iter->second->version_builder();
-      if (!builder->CheckConsistencyForNumLevels()) {
-        s = Status::InvalidArgument(
-            "db has more levels than options.num_levels");
-        break;
-      }
-    }
-  }
-  TEST_SYNC_POINT_CALLBACK("ReactiveVersionSet::ReadAndApply:AppliedEdits",
-                           &applied_edits);
-  return s;
-}
-
-Status ReactiveVersionSet::ApplyOneVersionEditToBuilder(
-    VersionEdit& edit, std::unordered_set<ColumnFamilyData*>* cfds_changed,
-    VersionEdit* version_edit) {
-  ColumnFamilyData* cfd =
-      column_family_set_->GetColumnFamily(edit.column_family_);
-
-  // If we cannot find this column family in our column family set, then it
-  // may be a new column family created by the primary after the secondary
-  // starts. It is also possible that the secondary instance opens only a subset
-  // of column families. Ignore it for now.
-  if (nullptr == cfd) {
-    return Status::OK();
-  }
-  if (active_version_builders_.find(edit.column_family_) ==
-          active_version_builders_.end() &&
-      !cfd->IsDropped()) {
-    std::unique_ptr<BaseReferencedVersionBuilder> builder_guard(
-        new BaseReferencedVersionBuilder(cfd));
-    active_version_builders_.insert(
-        std::make_pair(edit.column_family_, std::move(builder_guard)));
-  }
-
-  auto builder_iter = active_version_builders_.find(edit.column_family_);
-  assert(builder_iter != active_version_builders_.end());
-  auto builder = builder_iter->second->version_builder();
-  assert(builder != nullptr);
-
-  if (edit.is_column_family_add_) {
-    // TODO (yanqin) for now the secondary ignores column families created
-    // after Open. This also simplifies handling of switching to a new MANIFEST
-    // and processing the snapshot of the system at the beginning of the
-    // MANIFEST.
-  } else if (edit.is_column_family_drop_) {
-    // Drop the column family by setting it to be 'dropped' without destroying
-    // the column family handle.
-    // TODO (haoyu) figure out how to handle column faimly drop for
-    // secondary instance. (Is it possible that the ref count for cfd is 0 but
-    // the ref count for its versions is higher than 0?)
-    cfd->SetDropped();
-    if (cfd->UnrefAndTryDelete()) {
-      cfd = nullptr;
-    }
-    active_version_builders_.erase(builder_iter);
-  } else {
-    Status s = builder->Apply(&edit);
-    if (!s.ok()) {
-      return s;
-    }
-  }
-  Status s = ExtractInfoFromVersionEdit(cfd, edit, version_edit);
+  log::Reader* reader = manifest_reader->get();
+  assert(reader);
+  s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader);
   if (!s.ok()) {
     return s;
   }
-
-  if (cfd != nullptr && !cfd->IsDropped()) {
-    s = builder->LoadTableHandlers(
-        cfd->internal_stats(), db_options_->max_file_opening_threads,
-        false /* prefetch_index_and_filter_in_cache */,
-        false /* is_initial_load */,
-        cfd->GetLatestMutableCFOptions()->prefix_extractor.get(),
-        MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()));
-    TEST_SYNC_POINT_CALLBACK(
-        "ReactiveVersionSet::ApplyOneVersionEditToBuilder:"
-        "AfterLoadTableHandlers",
-        &s);
-
-    if (s.ok()) {
-      auto version = new Version(cfd, this, file_options_,
-                                 *cfd->GetLatestMutableCFOptions(), io_tracer_,
-                                 current_version_number_++);
-      s = builder->SaveTo(version->storage_info());
-      if (s.ok()) {
-        version->PrepareApply(*cfd->GetLatestMutableCFOptions(), true);
-        AppendVersion(cfd, version);
-        active_version_builders_.erase(builder_iter);
-        if (cfds_changed->count(cfd) == 0) {
-          cfds_changed->insert(cfd);
-        }
-      } else {
-        delete version;
-      }
-    } else if (s.IsPathNotFound()) {
-      s = Status::OK();
-    }
-    // Some other error has occurred during LoadTableHandlers.
-  }
-
+  manifest_tailer_->Iterate(*(manifest_reader->get()), manifest_read_status);
+  s = manifest_tailer_->status();
   if (s.ok()) {
-    if (version_edit->HasNextFile()) {
-      next_file_number_.store(version_edit->next_file_number_ + 1);
-    }
-    if (version_edit->has_last_sequence_) {
-      last_allocated_sequence_ = version_edit->last_sequence_;
-      last_published_sequence_ = version_edit->last_sequence_;
-      last_sequence_ = version_edit->last_sequence_;
-    }
-    if (version_edit->has_prev_log_number_) {
-      prev_log_number_ = version_edit->prev_log_number_;
-      MarkFileNumberUsed(version_edit->prev_log_number_);
-    }
-    if (version_edit->has_log_number_) {
-      MarkFileNumberUsed(version_edit->log_number_);
-    }
-    column_family_set_->UpdateMaxColumnFamily(version_edit->max_column_family_);
-    MarkMinLogNumberToKeep2PC(version_edit->min_log_number_to_keep_);
+    *cfds_changed = std::move(manifest_tailer_->GetUpdatedColumnFamilies());
   }
+
   return s;
 }
 
@@ -6421,7 +5785,7 @@ Status ReactiveVersionSet::MaybeSwitchManifest(
             "ReactiveVersionSet::MaybeSwitchManifest:"
             "AfterGetCurrentManifestPath:1");
         s = fs_->NewSequentialFile(manifest_path,
-                                   env_->OptimizeForManifestRead(file_options_),
+                                   fs_->OptimizeForManifestRead(file_options_),
                                    &manifest_file, nullptr);
       } else {
         // No need to switch manifest.
@@ -6438,15 +5802,24 @@ Status ReactiveVersionSet::MaybeSwitchManifest(
           true /* checksum */, 0 /* log_number */));
       ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n",
                      manifest_path.c_str());
-      // TODO (yanqin) every time we switch to a new MANIFEST, we clear the
-      // active_version_builders_ map because we choose to construct the
-      // versions from scratch, thanks to the first part of each MANIFEST
-      // written by VersionSet::WriteCurrentStatetoManifest. This is not
-      // necessary, but we choose this at present for the sake of simplicity.
-      active_version_builders_.clear();
+      if (manifest_tailer_) {
+        manifest_tailer_->PrepareToReadNewManifest();
+      }
     }
   } while (s.IsPathNotFound());
   return s;
 }
 
+#ifndef NDEBUG
+uint64_t ReactiveVersionSet::TEST_read_edits_in_atomic_group() const {
+  assert(manifest_tailer_);
+  return manifest_tailer_->GetReadBuffer().TEST_read_edits_in_atomic_group();
+}
+#endif  // !NDEBUG
+
+std::vector<VersionEdit>& ReactiveVersionSet::replay_buffer() {
+  assert(manifest_tailer_);
+  return manifest_tailer_->GetReadBuffer().replay_buffer();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/version_set.h b/db/version_set.h
index 93b4509611d..4a593e97cdf 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -26,9 +26,11 @@
 #include <memory>
 #include <set>
 #include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
+#include "cache/cache_helpers.h"
 #include "db/blob/blob_file_meta.h"
 #include "db/column_family.h"
 #include "db/compaction/compaction.h"
@@ -58,6 +60,7 @@ namespace log {
 class Writer;
 }
 
+class BlobIndex;
 class Compaction;
 class LogBuffer;
 class LookupKey;
@@ -68,6 +71,8 @@ class WriteBufferManager;
 class MergeContext;
 class ColumnFamilySet;
 class MergeIteratorBuilder;
+class SystemClock;
+class ManifestTailer;
 
 // VersionEdit is always supposed to be valid and it is used to point at
 // entries in Manifest. Ideally it should not be used as a container to
@@ -145,7 +150,7 @@ class VersionStorageInfo {
   // We use compaction scores to figure out which compaction to do next
   // REQUIRES: db_mutex held!!
   // TODO find a better way to pass compaction_options_fifo.
-  void ComputeCompactionScore(const ImmutableCFOptions& immutable_cf_options,
+  void ComputeCompactionScore(const ImmutableOptions& immutable_options,
                               const MutableCFOptions& mutable_cf_options);
 
   // Estimate est_comp_needed_bytes_
@@ -158,13 +163,13 @@ class VersionStorageInfo {
 
   // This computes ttl_expired_files_ and is called by
   // ComputeCompactionScore()
-  void ComputeExpiredTtlFiles(const ImmutableCFOptions& ioptions,
+  void ComputeExpiredTtlFiles(const ImmutableOptions& ioptions,
                               const uint64_t ttl);
 
   // This computes files_marked_for_periodic_compaction_ and is called by
   // ComputeCompactionScore()
   void ComputeFilesMarkedForPeriodicCompaction(
-      const ImmutableCFOptions& ioptions,
+      const ImmutableOptions& ioptions,
       const uint64_t periodic_compaction_seconds);
 
   // This computes bottommost_files_marked_for_compaction_ and is called by
@@ -340,6 +345,19 @@ class VersionStorageInfo {
   using BlobFiles = std::map<uint64_t, std::shared_ptr<BlobFileMetaData>>;
   const BlobFiles& GetBlobFiles() const { return blob_files_; }
 
+  uint64_t GetTotalBlobFileSize() const {
+    uint64_t total_blob_bytes = 0;
+
+    for (const auto& pair : blob_files_) {
+      const auto& meta = pair.second;
+      assert(meta);
+
+      total_blob_bytes += meta->GetTotalBlobBytes();
+    }
+
+    return total_blob_bytes;
+  }
+
   const ROCKSDB_NAMESPACE::LevelFilesBrief& LevelFilesBrief(int level) const {
     assert(level < static_cast<int>(level_files_brief_.size()));
     return level_files_brief_[level];
@@ -464,7 +482,7 @@ class VersionStorageInfo {
   uint64_t MaxBytesForLevel(int level) const;
 
   // Must be called after any change to MutableCFOptions.
-  void CalculateBaseBytes(const ImmutableCFOptions& ioptions,
+  void CalculateBaseBytes(const ImmutableOptions& ioptions,
                           const MutableCFOptions& options);
 
   // Returns an estimate of the amount of live data in bytes.
@@ -679,7 +697,21 @@ class Version {
            bool* is_blob = nullptr, bool do_merge = true);
 
   void MultiGet(const ReadOptions&, MultiGetRange* range,
-                ReadCallback* callback = nullptr, bool* is_blob = nullptr);
+                ReadCallback* callback = nullptr);
+
+  // Interprets blob_index_slice as a blob reference, and (assuming the
+  // corresponding blob file is part of this Version) retrieves the blob and
+  // saves it in *value.
+  // REQUIRES: blob_index_slice stores an encoded blob reference
+  Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
+                 const Slice& blob_index_slice, PinnableSlice* value,
+                 uint64_t* bytes_read) const;
+
+  // Retrieves a blob using a blob reference and saves it in *value,
+  // assuming the corresponding blob file is part of this Version.
+  Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
+                 const BlobIndex& blob_index, PinnableSlice* value,
+                 uint64_t* bytes_read) const;
 
   // Loads some stats information from files. Call without mutex held. It needs
   // to be called before applying the version to the version set.
@@ -741,10 +773,8 @@ class Version {
 
   ColumnFamilyData* cfd() const { return cfd_; }
 
-  // Return the next Version in the linked list. Used for debug only
-  Version* TEST_Next() const {
-    return next_;
-  }
+  // Return the next Version in the linked list.
+  Version* Next() const { return next_; }
 
   int TEST_refs() const { return refs_; }
 
@@ -764,6 +794,8 @@ class Version {
 
  private:
   Env* env_;
+  SystemClock* clock_;
+
   friend class ReactiveVersionSet;
   friend class VersionSet;
   friend class VersionEditHandler;
@@ -800,6 +832,7 @@ class Version {
   Logger* info_log_;
   Statistics* db_statistics_;
   TableCache* table_cache_;
+  BlobFileCache* blob_file_cache_;
   const MergeOperator* merge_operator_;
 
   VersionStorageInfo storage_info_;
@@ -876,6 +909,7 @@ class BaseReferencedVersionBuilder;
 
 class AtomicGroupReadBuffer {
  public:
+  AtomicGroupReadBuffer() = default;
   Status AddEdit(VersionEdit* edit);
   void Clear();
   bool IsFull() const;
@@ -901,13 +935,25 @@ class VersionSet {
              WriteBufferManager* write_buffer_manager,
              WriteController* write_controller,
              BlockCacheTracer* const block_cache_tracer,
-             const std::shared_ptr<IOTracer>& io_tracer);
+             const std::shared_ptr<IOTracer>& io_tracer,
+             const std::string& db_session_id);
   // No copying allowed
   VersionSet(const VersionSet&) = delete;
   void operator=(const VersionSet&) = delete;
 
   virtual ~VersionSet();
 
+  Status LogAndApplyToDefaultColumnFamily(
+      VersionEdit* edit, InstrumentedMutex* mu,
+      FSDirectory* db_directory = nullptr, bool new_descriptor_log = false,
+      const ColumnFamilyOptions* column_family_options = nullptr) {
+    ColumnFamilyData* default_cf = GetColumnFamilySet()->GetDefault();
+    const MutableCFOptions* cf_options =
+        default_cf->GetLatestMutableCFOptions();
+    return LogAndApply(default_cf, *cf_options, edit, mu, db_directory,
+                       new_descriptor_log, column_family_options);
+  }
+
   // Apply *edit to the current version to form a new descriptor that
   // is both saved to persistent state and installed as the new
   // current version.  Will release *mu while actually writing to the file.
@@ -938,7 +984,8 @@ class VersionSet {
       const MutableCFOptions& mutable_cf_options,
       const autovector<VersionEdit*>& edit_list, InstrumentedMutex* mu,
       FSDirectory* db_directory = nullptr, bool new_descriptor_log = false,
-      const ColumnFamilyOptions* column_family_options = nullptr) {
+      const ColumnFamilyOptions* column_family_options = nullptr,
+      const std::function<void(const Status&)>& manifest_wcb = {}) {
     autovector<ColumnFamilyData*> cfds;
     cfds.emplace_back(column_family_data);
     autovector<const MutableCFOptions*> mutable_cf_options_list;
@@ -946,7 +993,8 @@ class VersionSet {
     autovector<autovector<VersionEdit*>> edit_lists;
     edit_lists.emplace_back(edit_list);
     return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
-                       db_directory, new_descriptor_log, column_family_options);
+                       db_directory, new_descriptor_log, column_family_options,
+                       {manifest_wcb});
   }
 
   // The across-multi-cf batch version. If edit_lists contain more than
@@ -958,7 +1006,9 @@ class VersionSet {
       const autovector<autovector<VersionEdit*>>& edit_lists,
       InstrumentedMutex* mu, FSDirectory* db_directory = nullptr,
       bool new_descriptor_log = false,
-      const ColumnFamilyOptions* new_cf_options = nullptr);
+      const ColumnFamilyOptions* new_cf_options = nullptr,
+      const std::vector<std::function<void(const Status&)>>& manifest_wcbs =
+          {});
 
   static Status GetCurrentManifestPath(const std::string& dbname,
                                        FileSystem* fs,
@@ -1096,10 +1146,28 @@ class VersionSet {
     return PreComputeMinLogNumberWithUnflushedData(nullptr);
   }
   // Returns the minimum log number which still has data not flushed to any SST
+  // file.
+  // Empty column families' log number is considered to be
+  // new_log_number_for_empty_cf.
+  uint64_t PreComputeMinLogNumberWithUnflushedData(
+      uint64_t new_log_number_for_empty_cf) const {
+    uint64_t min_log_num = port::kMaxUint64;
+    for (auto cfd : *column_family_set_) {
+      // It's safe to ignore dropped column families here:
+      // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
+      uint64_t num =
+          cfd->IsEmpty() ? new_log_number_for_empty_cf : cfd->GetLogNumber();
+      if (min_log_num > num && !cfd->IsDropped()) {
+        min_log_num = num;
+      }
+    }
+    return min_log_num;
+  }
+  // Returns the minimum log number which still has data not flushed to any SST
   // file, except data from `cfd_to_skip`.
   uint64_t PreComputeMinLogNumberWithUnflushedData(
       const ColumnFamilyData* cfd_to_skip) const {
-    uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
+    uint64_t min_log_num = port::kMaxUint64;
     for (auto cfd : *column_family_set_) {
       if (cfd == cfd_to_skip) {
         continue;
@@ -1112,6 +1180,23 @@ class VersionSet {
     }
     return min_log_num;
   }
+  // Returns the minimum log number which still has data not flushed to any SST
+  // file, except data from `cfds_to_skip`.
+  uint64_t PreComputeMinLogNumberWithUnflushedData(
+      const std::unordered_set<const ColumnFamilyData*>& cfds_to_skip) const {
+    uint64_t min_log_num = port::kMaxUint64;
+    for (auto cfd : *column_family_set_) {
+      if (cfds_to_skip.count(cfd)) {
+        continue;
+      }
+      // It's safe to ignore dropped column families here:
+      // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
+      if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) {
+        min_log_num = cfd->GetLogNumber();
+      }
+    }
+    return min_log_num;
+  }
 
   // Create an iterator that reads over the compaction inputs for "*c".
   // The caller should delete the iterator when no longer needed.
@@ -1150,6 +1235,10 @@ class VersionSet {
   void GetLiveFilesMetaData(std::vector<LiveFileMetaData> *metadata);
 
   void AddObsoleteBlobFile(uint64_t blob_file_number, std::string path) {
+    assert(table_cache_);
+
+    table_cache_->Erase(GetSlice(&blob_file_number));
+
     obsolete_blob_files_.emplace_back(blob_file_number, std::move(path));
   }
 
@@ -1174,6 +1263,7 @@ class VersionSet {
   // Get the IO Status returned by written Manifest.
   const IOStatus& io_status() const { return io_status_; }
 
+  // The returned WalSet needs to be accessed with DB mutex held.
   const WalSet& GetWalSet() const { return wals_; }
 
   void TEST_CreateAndAppendVersion(ColumnFamilyData* cfd) {
@@ -1198,6 +1288,7 @@ class VersionSet {
   friend class Version;
   friend class VersionEditHandler;
   friend class VersionEditHandlerPointInTime;
+  friend class DumpManifestHandler;
   friend class DBImpl;
   friend class DBImplReadOnly;
   friend class ManifestReader;
@@ -1225,54 +1316,39 @@ class VersionSet {
 
   struct MutableCFState {
     uint64_t log_number;
+    std::string full_history_ts_low;
+
+    explicit MutableCFState() = default;
+    explicit MutableCFState(uint64_t _log_number, std::string ts_low)
+        : log_number(_log_number), full_history_ts_low(std::move(ts_low)) {}
   };
 
   // Save current contents to *log
   Status WriteCurrentStateToManifest(
       const std::unordered_map<uint32_t, MutableCFState>& curr_state,
-      log::Writer* log, IOStatus& io_s);
+      const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s);
 
   void AppendVersion(ColumnFamilyData* column_family_data, Version* v);
 
   ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
                                        const VersionEdit* edit);
 
-  Status ReadAndRecover(
-      log::Reader& reader, AtomicGroupReadBuffer* read_buffer,
-      const std::unordered_map<std::string, ColumnFamilyOptions>&
-          name_to_options,
-      std::unordered_map<int, std::string>& column_families_not_found,
-      std::unordered_map<
-          uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>& builders,
-      Status* log_read_status, VersionEditParams* version_edit,
-      std::string* db_id = nullptr);
-
-  // REQUIRES db mutex
-  Status ApplyOneVersionEditToBuilder(
-      VersionEdit& edit,
-      const std::unordered_map<std::string, ColumnFamilyOptions>& name_to_opts,
-      std::unordered_map<int, std::string>& column_families_not_found,
-      std::unordered_map<
-          uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>& builders,
-      VersionEditParams* version_edit);
-
-  Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
-                                    const VersionEdit& from_edit,
-                                    VersionEditParams* version_edit_params);
-
   Status VerifyFileMetadata(const std::string& fpath,
                             const FileMetaData& meta) const;
 
+  // Protected by DB mutex.
   WalSet wals_;
 
   std::unique_ptr<ColumnFamilySet> column_family_set_;
+  Cache* table_cache_;
   Env* const env_;
   FileSystemPtr const fs_;
+  SystemClock* const clock_;
   const std::string dbname_;
   std::string db_id_;
   const ImmutableDBOptions* const db_options_;
   std::atomic<uint64_t> next_file_number_;
-  // Any log number equal or lower than this should be ignored during recovery,
+  // Any WAL number smaller than this should be ignored during recovery,
   // and is qualified for being deleted in 2PC mode. In non-2PC mode, this
   // number is ignored.
   std::atomic<uint64_t> min_log_number_to_keep_2pc_ = {0};
@@ -1322,6 +1398,8 @@ class VersionSet {
 
   std::shared_ptr<IOTracer> io_tracer_;
 
+  std::string db_session_id_;
+
  private:
   // REQUIRES db mutex at beginning. may release and re-acquire db mutex
   Status ProcessManifestWrites(std::deque<ManifestWriter>& writers,
@@ -1352,23 +1430,20 @@ class ReactiveVersionSet : public VersionSet {
   Status ReadAndApply(
       InstrumentedMutex* mu,
       std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+      Status* manifest_read_status,
       std::unordered_set<ColumnFamilyData*>* cfds_changed);
 
   Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
                  std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
                  std::unique_ptr<log::Reader::Reporter>* manifest_reporter,
                  std::unique_ptr<Status>* manifest_reader_status);
+#ifndef NDEBUG
+  uint64_t TEST_read_edits_in_atomic_group() const;
+#endif  //! NDEBUG
 
-  uint64_t TEST_read_edits_in_atomic_group() const {
-    return read_buffer_.TEST_read_edits_in_atomic_group();
-  }
-  std::vector<VersionEdit>& replay_buffer() {
-    return read_buffer_.replay_buffer();
-  }
+  std::vector<VersionEdit>& replay_buffer();
 
  protected:
-  using VersionSet::ApplyOneVersionEditToBuilder;
-
   // REQUIRES db mutex
   Status ApplyOneVersionEditToBuilder(
       VersionEdit& edit, std::unordered_set<ColumnFamilyData*>* cfds_changed,
@@ -1379,11 +1454,7 @@ class ReactiveVersionSet : public VersionSet {
       std::unique_ptr<log::FragmentBufferedReader>* manifest_reader);
 
  private:
-  VersionBuilderMap active_version_builders_;
-  AtomicGroupReadBuffer read_buffer_;
-  // Number of version edits to skip by ReadAndApply at the beginning of a new
-  // MANIFEST created by primary.
-  int number_of_edits_to_skip_;
+  std::unique_ptr<ManifestTailer> manifest_tailer_;
 
   using VersionSet::LogAndApply;
   using VersionSet::Recover;
@@ -1393,8 +1464,9 @@ class ReactiveVersionSet : public VersionSet {
       const autovector<const MutableCFOptions*>& /*mutable_cf_options_list*/,
       const autovector<autovector<VersionEdit*>>& /*edit_lists*/,
       InstrumentedMutex* /*mu*/, FSDirectory* /*db_directory*/,
-      bool /*new_descriptor_log*/,
-      const ColumnFamilyOptions* /*new_cf_option*/) override {
+      bool /*new_descriptor_log*/, const ColumnFamilyOptions* /*new_cf_option*/,
+      const std::vector<std::function<void(const Status&)>>& /*manifest_wcbs*/)
+      override {
     return Status::NotSupported("not supported in reactive mode");
   }
 
diff --git a/db/version_set_test.cc b/db/version_set_test.cc
index 8f2134dcee3..75919c6edc8 100644
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@@ -8,10 +8,12 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/version_set.h"
+
 #include "db/db_impl/db_impl.h"
 #include "db/log_writer.h"
-#include "env/mock_env.h"
-#include "logging/logging.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/file_system.h"
+#include "table/block_based/block_based_table_factory.h"
 #include "table/mock_table.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
@@ -102,7 +104,7 @@ class VersionStorageInfoTestBase : public testing::Test {
   InternalKeyComparator icmp_;
   std::shared_ptr<CountingLogger> logger_;
   Options options_;
-  ImmutableCFOptions ioptions_;
+  ImmutableOptions ioptions_;
   MutableCFOptions mutable_cf_options_;
   VersionStorageInfo vstorage_;
 
@@ -692,44 +694,39 @@ class VersionSetTestBase {
   int num_initial_edits_;
 
   explicit VersionSetTestBase(const std::string& name)
-      : mem_env_(nullptr),
-        env_(nullptr),
-        env_guard_(),
-        fs_(),
+      : env_(nullptr),
         dbname_(test::PerThreadDBPath(name)),
         options_(),
         db_options_(options_),
         cf_options_(options_),
-        immutable_cf_options_(db_options_, cf_options_),
+        immutable_options_(db_options_, cf_options_),
         mutable_cf_options_(cf_options_),
         table_cache_(NewLRUCache(50000, 16)),
         write_buffer_manager_(db_options_.db_write_buffer_size),
         shutting_down_(false),
         mock_table_factory_(std::make_shared<mock::MockTableFactory>()) {
-    const char* test_env_uri = getenv("TEST_ENV_URI");
-    Env* base_env = nullptr;
-    if (test_env_uri) {
-      Status s = Env::LoadEnv(test_env_uri, &base_env, &env_guard_);
-      EXPECT_OK(s);
-      EXPECT_NE(Env::Default(), base_env);
-    } else {
-      base_env = Env::Default();
-    }
-    EXPECT_NE(nullptr, base_env);
-    if (getenv("MEM_ENV")) {
-      mem_env_ = new MockEnv(base_env);
+    EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env_, &env_guard_));
+    if (env_ == Env::Default() && getenv("MEM_ENV")) {
+      env_guard_.reset(NewMemEnv(Env::Default()));
+      env_ = env_guard_.get();
     }
-    env_ = mem_env_ ? mem_env_ : base_env;
+    EXPECT_NE(nullptr, env_);
 
-    fs_ = std::make_shared<LegacyFileSystemWrapper>(env_);
-    EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+    fs_ = env_->GetFileSystem();
+    EXPECT_OK(fs_->CreateDirIfMissing(dbname_, IOOptions(), nullptr));
 
+    options_.env = env_;
     db_options_.env = env_;
     db_options_.fs = fs_;
+    immutable_options_.env = env_;
+    immutable_options_.fs = fs_;
+    immutable_options_.clock = env_->GetSystemClock().get();
+
     versions_.reset(
         new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
                        &write_buffer_manager_, &write_controller_,
-                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr));
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
     reactive_versions_ = std::make_shared<ReactiveVersionSet>(
         dbname_, &db_options_, env_options_, table_cache_.get(),
         &write_buffer_manager_, &write_controller_, nullptr);
@@ -745,10 +742,6 @@ class VersionSetTestBase {
       options.env = env_;
       EXPECT_OK(DestroyDB(dbname_, options));
     }
-    if (mem_env_) {
-      delete mem_env_;
-      mem_env_ = nullptr;
-    }
   }
 
  protected:
@@ -760,7 +753,9 @@ class VersionSetTestBase {
     assert(log_writer != nullptr);
     VersionEdit new_db;
     if (db_options_.write_dbid_to_manifest) {
-      std::unique_ptr<DBImpl> impl(new DBImpl(DBOptions(), dbname_));
+      DBOptions tmp_db_options;
+      tmp_db_options.env = env_;
+      std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
       std::string db_id;
       impl->GetDbIdentityFromIdentityFile(&db_id);
       new_db.SetDBId(db_id);
@@ -787,13 +782,13 @@ class VersionSetTestBase {
     }
     *last_seqno = last_seq;
     num_initial_edits_ = static_cast<int>(new_cfs.size() + 1);
+    std::unique_ptr<WritableFileWriter> file_writer;
     const std::string manifest = DescriptorFileName(dbname_, 1);
-    std::unique_ptr<WritableFile> file;
-    Status s = env_->NewWritableFile(
-        manifest, &file, env_->OptimizeForManifestWrite(env_options_));
+    const auto& fs = env_->GetFileSystem();
+    Status s = WritableFileWriter::Create(
+        fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer,
+        nullptr);
     ASSERT_OK(s);
-    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-        NewLegacyWritableFileWrapper(std::move(file)), manifest, env_options_));
     {
       log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
       std::string record;
@@ -816,21 +811,29 @@ class VersionSetTestBase {
 
   // Create DB with 3 column families.
   void NewDB() {
-    std::vector<ColumnFamilyDescriptor> column_families;
     SequenceNumber last_seqno;
     std::unique_ptr<log::Writer> log_writer;
     SetIdentityFile(env_, dbname_);
-    PrepareManifest(&column_families, &last_seqno, &log_writer);
+    PrepareManifest(&column_families_, &last_seqno, &log_writer);
     log_writer.reset();
     // Make "CURRENT" file point to the new manifest file.
     Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
     ASSERT_OK(s);
 
-    EXPECT_OK(versions_->Recover(column_families, false));
-    EXPECT_EQ(column_families.size(),
+    EXPECT_OK(versions_->Recover(column_families_, false));
+    EXPECT_EQ(column_families_.size(),
               versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
   }
 
+  void ReopenDB() {
+    versions_.reset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
+    EXPECT_OK(versions_->Recover(column_families_, false));
+  }
+
   void VerifyManifest(std::string* manifest_path) const {
     assert(manifest_path != nullptr);
     uint64_t manifest_file_number = 0;
@@ -840,7 +843,63 @@ class VersionSetTestBase {
     ASSERT_EQ(1, manifest_file_number);
   }
 
-  MockEnv* mem_env_;
+  Status LogAndApplyToDefaultCF(VersionEdit& edit) {
+    mutex_.Lock();
+    Status s =
+        versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                               mutable_cf_options_, &edit, &mutex_);
+    mutex_.Unlock();
+    return s;
+  }
+
+  Status LogAndApplyToDefaultCF(
+      const autovector<std::unique_ptr<VersionEdit>>& edits) {
+    autovector<VersionEdit*> vedits;
+    for (auto& e : edits) {
+      vedits.push_back(e.get());
+    }
+    mutex_.Lock();
+    Status s =
+        versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                               mutable_cf_options_, vedits, &mutex_);
+    mutex_.Unlock();
+    return s;
+  }
+
+  void CreateNewManifest() {
+    constexpr FSDirectory* db_directory = nullptr;
+    constexpr bool new_descriptor_log = true;
+    mutex_.Lock();
+    VersionEdit dummy;
+    ASSERT_OK(versions_->LogAndApply(
+        versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_,
+        &dummy, &mutex_, db_directory, new_descriptor_log));
+    mutex_.Unlock();
+  }
+
+  ColumnFamilyData* CreateColumnFamily(const std::string& cf_name,
+                                       const ColumnFamilyOptions& cf_options) {
+    VersionEdit new_cf;
+    new_cf.AddColumnFamily(cf_name);
+    uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
+    new_cf.SetColumnFamily(new_id);
+    new_cf.SetLogNumber(0);
+    new_cf.SetComparatorName(cf_options.comparator->Name());
+    Status s;
+    mutex_.Lock();
+    s = versions_->LogAndApply(/*column_family_data=*/nullptr,
+                               MutableCFOptions(cf_options), &new_cf, &mutex_,
+                               /*db_directory=*/nullptr,
+                               /*new_descriptor_log=*/false, &cf_options);
+    mutex_.Unlock();
+    EXPECT_OK(s);
+    ColumnFamilyData* cfd =
+        versions_->GetColumnFamilySet()->GetColumnFamily(cf_name);
+    EXPECT_NE(nullptr, cfd);
+    return cfd;
+  }
+
+  Env* mem_env_;
   Env* env_;
   std::shared_ptr<Env> env_guard_;
   std::shared_ptr<FileSystem> fs_;
@@ -849,7 +908,7 @@ class VersionSetTestBase {
   Options options_;
   ImmutableDBOptions db_options_;
   ColumnFamilyOptions cf_options_;
-  ImmutableCFOptions immutable_cf_options_;
+  ImmutableOptions immutable_options_;
   MutableCFOptions mutable_cf_options_;
   std::shared_ptr<Cache> table_cache_;
   WriteController write_controller_;
@@ -859,6 +918,7 @@ class VersionSetTestBase {
   InstrumentedMutex mutex_;
   std::atomic<bool> shutting_down_;
   std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+  std::vector<ColumnFamilyDescriptor> column_families_;
 };
 
 const std::string VersionSetTestBase::kColumnFamilyName1 = "alice";
@@ -979,17 +1039,8 @@ TEST_F(VersionSetTest, PersistBlobFileStateInNewManifest) {
       [&](void* /* arg */) { ++garbage_encoded; });
   SyncPoint::GetInstance()->EnableProcessing();
 
-  VersionEdit dummy;
-
-  mutex_.Lock();
-  constexpr FSDirectory* db_directory = nullptr;
-  constexpr bool new_descriptor_log = true;
-  Status s = versions_->LogAndApply(
-      versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_,
-      &dummy, &mutex_, db_directory, new_descriptor_log);
-  mutex_.Unlock();
+  CreateNewManifest();
 
-  ASSERT_OK(s);
   ASSERT_EQ(addition_encoded, 2);
   ASSERT_EQ(garbage_encoded, 1);
 
@@ -1158,6 +1209,600 @@ TEST_F(VersionSetTest, ObsoleteBlobFile) {
   }
 }
 
+TEST_F(VersionSetTest, WalEditsNotAppliedToVersion) {
+  NewDB();
+
+  constexpr uint64_t kNumWals = 5;
+
+  autovector<std::unique_ptr<VersionEdit>> edits;
+  // Add some WALs.
+  for (uint64_t i = 1; i <= kNumWals; i++) {
+    edits.emplace_back(new VersionEdit);
+    // WAL's size equals its log number.
+    edits.back()->AddWal(i, WalMetadata(i));
+  }
+  // Delete the first half of the WALs.
+  edits.emplace_back(new VersionEdit);
+  edits.back()->DeleteWalsBefore(kNumWals / 2 + 1);
+
+  autovector<Version*> versions;
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:NewVersion",
+      [&](void* arg) { versions.push_back(reinterpret_cast<Version*>(arg)); });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(LogAndApplyToDefaultCF(edits));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Since the edits are all WAL edits, no version should be created.
+  ASSERT_EQ(versions.size(), 1);
+  ASSERT_EQ(versions[0], nullptr);
+}
+
+// Similar to WalEditsNotAppliedToVersion, but contains a non-WAL edit.
+TEST_F(VersionSetTest, NonWalEditsAppliedToVersion) {
+  NewDB();
+
+  const std::string kDBId = "db_db";
+  constexpr uint64_t kNumWals = 5;
+
+  autovector<std::unique_ptr<VersionEdit>> edits;
+  // Add some WALs.
+  for (uint64_t i = 1; i <= kNumWals; i++) {
+    edits.emplace_back(new VersionEdit);
+    // WAL's size equals its log number.
+    edits.back()->AddWal(i, WalMetadata(i));
+  }
+  // Delete the first half of the WALs.
+  edits.emplace_back(new VersionEdit);
+  edits.back()->DeleteWalsBefore(kNumWals / 2 + 1);
+  edits.emplace_back(new VersionEdit);
+  edits.back()->SetDBId(kDBId);
+
+  autovector<Version*> versions;
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:NewVersion",
+      [&](void* arg) { versions.push_back(reinterpret_cast<Version*>(arg)); });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(LogAndApplyToDefaultCF(edits));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Since the edits are all WAL edits, no version should be created.
+  ASSERT_EQ(versions.size(), 1);
+  ASSERT_NE(versions[0], nullptr);
+}
+
+TEST_F(VersionSetTest, WalAddition) {
+  NewDB();
+
+  constexpr WalNumber kLogNumber = 10;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  // A WAL is just created.
+  {
+    VersionEdit edit;
+    edit.AddWal(kLogNumber);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kLogNumber).HasSyncedSize());
+  }
+
+  // The WAL is synced for several times before closing.
+  {
+    for (uint64_t size_delta = 100; size_delta > 0; size_delta /= 2) {
+      uint64_t size = kSizeInBytes - size_delta;
+      WalMetadata wal(size);
+      VersionEdit edit;
+      edit.AddWal(kLogNumber, wal);
+
+      ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+      const auto& wals = versions_->GetWalSet().GetWals();
+      ASSERT_EQ(wals.size(), 1);
+      ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+      ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+      ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), size);
+    }
+  }
+
+  // The WAL is closed.
+  {
+    WalMetadata wal(kSizeInBytes);
+    VersionEdit edit;
+    edit.AddWal(kLogNumber, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSizeInBytes);
+  }
+
+  // Recover a new VersionSet.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, /*read_only=*/false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSizeInBytes);
+  }
+}
+
+TEST_F(VersionSetTest, WalCloseWithoutSync) {
+  NewDB();
+
+  constexpr WalNumber kLogNumber = 10;
+  constexpr uint64_t kSizeInBytes = 111;
+  constexpr uint64_t kSyncedSizeInBytes = kSizeInBytes / 2;
+
+  // A WAL is just created.
+  {
+    VersionEdit edit;
+    edit.AddWal(kLogNumber);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kLogNumber).HasSyncedSize());
+  }
+
+  // The WAL is synced before closing.
+  {
+    WalMetadata wal(kSyncedSizeInBytes);
+    VersionEdit edit;
+    edit.AddWal(kLogNumber, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes);
+  }
+
+  // A new WAL with larger log number is created,
+  // implicitly marking the current WAL closed.
+  {
+    VersionEdit edit;
+    edit.AddWal(kLogNumber + 1);
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 2);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes);
+    ASSERT_TRUE(wals.find(kLogNumber + 1) != wals.end());
+    ASSERT_FALSE(wals.at(kLogNumber + 1).HasSyncedSize());
+  }
+
+  // Recover a new VersionSet.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 2);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes);
+  }
+}
+
+TEST_F(VersionSetTest, WalDeletion) {
+  NewDB();
+
+  constexpr WalNumber kClosedLogNumber = 10;
+  constexpr WalNumber kNonClosedLogNumber = 20;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  // Add a non-closed and a closed WAL.
+  {
+    VersionEdit edit;
+    edit.AddWal(kClosedLogNumber, WalMetadata(kSizeInBytes));
+    edit.AddWal(kNonClosedLogNumber);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 2);
+    ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
+    ASSERT_TRUE(wals.find(kClosedLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
+    ASSERT_TRUE(wals.at(kClosedLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kClosedLogNumber).GetSyncedSizeInBytes(), kSizeInBytes);
+  }
+
+  // Delete the closed WAL.
+  {
+    VersionEdit edit;
+    edit.DeleteWalsBefore(kNonClosedLogNumber);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
+  }
+
+  // Recover a new VersionSet, only the non-closed WAL should show up.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
+  }
+
+  // Force the creation of a new MANIFEST file,
+  // only the non-closed WAL should be written to the new MANIFEST.
+  {
+    std::vector<WalAddition> wal_additions;
+    SyncPoint::GetInstance()->SetCallBack(
+        "VersionSet::WriteCurrentStateToManifest:SaveWal", [&](void* arg) {
+          VersionEdit* edit = reinterpret_cast<VersionEdit*>(arg);
+          ASSERT_TRUE(edit->IsWalAddition());
+          for (auto& addition : edit->GetWalAdditions()) {
+            wal_additions.push_back(addition);
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    CreateNewManifest();
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    ASSERT_EQ(wal_additions.size(), 1);
+    ASSERT_EQ(wal_additions[0].GetLogNumber(), kNonClosedLogNumber);
+    ASSERT_FALSE(wal_additions[0].GetMetadata().HasSyncedSize());
+  }
+
+  // Recover from the new MANIFEST, only the non-closed WAL should show up.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
+  }
+}
+
+TEST_F(VersionSetTest, WalCreateTwice) {
+  NewDB();
+
+  constexpr WalNumber kLogNumber = 10;
+
+  VersionEdit edit;
+  edit.AddWal(kLogNumber);
+
+  ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+  Status s = LogAndApplyToDefaultCF(edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(s.ToString().find("WAL 10 is created more than once") !=
+              std::string::npos)
+      << s.ToString();
+}
+
+TEST_F(VersionSetTest, WalCreateAfterClose) {
+  NewDB();
+
+  constexpr WalNumber kLogNumber = 10;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  {
+    // Add a closed WAL.
+    VersionEdit edit;
+    edit.AddWal(kLogNumber);
+    WalMetadata wal(kSizeInBytes);
+    edit.AddWal(kLogNumber, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+
+  {
+    // Create the same WAL again.
+    VersionEdit edit;
+    edit.AddWal(kLogNumber);
+
+    Status s = LogAndApplyToDefaultCF(edit);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(s.ToString().find("WAL 10 is created more than once") !=
+                std::string::npos)
+        << s.ToString();
+  }
+}
+
+TEST_F(VersionSetTest, AddWalWithSmallerSize) {
+  NewDB();
+
+  constexpr WalNumber kLogNumber = 10;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  {
+    // Add a closed WAL.
+    VersionEdit edit;
+    WalMetadata wal(kSizeInBytes);
+    edit.AddWal(kLogNumber, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+
+  {
+    // Add the same WAL with smaller synced size.
+    VersionEdit edit;
+    WalMetadata wal(kSizeInBytes / 2);
+    edit.AddWal(kLogNumber, wal);
+
+    Status s = LogAndApplyToDefaultCF(edit);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(
+        s.ToString().find(
+            "WAL 10 must not have smaller synced size than previous one") !=
+        std::string::npos)
+        << s.ToString();
+  }
+}
+
+TEST_F(VersionSetTest, DeleteWalsBeforeNonExistingWalNumber) {
+  NewDB();
+
+  constexpr WalNumber kLogNumber0 = 10;
+  constexpr WalNumber kLogNumber1 = 20;
+  constexpr WalNumber kNonExistingNumber = 15;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  {
+    // Add closed WALs.
+    VersionEdit edit;
+    WalMetadata wal(kSizeInBytes);
+    edit.AddWal(kLogNumber0, wal);
+    edit.AddWal(kLogNumber1, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+
+  {
+    // Delete WALs before a non-existing WAL.
+    VersionEdit edit;
+    edit.DeleteWalsBefore(kNonExistingNumber);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+
+  // Recover a new VersionSet, WAL0 is deleted, WAL1 is not.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber1) != wals.end());
+  }
+}
+
+TEST_F(VersionSetTest, DeleteAllWals) {
+  NewDB();
+
+  constexpr WalNumber kMaxLogNumber = 10;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  {
+    // Add a closed WAL.
+    VersionEdit edit;
+    WalMetadata wal(kSizeInBytes);
+    edit.AddWal(kMaxLogNumber, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+
+  {
+    VersionEdit edit;
+    edit.DeleteWalsBefore(kMaxLogNumber + 10);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+
+  // Recover a new VersionSet, all WALs are deleted.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 0);
+  }
+}
+
+TEST_F(VersionSetTest, AtomicGroupWithWalEdits) {
+  NewDB();
+
+  constexpr int kAtomicGroupSize = 7;
+  constexpr uint64_t kNumWals = 5;
+  const std::string kDBId = "db_db";
+
+  int remaining = kAtomicGroupSize;
+  autovector<std::unique_ptr<VersionEdit>> edits;
+  // Add 5 WALs.
+  for (uint64_t i = 1; i <= kNumWals; i++) {
+    edits.emplace_back(new VersionEdit);
+    // WAL's size equals its log number.
+    edits.back()->AddWal(i, WalMetadata(i));
+    edits.back()->MarkAtomicGroup(--remaining);
+  }
+  // One edit with the min log number set.
+  edits.emplace_back(new VersionEdit);
+  edits.back()->SetDBId(kDBId);
+  edits.back()->MarkAtomicGroup(--remaining);
+  // Delete the first added 4 WALs.
+  edits.emplace_back(new VersionEdit);
+  edits.back()->DeleteWalsBefore(kNumWals);
+  edits.back()->MarkAtomicGroup(--remaining);
+  ASSERT_EQ(remaining, 0);
+
+  ASSERT_OK(LogAndApplyToDefaultCF(edits));
+
+  // Recover a new VersionSet, the min log number and the last WAL should be
+  // kept.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
+    std::string db_id;
+    ASSERT_OK(
+        new_versions->Recover(column_families_, /*read_only=*/false, &db_id));
+
+    ASSERT_EQ(db_id, kDBId);
+
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kNumWals) != wals.end());
+    ASSERT_TRUE(wals.at(kNumWals).HasSyncedSize());
+    ASSERT_EQ(wals.at(kNumWals).GetSyncedSizeInBytes(), kNumWals);
+  }
+}
+
+class VersionSetWithTimestampTest : public VersionSetTest {
+ public:
+  static const std::string kNewCfName;
+
+  explicit VersionSetWithTimestampTest() : VersionSetTest() {}
+
+  void SetUp() override {
+    NewDB();
+    Options options;
+    options.comparator = test::ComparatorWithU64Ts();
+    cfd_ = CreateColumnFamily(kNewCfName, options);
+    EXPECT_NE(nullptr, cfd_);
+    EXPECT_NE(nullptr, cfd_->GetLatestMutableCFOptions());
+    column_families_.emplace_back(kNewCfName, options);
+  }
+
+  void TearDown() override {
+    for (auto* e : edits_) {
+      delete e;
+    }
+    edits_.clear();
+  }
+
+  void GenVersionEditsToSetFullHistoryTsLow(
+      const std::vector<uint64_t>& ts_lbs) {
+    for (const auto ts_lb : ts_lbs) {
+      VersionEdit* edit = new VersionEdit;
+      edit->SetColumnFamily(cfd_->GetID());
+      std::string ts_str = test::EncodeInt(ts_lb);
+      edit->SetFullHistoryTsLow(ts_str);
+      edits_.emplace_back(edit);
+    }
+  }
+
+  void VerifyFullHistoryTsLow(uint64_t expected_ts_low) {
+    std::unique_ptr<VersionSet> vset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
+    ASSERT_OK(vset->Recover(column_families_, /*read_only=*/false,
+                            /*db_id=*/nullptr));
+    for (auto* cfd : *(vset->GetColumnFamilySet())) {
+      ASSERT_NE(nullptr, cfd);
+      if (cfd->GetName() == kNewCfName) {
+        ASSERT_EQ(test::EncodeInt(expected_ts_low), cfd->GetFullHistoryTsLow());
+      } else {
+        ASSERT_TRUE(cfd->GetFullHistoryTsLow().empty());
+      }
+    }
+  }
+
+  void DoTest(const std::vector<uint64_t>& ts_lbs) {
+    if (ts_lbs.empty()) {
+      return;
+    }
+
+    GenVersionEditsToSetFullHistoryTsLow(ts_lbs);
+
+    Status s;
+    mutex_.Lock();
+    s = versions_->LogAndApply(cfd_, *(cfd_->GetLatestMutableCFOptions()),
+                               edits_, &mutex_);
+    mutex_.Unlock();
+    ASSERT_OK(s);
+    VerifyFullHistoryTsLow(*std::max_element(ts_lbs.begin(), ts_lbs.end()));
+  }
+
+ protected:
+  ColumnFamilyData* cfd_{nullptr};
+  // edits_ must contain and own pointers to heap-alloc VersionEdit objects.
+  autovector<VersionEdit*> edits_;
+};
+
+const std::string VersionSetWithTimestampTest::kNewCfName("new_cf");
+
+TEST_F(VersionSetWithTimestampTest, SetFullHistoryTsLbOnce) {
+  constexpr uint64_t kTsLow = 100;
+  DoTest({kTsLow});
+}
+
+// Simulate the application increasing full_history_ts_low.
+TEST_F(VersionSetWithTimestampTest, IncreaseFullHistoryTsLb) {
+  const std::vector<uint64_t> ts_lbs = {100, 101, 102, 103};
+  DoTest(ts_lbs);
+}
+
+// Simulate the application trying to decrease full_history_ts_low
+// unsuccessfully. If the application calls public API sequentially to
+// decrease the lower bound ts, RocksDB will return an InvalidArgument
+// status before involving VersionSet. Only when multiple threads trying
+// to decrease the lower bound concurrently will this case ever happen. Even
+// so, the lower bound cannot be decreased. The application will be notified
+// via return value of the API.
+TEST_F(VersionSetWithTimestampTest, TryDecreaseFullHistoryTsLb) {
+  const std::vector<uint64_t> ts_lbs = {103, 102, 101, 100};
+  DoTest(ts_lbs);
+}
+
 class VersionSetAtomicGroupTest : public VersionSetTestBase,
                                   public testing::Test {
  public:
@@ -1242,12 +1887,9 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase,
           last_in_atomic_group_ = true;
         });
     SyncPoint::GetInstance()->SetCallBack(
-        "VersionSet::ReadAndRecover:RecoveredEdits", [&](void* arg) {
+        "VersionEditHandlerBase::Iterate:Finish", [&](void* arg) {
           num_recovered_edits_ = *reinterpret_cast<int*>(arg);
         });
-    SyncPoint::GetInstance()->SetCallBack(
-        "ReactiveVersionSet::ReadAndApply:AppliedEdits",
-        [&](void* arg) { num_applied_edits_ = *reinterpret_cast<int*>(arg); });
     SyncPoint::GetInstance()->SetCallBack(
         "AtomicGroupReadBuffer::AddEdit:AtomicGroup",
         [&](void* /* arg */) { ++num_edits_in_atomic_group_; });
@@ -1287,7 +1929,6 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase,
   bool last_in_atomic_group_ = false;
   int num_edits_in_atomic_group_ = 0;
   int num_recovered_edits_ = 0;
-  int num_applied_edits_ = 0;
   VersionEdit corrupted_edit_;
   VersionEdit edit_with_incorrect_group_size_;
   std::unique_ptr<log::Writer> log_writer_;
@@ -1303,7 +1944,6 @@ TEST_F(VersionSetAtomicGroupTest, HandleValidAtomicGroupWithVersionSetRecover) {
   EXPECT_TRUE(first_in_atomic_group_);
   EXPECT_TRUE(last_in_atomic_group_);
   EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_);
-  EXPECT_EQ(0, num_applied_edits_);
 }
 
 TEST_F(VersionSetAtomicGroupTest,
@@ -1325,7 +1965,6 @@ TEST_F(VersionSetAtomicGroupTest,
   EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
   EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
   EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_);
-  EXPECT_EQ(0, num_applied_edits_);
 }
 
 TEST_F(VersionSetAtomicGroupTest,
@@ -1338,20 +1977,20 @@ TEST_F(VersionSetAtomicGroupTest,
   EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
                                         &manifest_reporter,
                                         &manifest_reader_status));
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
   AddNewEditsToLog(kAtomicGroupSize);
   InstrumentedMutex mu;
   std::unordered_set<ColumnFamilyData*> cfds_changed;
   mu.Lock();
-  EXPECT_OK(
-      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  EXPECT_OK(reactive_versions_->ReadAndApply(
+      &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
   mu.Unlock();
   EXPECT_TRUE(first_in_atomic_group_);
   EXPECT_TRUE(last_in_atomic_group_);
   // The recover should clean up the replay buffer.
   EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
   EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
-  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
-  EXPECT_EQ(kAtomicGroupSize, num_applied_edits_);
+  EXPECT_EQ(kAtomicGroupSize, num_recovered_edits_);
 }
 
 TEST_F(VersionSetAtomicGroupTest,
@@ -1367,7 +2006,6 @@ TEST_F(VersionSetAtomicGroupTest,
   EXPECT_FALSE(last_in_atomic_group_);
   EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
   EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
-  EXPECT_EQ(0, num_applied_edits_);
 }
 
 TEST_F(VersionSetAtomicGroupTest,
@@ -1399,14 +2037,13 @@ TEST_F(VersionSetAtomicGroupTest,
   InstrumentedMutex mu;
   std::unordered_set<ColumnFamilyData*> cfds_changed;
   mu.Lock();
-  EXPECT_OK(
-      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  EXPECT_OK(reactive_versions_->ReadAndApply(
+      &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
   mu.Unlock();
   // Reactive version set should be empty now.
   EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
   EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
   EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
-  EXPECT_EQ(kAtomicGroupSize, num_applied_edits_);
 }
 
 TEST_F(VersionSetAtomicGroupTest,
@@ -1423,13 +2060,14 @@ TEST_F(VersionSetAtomicGroupTest,
                                         &manifest_reader_status));
   EXPECT_EQ(column_families_.size(),
             reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
   // Write a few edits in an atomic group.
   AddNewEditsToLog(kNumberOfPersistedVersionEdits);
   InstrumentedMutex mu;
   std::unordered_set<ColumnFamilyData*> cfds_changed;
   mu.Lock();
-  EXPECT_OK(
-      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  EXPECT_OK(reactive_versions_->ReadAndApply(
+      &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
   mu.Unlock();
   EXPECT_TRUE(first_in_atomic_group_);
   EXPECT_FALSE(last_in_atomic_group_);
@@ -1438,8 +2076,6 @@ TEST_F(VersionSetAtomicGroupTest,
   EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() ==
               kNumberOfPersistedVersionEdits);
   EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize);
-  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
-  EXPECT_EQ(0, num_applied_edits_);
 }
 
 TEST_F(VersionSetAtomicGroupTest,
@@ -1486,8 +2122,8 @@ TEST_F(VersionSetAtomicGroupTest,
   // Write the corrupted edits.
   AddNewEditsToLog(kAtomicGroupSize);
   mu.Lock();
-  EXPECT_NOK(
-      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  EXPECT_NOK(reactive_versions_->ReadAndApply(
+      &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
   mu.Unlock();
   EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
             corrupted_edit_.DebugString());
@@ -1536,8 +2172,8 @@ TEST_F(VersionSetAtomicGroupTest,
                                         &manifest_reader_status));
   AddNewEditsToLog(kAtomicGroupSize);
   mu.Lock();
-  EXPECT_NOK(
-      reactive_versions_->ReadAndApply(&mu, &manifest_reader, &cfds_changed));
+  EXPECT_NOK(reactive_versions_->ReadAndApply(
+      &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
   mu.Unlock();
   EXPECT_EQ(edits_[1].DebugString(),
             edit_with_incorrect_group_size_.DebugString());
@@ -1651,10 +2287,7 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) {
   mutex_.Unlock();
   ASSERT_OK(s);
   ASSERT_EQ(1, called);
-  if (cfd_to_drop->Unref()) {
-    delete cfd_to_drop;
-    cfd_to_drop = nullptr;
-  }
+  cfd_to_drop->UnrefAndTryDelete();
 }
 
 INSTANTIATE_TEST_CASE_P(
@@ -1674,14 +2307,13 @@ class EmptyDefaultCfNewManifest : public VersionSetTestBase,
     assert(log_writer != nullptr);
     VersionEdit new_db;
     new_db.SetLogNumber(0);
-    std::unique_ptr<WritableFile> file;
     const std::string manifest_path = DescriptorFileName(dbname_, 1);
-    Status s = env_->NewWritableFile(
-        manifest_path, &file, env_->OptimizeForManifestWrite(env_options_));
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<WritableFileWriter> file_writer;
+    Status s = WritableFileWriter::Create(
+        fs, manifest_path, fs->OptimizeForManifestWrite(env_options_),
+        &file_writer, nullptr);
     ASSERT_OK(s);
-    std::unique_ptr<WritableFileWriter> file_writer(
-        new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(file)),
-                               manifest_path, env_options_));
     log_writer->reset(new log::Writer(std::move(file_writer), 0, true));
     std::string record;
     ASSERT_TRUE(new_db.EncodeTo(&record));
@@ -1741,19 +2373,20 @@ class VersionSetTestEmptyDb
     assert(nullptr != log_writer);
     VersionEdit new_db;
     if (db_options_.write_dbid_to_manifest) {
-      std::unique_ptr<DBImpl> impl(new DBImpl(DBOptions(), dbname_));
+      DBOptions tmp_db_options;
+      tmp_db_options.env = env_;
+      std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
       std::string db_id;
       impl->GetDbIdentityFromIdentityFile(&db_id);
       new_db.SetDBId(db_id);
     }
     const std::string manifest_path = DescriptorFileName(dbname_, 1);
-    std::unique_ptr<WritableFile> file;
-    Status s = env_->NewWritableFile(
-        manifest_path, &file, env_->OptimizeForManifestWrite(env_options_));
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<WritableFileWriter> file_writer;
+    Status s = WritableFileWriter::Create(
+        fs, manifest_path, fs->OptimizeForManifestWrite(env_options_),
+        &file_writer, nullptr);
     ASSERT_OK(s);
-    std::unique_ptr<WritableFileWriter> file_writer(
-        new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(file)),
-                               manifest_path, env_options_));
     {
       log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
       std::string record;
@@ -2057,16 +2690,18 @@ class VersionSetTestMissingFiles : public VersionSetTestBase,
     assert(last_seqno != nullptr);
     assert(log_writer != nullptr);
     const std::string manifest = DescriptorFileName(dbname_, 1);
-    std::unique_ptr<WritableFile> file;
-    Status s = env_->NewWritableFile(
-        manifest, &file, env_->OptimizeForManifestWrite(env_options_));
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<WritableFileWriter> file_writer;
+    Status s = WritableFileWriter::Create(
+        fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer,
+        nullptr);
     ASSERT_OK(s);
-    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-        NewLegacyWritableFileWrapper(std::move(file)), manifest, env_options_));
     log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
     VersionEdit new_db;
     if (db_options_.write_dbid_to_manifest) {
-      std::unique_ptr<DBImpl> impl(new DBImpl(DBOptions(), dbname_));
+      DBOptions tmp_db_options;
+      tmp_db_options.env = env_;
+      std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
       std::string db_id;
       impl->GetDbIdentityFromIdentityFile(&db_id);
       new_db.SetDBId(db_id);
@@ -2144,18 +2779,17 @@ class VersionSetTestMissingFiles : public VersionSetTestBase,
       std::unique_ptr<FSWritableFile> file;
       Status s = fs_->NewWritableFile(fname, FileOptions(), &file, nullptr);
       ASSERT_OK(s);
-      std::unique_ptr<WritableFileWriter> fwriter(
-          new WritableFileWriter(std::move(file), fname, FileOptions(), env_));
-      std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-          int_tbl_prop_collector_factories;
+      std::unique_ptr<WritableFileWriter> fwriter(new WritableFileWriter(
+          std::move(file), fname, FileOptions(), env_->GetSystemClock().get()));
+      IntTblPropCollectorFactories int_tbl_prop_collector_factories;
 
       std::unique_ptr<TableBuilder> builder(table_factory_->NewTableBuilder(
           TableBuilderOptions(
-              immutable_cf_options_, mutable_cf_options_, *internal_comparator_,
+              immutable_options_, mutable_cf_options_, *internal_comparator_,
               &int_tbl_prop_collector_factories, kNoCompression,
-              /*_sample_for_compression=*/0, CompressionOptions(),
-              /*_skip_filters=*/false, info.column_family, info.level),
-          TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+              CompressionOptions(),
+              TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+              info.column_family, info.level),
           fwriter.get()));
       InternalKey ikey(info.key, 0, ValueType::kTypeValue);
       builder->Add(ikey.Encode(), "value");
@@ -2165,11 +2799,9 @@ class VersionSetTestMissingFiles : public VersionSetTestBase,
       s = fs_->GetFileSize(fname, IOOptions(), &file_size, nullptr);
       ASSERT_OK(s);
       ASSERT_NE(0, file_size);
-      FileMetaData meta;
-      meta = FileMetaData(file_num, /*file_path_id=*/0, file_size, ikey, ikey,
-                          0, 0, false, 0, 0, 0, kUnknownFileChecksum,
-                          kUnknownFileChecksumFuncName);
-      file_metas->emplace_back(meta);
+      file_metas->emplace_back(file_num, /*file_path_id=*/0, file_size, ikey,
+                               ikey, 0, 0, false, 0, 0, 0, kUnknownFileChecksum,
+                               kUnknownFileChecksumFuncName);
     }
   }
 
@@ -2364,6 +2996,27 @@ TEST_F(VersionSetTestMissingFiles, NoFileMissing) {
   }
 }
 
+TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) {
+  NewDB();
+
+  SstInfo sst(100, kDefaultColumnFamilyName, "a");
+  std::vector<FileMetaData> file_metas;
+  CreateDummyTableFiles({sst}, &file_metas);
+
+  constexpr WalNumber kMinWalNumberToKeep2PC = 10;
+  VersionEdit edit;
+  edit.AddFile(0, file_metas[0]);
+  edit.SetMinLogNumberToKeep(kMinWalNumberToKeep2PC);
+  ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  ASSERT_EQ(versions_->min_log_number_to_keep_2pc(), kMinWalNumberToKeep2PC);
+
+  for (int i = 0; i < 3; i++) {
+    CreateNewManifest();
+    ReopenDB();
+    ASSERT_EQ(versions_->min_log_number_to_keep_2pc(), kMinWalNumberToKeep2PC);
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/db/wal_edit.cc b/db/wal_edit.cc
index f1939534476..786d68b5c83 100644
--- a/db/wal_edit.cc
+++ b/db/wal_edit.cc
@@ -19,10 +19,6 @@ void WalAddition::EncodeTo(std::string* dst) const {
     PutVarint64(dst, metadata_.GetSyncedSizeInBytes());
   }
 
-  if (metadata_.IsClosed()) {
-    PutVarint32(dst, static_cast<uint32_t>(WalAdditionTag::kClosed));
-  }
-
   PutVarint32(dst, static_cast<uint32_t>(WalAdditionTag::kTerminate));
 }
 
@@ -48,10 +44,6 @@ Status WalAddition::DecodeFrom(Slice* src) {
         metadata_.SetSyncedSizeInBytes(size);
         break;
       }
-      case WalAdditionTag::kClosed: {
-        metadata_.SetClosed();
-        break;
-      }
       // TODO: process future tags such as checksum.
       case WalAdditionTag::kTerminate:
         return Status::OK();
@@ -66,15 +58,13 @@ Status WalAddition::DecodeFrom(Slice* src) {
 
 JSONWriter& operator<<(JSONWriter& jw, const WalAddition& wal) {
   jw << "LogNumber" << wal.GetLogNumber() << "SyncedSizeInBytes"
-     << wal.GetMetadata().GetSyncedSizeInBytes() << "Closed"
-     << wal.GetMetadata().IsClosed();
+     << wal.GetMetadata().GetSyncedSizeInBytes();
   return jw;
 }
 
 std::ostream& operator<<(std::ostream& os, const WalAddition& wal) {
   os << "log_number: " << wal.GetLogNumber()
-     << " synced_size_in_bytes: " << wal.GetMetadata().GetSyncedSizeInBytes()
-     << " closed: " << wal.GetMetadata().IsClosed();
+     << " synced_size_in_bytes: " << wal.GetMetadata().GetSyncedSizeInBytes();
   return os;
 }
 
@@ -115,33 +105,30 @@ std::string WalDeletion::DebugString() const {
 }
 
 Status WalSet::AddWal(const WalAddition& wal) {
+  if (wal.GetLogNumber() < min_wal_number_to_keep_) {
+    // The WAL has been obsolete, ignore it.
+    return Status::OK();
+  }
+
   auto it = wals_.lower_bound(wal.GetLogNumber());
   bool existing = it != wals_.end() && it->first == wal.GetLogNumber();
-  if (wal.GetMetadata().IsClosed()) {
-    // The WAL must exist and not closed.
-    if (!existing) {
-      std::stringstream ss;
-      ss << "WAL " << wal.GetLogNumber() << " is not created before closing";
-      return Status::Corruption("WalSet", ss.str());
-    }
-    if (it->second.IsClosed()) {
-      std::stringstream ss;
-      ss << "WAL " << wal.GetLogNumber() << " is closed more than once";
-      return Status::Corruption("WalSet", ss.str());
-    }
+  if (existing && !wal.GetMetadata().HasSyncedSize()) {
+    std::stringstream ss;
+    ss << "WAL " << wal.GetLogNumber() << " is created more than once";
+    return Status::Corruption("WalSet::AddWal", ss.str());
   }
   // If the WAL has synced size, it must >= the previous size.
-  if (existing && it->second.HasSyncedSize() &&
-      (!wal.GetMetadata().HasSyncedSize() ||
-       wal.GetMetadata().GetSyncedSizeInBytes() <
-           it->second.GetSyncedSizeInBytes())) {
+  if (wal.GetMetadata().HasSyncedSize() && existing &&
+      it->second.HasSyncedSize() &&
+      wal.GetMetadata().GetSyncedSizeInBytes() <
+          it->second.GetSyncedSizeInBytes()) {
     std::stringstream ss;
     ss << "WAL " << wal.GetLogNumber()
        << " must not have smaller synced size than previous one";
-    return Status::Corruption("WalSet", ss.str());
+    return Status::Corruption("WalSet::AddWal", ss.str());
   }
   if (existing) {
-    it->second = wal.GetMetadata();
+    it->second.SetSyncedSizeInBytes(wal.GetMetadata().GetSyncedSizeInBytes());
   } else {
     wals_.insert(it, {wal.GetLogNumber(), wal.GetMetadata()});
   }
@@ -159,36 +146,19 @@ Status WalSet::AddWals(const WalAdditions& wals) {
   return s;
 }
 
-Status WalSet::DeleteWal(const WalDeletion& wal) {
-  auto it = wals_.find(wal.GetLogNumber());
-  // The WAL must exist and has been closed.
-  if (it == wals_.end()) {
-    std::stringstream ss;
-    ss << "WAL " << wal.GetLogNumber() << " must exist before deletion";
-    return Status::Corruption("WalSet", ss.str());
-  }
-  if (!it->second.IsClosed()) {
-    std::stringstream ss;
-    ss << "WAL " << wal.GetLogNumber() << " must be closed before deletion";
-    return Status::Corruption("WalSet", ss.str());
+Status WalSet::DeleteWalsBefore(WalNumber wal) {
+  if (wal > min_wal_number_to_keep_) {
+    min_wal_number_to_keep_ = wal;
+    wals_.erase(wals_.begin(), wals_.lower_bound(wal));
   }
-  wals_.erase(it);
   return Status::OK();
 }
 
-Status WalSet::DeleteWals(const WalDeletions& wals) {
-  Status s;
-  for (const WalDeletion& wal : wals) {
-    s = DeleteWal(wal);
-    if (!s.ok()) {
-      break;
-    }
-  }
-  return s;
+void WalSet::Reset() {
+  wals_.clear();
+  min_wal_number_to_keep_ = 0;
 }
 
-void WalSet::Reset() { wals_.clear(); }
-
 Status WalSet::CheckWals(
     Env* env,
     const std::unordered_map<WalNumber, std::string>& logs_on_disk) const {
diff --git a/db/wal_edit.h b/db/wal_edit.h
index 9493c3648bb..7e1f9a5762d 100644
--- a/db/wal_edit.h
+++ b/db/wal_edit.h
@@ -35,10 +35,6 @@ class WalMetadata {
   explicit WalMetadata(uint64_t synced_size_bytes)
       : synced_size_bytes_(synced_size_bytes) {}
 
-  bool IsClosed() const { return closed_; }
-
-  void SetClosed() { closed_ = true; }
-
   bool HasSyncedSize() const { return synced_size_bytes_ != kUnknownWalSize; }
 
   void SetSyncedSizeInBytes(uint64_t bytes) { synced_size_bytes_ = bytes; }
@@ -52,9 +48,6 @@ class WalMetadata {
 
   // Size of the most recently synced WAL in bytes.
   uint64_t synced_size_bytes_ = kUnknownWalSize;
-
-  // Whether the WAL is closed.
-  bool closed_ = false;
 };
 
 // These tags are persisted to MANIFEST, so it's part of the user API.
@@ -63,8 +56,6 @@ enum class WalAdditionTag : uint32_t {
   kTerminate = 1,
   // Synced Size in bytes.
   kSyncedSize = 2,
-  // Whether the WAL is closed.
-  kClosed = 3,
   // Add tags in the future, such as checksum?
 };
 
@@ -98,10 +89,10 @@ JSONWriter& operator<<(JSONWriter& jw, const WalAddition& wal);
 
 using WalAdditions = std::vector<WalAddition>;
 
-// Records the event of deleting/archiving a WAL in VersionEdit.
+// Records the event of deleting WALs before the specified log number.
 class WalDeletion {
  public:
-  WalDeletion() : number_(0) {}
+  WalDeletion() : number_(kEmpty) {}
 
   explicit WalDeletion(WalNumber number) : number_(number) {}
 
@@ -113,18 +104,22 @@ class WalDeletion {
 
   std::string DebugString() const;
 
+  bool IsEmpty() const { return number_ == kEmpty; }
+
+  void Reset() { number_ = kEmpty; }
+
  private:
+  static constexpr WalNumber kEmpty = 0;
+
   WalNumber number_;
 };
 
 std::ostream& operator<<(std::ostream& os, const WalDeletion& wal);
 JSONWriter& operator<<(JSONWriter& jw, const WalDeletion& wal);
 
-using WalDeletions = std::vector<WalDeletion>;
-
 // Used in VersionSet to keep the current set of WALs.
 //
-// When a WAL is created, closed, deleted, or archived,
+// When a WAL is synced or becomes obsoleted,
 // a VersionEdit is logged to MANIFEST and
 // the WAL is added to or deleted from WalSet.
 //
@@ -139,16 +134,16 @@ class WalSet {
   Status AddWal(const WalAddition& wal);
   Status AddWals(const WalAdditions& wals);
 
-  // Delete WAL(s).
-  // The WAL to be deleted must exist and be closed, otherwise,
-  // return Status::Corruption.
+  // Delete WALs with log number smaller than the specified wal number.
   // Can happen when applying a VersionEdit or recovering from MANIFEST.
-  Status DeleteWal(const WalDeletion& wal);
-  Status DeleteWals(const WalDeletions& wals);
+  Status DeleteWalsBefore(WalNumber wal);
 
   // Resets the internal state.
   void Reset();
 
+  // WALs with number less than MinWalNumberToKeep should not exist in WalSet.
+  WalNumber GetMinWalNumberToKeep() const { return min_wal_number_to_keep_; }
+
   const std::map<WalNumber, WalMetadata>& GetWals() const { return wals_; }
 
   // Checks whether there are missing or corrupted WALs.
@@ -163,6 +158,9 @@ class WalSet {
 
  private:
   std::map<WalNumber, WalMetadata> wals_;
+  // WAL number < min_wal_number_to_keep_ should not exist in wals_.
+  // It's monotonically increasing, in-memory only, not written to MANIFEST.
+  WalNumber min_wal_number_to_keep_ = 0;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/wal_edit_test.cc b/db/wal_edit_test.cc
index b6eb347a300..5895e180d1c 100644
--- a/db/wal_edit_test.cc
+++ b/db/wal_edit_test.cc
@@ -24,18 +24,8 @@ TEST(WalSet, AddDeleteReset) {
   }
   ASSERT_EQ(wals.GetWals().size(), 10);
 
-  // Close WAL 1 - 5.
-  for (WalNumber log_number = 1; log_number <= 5; log_number++) {
-    WalMetadata wal(100);
-    wal.SetClosed();
-    wals.AddWal(WalAddition(log_number, wal));
-  }
-  ASSERT_EQ(wals.GetWals().size(), 10);
-
   // Delete WAL 1 - 5.
-  for (WalNumber log_number = 1; log_number <= 5; log_number++) {
-    wals.DeleteWal(WalDeletion(log_number));
-  }
+  wals.DeleteWalsBefore(6);
   ASSERT_EQ(wals.GetWals().size(), 5);
 
   WalNumber expected_log_number = 6;
@@ -72,65 +62,43 @@ TEST(WalSet, SmallerSyncedSize) {
       std::string::npos);
 }
 
-TEST(WalSet, CloseTwice) {
+TEST(WalSet, CreateTwice) {
   constexpr WalNumber kNumber = 100;
-  constexpr uint64_t kBytes = 200;
   WalSet wals;
   ASSERT_OK(wals.AddWal(WalAddition(kNumber)));
-  WalMetadata wal(kBytes);
-  wal.SetClosed();
-  ASSERT_OK(wals.AddWal(WalAddition(kNumber, wal)));
-  Status s = wals.AddWal(WalAddition(kNumber, wal));
+  Status s = wals.AddWal(WalAddition(kNumber));
   ASSERT_TRUE(s.IsCorruption());
-  ASSERT_TRUE(s.ToString().find("WAL 100 is closed more than once") !=
+  ASSERT_TRUE(s.ToString().find("WAL 100 is created more than once") !=
               std::string::npos);
 }
 
-TEST(WalSet, CloseBeforeCreate) {
-  constexpr WalNumber kNumber = 100;
-  constexpr uint64_t kBytes = 200;
+TEST(WalSet, DeleteAllWals) {
+  constexpr WalNumber kMaxWalNumber = 10;
   WalSet wals;
-  WalMetadata wal(kBytes);
-  wal.SetClosed();
-  Status s = wals.AddWal(WalAddition(kNumber, wal));
-  ASSERT_TRUE(s.IsCorruption());
-  ASSERT_TRUE(s.ToString().find("WAL 100 is not created before closing") !=
-              std::string::npos);
+  for (WalNumber i = 1; i <= kMaxWalNumber; i++) {
+    wals.AddWal(WalAddition(i));
+  }
+  ASSERT_OK(wals.DeleteWalsBefore(kMaxWalNumber + 1));
 }
 
-TEST(WalSet, CreateAfterClose) {
+TEST(WalSet, AddObsoleteWal) {
   constexpr WalNumber kNumber = 100;
-  constexpr uint64_t kBytes = 200;
   WalSet wals;
+  ASSERT_OK(wals.DeleteWalsBefore(kNumber + 1));
   ASSERT_OK(wals.AddWal(WalAddition(kNumber)));
-  WalMetadata wal(kBytes);
-  wal.SetClosed();
-  ASSERT_OK(wals.AddWal(WalAddition(kNumber, wal)));
-  Status s = wals.AddWal(WalAddition(kNumber));
-  ASSERT_TRUE(s.IsCorruption());
-  ASSERT_TRUE(
-      s.ToString().find(
-          "WAL 100 must not have smaller synced size than previous one") !=
-      std::string::npos);
-}
-
-TEST(WalSet, DeleteNonExistingWal) {
-  constexpr WalNumber kNonExistingNumber = 100;
-  WalSet wals;
-  Status s = wals.DeleteWal(WalDeletion(kNonExistingNumber));
-  ASSERT_TRUE(s.IsCorruption());
-  ASSERT_TRUE(s.ToString().find("WAL 100 must exist before deletion") !=
-              std::string::npos);
+  ASSERT_TRUE(wals.GetWals().empty());
 }
 
-TEST(WalSet, DeleteNonClosedWal) {
-  constexpr WalNumber kNonClosedWalNumber = 100;
+TEST(WalSet, MinWalNumberToKeep) {
+  constexpr WalNumber kNumber = 100;
   WalSet wals;
-  ASSERT_OK(wals.AddWal(WalAddition(kNonClosedWalNumber)));
-  Status s = wals.DeleteWal(WalDeletion(kNonClosedWalNumber));
-  ASSERT_TRUE(s.IsCorruption());
-  ASSERT_TRUE(s.ToString().find("WAL 100 must be closed before deletion") !=
-              std::string::npos);
+  ASSERT_EQ(wals.GetMinWalNumberToKeep(), 0);
+  ASSERT_OK(wals.DeleteWalsBefore(kNumber));
+  ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber);
+  ASSERT_OK(wals.DeleteWalsBefore(kNumber - 1));
+  ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber);
+  ASSERT_OK(wals.DeleteWalsBefore(kNumber + 1));
+  ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber + 1);
 }
 
 class WalSetTest : public DBTestBase {
@@ -165,7 +133,6 @@ class WalSetTest : public DBTestBase {
     ASSERT_OK(wals_.AddWal(WalAddition(number)));
     // Close WAL.
     WalMetadata wal(size_bytes);
-    wal.SetClosed();
     ASSERT_OK(wals_.AddWal(WalAddition(number, wal)));
   }
 
diff --git a/db/wal_manager.cc b/db/wal_manager.cc
index 327f89cdf1d..359cc154678 100644
--- a/db/wal_manager.cc
+++ b/db/wal_manager.cc
@@ -134,14 +134,14 @@ Status WalManager::GetUpdatesSince(
 //    b. get sorted non-empty archived logs
 //    c. delete what should be deleted
 void WalManager::PurgeObsoleteWALFiles() {
-  bool const ttl_enabled = db_options_.wal_ttl_seconds > 0;
-  bool const size_limit_enabled = db_options_.wal_size_limit_mb > 0;
+  bool const ttl_enabled = db_options_.WAL_ttl_seconds > 0;
+  bool const size_limit_enabled = db_options_.WAL_size_limit_MB > 0;
   if (!ttl_enabled && !size_limit_enabled) {
     return;
   }
 
-  int64_t current_time;
-  Status s = env_->GetCurrentTime(&current_time);
+  int64_t current_time = 0;
+  Status s = db_options_.clock->GetCurrentTime(&current_time);
   if (!s.ok()) {
     ROCKS_LOG_ERROR(db_options_.info_log, "Can't get current time: %s",
                     s.ToString().c_str());
@@ -150,7 +150,7 @@ void WalManager::PurgeObsoleteWALFiles() {
   }
   uint64_t const now_seconds = static_cast<uint64_t>(current_time);
   uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled)
-                                     ? db_options_.wal_ttl_seconds / 2
+                                     ? db_options_.WAL_ttl_seconds / 2
                                      : kDefaultIntervalToDeleteObsoleteWAL;
 
   if (purge_wal_files_last_run_ + time_to_check > now_seconds) {
@@ -171,11 +171,10 @@ void WalManager::PurgeObsoleteWALFiles() {
 
   size_t log_files_num = 0;
   uint64_t log_file_size = 0;
-
   for (auto& f : files) {
     uint64_t number;
     FileType type;
-    if (ParseFileName(f, &number, &type) && type == kLogFile) {
+    if (ParseFileName(f, &number, &type) && type == kWalFile) {
       std::string const file_path = archival_dir + "/" + f;
       if (ttl_enabled) {
         uint64_t file_m_time;
@@ -186,7 +185,7 @@ void WalManager::PurgeObsoleteWALFiles() {
                          s.ToString().c_str());
           continue;
         }
-        if (now_seconds - file_m_time > db_options_.wal_ttl_seconds) {
+        if (now_seconds - file_m_time > db_options_.WAL_ttl_seconds) {
           s = DeleteDBFile(&db_options_, file_path, archival_dir, false,
                            /*force_fg=*/!wal_in_db_path_);
           if (!s.ok()) {
@@ -235,17 +234,21 @@ void WalManager::PurgeObsoleteWALFiles() {
     return;
   }
 
-  size_t const files_keep_num =
-      static_cast<size_t>(db_options_.wal_size_limit_mb * 1024 * 1024 / log_file_size);
+  size_t const files_keep_num = static_cast<size_t>(
+      db_options_.WAL_size_limit_MB * 1024 * 1024 / log_file_size);
   if (log_files_num <= files_keep_num) {
     return;
   }
 
   size_t files_del_num = log_files_num - files_keep_num;
   VectorLogPtr archived_logs;
-  GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile);
-
-  if (files_del_num > archived_logs.size()) {
+  s = GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile);
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Unable to get archived WALs from: %s: %s",
+                   archival_dir.c_str(), s.ToString().c_str());
+    files_del_num = 0;
+  } else if (files_del_num > archived_logs.size()) {
     ROCKS_LOG_WARN(db_options_.info_log,
                    "Trying to delete more archived log files than "
                    "exist. Deleting all");
@@ -292,7 +295,7 @@ Status WalManager::GetSortedWalsOfType(const std::string& path,
   for (const auto& f : all_files) {
     uint64_t number;
     FileType type;
-    if (ParseFileName(f, &number, &type) && type == kLogFile) {
+    if (ParseFileName(f, &number, &type) && type == kWalFile) {
       SequenceNumber sequence;
       Status s = ReadFirstRecord(log_type, number, &sequence);
       if (!s.ok()) {
diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc
index 87c168b703c..580379a6c9e 100644
--- a/db/wal_manager_test.cc
+++ b/db/wal_manager_test.cc
@@ -5,20 +5,21 @@
 
 #ifndef ROCKSDB_LITE
 
+#include "db/wal_manager.h"
+
 #include <map>
 #include <string>
 
-#include "rocksdb/cache.h"
-#include "rocksdb/write_batch.h"
-#include "rocksdb/write_buffer_manager.h"
-
 #include "db/column_family.h"
 #include "db/db_impl/db_impl.h"
 #include "db/log_writer.h"
 #include "db/version_set.h"
-#include "db/wal_manager.h"
 #include "env/mock_env.h"
 #include "file/writable_file_writer.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/write_buffer_manager.h"
 #include "table/mock_table.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
@@ -47,13 +48,14 @@ class WalManagerTest : public testing::Test {
                                       std::numeric_limits<uint64_t>::max());
     db_options_.wal_dir = dbname_;
     db_options_.env = env_.get();
-    fs_.reset(new LegacyFileSystemWrapper(env_.get()));
-    db_options_.fs = fs_;
+    db_options_.fs = env_->GetFileSystem();
+    db_options_.clock = env_->GetSystemClock().get();
 
     versions_.reset(
         new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
                        &write_buffer_manager_, &write_controller_,
-                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr));
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_session_id*/ ""));
 
     wal_manager_.reset(
         new WalManager(db_options_, env_options_, nullptr /*IOTracer*/));
@@ -69,9 +71,10 @@ class WalManagerTest : public testing::Test {
     assert(current_log_writer_.get() != nullptr);
     uint64_t seq =  versions_->LastSequence() + 1;
     WriteBatch batch;
-    batch.Put(key, value);
+    ASSERT_OK(batch.Put(key, value));
     WriteBatchInternal::SetSequence(&batch, seq);
-    current_log_writer_->AddRecord(WriteBatchInternal::Contents(&batch));
+    ASSERT_OK(
+        current_log_writer_->AddRecord(WriteBatchInternal::Contents(&batch)));
     versions_->SetLastAllocatedSequence(seq);
     versions_->SetLastPublishedSequence(seq);
     versions_->SetLastSequence(seq);
@@ -81,10 +84,10 @@ class WalManagerTest : public testing::Test {
   void RollTheLog(bool /*archived*/) {
     current_log_number_++;
     std::string fname = ArchivedLogFileName(dbname_, current_log_number_);
-    std::unique_ptr<WritableFile> file;
-    ASSERT_OK(env_->NewWritableFile(fname, &file, env_options_));
-    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-        NewLegacyWritableFileWrapper(std::move(file)), fname, env_options_));
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<WritableFileWriter> file_writer;
+    ASSERT_OK(WritableFileWriter::Create(fs, fname, env_options_, &file_writer,
+                                         nullptr));
     current_log_writer_.reset(new log::Writer(std::move(file_writer), 0, false));
   }
 
@@ -115,7 +118,6 @@ class WalManagerTest : public testing::Test {
   WriteBufferManager write_buffer_manager_;
   std::unique_ptr<VersionSet> versions_;
   std::unique_ptr<WalManager> wal_manager_;
-  std::shared_ptr<LegacyFileSystemWrapper> fs_;
 
   std::unique_ptr<log::Writer> current_log_writer_;
   uint64_t current_log_number_;
@@ -124,8 +126,9 @@ class WalManagerTest : public testing::Test {
 TEST_F(WalManagerTest, ReadFirstRecordCache) {
   Init();
   std::string path = dbname_ + "/000001.log";
-  std::unique_ptr<WritableFile> file;
-  ASSERT_OK(env_->NewWritableFile(path, &file, EnvOptions()));
+  std::unique_ptr<FSWritableFile> file;
+  ASSERT_OK(env_->GetFileSystem()->NewWritableFile(path, FileOptions(), &file,
+                                                   nullptr));
 
   SequenceNumber s;
   ASSERT_OK(wal_manager_->TEST_ReadFirstLine(path, 1 /* number */, &s));
@@ -135,14 +138,14 @@ TEST_F(WalManagerTest, ReadFirstRecordCache) {
       wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1 /* number */, &s));
   ASSERT_EQ(s, 0U);
 
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(file)), path, EnvOptions()));
+  std::unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(file), path, FileOptions()));
   log::Writer writer(std::move(file_writer), 1,
                      db_options_.recycle_log_file_num > 0);
   WriteBatch batch;
-  batch.Put("foo", "bar");
+  ASSERT_OK(batch.Put("foo", "bar"));
   WriteBatchInternal::SetSequence(&batch, 10);
-  writer.AddRecord(WriteBatchInternal::Contents(&batch));
+  ASSERT_OK(writer.AddRecord(WriteBatchInternal::Contents(&batch)));
 
   // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here.
   // Waiting for lei to finish with db_test
@@ -167,14 +170,14 @@ namespace {
 uint64_t GetLogDirSize(std::string dir_path, Env* env) {
   uint64_t dir_size = 0;
   std::vector<std::string> files;
-  env->GetChildren(dir_path, &files);
+  EXPECT_OK(env->GetChildren(dir_path, &files));
   for (auto& f : files) {
     uint64_t number;
     FileType type;
-    if (ParseFileName(f, &number, &type) && type == kLogFile) {
+    if (ParseFileName(f, &number, &type) && type == kWalFile) {
       std::string const file_path = dir_path + "/" + f;
       uint64_t file_size;
-      env->GetFileSize(file_path, &file_size);
+      EXPECT_OK(env->GetFileSize(file_path, &file_size));
       dir_size += file_size;
     }
   }
@@ -184,9 +187,9 @@ std::vector<std::uint64_t> ListSpecificFiles(
     Env* env, const std::string& path, const FileType expected_file_type) {
   std::vector<std::string> files;
   std::vector<uint64_t> file_numbers;
-  env->GetChildren(path, &files);
   uint64_t number;
   FileType type;
+  EXPECT_OK(env->GetChildren(path, &files));
   for (size_t i = 0; i < files.size(); ++i) {
     if (ParseFileName(files[i], &number, &type)) {
       if (type == expected_file_type) {
@@ -209,13 +212,14 @@ int CountRecords(TransactionLogIterator* iter) {
     EXPECT_OK(iter->status());
     iter->Next();
   }
+  EXPECT_OK(iter->status());
   return count;
 }
 }  // namespace
 
 TEST_F(WalManagerTest, WALArchivalSizeLimit) {
-  db_options_.wal_ttl_seconds = 0;
-  db_options_.wal_size_limit_mb = 1000;
+  db_options_.WAL_ttl_seconds = 0;
+  db_options_.WAL_size_limit_MB = 1000;
   Init();
 
   // TEST : Create WalManager with huge size limit and no ttl.
@@ -223,7 +227,7 @@ TEST_F(WalManagerTest, WALArchivalSizeLimit) {
   // Count the archived log files that survived.
   // Assert that all of them did.
   // Change size limit. Re-open WalManager.
-  // Assert that archive is not greater than wal_size_limit_mb after
+  // Assert that archive is not greater than WAL_size_limit_MB after
   // PurgeObsoleteWALFiles()
   // Set ttl and time_to_check_ to small values. Re-open db.
   // Assert that there are no archived logs left.
@@ -232,27 +236,27 @@ TEST_F(WalManagerTest, WALArchivalSizeLimit) {
   CreateArchiveLogs(20, 5000);
 
   std::vector<std::uint64_t> log_files =
-      ListSpecificFiles(env_.get(), archive_dir, kLogFile);
+      ListSpecificFiles(env_.get(), archive_dir, kWalFile);
   ASSERT_EQ(log_files.size(), 20U);
 
-  db_options_.wal_size_limit_mb = 8;
+  db_options_.WAL_size_limit_MB = 8;
   Reopen();
   wal_manager_->PurgeObsoleteWALFiles();
 
   uint64_t archive_size = GetLogDirSize(archive_dir, env_.get());
-  ASSERT_TRUE(archive_size <= db_options_.wal_size_limit_mb * 1024 * 1024);
+  ASSERT_TRUE(archive_size <= db_options_.WAL_size_limit_MB * 1024 * 1024);
 
-  db_options_.wal_ttl_seconds = 1;
+  db_options_.WAL_ttl_seconds = 1;
   env_->FakeSleepForMicroseconds(2 * 1000 * 1000);
   Reopen();
   wal_manager_->PurgeObsoleteWALFiles();
 
-  log_files = ListSpecificFiles(env_.get(), archive_dir, kLogFile);
+  log_files = ListSpecificFiles(env_.get(), archive_dir, kWalFile);
   ASSERT_TRUE(log_files.empty());
 }
 
 TEST_F(WalManagerTest, WALArchivalTtl) {
-  db_options_.wal_ttl_seconds = 1000;
+  db_options_.WAL_ttl_seconds = 1000;
   Init();
 
   // TEST : Create WalManager with a ttl and no size limit.
@@ -265,15 +269,15 @@ TEST_F(WalManagerTest, WALArchivalTtl) {
   CreateArchiveLogs(20, 5000);
 
   std::vector<uint64_t> log_files =
-      ListSpecificFiles(env_.get(), archive_dir, kLogFile);
+      ListSpecificFiles(env_.get(), archive_dir, kWalFile);
   ASSERT_GT(log_files.size(), 0U);
 
-  db_options_.wal_ttl_seconds = 1;
+  db_options_.WAL_ttl_seconds = 1;
   env_->FakeSleepForMicroseconds(3 * 1000 * 1000);
   Reopen();
   wal_manager_->PurgeObsoleteWALFiles();
 
-  log_files = ListSpecificFiles(env_.get(), archive_dir, kLogFile);
+  log_files = ListSpecificFiles(env_.get(), archive_dir, kWalFile);
   ASSERT_TRUE(log_files.empty());
 }
 
diff --git a/db/write_batch.cc b/db/write_batch.cc
index f9b13406863..1d9423e0d87 100644
--- a/db/write_batch.cc
+++ b/db/write_batch.cc
@@ -46,6 +46,7 @@
 #include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "db/flush_scheduler.h"
+#include "db/kv_checksum.h"
 #include "db/memtable.h"
 #include "db/merge_context.h"
 #include "db/snapshot_impl.h"
@@ -55,6 +56,7 @@
 #include "monitoring/statistics.h"
 #include "port/lang.h"
 #include "rocksdb/merge_operator.h"
+#include "rocksdb/system_clock.h"
 #include "util/autovector.h"
 #include "util/cast_util.h"
 #include "util/coding.h"
@@ -140,10 +142,14 @@ struct BatchContentClassifier : public WriteBatch::Handler {
 
 class TimestampAssigner : public WriteBatch::Handler {
  public:
-  explicit TimestampAssigner(const Slice& ts)
-      : timestamp_(ts), timestamps_(kEmptyTimestampList) {}
-  explicit TimestampAssigner(const std::vector<Slice>& ts_list)
-      : timestamps_(ts_list) {
+  explicit TimestampAssigner(const Slice& ts,
+                             WriteBatch::ProtectionInfo* prot_info)
+      : timestamp_(ts),
+        timestamps_(kEmptyTimestampList),
+        prot_info_(prot_info) {}
+  explicit TimestampAssigner(const std::vector<Slice>& ts_list,
+                             WriteBatch::ProtectionInfo* prot_info)
+      : timestamps_(ts_list), prot_info_(prot_info) {
     SanityCheck();
   }
   ~TimestampAssigner() override {}
@@ -167,9 +173,8 @@ class TimestampAssigner : public WriteBatch::Handler {
   }
 
   Status DeleteRangeCF(uint32_t, const Slice& begin_key,
-                       const Slice& end_key) override {
+                       const Slice& /* end_key */) override {
     AssignTimestamp(begin_key);
-    AssignTimestamp(end_key);
     ++idx_;
     return Status::OK();
   }
@@ -221,12 +226,17 @@ class TimestampAssigner : public WriteBatch::Handler {
     const Slice& ts = timestamps_.empty() ? timestamp_ : timestamps_[idx_];
     size_t ts_sz = ts.size();
     char* ptr = const_cast<char*>(key.data() + key.size() - ts_sz);
+    if (prot_info_ != nullptr) {
+      Slice old_ts(ptr, ts_sz), new_ts(ts.data(), ts_sz);
+      prot_info_->entries_[idx_].UpdateT(old_ts, new_ts);
+    }
     memcpy(ptr, ts.data(), ts_sz);
   }
 
   static const std::vector<Slice> kEmptyTimestampList;
   const Slice timestamp_;
   const std::vector<Slice>& timestamps_;
+  WriteBatch::ProtectionInfo* const prot_info_;
   size_t idx_ = 0;
 
   // No copy or move.
@@ -258,6 +268,21 @@ WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz)
   rep_.resize(WriteBatchInternal::kHeader);
 }
 
+WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz,
+                       size_t protection_bytes_per_key)
+    : content_flags_(0), max_bytes_(max_bytes), rep_(), timestamp_size_(ts_sz) {
+  // Currently `protection_bytes_per_key` can only be enabled at 8 bytes per
+  // entry.
+  assert(protection_bytes_per_key == 0 || protection_bytes_per_key == 8);
+  if (protection_bytes_per_key != 0) {
+    prot_info_.reset(new WriteBatch::ProtectionInfo());
+  }
+  rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader)
+                   ? reserved_bytes
+                   : WriteBatchInternal::kHeader);
+  rep_.resize(WriteBatchInternal::kHeader);
+}
+
 WriteBatch::WriteBatch(const std::string& rep)
     : content_flags_(ContentFlags::DEFERRED),
       max_bytes_(0),
@@ -280,6 +305,10 @@ WriteBatch::WriteBatch(const WriteBatch& src)
     save_points_.reset(new SavePoints());
     save_points_->stack = src.save_points_->stack;
   }
+  if (src.prot_info_ != nullptr) {
+    prot_info_.reset(new WriteBatch::ProtectionInfo());
+    prot_info_->entries_ = src.prot_info_->entries_;
+  }
 }
 
 WriteBatch::WriteBatch(WriteBatch&& src) noexcept
@@ -287,6 +316,7 @@ WriteBatch::WriteBatch(WriteBatch&& src) noexcept
       wal_term_point_(std::move(src.wal_term_point_)),
       content_flags_(src.content_flags_.load(std::memory_order_relaxed)),
       max_bytes_(src.max_bytes_),
+      prot_info_(std::move(src.prot_info_)),
       rep_(std::move(src.rep_)),
       timestamp_size_(src.timestamp_size_) {}
 
@@ -331,6 +361,9 @@ void WriteBatch::Clear() {
     }
   }
 
+  if (prot_info_ != nullptr) {
+    prot_info_->entries_.clear();
+  }
   wal_term_point_.clear();
 }
 
@@ -359,6 +392,13 @@ void WriteBatch::MarkWalTerminationPoint() {
   wal_term_point_.content_flags = content_flags_;
 }
 
+size_t WriteBatch::GetProtectionBytesPerKey() const {
+  if (prot_info_ != nullptr) {
+    return prot_info_->GetBytesPerKey();
+  }
+  return 0;
+}
+
 bool WriteBatch::HasPut() const {
   return (ComputeContentFlags() & ContentFlags::HAS_PUT) != 0;
 }
@@ -640,7 +680,8 @@ Status WriteBatchInternal::Iterate(const WriteBatch* wb,
       case kTypeBeginPrepareXID:
         assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
-        handler->MarkBeginPrepare();
+        s = handler->MarkBeginPrepare();
+        assert(s.ok());
         empty_batch = false;
         if (!handler->WriteAfterCommit()) {
           s = Status::NotSupported(
@@ -659,7 +700,8 @@ Status WriteBatchInternal::Iterate(const WriteBatch* wb,
       case kTypeBeginPersistedPrepareXID:
         assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
-        handler->MarkBeginPrepare();
+        s = handler->MarkBeginPrepare();
+        assert(s.ok());
         empty_batch = false;
         if (handler->WriteAfterCommit()) {
           s = Status::NotSupported(
@@ -672,7 +714,8 @@ Status WriteBatchInternal::Iterate(const WriteBatch* wb,
       case kTypeBeginUnprepareXID:
         assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_UNPREPARE));
-        handler->MarkBeginPrepare(true /* unprepared */);
+        s = handler->MarkBeginPrepare(true /* unprepared */);
+        assert(s.ok());
         empty_batch = false;
         if (handler->WriteAfterCommit()) {
           s = Status::NotSupported(
@@ -691,23 +734,27 @@ Status WriteBatchInternal::Iterate(const WriteBatch* wb,
       case kTypeEndPrepareXID:
         assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_END_PREPARE));
-        handler->MarkEndPrepare(xid);
+        s = handler->MarkEndPrepare(xid);
+        assert(s.ok());
         empty_batch = true;
         break;
       case kTypeCommitXID:
         assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT));
-        handler->MarkCommit(xid);
+        s = handler->MarkCommit(xid);
+        assert(s.ok());
         empty_batch = true;
         break;
       case kTypeRollbackXID:
         assert(wb->content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_ROLLBACK));
-        handler->MarkRollback(xid);
+        s = handler->MarkRollback(xid);
+        assert(s.ok());
         empty_batch = true;
         break;
       case kTypeNoop:
-        handler->MarkNoop(empty_batch);
+        s = handler->MarkNoop(empty_batch);
+        assert(s.ok());
         empty_batch = true;
         break;
       default:
@@ -770,18 +817,31 @@ Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
     b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
     PutVarint32(&b->rep_, column_family_id);
   }
+  std::string timestamp(b->timestamp_size_, '\0');
   if (0 == b->timestamp_size_) {
     PutLengthPrefixedSlice(&b->rep_, key);
   } else {
     PutVarint32(&b->rep_,
                 static_cast<uint32_t>(key.size() + b->timestamp_size_));
     b->rep_.append(key.data(), key.size());
-    b->rep_.append(b->timestamp_size_, '\0');
+    b->rep_.append(timestamp);
   }
   PutLengthPrefixedSlice(&b->rep_, value);
   b->content_flags_.store(
       b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT,
       std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // Technically the optype could've been `kTypeColumnFamilyValue` with the
+    // CF ID encoded in the `WriteBatch`. That distinction is unimportant
+    // however since we verify CF ID is correct, as well as all other fields
+    // (a missing/extra encoded CF ID would corrupt another field). It is
+    // convenient to consolidate on `kTypeValue` here as that is what will be
+    // inserted into memtable.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVOT(key, value, kTypeValue, timestamp)
+            .ProtectC(column_family_id));
+  }
   return save.commit();
 }
 
@@ -826,6 +886,7 @@ Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
     b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
     PutVarint32(&b->rep_, column_family_id);
   }
+  std::string timestamp(b->timestamp_size_, '\0');
   if (0 == b->timestamp_size_) {
     PutLengthPrefixedSliceParts(&b->rep_, key);
   } else {
@@ -835,6 +896,14 @@ Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
   b->content_flags_.store(
       b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT,
       std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVOT()`.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVOT(key, value, kTypeValue, timestamp)
+            .ProtectC(column_family_id));
+  }
   return save.commit();
 }
 
@@ -909,17 +978,26 @@ Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id,
     b->rep_.push_back(static_cast<char>(kTypeColumnFamilyDeletion));
     PutVarint32(&b->rep_, column_family_id);
   }
+  std::string timestamp(b->timestamp_size_, '\0');
   if (0 == b->timestamp_size_) {
     PutLengthPrefixedSlice(&b->rep_, key);
   } else {
     PutVarint32(&b->rep_,
                 static_cast<uint32_t>(key.size() + b->timestamp_size_));
     b->rep_.append(key.data(), key.size());
-    b->rep_.append(b->timestamp_size_, '\0');
+    b->rep_.append(timestamp);
   }
   b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
                               ContentFlags::HAS_DELETE,
                           std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVOT()`.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVOT(key, "" /* value */, kTypeDeletion, timestamp)
+            .ProtectC(column_family_id));
+  }
   return save.commit();
 }
 
@@ -938,6 +1016,7 @@ Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id,
     b->rep_.push_back(static_cast<char>(kTypeColumnFamilyDeletion));
     PutVarint32(&b->rep_, column_family_id);
   }
+  std::string timestamp(b->timestamp_size_, '\0');
   if (0 == b->timestamp_size_) {
     PutLengthPrefixedSliceParts(&b->rep_, key);
   } else {
@@ -946,6 +1025,16 @@ Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id,
   b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
                               ContentFlags::HAS_DELETE,
                           std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVOT()`.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVOT(key,
+                         SliceParts(nullptr /* _parts */, 0 /* _num_parts */),
+                         kTypeDeletion, timestamp)
+            .ProtectC(column_family_id));
+  }
   return save.commit();
 }
 
@@ -970,6 +1059,15 @@ Status WriteBatchInternal::SingleDelete(WriteBatch* b,
   b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
                               ContentFlags::HAS_SINGLE_DELETE,
                           std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVOT()`.
+    b->prot_info_->entries_.emplace_back(ProtectionInfo64()
+                                             .ProtectKVOT(key, "" /* value */,
+                                                          kTypeSingleDeletion,
+                                                          "" /* timestamp */)
+                                             .ProtectC(column_family_id));
+  }
   return save.commit();
 }
 
@@ -994,6 +1092,17 @@ Status WriteBatchInternal::SingleDelete(WriteBatch* b,
   b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
                               ContentFlags::HAS_SINGLE_DELETE,
                           std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVOT()`.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVOT(key,
+                         SliceParts(nullptr /* _parts */,
+                                    0 /* _num_parts */) /* value */,
+                         kTypeSingleDeletion, "" /* timestamp */)
+            .ProtectC(column_family_id));
+  }
   return save.commit();
 }
 
@@ -1019,6 +1128,16 @@ Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id,
   b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
                               ContentFlags::HAS_DELETE_RANGE,
                           std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVOT()`.
+    // In `DeleteRange()`, the end key is treated as the value.
+    b->prot_info_->entries_.emplace_back(ProtectionInfo64()
+                                             .ProtectKVOT(begin_key, end_key,
+                                                          kTypeRangeDeletion,
+                                                          "" /* timestamp */)
+                                             .ProtectC(column_family_id));
+  }
   return save.commit();
 }
 
@@ -1044,6 +1163,16 @@ Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id,
   b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
                               ContentFlags::HAS_DELETE_RANGE,
                           std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVOT()`.
+    // In `DeleteRange()`, the end key is treated as the value.
+    b->prot_info_->entries_.emplace_back(ProtectionInfo64()
+                                             .ProtectKVOT(begin_key, end_key,
+                                                          kTypeRangeDeletion,
+                                                          "" /* timestamp */)
+                                             .ProtectC(column_family_id));
+  }
   return save.commit();
 }
 
@@ -1076,6 +1205,14 @@ Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
   b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
                               ContentFlags::HAS_MERGE,
                           std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVOT()`.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVOT(key, value, kTypeMerge, "" /* timestamp */)
+            .ProtectC(column_family_id));
+  }
   return save.commit();
 }
 
@@ -1106,6 +1243,14 @@ Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
   b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
                               ContentFlags::HAS_MERGE,
                           std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVOT()`.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVOT(key, value, kTypeMerge, "" /* timestamp */)
+            .ProtectC(column_family_id));
+  }
   return save.commit();
 }
 
@@ -1131,6 +1276,14 @@ Status WriteBatchInternal::PutBlobIndex(WriteBatch* b,
   b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
                               ContentFlags::HAS_BLOB_INDEX,
                           std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVOT()`.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVOT(key, value, kTypeBlobIndex, "" /* timestamp */)
+            .ProtectC(column_family_id));
+  }
   return save.commit();
 }
 
@@ -1169,6 +1322,9 @@ Status WriteBatch::RollbackToSavePoint() {
     Clear();
   } else {
     rep_.resize(savepoint.size);
+    if (prot_info_ != nullptr) {
+      prot_info_->entries_.resize(savepoint.count);
+    }
     WriteBatchInternal::SetCount(this, savepoint.count);
     content_flags_.store(savepoint.content_flags, std::memory_order_relaxed);
   }
@@ -1188,12 +1344,12 @@ Status WriteBatch::PopSavePoint() {
 }
 
 Status WriteBatch::AssignTimestamp(const Slice& ts) {
-  TimestampAssigner ts_assigner(ts);
+  TimestampAssigner ts_assigner(ts, prot_info_.get());
   return Iterate(&ts_assigner);
 }
 
 Status WriteBatch::AssignTimestamps(const std::vector<Slice>& ts_list) {
-  TimestampAssigner ts_assigner(ts_list);
+  TimestampAssigner ts_assigner(ts_list, prot_info_.get());
   return Iterate(&ts_assigner);
 }
 
@@ -1210,6 +1366,8 @@ class MemTableInserter : public WriteBatch::Handler {
   DBImpl* db_;
   const bool concurrent_memtable_writes_;
   bool       post_info_created_;
+  const WriteBatch::ProtectionInfo* prot_info_;
+  size_t prot_info_idx_;
 
   bool* has_valid_writes_;
   // On some (!) platforms just default creating
@@ -1272,6 +1430,16 @@ class MemTableInserter : public WriteBatch::Handler {
       (&duplicate_detector_)->IsDuplicateKeySeq(column_family_id, key, sequence_);
   }
 
+  const ProtectionInfoKVOTC64* NextProtectionInfo() {
+    const ProtectionInfoKVOTC64* res = nullptr;
+    if (prot_info_ != nullptr) {
+      assert(prot_info_idx_ < prot_info_->entries_.size());
+      res = &prot_info_->entries_[prot_info_idx_];
+      ++prot_info_idx_;
+    }
+    return res;
+  }
+
  protected:
   bool WriteBeforePrepare() const override { return write_before_prepare_; }
   bool WriteAfterCommit() const override { return write_after_commit_; }
@@ -1284,6 +1452,7 @@ class MemTableInserter : public WriteBatch::Handler {
                    bool ignore_missing_column_families,
                    uint64_t recovering_log_number, DB* db,
                    bool concurrent_memtable_writes,
+                   const WriteBatch::ProtectionInfo* prot_info,
                    bool* has_valid_writes = nullptr, bool seq_per_batch = false,
                    bool batch_per_txn = true, bool hint_per_batch = false)
       : sequence_(_sequence),
@@ -1296,6 +1465,8 @@ class MemTableInserter : public WriteBatch::Handler {
         db_(static_cast_with_check<DBImpl>(db)),
         concurrent_memtable_writes_(concurrent_memtable_writes),
         post_info_created_(false),
+        prot_info_(prot_info),
+        prot_info_idx_(0),
         has_valid_writes_(has_valid_writes),
         rebuilding_trx_(nullptr),
         rebuilding_trx_seq_(0),
@@ -1353,6 +1524,10 @@ class MemTableInserter : public WriteBatch::Handler {
   }
 
   void set_log_number_ref(uint64_t log) { log_number_ref_ = log; }
+  void set_prot_info(const WriteBatch::ProtectionInfo* prot_info) {
+    prot_info_ = prot_info;
+    prot_info_idx_ = 0;
+  }
 
   SequenceNumber sequence() const { return sequence_; }
 
@@ -1408,27 +1583,34 @@ class MemTableInserter : public WriteBatch::Handler {
   }
 
   Status PutCFImpl(uint32_t column_family_id, const Slice& key,
-                   const Slice& value, ValueType value_type) {
+                   const Slice& value, ValueType value_type,
+                   const ProtectionInfoKVOTS64* kv_prot_info) {
     // optimize for non-recovery mode
     if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
-      WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value);
-      return Status::OK();
+      // TODO(ajkr): propagate `ProtectionInfoKVOTS64`.
+      return WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key,
+                                     value);
       // else insert the values to the memtable right away
     }
 
     Status ret_status;
     if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
-      bool batch_boundry = false;
-      if (rebuilding_trx_ != nullptr) {
+      if (ret_status.ok() && rebuilding_trx_ != nullptr) {
         assert(!write_after_commit_);
         // The CF is probably flushed and hence no need for insert but we still
         // need to keep track of the keys for upcoming rollback/commit.
-        WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value);
-        batch_boundry = IsDuplicateKeySeq(column_family_id, key);
+        // TODO(ajkr): propagate `ProtectionInfoKVOTS64`.
+        ret_status = WriteBatchInternal::Put(rebuilding_trx_, column_family_id,
+                                             key, value);
+        if (ret_status.ok()) {
+          MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
+        }
+      } else if (ret_status.ok()) {
+        MaybeAdvanceSeq(false /* batch_boundary */);
       }
-      MaybeAdvanceSeq(batch_boundry);
       return ret_status;
     }
+    assert(ret_status.ok());
 
     MemTable* mem = cf_mems_->GetMemTable();
     auto* moptions = mem->GetImmutableMemTableOptions();
@@ -1436,23 +1618,17 @@ class MemTableInserter : public WriteBatch::Handler {
     // any kind of transactions including the ones that use seq_per_batch
     assert(!seq_per_batch_ || !moptions->inplace_update_support);
     if (!moptions->inplace_update_support) {
-      bool mem_res =
-          mem->Add(sequence_, value_type, key, value,
+      ret_status =
+          mem->Add(sequence_, value_type, key, value, kv_prot_info,
                    concurrent_memtable_writes_, get_post_process_info(mem),
                    hint_per_batch_ ? &GetHintMap()[mem] : nullptr);
-      if (UNLIKELY(!mem_res)) {
-        assert(seq_per_batch_);
-        ret_status = Status::TryAgain("key+seq exists");
-        const bool BATCH_BOUNDRY = true;
-        MaybeAdvanceSeq(BATCH_BOUNDRY);
-      }
     } else if (moptions->inplace_callback == nullptr) {
       assert(!concurrent_memtable_writes_);
-      mem->Update(sequence_, key, value);
+      ret_status = mem->Update(sequence_, key, value, kv_prot_info);
     } else {
       assert(!concurrent_memtable_writes_);
-      if (mem->UpdateCallback(sequence_, key, value)) {
-      } else {
+      ret_status = mem->UpdateCallback(sequence_, key, value, kv_prot_info);
+      if (ret_status.IsNotFound()) {
         // key not found in memtable. Do sst get, update, add
         SnapshotImpl read_from_snapshot;
         read_from_snapshot.number_ = sequence_;
@@ -1466,94 +1642,149 @@ class MemTableInserter : public WriteBatch::Handler {
         std::string merged_value;
 
         auto cf_handle = cf_mems_->GetColumnFamilyHandle();
-        Status s = Status::NotSupported();
+        Status get_status = Status::NotSupported();
         if (db_ != nullptr && recovering_log_number_ == 0) {
           if (cf_handle == nullptr) {
             cf_handle = db_->DefaultColumnFamily();
           }
-          s = db_->Get(ropts, cf_handle, key, &prev_value);
+          get_status = db_->Get(ropts, cf_handle, key, &prev_value);
         }
-
-        char* prev_buffer = const_cast<char*>(prev_value.c_str());
-        uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
-        auto status = moptions->inplace_callback(s.ok() ? prev_buffer : nullptr,
-                                                 s.ok() ? &prev_size : nullptr,
-                                                 value, &merged_value);
-        if (status == UpdateStatus::UPDATED_INPLACE) {
-          // prev_value is updated in-place with final value.
-          bool mem_res __attribute__((__unused__));
-          mem_res = mem->Add(
-              sequence_, value_type, key, Slice(prev_buffer, prev_size));
-          assert(mem_res);
-          RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
-        } else if (status == UpdateStatus::UPDATED) {
-          // merged_value contains the final value.
-          bool mem_res __attribute__((__unused__));
-          mem_res =
-              mem->Add(sequence_, value_type, key, Slice(merged_value));
-          assert(mem_res);
-          RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
+        // Intentionally overwrites the `NotFound` in `ret_status`.
+        if (!get_status.ok() && !get_status.IsNotFound()) {
+          ret_status = get_status;
+        } else {
+          ret_status = Status::OK();
+        }
+        if (ret_status.ok()) {
+          UpdateStatus update_status;
+          char* prev_buffer = const_cast<char*>(prev_value.c_str());
+          uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
+          if (get_status.ok()) {
+            update_status = moptions->inplace_callback(prev_buffer, &prev_size,
+                                                       value, &merged_value);
+          } else {
+            update_status = moptions->inplace_callback(
+                nullptr /* existing_value */, nullptr /* existing_value_size */,
+                value, &merged_value);
+          }
+          if (update_status == UpdateStatus::UPDATED_INPLACE) {
+            assert(get_status.ok());
+            if (kv_prot_info != nullptr) {
+              ProtectionInfoKVOTS64 updated_kv_prot_info(*kv_prot_info);
+              updated_kv_prot_info.UpdateV(value,
+                                           Slice(prev_buffer, prev_size));
+              // prev_value is updated in-place with final value.
+              ret_status = mem->Add(sequence_, value_type, key,
+                                    Slice(prev_buffer, prev_size),
+                                    &updated_kv_prot_info);
+            } else {
+              ret_status = mem->Add(sequence_, value_type, key,
+                                    Slice(prev_buffer, prev_size),
+                                    nullptr /* kv_prot_info */);
+            }
+            if (ret_status.ok()) {
+              RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
+            }
+          } else if (update_status == UpdateStatus::UPDATED) {
+            if (kv_prot_info != nullptr) {
+              ProtectionInfoKVOTS64 updated_kv_prot_info(*kv_prot_info);
+              updated_kv_prot_info.UpdateV(value, merged_value);
+              // merged_value contains the final value.
+              ret_status = mem->Add(sequence_, value_type, key,
+                                    Slice(merged_value), &updated_kv_prot_info);
+            } else {
+              // merged_value contains the final value.
+              ret_status =
+                  mem->Add(sequence_, value_type, key, Slice(merged_value),
+                           nullptr /* kv_prot_info */);
+            }
+            if (ret_status.ok()) {
+              RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
+            }
+          }
         }
       }
     }
+    if (UNLIKELY(ret_status.IsTryAgain())) {
+      assert(seq_per_batch_);
+      const bool kBatchBoundary = true;
+      MaybeAdvanceSeq(kBatchBoundary);
+    } else if (ret_status.ok()) {
+      MaybeAdvanceSeq();
+      CheckMemtableFull();
+    }
     // optimize for non-recovery mode
-    if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+    // If `ret_status` is `TryAgain` then the next (successful) try will add
+    // the key to the rebuilding transaction object. If `ret_status` is
+    // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+    // away. So we only need to add to it when `ret_status.ok()`.
+    if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
       assert(!write_after_commit_);
-      // If the ret_status is TryAgain then let the next try to add the ky to
-      // the rebuilding transaction object.
-      WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value);
+      // TODO(ajkr): propagate `ProtectionInfoKVOTS64`.
+      ret_status = WriteBatchInternal::Put(rebuilding_trx_, column_family_id,
+                                           key, value);
     }
-    // Since all Puts are logged in transaction logs (if enabled), always bump
-    // sequence number. Even if the update eventually fails and does not result
-    // in memtable add/update.
-    MaybeAdvanceSeq();
-    CheckMemtableFull();
     return ret_status;
   }
 
   Status PutCF(uint32_t column_family_id, const Slice& key,
                const Slice& value) override {
-    return PutCFImpl(column_family_id, key, value, kTypeValue);
+    const auto* kv_prot_info = NextProtectionInfo();
+    if (kv_prot_info != nullptr) {
+      // Memtable needs seqno, doesn't need CF ID
+      auto mem_kv_prot_info =
+          kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+      return PutCFImpl(column_family_id, key, value, kTypeValue,
+                       &mem_kv_prot_info);
+    }
+    return PutCFImpl(column_family_id, key, value, kTypeValue,
+                     nullptr /* kv_prot_info */);
   }
 
   Status DeleteImpl(uint32_t /*column_family_id*/, const Slice& key,
-                    const Slice& value, ValueType delete_type) {
+                    const Slice& value, ValueType delete_type,
+                    const ProtectionInfoKVOTS64* kv_prot_info) {
     Status ret_status;
     MemTable* mem = cf_mems_->GetMemTable();
-    bool mem_res =
-        mem->Add(sequence_, delete_type, key, value,
+    ret_status =
+        mem->Add(sequence_, delete_type, key, value, kv_prot_info,
                  concurrent_memtable_writes_, get_post_process_info(mem),
                  hint_per_batch_ ? &GetHintMap()[mem] : nullptr);
-    if (UNLIKELY(!mem_res)) {
+    if (UNLIKELY(ret_status.IsTryAgain())) {
       assert(seq_per_batch_);
-      ret_status = Status::TryAgain("key+seq exists");
-      const bool BATCH_BOUNDRY = true;
-      MaybeAdvanceSeq(BATCH_BOUNDRY);
+      const bool kBatchBoundary = true;
+      MaybeAdvanceSeq(kBatchBoundary);
+    } else if (ret_status.ok()) {
+      MaybeAdvanceSeq();
+      CheckMemtableFull();
     }
-    MaybeAdvanceSeq();
-    CheckMemtableFull();
     return ret_status;
   }
 
   Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+    const auto* kv_prot_info = NextProtectionInfo();
     // optimize for non-recovery mode
     if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
-      WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
-      return Status::OK();
+      // TODO(ajkr): propagate `ProtectionInfoKVOTS64`.
+      return WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
       // else insert the values to the memtable right away
     }
 
     Status ret_status;
     if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
-      bool batch_boundry = false;
-      if (rebuilding_trx_ != nullptr) {
+      if (ret_status.ok() && rebuilding_trx_ != nullptr) {
         assert(!write_after_commit_);
         // The CF is probably flushed and hence no need for insert but we still
         // need to keep track of the keys for upcoming rollback/commit.
-        WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
-        batch_boundry = IsDuplicateKeySeq(column_family_id, key);
+        // TODO(ajkr): propagate `ProtectionInfoKVOTS64`.
+        ret_status =
+            WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
+        if (ret_status.ok()) {
+          MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
+        }
+      } else if (ret_status.ok()) {
+        MaybeAdvanceSeq(false /* batch_boundary */);
       }
-      MaybeAdvanceSeq(batch_boundry);
       return ret_status;
     }
 
@@ -1564,78 +1795,112 @@ class MemTableInserter : public WriteBatch::Handler {
                              : 0;
     const ValueType delete_type =
         (0 == ts_sz) ? kTypeDeletion : kTypeDeletionWithTimestamp;
-    ret_status = DeleteImpl(column_family_id, key, Slice(), delete_type);
+    if (kv_prot_info != nullptr) {
+      auto mem_kv_prot_info =
+          kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+      mem_kv_prot_info.UpdateO(kTypeDeletion, delete_type);
+      ret_status = DeleteImpl(column_family_id, key, Slice(), delete_type,
+                              &mem_kv_prot_info);
+    } else {
+      ret_status = DeleteImpl(column_family_id, key, Slice(), delete_type,
+                              nullptr /* kv_prot_info */);
+    }
     // optimize for non-recovery mode
-    if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+    // If `ret_status` is `TryAgain` then the next (successful) try will add
+    // the key to the rebuilding transaction object. If `ret_status` is
+    // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+    // away. So we only need to add to it when `ret_status.ok()`.
+    if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
       assert(!write_after_commit_);
-      // If the ret_status is TryAgain then let the next try to add the ky to
-      // the rebuilding transaction object.
-      WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
+      // TODO(ajkr): propagate `ProtectionInfoKVOTS64`.
+      ret_status =
+          WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
     }
     return ret_status;
   }
 
   Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override {
+    const auto* kv_prot_info = NextProtectionInfo();
     // optimize for non-recovery mode
     if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
-      WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, key);
-      return Status::OK();
+      // TODO(ajkr): propagate `ProtectionInfoKVOTS64`.
+      return WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id,
+                                              key);
       // else insert the values to the memtable right away
     }
 
     Status ret_status;
     if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
-      bool batch_boundry = false;
-      if (rebuilding_trx_ != nullptr) {
+      if (ret_status.ok() && rebuilding_trx_ != nullptr) {
         assert(!write_after_commit_);
         // The CF is probably flushed and hence no need for insert but we still
         // need to keep track of the keys for upcoming rollback/commit.
-        WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id,
-                                         key);
-        batch_boundry = IsDuplicateKeySeq(column_family_id, key);
+        // TODO(ajkr): propagate `ProtectionInfoKVOTS64`.
+        ret_status = WriteBatchInternal::SingleDelete(rebuilding_trx_,
+                                                      column_family_id, key);
+        if (ret_status.ok()) {
+          MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
+        }
+      } else if (ret_status.ok()) {
+        MaybeAdvanceSeq(false /* batch_boundary */);
       }
-      MaybeAdvanceSeq(batch_boundry);
       return ret_status;
     }
+    assert(ret_status.ok());
 
-    ret_status =
-        DeleteImpl(column_family_id, key, Slice(), kTypeSingleDeletion);
+    if (kv_prot_info != nullptr) {
+      auto mem_kv_prot_info =
+          kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+      ret_status = DeleteImpl(column_family_id, key, Slice(),
+                              kTypeSingleDeletion, &mem_kv_prot_info);
+    } else {
+      ret_status = DeleteImpl(column_family_id, key, Slice(),
+                              kTypeSingleDeletion, nullptr /* kv_prot_info */);
+    }
     // optimize for non-recovery mode
-    if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+    // If `ret_status` is `TryAgain` then the next (successful) try will add
+    // the key to the rebuilding transaction object. If `ret_status` is
+    // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+    // away. So we only need to add to it when `ret_status.ok()`.
+    if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
       assert(!write_after_commit_);
-      // If the ret_status is TryAgain then let the next try to add the ky to
-      // the rebuilding transaction object.
-      WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, key);
+      // TODO(ajkr): propagate `ProtectionInfoKVOTS64`.
+      ret_status = WriteBatchInternal::SingleDelete(rebuilding_trx_,
+                                                    column_family_id, key);
     }
     return ret_status;
   }
 
   Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
                        const Slice& end_key) override {
+    const auto* kv_prot_info = NextProtectionInfo();
     // optimize for non-recovery mode
     if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
-      WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
-                                      begin_key, end_key);
-      return Status::OK();
+      // TODO(ajkr): propagate `ProtectionInfoKVOTS64`.
+      return WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
+                                             begin_key, end_key);
       // else insert the values to the memtable right away
     }
 
     Status ret_status;
     if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
-      bool batch_boundry = false;
-      if (rebuilding_trx_ != nullptr) {
+      if (ret_status.ok() && rebuilding_trx_ != nullptr) {
         assert(!write_after_commit_);
         // The CF is probably flushed and hence no need for insert but we still
         // need to keep track of the keys for upcoming rollback/commit.
-        WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
-                                        begin_key, end_key);
-        // TODO(myabandeh): when transactional DeleteRange support is added,
-        // check if end_key must also be added.
-        batch_boundry = IsDuplicateKeySeq(column_family_id, begin_key);
+        // TODO(ajkr): propagate `ProtectionInfoKVOTS64`.
+        ret_status = WriteBatchInternal::DeleteRange(
+            rebuilding_trx_, column_family_id, begin_key, end_key);
+        if (ret_status.ok()) {
+          MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, begin_key));
+        }
+      } else if (ret_status.ok()) {
+        MaybeAdvanceSeq(false /* batch_boundary */);
       }
-      MaybeAdvanceSeq(batch_boundry);
       return ret_status;
     }
+    assert(ret_status.ok());
+
     if (db_ != nullptr) {
       auto cf_handle = cf_mems_->GetColumnFamilyHandle();
       if (cf_handle == nullptr) {
@@ -1644,6 +1909,8 @@ class MemTableInserter : public WriteBatch::Handler {
       auto* cfd =
           static_cast_with_check<ColumnFamilyHandleImpl>(cf_handle)->cfd();
       if (!cfd->is_delete_range_supported()) {
+        // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`.
+        ret_status.PermitUncheckedError();
         return Status::NotSupported(
             std::string("DeleteRange not supported for table type ") +
             cfd->ioptions()->table_factory->Name() + " in CF " +
@@ -1651,54 +1918,78 @@ class MemTableInserter : public WriteBatch::Handler {
       }
       int cmp = cfd->user_comparator()->Compare(begin_key, end_key);
       if (cmp > 0) {
+        // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`.
+        ret_status.PermitUncheckedError();
         // It's an empty range where endpoints appear mistaken. Don't bother
         // applying it to the DB, and return an error to the user.
         return Status::InvalidArgument("end key comes before start key");
       } else if (cmp == 0) {
+        // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`.
+        ret_status.PermitUncheckedError();
         // It's an empty range. Don't bother applying it to the DB.
         return Status::OK();
       }
     }
 
-    ret_status =
-        DeleteImpl(column_family_id, begin_key, end_key, kTypeRangeDeletion);
+    if (kv_prot_info != nullptr) {
+      auto mem_kv_prot_info =
+          kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+      ret_status = DeleteImpl(column_family_id, begin_key, end_key,
+                              kTypeRangeDeletion, &mem_kv_prot_info);
+    } else {
+      ret_status = DeleteImpl(column_family_id, begin_key, end_key,
+                              kTypeRangeDeletion, nullptr /* kv_prot_info */);
+    }
     // optimize for non-recovery mode
+    // If `ret_status` is `TryAgain` then the next (successful) try will add
+    // the key to the rebuilding transaction object. If `ret_status` is
+    // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+    // away. So we only need to add to it when `ret_status.ok()`.
     if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
       assert(!write_after_commit_);
-      // If the ret_status is TryAgain then let the next try to add the ky to
-      // the rebuilding transaction object.
-      WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
-                                      begin_key, end_key);
+      // TODO(ajkr): propagate `ProtectionInfoKVOTS64`.
+      ret_status = WriteBatchInternal::DeleteRange(
+          rebuilding_trx_, column_family_id, begin_key, end_key);
     }
     return ret_status;
   }
 
   Status MergeCF(uint32_t column_family_id, const Slice& key,
                  const Slice& value) override {
+    const auto* kv_prot_info = NextProtectionInfo();
     // optimize for non-recovery mode
     if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
-      WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value);
-      return Status::OK();
+      // TODO(ajkr): propagate `ProtectionInfoKVOTS64`.
+      return WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key,
+                                       value);
       // else insert the values to the memtable right away
     }
 
     Status ret_status;
     if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
-      bool batch_boundry = false;
-      if (rebuilding_trx_ != nullptr) {
+      if (ret_status.ok() && rebuilding_trx_ != nullptr) {
         assert(!write_after_commit_);
         // The CF is probably flushed and hence no need for insert but we still
         // need to keep track of the keys for upcoming rollback/commit.
-        WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key,
-                                  value);
-        batch_boundry = IsDuplicateKeySeq(column_family_id, key);
+        // TODO(ajkr): propagate `ProtectionInfoKVOTS64`.
+        ret_status = WriteBatchInternal::Merge(rebuilding_trx_,
+                                               column_family_id, key, value);
+        if (ret_status.ok()) {
+          MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
+        }
+      } else if (ret_status.ok()) {
+        MaybeAdvanceSeq(false /* batch_boundary */);
       }
-      MaybeAdvanceSeq(batch_boundry);
       return ret_status;
     }
+    assert(ret_status.ok());
 
     MemTable* mem = cf_mems_->GetMemTable();
     auto* moptions = mem->GetImmutableMemTableOptions();
+    if (moptions->merge_operator == nullptr) {
+      return Status::InvalidArgument(
+          "Merge requires `ColumnFamilyOptions::merge_operator != nullptr`");
+    }
     bool perform_merge = false;
     assert(!concurrent_memtable_writes_ ||
            moptions->max_successive_merges == 0);
@@ -1736,65 +2027,97 @@ class MemTableInserter : public WriteBatch::Handler {
       if (cf_handle == nullptr) {
         cf_handle = db_->DefaultColumnFamily();
       }
-      db_->Get(read_options, cf_handle, key, &get_value);
-      Slice get_value_slice = Slice(get_value);
-
-      // 2) Apply this merge
-      auto merge_operator = moptions->merge_operator;
-      assert(merge_operator);
-
-      std::string new_value;
-
-      Status merge_status = MergeHelper::TimedFullMerge(
-          merge_operator, key, &get_value_slice, {value}, &new_value,
-          moptions->info_log, moptions->statistics, Env::Default());
-
-      if (!merge_status.ok()) {
-        // Failed to merge!
-        // Store the delta in memtable
+      Status get_status = db_->Get(read_options, cf_handle, key, &get_value);
+      if (!get_status.ok()) {
+        // Failed to read a key we know exists. Store the delta in memtable.
         perform_merge = false;
       } else {
-        // 3) Add value to memtable
-        assert(!concurrent_memtable_writes_);
-        bool mem_res = mem->Add(sequence_, kTypeValue, key, new_value);
-        if (UNLIKELY(!mem_res)) {
-          assert(seq_per_batch_);
-          ret_status = Status::TryAgain("key+seq exists");
-          const bool BATCH_BOUNDRY = true;
-          MaybeAdvanceSeq(BATCH_BOUNDRY);
+        Slice get_value_slice = Slice(get_value);
+
+        // 2) Apply this merge
+        auto merge_operator = moptions->merge_operator;
+        assert(merge_operator);
+
+        std::string new_value;
+        Status merge_status = MergeHelper::TimedFullMerge(
+            merge_operator, key, &get_value_slice, {value}, &new_value,
+            moptions->info_log, moptions->statistics,
+            SystemClock::Default().get());
+
+        if (!merge_status.ok()) {
+          // Failed to merge!
+          // Store the delta in memtable
+          perform_merge = false;
+        } else {
+          // 3) Add value to memtable
+          assert(!concurrent_memtable_writes_);
+          if (kv_prot_info != nullptr) {
+            auto merged_kv_prot_info =
+                kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+            merged_kv_prot_info.UpdateV(value, new_value);
+            merged_kv_prot_info.UpdateO(kTypeMerge, kTypeValue);
+            ret_status = mem->Add(sequence_, kTypeValue, key, new_value,
+                                  &merged_kv_prot_info);
+          } else {
+            ret_status = mem->Add(sequence_, kTypeValue, key, new_value,
+                                  nullptr /* kv_prot_info */);
+          }
         }
       }
     }
 
     if (!perform_merge) {
-      // Add merge operator to memtable
-      bool mem_res =
-          mem->Add(sequence_, kTypeMerge, key, value,
-                   concurrent_memtable_writes_, get_post_process_info(mem));
-      if (UNLIKELY(!mem_res)) {
-        assert(seq_per_batch_);
-        ret_status = Status::TryAgain("key+seq exists");
-        const bool BATCH_BOUNDRY = true;
-        MaybeAdvanceSeq(BATCH_BOUNDRY);
+      assert(ret_status.ok());
+      // Add merge operand to memtable
+      if (kv_prot_info != nullptr) {
+        auto mem_kv_prot_info =
+            kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+        ret_status =
+            mem->Add(sequence_, kTypeMerge, key, value, &mem_kv_prot_info,
+                     concurrent_memtable_writes_, get_post_process_info(mem));
+      } else {
+        ret_status = mem->Add(
+            sequence_, kTypeMerge, key, value, nullptr /* kv_prot_info */,
+            concurrent_memtable_writes_, get_post_process_info(mem));
       }
     }
 
+    if (UNLIKELY(ret_status.IsTryAgain())) {
+      assert(seq_per_batch_);
+      const bool kBatchBoundary = true;
+      MaybeAdvanceSeq(kBatchBoundary);
+    } else if (ret_status.ok()) {
+      MaybeAdvanceSeq();
+      CheckMemtableFull();
+    }
     // optimize for non-recovery mode
-    if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+    // If `ret_status` is `TryAgain` then the next (successful) try will add
+    // the key to the rebuilding transaction object. If `ret_status` is
+    // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+    // away. So we only need to add to it when `ret_status.ok()`.
+    if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
       assert(!write_after_commit_);
-      // If the ret_status is TryAgain then let the next try to add the ky to
-      // the rebuilding transaction object.
-      WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value);
+      // TODO(ajkr): propagate `ProtectionInfoKVOTS64`.
+      ret_status = WriteBatchInternal::Merge(rebuilding_trx_, column_family_id,
+                                             key, value);
     }
-    MaybeAdvanceSeq();
-    CheckMemtableFull();
     return ret_status;
   }
 
   Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key,
                         const Slice& value) override {
-    // Same as PutCF except for value type.
-    return PutCFImpl(column_family_id, key, value, kTypeBlobIndex);
+    const auto* kv_prot_info = NextProtectionInfo();
+    if (kv_prot_info != nullptr) {
+      // Memtable needs seqno, doesn't need CF ID
+      auto mem_kv_prot_info =
+          kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+      // Same as PutCF except for value type.
+      return PutCFImpl(column_family_id, key, value, kTypeBlobIndex,
+                       &mem_kv_prot_info);
+    } else {
+      return PutCFImpl(column_family_id, key, value, kTypeBlobIndex,
+                       nullptr /* kv_prot_info */);
+    }
   }
 
   void CheckMemtableFull() {
@@ -2000,8 +2323,8 @@ Status WriteBatchInternal::InsertInto(
   MemTableInserter inserter(
       sequence, memtables, flush_scheduler, trim_history_scheduler,
       ignore_missing_column_families, recovery_log_number, db,
-      concurrent_memtable_writes, nullptr /*has_valid_writes*/, seq_per_batch,
-      batch_per_txn);
+      concurrent_memtable_writes, nullptr /* prot_info */,
+      nullptr /*has_valid_writes*/, seq_per_batch, batch_per_txn);
   for (auto w : write_group) {
     if (w->CallbackFailed()) {
       continue;
@@ -2014,6 +2337,7 @@ Status WriteBatchInternal::InsertInto(
     }
     SetSequence(w->batch, inserter.sequence());
     inserter.set_log_number_ref(w->log_ref);
+    inserter.set_prot_info(w->batch->prot_info_.get());
     w->status = w->batch->Iterate(&inserter);
     if (!w->status.ok()) {
       return w->status;
@@ -2035,13 +2359,15 @@ Status WriteBatchInternal::InsertInto(
   (void)batch_cnt;
 #endif
   assert(writer->ShouldWriteToMemtable());
-  MemTableInserter inserter(
-      sequence, memtables, flush_scheduler, trim_history_scheduler,
-      ignore_missing_column_families, log_number, db,
-      concurrent_memtable_writes, nullptr /*has_valid_writes*/, seq_per_batch,
-      batch_per_txn, hint_per_batch);
+  MemTableInserter inserter(sequence, memtables, flush_scheduler,
+                            trim_history_scheduler,
+                            ignore_missing_column_families, log_number, db,
+                            concurrent_memtable_writes, nullptr /* prot_info */,
+                            nullptr /*has_valid_writes*/, seq_per_batch,
+                            batch_per_txn, hint_per_batch);
   SetSequence(writer->batch, sequence);
   inserter.set_log_number_ref(writer->log_ref);
+  inserter.set_prot_info(writer->batch->prot_info_.get());
   Status s = writer->batch->Iterate(&inserter);
   assert(!seq_per_batch || batch_cnt != 0);
   assert(!seq_per_batch || inserter.sequence() - sequence == batch_cnt);
@@ -2061,8 +2387,8 @@ Status WriteBatchInternal::InsertInto(
   MemTableInserter inserter(Sequence(batch), memtables, flush_scheduler,
                             trim_history_scheduler,
                             ignore_missing_column_families, log_number, db,
-                            concurrent_memtable_writes, has_valid_writes,
-                            seq_per_batch, batch_per_txn);
+                            concurrent_memtable_writes, batch->prot_info_.get(),
+                            has_valid_writes, seq_per_batch, batch_per_txn);
   Status s = batch->Iterate(&inserter);
   if (next_seq != nullptr) {
     *next_seq = inserter.sequence();
@@ -2075,6 +2401,7 @@ Status WriteBatchInternal::InsertInto(
 
 Status WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
   assert(contents.size() >= WriteBatchInternal::kHeader);
+  assert(b->prot_info_ == nullptr);
   b->rep_.assign(contents.data(), contents.size());
   b->content_flags_.store(ContentFlags::DEFERRED, std::memory_order_relaxed);
   return Status::OK();
@@ -2082,6 +2409,8 @@ Status WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
 
 Status WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src,
                                   const bool wal_only) {
+  assert(dst->Count() == 0 ||
+         (dst->prot_info_ == nullptr) == (src->prot_info_ == nullptr));
   size_t src_len;
   int src_count;
   uint32_t src_flags;
@@ -2098,6 +2427,13 @@ Status WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src,
     src_flags = src->content_flags_.load(std::memory_order_relaxed);
   }
 
+  if (dst->prot_info_ != nullptr) {
+    std::copy(src->prot_info_->entries_.begin(),
+              src->prot_info_->entries_.begin() + src_count,
+              std::back_inserter(dst->prot_info_->entries_));
+  } else if (src->prot_info_ != nullptr) {
+    dst->prot_info_.reset(new WriteBatch::ProtectionInfo(*src->prot_info_));
+  }
   SetCount(dst, Count(dst) + src_count);
   assert(src->rep_.size() >= WriteBatchInternal::kHeader);
   dst->rep_.append(src->rep_.data() + WriteBatchInternal::kHeader, src_len);
diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h
index 30c48996516..fa863a1d62c 100644
--- a/db/write_batch_internal.h
+++ b/db/write_batch_internal.h
@@ -9,7 +9,9 @@
 
 #pragma once
 #include <vector>
+
 #include "db/flush_scheduler.h"
+#include "db/kv_checksum.h"
 #include "db/trim_history_scheduler.h"
 #include "db/write_thread.h"
 #include "rocksdb/db.h"
@@ -61,6 +63,14 @@ class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables {
   MemTable* mem_;
 };
 
+struct WriteBatch::ProtectionInfo {
+  // `WriteBatch` usually doesn't contain a huge number of keys so protecting
+  // with a fixed, non-configurable eight bytes per key may work well enough.
+  autovector<ProtectionInfoKVOTC64> entries_;
+
+  size_t GetBytesPerKey() const { return 8; }
+};
+
 // WriteBatchInternal provides static methods for manipulating a
 // WriteBatch that we don't want in the public WriteBatch interface.
 class WriteBatchInternal {
@@ -232,6 +242,9 @@ class LocalSavePoint {
     if (batch_->max_bytes_ && batch_->rep_.size() > batch_->max_bytes_) {
       batch_->rep_.resize(savepoint_.size);
       WriteBatchInternal::SetCount(batch_, savepoint_.count);
+      if (batch_->prot_info_ != nullptr) {
+        batch_->prot_info_->entries_.resize(savepoint_.count);
+      }
       batch_->content_flags_.store(savepoint_.content_flags,
                                    std::memory_order_relaxed);
       return Status::MemoryLimit();
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
index 5a210b3d6dd..e4043cc406e 100644
--- a/db/write_batch_test.cc
+++ b/db/write_batch_test.cc
@@ -7,12 +7,13 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include "rocksdb/db.h"
-
 #include <memory>
+
 #include "db/column_family.h"
+#include "db/db_test_util.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
@@ -23,12 +24,16 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-static std::string PrintContents(WriteBatch* b) {
+static std::string PrintContents(WriteBatch* b,
+                                 bool merge_operator_supported = true) {
   InternalKeyComparator cmp(BytewiseComparator());
   auto factory = std::make_shared<SkipListFactory>();
   Options options;
   options.memtable_factory = factory;
-  ImmutableCFOptions ioptions(options);
+  if (merge_operator_supported) {
+    options.merge_operator.reset(new TestPutOperator());
+  }
+  ImmutableOptions ioptions(options);
   WriteBufferManager wb(options.db_write_buffer_size);
   MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
                                kMaxSequenceNumber, 0 /* column_family_id */);
@@ -59,10 +64,11 @@ static std::string PrintContents(WriteBatch* b) {
     if (iter == nullptr) {
       continue;
     }
+    EXPECT_OK(iter->status());
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
       ParsedInternalKey ikey;
       ikey.clear();
-      EXPECT_OK(ParseInternalKey(iter->key(), &ikey));
+      EXPECT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
       switch (ikey.type) {
         case kTypeValue:
           state.append("Put(");
@@ -112,16 +118,19 @@ static std::string PrintContents(WriteBatch* b) {
       state.append("@");
       state.append(NumberToString(ikey.sequence));
     }
+    EXPECT_OK(iter->status());
   }
-  EXPECT_EQ(b->HasPut(), put_count > 0);
-  EXPECT_EQ(b->HasDelete(), delete_count > 0);
-  EXPECT_EQ(b->HasSingleDelete(), single_delete_count > 0);
-  EXPECT_EQ(b->HasDeleteRange(), delete_range_count > 0);
-  EXPECT_EQ(b->HasMerge(), merge_count > 0);
-  if (!s.ok()) {
+  if (s.ok()) {
+    EXPECT_EQ(b->HasPut(), put_count > 0);
+    EXPECT_EQ(b->HasDelete(), delete_count > 0);
+    EXPECT_EQ(b->HasSingleDelete(), single_delete_count > 0);
+    EXPECT_EQ(b->HasDeleteRange(), delete_range_count > 0);
+    EXPECT_EQ(b->HasMerge(), merge_count > 0);
+    if (count != WriteBatchInternal::Count(b)) {
+      state.append("CountMismatch()");
+    }
+  } else {
     state.append(s.ToString());
-  } else if (count != WriteBatchInternal::Count(b)) {
-    state.append("CountMismatch()");
   }
   delete mem->Unref();
   return state;
@@ -138,10 +147,10 @@ TEST_F(WriteBatchTest, Empty) {
 
 TEST_F(WriteBatchTest, Multiple) {
   WriteBatch batch;
-  batch.Put(Slice("foo"), Slice("bar"));
-  batch.Delete(Slice("box"));
-  batch.DeleteRange(Slice("bar"), Slice("foo"));
-  batch.Put(Slice("baz"), Slice("boo"));
+  ASSERT_OK(batch.Put(Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Delete(Slice("box")));
+  ASSERT_OK(batch.DeleteRange(Slice("bar"), Slice("foo")));
+  ASSERT_OK(batch.Put(Slice("baz"), Slice("boo")));
   WriteBatchInternal::SetSequence(&batch, 100);
   ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch));
   ASSERT_EQ(4u, WriteBatchInternal::Count(&batch));
@@ -156,12 +165,12 @@ TEST_F(WriteBatchTest, Multiple) {
 
 TEST_F(WriteBatchTest, Corruption) {
   WriteBatch batch;
-  batch.Put(Slice("foo"), Slice("bar"));
-  batch.Delete(Slice("box"));
+  ASSERT_OK(batch.Put(Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Delete(Slice("box")));
   WriteBatchInternal::SetSequence(&batch, 200);
   Slice contents = WriteBatchInternal::Contents(&batch);
-  WriteBatchInternal::SetContents(&batch,
-                                  Slice(contents.data(),contents.size()-1));
+  ASSERT_OK(WriteBatchInternal::SetContents(
+      &batch, Slice(contents.data(), contents.size() - 1)));
   ASSERT_EQ("Put(foo, bar)@200"
             "Corruption: bad WriteBatch Delete",
             PrintContents(&batch));
@@ -171,24 +180,24 @@ TEST_F(WriteBatchTest, Append) {
   WriteBatch b1, b2;
   WriteBatchInternal::SetSequence(&b1, 200);
   WriteBatchInternal::SetSequence(&b2, 300);
-  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
   ASSERT_EQ("",
             PrintContents(&b1));
   ASSERT_EQ(0u, b1.Count());
-  b2.Put("a", "va");
-  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_OK(b2.Put("a", "va"));
+  ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
   ASSERT_EQ("Put(a, va)@200",
             PrintContents(&b1));
   ASSERT_EQ(1u, b1.Count());
   b2.Clear();
-  b2.Put("b", "vb");
-  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_OK(b2.Put("b", "vb"));
+  ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
   ASSERT_EQ("Put(a, va)@200"
             "Put(b, vb)@201",
             PrintContents(&b1));
   ASSERT_EQ(2u, b1.Count());
-  b2.Delete("foo");
-  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_OK(b2.Delete("foo"));
+  ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
   ASSERT_EQ("Put(a, va)@200"
             "Put(b, vb)@202"
             "Put(b, vb)@201"
@@ -196,11 +205,11 @@ TEST_F(WriteBatchTest, Append) {
             PrintContents(&b1));
   ASSERT_EQ(4u, b1.Count());
   b2.Clear();
-  b2.Put("c", "cc");
-  b2.Put("d", "dd");
+  ASSERT_OK(b2.Put("c", "cc"));
+  ASSERT_OK(b2.Put("d", "dd"));
   b2.MarkWalTerminationPoint();
-  b2.Put("e", "ee");
-  WriteBatchInternal::Append(&b1, &b2, /*wal only*/ true);
+  ASSERT_OK(b2.Put("e", "ee"));
+  ASSERT_OK(WriteBatchInternal::Append(&b1, &b2, /*wal only*/ true));
   ASSERT_EQ(
       "Put(a, va)@200"
       "Put(b, vb)@202"
@@ -223,10 +232,10 @@ TEST_F(WriteBatchTest, SingleDeletion) {
   WriteBatchInternal::SetSequence(&batch, 100);
   ASSERT_EQ("", PrintContents(&batch));
   ASSERT_EQ(0u, batch.Count());
-  batch.Put("a", "va");
+  ASSERT_OK(batch.Put("a", "va"));
   ASSERT_EQ("Put(a, va)@100", PrintContents(&batch));
   ASSERT_EQ(1u, batch.Count());
-  batch.SingleDelete("a");
+  ASSERT_OK(batch.SingleDelete("a"));
   ASSERT_EQ(
       "SingleDelete(a)@101"
       "Put(a, va)@100",
@@ -316,7 +325,7 @@ namespace {
 
 TEST_F(WriteBatchTest, PutNotImplemented) {
   WriteBatch batch;
-  batch.Put(Slice("k1"), Slice("v1"));
+  ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
   ASSERT_EQ(1u, batch.Count());
   ASSERT_EQ("Put(k1, v1)@0", PrintContents(&batch));
 
@@ -326,7 +335,7 @@ TEST_F(WriteBatchTest, PutNotImplemented) {
 
 TEST_F(WriteBatchTest, DeleteNotImplemented) {
   WriteBatch batch;
-  batch.Delete(Slice("k2"));
+  ASSERT_OK(batch.Delete(Slice("k2")));
   ASSERT_EQ(1u, batch.Count());
   ASSERT_EQ("Delete(k2)@0", PrintContents(&batch));
 
@@ -336,7 +345,7 @@ TEST_F(WriteBatchTest, DeleteNotImplemented) {
 
 TEST_F(WriteBatchTest, SingleDeleteNotImplemented) {
   WriteBatch batch;
-  batch.SingleDelete(Slice("k2"));
+  ASSERT_OK(batch.SingleDelete(Slice("k2")));
   ASSERT_EQ(1u, batch.Count());
   ASSERT_EQ("SingleDelete(k2)@0", PrintContents(&batch));
 
@@ -346,7 +355,7 @@ TEST_F(WriteBatchTest, SingleDeleteNotImplemented) {
 
 TEST_F(WriteBatchTest, MergeNotImplemented) {
   WriteBatch batch;
-  batch.Merge(Slice("foo"), Slice("bar"));
+  ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
   ASSERT_EQ(1u, batch.Count());
   ASSERT_EQ("Merge(foo, bar)@0", PrintContents(&batch));
 
@@ -354,16 +363,26 @@ TEST_F(WriteBatchTest, MergeNotImplemented) {
   ASSERT_OK(batch.Iterate(&handler));
 }
 
+TEST_F(WriteBatchTest, MergeWithoutOperatorInsertionFailure) {
+  WriteBatch batch;
+  ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
+  ASSERT_EQ(1u, batch.Count());
+  ASSERT_EQ(
+      "Invalid argument: Merge requires `ColumnFamilyOptions::merge_operator "
+      "!= nullptr`",
+      PrintContents(&batch, false /* merge_operator_supported */));
+}
+
 TEST_F(WriteBatchTest, Blob) {
   WriteBatch batch;
-  batch.Put(Slice("k1"), Slice("v1"));
-  batch.Put(Slice("k2"), Slice("v2"));
-  batch.Put(Slice("k3"), Slice("v3"));
-  batch.PutLogData(Slice("blob1"));
-  batch.Delete(Slice("k2"));
-  batch.SingleDelete(Slice("k3"));
-  batch.PutLogData(Slice("blob2"));
-  batch.Merge(Slice("foo"), Slice("bar"));
+  ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
+  ASSERT_OK(batch.Put(Slice("k2"), Slice("v2")));
+  ASSERT_OK(batch.Put(Slice("k3"), Slice("v3")));
+  ASSERT_OK(batch.PutLogData(Slice("blob1")));
+  ASSERT_OK(batch.Delete(Slice("k2")));
+  ASSERT_OK(batch.SingleDelete(Slice("k3")));
+  ASSERT_OK(batch.PutLogData(Slice("blob2")));
+  ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
   ASSERT_EQ(6u, batch.Count());
   ASSERT_EQ(
       "Merge(foo, bar)@5"
@@ -375,7 +394,7 @@ TEST_F(WriteBatchTest, Blob) {
       PrintContents(&batch));
 
   TestHandler handler;
-  batch.Iterate(&handler);
+  ASSERT_OK(batch.Iterate(&handler));
   ASSERT_EQ(
       "Put(k1, v1)"
       "Put(k2, v2)"
@@ -390,19 +409,19 @@ TEST_F(WriteBatchTest, Blob) {
 
 TEST_F(WriteBatchTest, PrepareCommit) {
   WriteBatch batch;
-  WriteBatchInternal::InsertNoop(&batch);
-  batch.Put(Slice("k1"), Slice("v1"));
-  batch.Put(Slice("k2"), Slice("v2"));
+  ASSERT_OK(WriteBatchInternal::InsertNoop(&batch));
+  ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
+  ASSERT_OK(batch.Put(Slice("k2"), Slice("v2")));
   batch.SetSavePoint();
-  WriteBatchInternal::MarkEndPrepare(&batch, Slice("xid1"));
+  ASSERT_OK(WriteBatchInternal::MarkEndPrepare(&batch, Slice("xid1")));
   Status s = batch.RollbackToSavePoint();
   ASSERT_EQ(s, Status::NotFound());
-  WriteBatchInternal::MarkCommit(&batch, Slice("xid1"));
-  WriteBatchInternal::MarkRollback(&batch, Slice("xid1"));
+  ASSERT_OK(WriteBatchInternal::MarkCommit(&batch, Slice("xid1")));
+  ASSERT_OK(WriteBatchInternal::MarkRollback(&batch, Slice("xid1")));
   ASSERT_EQ(2u, batch.Count());
 
   TestHandler handler;
-  batch.Iterate(&handler);
+  ASSERT_OK(batch.Iterate(&handler));
   ASSERT_EQ(
       "MarkBeginPrepare(false)"
       "Put(k1, v1)"
@@ -430,7 +449,7 @@ TEST_F(WriteBatchTest, DISABLED_ManyUpdates) {
     raw[0] = c;
     raw[raw.length() - 1] = c;
     c++;
-    batch.Put(raw, raw);
+    ASSERT_OK(batch.Put(raw, raw));
   }
 
   ASSERT_EQ(kNumUpdates, batch.Count());
@@ -472,7 +491,7 @@ TEST_F(WriteBatchTest, DISABLED_ManyUpdates) {
     bool Continue() override { return num_seen < kNumUpdates; }
   } handler;
 
-  batch.Iterate(&handler);
+  ASSERT_OK(batch.Iterate(&handler));
   ASSERT_EQ(kNumUpdates, handler.num_seen);
 }
 
@@ -486,7 +505,7 @@ TEST_F(WriteBatchTest, DISABLED_LargeKeyValue) {
   for (char i = 0; i < 2; i++) {
     raw[0] = 'A' + i;
     raw[raw.length() - 1] = 'A' - i;
-    batch.Put(raw, raw);
+    ASSERT_OK(batch.Put(raw, raw));
   }
 
   ASSERT_EQ(2u, batch.Count());
@@ -523,7 +542,7 @@ TEST_F(WriteBatchTest, DISABLED_LargeKeyValue) {
     bool Continue() override { return num_seen < 2; }
   } handler;
 
-  batch.Iterate(&handler);
+  ASSERT_OK(batch.Iterate(&handler));
   ASSERT_EQ(2, handler.num_seen);
 }
 
@@ -558,14 +577,14 @@ TEST_F(WriteBatchTest, Continue) {
     bool Continue() override { return num_seen < 5; }
   } handler;
 
-  batch.Put(Slice("k1"), Slice("v1"));
-  batch.Put(Slice("k2"), Slice("v2"));
-  batch.PutLogData(Slice("blob1"));
-  batch.Delete(Slice("k1"));
-  batch.SingleDelete(Slice("k2"));
-  batch.PutLogData(Slice("blob2"));
-  batch.Merge(Slice("foo"), Slice("bar"));
-  batch.Iterate(&handler);
+  ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
+  ASSERT_OK(batch.Put(Slice("k2"), Slice("v2")));
+  ASSERT_OK(batch.PutLogData(Slice("blob1")));
+  ASSERT_OK(batch.Delete(Slice("k1")));
+  ASSERT_OK(batch.SingleDelete(Slice("k2")));
+  ASSERT_OK(batch.PutLogData(Slice("blob2")));
+  ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Iterate(&handler));
   ASSERT_EQ(
       "Put(k1, v1)"
       "Put(k2, v2)"
@@ -577,22 +596,22 @@ TEST_F(WriteBatchTest, Continue) {
 
 TEST_F(WriteBatchTest, PutGatherSlices) {
   WriteBatch batch;
-  batch.Put(Slice("foo"), Slice("bar"));
+  ASSERT_OK(batch.Put(Slice("foo"), Slice("bar")));
 
   {
     // Try a write where the key is one slice but the value is two
     Slice key_slice("baz");
     Slice value_slices[2] = { Slice("header"), Slice("payload") };
-    batch.Put(SliceParts(&key_slice, 1),
-              SliceParts(value_slices, 2));
+    ASSERT_OK(
+        batch.Put(SliceParts(&key_slice, 1), SliceParts(value_slices, 2)));
   }
 
   {
     // One where the key is composite but the value is a single slice
     Slice key_slices[3] = { Slice("key"), Slice("part2"), Slice("part3") };
     Slice value_slice("value");
-    batch.Put(SliceParts(key_slices, 3),
-              SliceParts(&value_slice, 1));
+    ASSERT_OK(
+        batch.Put(SliceParts(key_slices, 3), SliceParts(&value_slice, 1)));
   }
 
   WriteBatchInternal::SetSequence(&batch, 100);
@@ -621,18 +640,18 @@ class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl {
 TEST_F(WriteBatchTest, ColumnFamiliesBatchTest) {
   WriteBatch batch;
   ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
-  batch.Put(&zero, Slice("foo"), Slice("bar"));
-  batch.Put(&two, Slice("twofoo"), Slice("bar2"));
-  batch.Put(&eight, Slice("eightfoo"), Slice("bar8"));
-  batch.Delete(&eight, Slice("eightfoo"));
-  batch.SingleDelete(&two, Slice("twofoo"));
-  batch.DeleteRange(&two, Slice("3foo"), Slice("4foo"));
-  batch.Merge(&three, Slice("threethree"), Slice("3three"));
-  batch.Put(&zero, Slice("foo"), Slice("bar"));
-  batch.Merge(Slice("omom"), Slice("nom"));
+  ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Put(&two, Slice("twofoo"), Slice("bar2")));
+  ASSERT_OK(batch.Put(&eight, Slice("eightfoo"), Slice("bar8")));
+  ASSERT_OK(batch.Delete(&eight, Slice("eightfoo")));
+  ASSERT_OK(batch.SingleDelete(&two, Slice("twofoo")));
+  ASSERT_OK(batch.DeleteRange(&two, Slice("3foo"), Slice("4foo")));
+  ASSERT_OK(batch.Merge(&three, Slice("threethree"), Slice("3three")));
+  ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Merge(Slice("omom"), Slice("nom")));
 
   TestHandler handler;
-  batch.Iterate(&handler);
+  ASSERT_OK(batch.Iterate(&handler));
   ASSERT_EQ(
       "Put(foo, bar)"
       "PutCF(2, twofoo, bar2)"
@@ -650,14 +669,14 @@ TEST_F(WriteBatchTest, ColumnFamiliesBatchTest) {
 TEST_F(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) {
   WriteBatchWithIndex batch;
   ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
-  batch.Put(&zero, Slice("foo"), Slice("bar"));
-  batch.Put(&two, Slice("twofoo"), Slice("bar2"));
-  batch.Put(&eight, Slice("eightfoo"), Slice("bar8"));
-  batch.Delete(&eight, Slice("eightfoo"));
-  batch.SingleDelete(&two, Slice("twofoo"));
-  batch.Merge(&three, Slice("threethree"), Slice("3three"));
-  batch.Put(&zero, Slice("foo"), Slice("bar"));
-  batch.Merge(Slice("omom"), Slice("nom"));
+  ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Put(&two, Slice("twofoo"), Slice("bar2")));
+  ASSERT_OK(batch.Put(&eight, Slice("eightfoo"), Slice("bar8")));
+  ASSERT_OK(batch.Delete(&eight, Slice("eightfoo")));
+  ASSERT_OK(batch.SingleDelete(&two, Slice("twofoo")));
+  ASSERT_OK(batch.Merge(&three, Slice("threethree"), Slice("3three")));
+  ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Merge(Slice("omom"), Slice("nom")));
 
   std::unique_ptr<WBWIIterator> iter;
 
@@ -736,7 +755,7 @@ TEST_F(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) {
   ASSERT_TRUE(!iter->Valid());
 
   TestHandler handler;
-  batch.GetWriteBatch()->Iterate(&handler);
+  ASSERT_OK(batch.GetWriteBatch()->Iterate(&handler));
   ASSERT_EQ(
       "Put(foo, bar)"
       "PutCF(2, twofoo, bar2)"
@@ -755,12 +774,12 @@ TEST_F(WriteBatchTest, SavePointTest) {
   WriteBatch batch;
   batch.SetSavePoint();
 
-  batch.Put("A", "a");
-  batch.Put("B", "b");
+  ASSERT_OK(batch.Put("A", "a"));
+  ASSERT_OK(batch.Put("B", "b"));
   batch.SetSavePoint();
 
-  batch.Put("C", "c");
-  batch.Delete("A");
+  ASSERT_OK(batch.Put("C", "c"));
+  ASSERT_OK(batch.Delete("A"));
   batch.SetSavePoint();
   batch.SetSavePoint();
 
@@ -779,8 +798,8 @@ TEST_F(WriteBatchTest, SavePointTest) {
       "Put(B, b)@1",
       PrintContents(&batch));
 
-  batch.Delete("A");
-  batch.Put("B", "bb");
+  ASSERT_OK(batch.Delete("A"));
+  ASSERT_OK(batch.Put("B", "bb"));
 
   ASSERT_OK(batch.RollbackToSavePoint());
   ASSERT_EQ("", PrintContents(&batch));
@@ -789,12 +808,12 @@ TEST_F(WriteBatchTest, SavePointTest) {
   ASSERT_TRUE(s.IsNotFound());
   ASSERT_EQ("", PrintContents(&batch));
 
-  batch.Put("D", "d");
-  batch.Delete("A");
+  ASSERT_OK(batch.Put("D", "d"));
+  ASSERT_OK(batch.Delete("A"));
 
   batch.SetSavePoint();
 
-  batch.Put("A", "aaa");
+  ASSERT_OK(batch.Put("A", "aaa"));
 
   ASSERT_OK(batch.RollbackToSavePoint());
   ASSERT_EQ(
@@ -804,8 +823,8 @@ TEST_F(WriteBatchTest, SavePointTest) {
 
   batch.SetSavePoint();
 
-  batch.Put("D", "d");
-  batch.Delete("A");
+  ASSERT_OK(batch.Put("D", "d"));
+  ASSERT_OK(batch.Delete("A"));
 
   ASSERT_OK(batch.RollbackToSavePoint());
   ASSERT_EQ(
@@ -826,7 +845,7 @@ TEST_F(WriteBatchTest, SavePointTest) {
   ASSERT_TRUE(s.IsNotFound());
   ASSERT_EQ("", PrintContents(&batch2));
 
-  batch2.Delete("A");
+  ASSERT_OK(batch2.Delete("A"));
   batch2.SetSavePoint();
 
   s = batch2.RollbackToSavePoint();
@@ -838,7 +857,7 @@ TEST_F(WriteBatchTest, SavePointTest) {
 
   batch2.SetSavePoint();
 
-  batch2.Delete("B");
+  ASSERT_OK(batch2.Delete("B"));
   ASSERT_EQ("Delete(B)@0", PrintContents(&batch2));
 
   batch2.SetSavePoint();
@@ -861,7 +880,7 @@ TEST_F(WriteBatchTest, SavePointTest) {
   ASSERT_EQ("", PrintContents(&batch3));
 
   batch3.SetSavePoint();
-  batch3.Delete("A");
+  ASSERT_OK(batch3.Delete("A"));
 
   s = batch3.PopSavePoint();
   ASSERT_OK(s);
diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc
index 4bfc4e911f4..a2d3f94c4bf 100644
--- a/db/write_callback_test.cc
+++ b/db/write_callback_test.cc
@@ -111,7 +111,7 @@ TEST_P(WriteCallbackPTest, WriteWithCallbackTest) {
 
     void Put(const string& key, const string& val) {
       kvs_.push_back(std::make_pair(key, val));
-      write_batch_.Put(key, val);
+      ASSERT_OK(write_batch_.Put(key, val));
     }
 
     void Clear() {
@@ -319,7 +319,7 @@ TEST_P(WriteCallbackPTest, WriteWithCallbackTest) {
           DBImpl* db_impl_;
         } publish_seq_callback(db_impl);
         // seq_per_batch_ requires a natural batch separator or Noop
-        WriteBatchInternal::InsertNoop(&write_op.write_batch_);
+        ASSERT_OK(WriteBatchInternal::InsertNoop(&write_op.write_batch_));
         const size_t ONE_BATCH = 1;
         s = db_impl->WriteImpl(woptions, &write_op.write_batch_,
                                &write_op.callback_, nullptr, 0, false, nullptr,
@@ -396,8 +396,8 @@ TEST_F(WriteCallbackTest, WriteCallBackTest) {
 
   WriteBatch wb;
 
-  wb.Put("a", "value.a");
-  wb.Delete("x");
+  ASSERT_OK(wb.Put("a", "value.a"));
+  ASSERT_OK(wb.Delete("x"));
 
   // Test a simple Write
   s = db->Write(write_options, &wb);
@@ -411,7 +411,7 @@ TEST_F(WriteCallbackTest, WriteCallBackTest) {
   WriteCallbackTestWriteCallback1 callback1;
   WriteBatch wb2;
 
-  wb2.Put("a", "value.a2");
+  ASSERT_OK(wb2.Put("a", "value.a2"));
 
   s = db_impl->WriteWithCallback(write_options, &wb2, &callback1);
   ASSERT_OK(s);
@@ -425,7 +425,7 @@ TEST_F(WriteCallbackTest, WriteCallBackTest) {
   WriteCallbackTestWriteCallback2 callback2;
   WriteBatch wb3;
 
-  wb3.Put("a", "value.a3");
+  ASSERT_OK(wb3.Put("a", "value.a3"));
 
   s = db_impl->WriteWithCallback(write_options, &wb3, &callback2);
   ASSERT_NOK(s);
diff --git a/db/write_controller.cc b/db/write_controller.cc
index 5480aabd149..c5f7443752f 100644
--- a/db/write_controller.cc
+++ b/db/write_controller.cc
@@ -5,10 +5,12 @@
 
 #include "db/write_controller.h"
 
+#include <algorithm>
 #include <atomic>
 #include <cassert>
 #include <ratio>
-#include "rocksdb/env.h"
+
+#include "rocksdb/system_clock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -19,10 +21,14 @@ std::unique_ptr<WriteControllerToken> WriteController::GetStopToken() {
 
 std::unique_ptr<WriteControllerToken> WriteController::GetDelayToken(
     uint64_t write_rate) {
-  total_delayed_++;
-  // Reset counters.
-  last_refill_time_ = 0;
-  bytes_left_ = 0;
+  if (0 == total_delayed_++) {
+    // Starting delay, so reset counters.
+    next_refill_time_ = 0;
+    credit_in_bytes_ = 0;
+  }
+  // NOTE: for simplicity, any current credit_in_bytes_ or "debt" in
+  // next_refill_time_ will be based on an old rate. This rate will apply
+  // for subsequent additional debts and for the next refill.
   set_delayed_write_rate(write_rate);
   return std::unique_ptr<WriteControllerToken>(new DelayWriteToken(this));
 }
@@ -42,7 +48,7 @@ bool WriteController::IsStopped() const {
 // If it turns out to be a performance issue, we can redesign the thread
 // synchronization model here.
 // The function trust caller will sleep micros returned.
-uint64_t WriteController::GetDelay(Env* env, uint64_t num_bytes) {
+uint64_t WriteController::GetDelay(SystemClock* clock, uint64_t num_bytes) {
   if (total_stopped_.load(std::memory_order_relaxed) > 0) {
     return 0;
   }
@@ -50,64 +56,51 @@ uint64_t WriteController::GetDelay(Env* env, uint64_t num_bytes) {
     return 0;
   }
 
-  const uint64_t kMicrosPerSecond = 1000000;
-  const uint64_t kRefillInterval = 1024U;
-
-  if (bytes_left_ >= num_bytes) {
-    bytes_left_ -= num_bytes;
+  if (credit_in_bytes_ >= num_bytes) {
+    credit_in_bytes_ -= num_bytes;
     return 0;
   }
   // The frequency to get time inside DB mutex is less than one per refill
   // interval.
-  auto time_now = NowMicrosMonotonic(env);
-
-  uint64_t sleep_debt = 0;
-  uint64_t time_since_last_refill = 0;
-  if (last_refill_time_ != 0) {
-    if (last_refill_time_ > time_now) {
-      sleep_debt = last_refill_time_ - time_now;
-    } else {
-      time_since_last_refill = time_now - last_refill_time_;
-      bytes_left_ +=
-          static_cast<uint64_t>(static_cast<double>(time_since_last_refill) /
-                                kMicrosPerSecond * delayed_write_rate_);
-      if (time_since_last_refill >= kRefillInterval &&
-          bytes_left_ > num_bytes) {
-        // If refill interval already passed and we have enough bytes
-        // return without extra sleeping.
-        last_refill_time_ = time_now;
-        bytes_left_ -= num_bytes;
-        return 0;
-      }
+  auto time_now = NowMicrosMonotonic(clock);
+
+  const uint64_t kMicrosPerSecond = 1000000;
+  // Refill every 1 ms
+  const uint64_t kMicrosPerRefill = 1000;
+
+  if (next_refill_time_ == 0) {
+    // Start with an initial allotment of bytes for one interval
+    next_refill_time_ = time_now;
+  }
+  if (next_refill_time_ <= time_now) {
+    // Refill based on time interval plus any extra elapsed
+    uint64_t elapsed = time_now - next_refill_time_ + kMicrosPerRefill;
+    credit_in_bytes_ += static_cast<uint64_t>(
+        1.0 * elapsed / kMicrosPerSecond * delayed_write_rate_ + 0.999999);
+    next_refill_time_ = time_now + kMicrosPerRefill;
+
+    if (credit_in_bytes_ >= num_bytes) {
+      // Avoid delay if possible, to reduce DB mutex release & re-aquire.
+      credit_in_bytes_ -= num_bytes;
+      return 0;
     }
   }
 
-  uint64_t single_refill_amount =
-      delayed_write_rate_ * kRefillInterval / kMicrosPerSecond;
-  if (bytes_left_ + single_refill_amount >= num_bytes) {
-    // Wait until a refill interval
-    // Never trigger expire for less than one refill interval to avoid to get
-    // time.
-    bytes_left_ = bytes_left_ + single_refill_amount - num_bytes;
-    last_refill_time_ = time_now + kRefillInterval;
-    return kRefillInterval + sleep_debt;
-  }
+  // We need to delay to avoid exceeding write rate.
+  assert(num_bytes > credit_in_bytes_);
+  uint64_t bytes_over_budget = num_bytes - credit_in_bytes_;
+  uint64_t needed_delay = static_cast<uint64_t>(
+      1.0 * bytes_over_budget / delayed_write_rate_ * kMicrosPerSecond);
+
+  credit_in_bytes_ = 0;
+  next_refill_time_ += needed_delay;
 
-  // Need to refill more than one interval. Need to sleep longer. Check
-  // whether expiration will hit
-
-  // Sleep just until `num_bytes` is allowed.
-  uint64_t sleep_amount =
-      static_cast<uint64_t>(num_bytes /
-                            static_cast<long double>(delayed_write_rate_) *
-                            kMicrosPerSecond) +
-      sleep_debt;
-  last_refill_time_ = time_now + sleep_amount;
-  return sleep_amount;
+  // Minimum delay of refill interval, to reduce DB mutex contention.
+  return std::max(next_refill_time_ - time_now, kMicrosPerRefill);
 }
 
-uint64_t WriteController::NowMicrosMonotonic(Env* env) {
-  return env->NowNanos() / std::milli::den;
+uint64_t WriteController::NowMicrosMonotonic(SystemClock* clock) {
+  return clock->NowNanos() / std::milli::den;
 }
 
 StopWriteToken::~StopWriteToken() {
diff --git a/db/write_controller.h b/db/write_controller.h
index 785ae68965d..88bd1417f19 100644
--- a/db/write_controller.h
+++ b/db/write_controller.h
@@ -13,7 +13,7 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-class Env;
+class SystemClock;
 class WriteControllerToken;
 
 // WriteController is controlling write stalls in our write code-path. Write
@@ -27,8 +27,8 @@ class WriteController {
       : total_stopped_(0),
         total_delayed_(0),
         total_compaction_pressure_(0),
-        bytes_left_(0),
-        last_refill_time_(0),
+        credit_in_bytes_(0),
+        next_refill_time_(0),
         low_pri_rate_limiter_(
             NewGenericRateLimiter(low_pri_rate_bytes_per_sec)) {
     set_max_delayed_write_rate(_delayed_write_rate);
@@ -57,7 +57,7 @@ class WriteController {
   // return how many microseconds the caller needs to sleep after the call
   // num_bytes: how many number of bytes to put into the DB.
   // Prerequisite: DB mutex held.
-  uint64_t GetDelay(Env* env, uint64_t num_bytes);
+  uint64_t GetDelay(SystemClock* clock, uint64_t num_bytes);
   void set_delayed_write_rate(uint64_t write_rate) {
     // avoid divide 0
     if (write_rate == 0) {
@@ -85,7 +85,7 @@ class WriteController {
   RateLimiter* low_pri_rate_limiter() { return low_pri_rate_limiter_.get(); }
 
  private:
-  uint64_t NowMicrosMonotonic(Env* env);
+  uint64_t NowMicrosMonotonic(SystemClock* clock);
 
   friend class WriteControllerToken;
   friend class StopWriteToken;
@@ -95,11 +95,14 @@ class WriteController {
   std::atomic<int> total_stopped_;
   std::atomic<int> total_delayed_;
   std::atomic<int> total_compaction_pressure_;
-  uint64_t bytes_left_;
-  uint64_t last_refill_time_;
-  // write rate set when initialization or by `DBImpl::SetDBOptions`
+
+  // Number of bytes allowed to write without delay
+  uint64_t credit_in_bytes_;
+  // Next time that we can add more credit of bytes
+  uint64_t next_refill_time_;
+  // Write rate set when initialization or by `DBImpl::SetDBOptions`
   uint64_t max_delayed_write_rate_;
-  // current write rate
+  // Current write rate (bytes / second)
   uint64_t delayed_write_rate_;
 
   std::unique_ptr<RateLimiter> low_pri_rate_limiter_;
diff --git a/db/write_controller_test.cc b/db/write_controller_test.cc
index 72d116798e8..1f7cf999aaf 100644
--- a/db/write_controller_test.cc
+++ b/db/write_controller_test.cc
@@ -3,128 +3,240 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
-#include <ratio>
-
 #include "db/write_controller.h"
 
-#include "rocksdb/env.h"
+#include <array>
+#include <ratio>
+
+#include "rocksdb/system_clock.h"
 #include "test_util/testharness.h"
 
 namespace ROCKSDB_NAMESPACE {
-
-class WriteControllerTest : public testing::Test {};
-
-class TimeSetEnv : public EnvWrapper {
+namespace {
+class TimeSetClock : public SystemClockWrapper {
  public:
-  explicit TimeSetEnv() : EnvWrapper(nullptr) {}
+  explicit TimeSetClock() : SystemClockWrapper(nullptr) {}
+  const char* Name() const override { return "TimeSetClock"; }
   uint64_t now_micros_ = 6666;
   uint64_t NowNanos() override { return now_micros_ * std::milli::den; }
 };
+}  // namespace
+class WriteControllerTest : public testing::Test {
+ public:
+  WriteControllerTest() { clock_ = std::make_shared<TimeSetClock>(); }
+  std::shared_ptr<TimeSetClock> clock_;
+};
 
-TEST_F(WriteControllerTest, ChangeDelayRateTest) {
-  TimeSetEnv env;
-  WriteController controller(40000000u);  // also set max delayed rate
-  controller.set_delayed_write_rate(10000000u);
-  auto delay_token_0 =
-      controller.GetDelayToken(controller.delayed_write_rate());
-  ASSERT_EQ(static_cast<uint64_t>(2000000),
-            controller.GetDelay(&env, 20000000u));
-  auto delay_token_1 = controller.GetDelayToken(2000000u);
-  ASSERT_EQ(static_cast<uint64_t>(10000000),
-            controller.GetDelay(&env, 20000000u));
-  auto delay_token_2 = controller.GetDelayToken(1000000u);
-  ASSERT_EQ(static_cast<uint64_t>(20000000),
-            controller.GetDelay(&env, 20000000u));
-  auto delay_token_3 = controller.GetDelayToken(20000000u);
-  ASSERT_EQ(static_cast<uint64_t>(1000000),
-            controller.GetDelay(&env, 20000000u));
-  // This is more than max rate. Max delayed rate will be used.
-  auto delay_token_4 =
-      controller.GetDelayToken(controller.delayed_write_rate() * 3);
-  ASSERT_EQ(static_cast<uint64_t>(500000),
-            controller.GetDelay(&env, 20000000u));
+// Make tests easier to read
+#define MILLION *1000000u
+#define MB MILLION
+#define MBPS MILLION
+#define SECS MILLION  // in microseconds
+
+TEST_F(WriteControllerTest, BasicAPI) {
+  WriteController controller(40 MBPS);  // also set max delayed rate
+  EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS);
+  EXPECT_FALSE(controller.IsStopped());
+  EXPECT_FALSE(controller.NeedsDelay());
+  EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+
+  // set, get
+  controller.set_delayed_write_rate(20 MBPS);
+  EXPECT_EQ(controller.delayed_write_rate(), 20 MBPS);
+  EXPECT_FALSE(controller.IsStopped());
+  EXPECT_FALSE(controller.NeedsDelay());
+  EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+
+  {
+    // set with token, get
+    auto delay_token_0 = controller.GetDelayToken(10 MBPS);
+    EXPECT_EQ(controller.delayed_write_rate(), 10 MBPS);
+    EXPECT_FALSE(controller.IsStopped());
+    EXPECT_TRUE(controller.NeedsDelay());
+    // test with delay
+    EXPECT_EQ(2 SECS, controller.GetDelay(clock_.get(), 20 MB));
+    clock_->now_micros_ += 2 SECS;  // pay the "debt"
+
+    auto delay_token_1 = controller.GetDelayToken(2 MBPS);
+    EXPECT_EQ(10 SECS, controller.GetDelay(clock_.get(), 20 MB));
+    clock_->now_micros_ += 10 SECS;  // pay the "debt"
+
+    auto delay_token_2 = controller.GetDelayToken(1 MBPS);
+    EXPECT_EQ(20 SECS, controller.GetDelay(clock_.get(), 20 MB));
+    clock_->now_micros_ += 20 SECS;  // pay the "debt"
+
+    auto delay_token_3 = controller.GetDelayToken(20 MBPS);
+    EXPECT_EQ(1 SECS, controller.GetDelay(clock_.get(), 20 MB));
+    clock_->now_micros_ += 1 SECS;  // pay the "debt"
+
+    // 60M is more than the max rate of 40M. Max rate will be used.
+    EXPECT_EQ(controller.delayed_write_rate(), 20 MBPS);
+    auto delay_token_4 =
+        controller.GetDelayToken(controller.delayed_write_rate() * 3);
+    EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS);
+    EXPECT_EQ(static_cast<uint64_t>(0.5 SECS),
+              controller.GetDelay(clock_.get(), 20 MB));
+
+    EXPECT_FALSE(controller.IsStopped());
+    EXPECT_TRUE(controller.NeedsDelay());
+
+    // Test stop tokens
+    {
+      auto stop_token_1 = controller.GetStopToken();
+      EXPECT_TRUE(controller.IsStopped());
+      EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+      {
+        auto stop_token_2 = controller.GetStopToken();
+        EXPECT_TRUE(controller.IsStopped());
+        EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+      }
+      EXPECT_TRUE(controller.IsStopped());
+      EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+    }
+    // Stop tokens released
+    EXPECT_FALSE(controller.IsStopped());
+    EXPECT_TRUE(controller.NeedsDelay());
+    EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS);
+    // pay the previous "debt"
+    clock_->now_micros_ += static_cast<uint64_t>(0.5 SECS);
+    EXPECT_EQ(1 SECS, controller.GetDelay(clock_.get(), 40 MB));
+  }
+
+  // Delay tokens released
+  EXPECT_FALSE(controller.NeedsDelay());
 }
 
-TEST_F(WriteControllerTest, SanityTest) {
-  WriteController controller(10000000u);
-  auto stop_token_1 = controller.GetStopToken();
-  auto stop_token_2 = controller.GetStopToken();
-
-  ASSERT_TRUE(controller.IsStopped());
-  stop_token_1.reset();
-  ASSERT_TRUE(controller.IsStopped());
-  stop_token_2.reset();
-  ASSERT_FALSE(controller.IsStopped());
-
-  TimeSetEnv env;
-
-  auto delay_token_1 = controller.GetDelayToken(10000000u);
-  ASSERT_EQ(static_cast<uint64_t>(2000000),
-            controller.GetDelay(&env, 20000000u));
-
-  env.now_micros_ += 1999900u;  // sleep debt 1000
+TEST_F(WriteControllerTest, StartFilled) {
+  WriteController controller(10 MBPS);
 
-  auto delay_token_2 = controller.GetDelayToken(10000000u);
-  // Rate reset after changing the token.
-  ASSERT_EQ(static_cast<uint64_t>(2000000),
-            controller.GetDelay(&env, 20000000u));
-
-  env.now_micros_ += 1999900u;  // sleep debt 1000
-
-  // One refill: 10240 bytes allowed, 1000 used, 9240 left
-  ASSERT_EQ(static_cast<uint64_t>(1124), controller.GetDelay(&env, 1000u));
-  env.now_micros_ += 1124u;  // sleep debt 0
-
-  delay_token_2.reset();
-  // 1000 used, 8240 left
-  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 1000u));
-
-  env.now_micros_ += 100u;  // sleep credit 100
-  // 1000 used, 7240 left
-  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 1000u));
-
-  env.now_micros_ += 100u;  // sleep credit 200
-  // One refill: 10240 fileed, sleep credit generates 2000. 8000 used
-  //             7240 + 10240 + 2000 - 8000 = 11480 left
-  ASSERT_EQ(static_cast<uint64_t>(1024u), controller.GetDelay(&env, 8000u));
-
-  env.now_micros_ += 200u;  // sleep debt 824
-  // 1000 used, 10480 left.
-  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 1000u));
-
-  env.now_micros_ += 200u;  // sleep debt 624
-  // Out of bound sleep, still 10480 left
-  ASSERT_EQ(static_cast<uint64_t>(3000624u),
-            controller.GetDelay(&env, 30000000u));
-
-  env.now_micros_ += 3000724u;  // sleep credit 100
-  // 6000 used, 4480 left.
-  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 6000u));
-
-  env.now_micros_ += 200u;  // sleep credit 300
-  // One refill, credit 4480 balance + 3000 credit + 10240 refill
-  // Use 8000, 9720 left
-  ASSERT_EQ(static_cast<uint64_t>(1024u), controller.GetDelay(&env, 8000u));
+  // Attempt to write two things that combined would be allowed within
+  // a single refill interval
+  auto delay_token_0 =
+      controller.GetDelayToken(controller.delayed_write_rate());
 
-  env.now_micros_ += 3024u;  // sleep credit 2000
+  // Verify no delay because write rate has not been exceeded within
+  // refill interval.
+  EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/));
+  EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/));
 
-  // 1720 left
-  ASSERT_EQ(static_cast<uint64_t>(0u), controller.GetDelay(&env, 8000u));
+  // Allow refill (kMicrosPerRefill)
+  clock_->now_micros_ += 1000;
 
-  // 1720 balance + 20000 credit = 20170 left
-  // Use 8000, 12170 left
-  ASSERT_EQ(static_cast<uint64_t>(0u), controller.GetDelay(&env, 8000u));
+  // Again
+  EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/));
+  EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/));
 
-  // 4170 left
-  ASSERT_EQ(static_cast<uint64_t>(0u), controller.GetDelay(&env, 8000u));
+  // Control: something bigger that would exceed write rate within interval
+  uint64_t delay = controller.GetDelay(clock_.get(), 10 MB);
+  EXPECT_GT(1.0 * delay, 0.999 SECS);
+  EXPECT_LT(1.0 * delay, 1.001 SECS);
+}
 
-  // Need a refill
-  ASSERT_EQ(static_cast<uint64_t>(1024u), controller.GetDelay(&env, 9000u));
+TEST_F(WriteControllerTest, DebtAccumulation) {
+  WriteController controller(10 MBPS);
+
+  std::array<std::unique_ptr<WriteControllerToken>, 10> tokens;
+
+  // Accumulate a time delay debt with no passage of time, like many column
+  // families delaying writes simultaneously. (Old versions of WriteController
+  // would reset the debt on every GetDelayToken.)
+  uint64_t debt = 0;
+  for (unsigned i = 0; i < tokens.size(); ++i) {
+    tokens[i] = controller.GetDelayToken((i + 1u) MBPS);
+    uint64_t delay = controller.GetDelay(clock_.get(), 63 MB);
+    ASSERT_GT(delay, debt);
+    uint64_t incremental = delay - debt;
+    ASSERT_EQ(incremental, (63 SECS) / (i + 1u));
+    debt += incremental;
+  }
+
+  // Pay down the debt
+  clock_->now_micros_ += debt;
+  debt = 0;
+
+  // Now accumulate debt with some passage of time.
+  for (unsigned i = 0; i < tokens.size(); ++i) {
+    // Debt is accumulated in time, not in bytes, so this new write
+    // limit is not applied to prior requested delays, even it they are
+    // in progress.
+    tokens[i] = controller.GetDelayToken((i + 1u) MBPS);
+    uint64_t delay = controller.GetDelay(clock_.get(), 63 MB);
+    ASSERT_GT(delay, debt);
+    uint64_t incremental = delay - debt;
+    ASSERT_EQ(incremental, (63 SECS) / (i + 1u));
+    debt += incremental;
+    uint64_t credit = debt / 2;
+    clock_->now_micros_ += credit;
+    debt -= credit;
+  }
+
+  // Pay down the debt
+  clock_->now_micros_ += debt;
+  debt = 0;    // consistent state
+  (void)debt;  // appease clang-analyze
+
+  // Verify paid down
+  EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/));
+
+  // Accumulate another debt, without accounting, and releasing tokens
+  for (unsigned i = 0; i < tokens.size(); ++i) {
+    // Big and small are delayed
+    ASSERT_LT(0U, controller.GetDelay(clock_.get(), 63 MB));
+    ASSERT_LT(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/));
+    tokens[i].reset();
+  }
+  // All tokens released.
+  // Verify that releasing all tokens pays down debt, even with no time passage.
+  tokens[0] = controller.GetDelayToken(1 MBPS);
+  ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/));
+}
 
-  delay_token_1.reset();
-  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 30000000u));
-  delay_token_1.reset();
-  ASSERT_FALSE(controller.IsStopped());
+// This may or may not be a "good" feature, but it's an old feature
+TEST_F(WriteControllerTest, CreditAccumulation) {
+  WriteController controller(10 MBPS);
+
+  std::array<std::unique_ptr<WriteControllerToken>, 10> tokens;
+
+  // Ensure started
+  tokens[0] = controller.GetDelayToken(1 MBPS);
+  ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB));
+  clock_->now_micros_ += 10 SECS;
+
+  // Accumulate a credit
+  uint64_t credit = 1000 SECS /* see below: * 1 MB / 1 SEC */;
+  clock_->now_micros_ += credit;
+
+  // Spend some credit (burst of I/O)
+  for (unsigned i = 0; i < tokens.size(); ++i) {
+    tokens[i] = controller.GetDelayToken((i + 1u) MBPS);
+    ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 63 MB));
+    // In WriteController, credit is accumulated in bytes, not in time.
+    // After an "unnecessary" delay, all of our time credit will be
+    // translated to bytes on the next operation, in this case with
+    // setting 1 MBPS. So regardless of the rate at delay time, we just
+    // account for the bytes.
+    credit -= 63 MB;
+  }
+  // Spend remaining credit
+  tokens[0] = controller.GetDelayToken(1 MBPS);
+  ASSERT_EQ(0U, controller.GetDelay(clock_.get(), credit));
+  // Verify
+  ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB));
+  clock_->now_micros_ += 10 SECS;
+
+  // Accumulate a credit, no accounting
+  clock_->now_micros_ += 1000 SECS;
+
+  // Spend a small amount, releasing tokens
+  for (unsigned i = 0; i < tokens.size(); ++i) {
+    ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 3 MB));
+    tokens[i].reset();
+  }
+
+  // All tokens released.
+  // Verify credit is wiped away on new delay.
+  tokens[0] = controller.GetDelayToken(1 MBPS);
+  ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB));
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/write_thread.cc b/db/write_thread.cc
index d26a694aac7..ac3a2f86915 100644
--- a/db/write_thread.cc
+++ b/db/write_thread.cc
@@ -208,6 +208,7 @@ uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask,
 }
 
 void WriteThread::SetState(Writer* w, uint8_t new_state) {
+  assert(w);
   auto state = w->state.load(std::memory_order_acquire);
   if (state == STATE_LOCKED_WAITING ||
       !w->state.compare_exchange_strong(state, new_state)) {
@@ -240,6 +241,7 @@ bool WriteThread::LinkOne(Writer* w, std::atomic<Writer*>* newest_writer) {
         MutexLock lock(&stall_mu_);
         writers = newest_writer->load(std::memory_order_relaxed);
         if (writers == &write_stall_dummy_) {
+          TEST_SYNC_POINT_CALLBACK("WriteThread::WriteStall::Wait", w);
           stall_cv_.Wait();
           // Load newest_writers_ again since it may have changed
           writers = newest_writer->load(std::memory_order_relaxed);
@@ -464,6 +466,11 @@ size_t WriteThread::EnterAsBatchGroupLeader(Writer* leader,
       break;
     }
 
+    if (w->protection_bytes_per_key != leader->protection_bytes_per_key) {
+      // Do not mix writes with different levels of integrity protection.
+      break;
+    }
+
     if (w->batch == nullptr) {
       // Do not include those writes with nullptr batch. Those are not writes,
       // those are something else. They want to be alone
diff --git a/db/write_thread.h b/db/write_thread.h
index 41cb9842c90..b050606aa54 100644
--- a/db/write_thread.h
+++ b/db/write_thread.h
@@ -5,12 +5,11 @@
 
 #pragma once
 
-#include <assert.h>
-#include <stdint.h>
-
 #include <atomic>
+#include <cassert>
 #include <chrono>
 #include <condition_variable>
+#include <cstdint>
 #include <mutex>
 #include <type_traits>
 #include <vector>
@@ -37,7 +36,7 @@ class WriteThread {
     // non-parallel informs a follower that its writes have been committed
     // (-> STATE_COMPLETED), or when a leader that has chosen to perform
     // updates in parallel and needs this Writer to apply its batch (->
-    // STATE_PARALLEL_FOLLOWER).
+    // STATE_PARALLEL_MEMTABLE_WRITER).
     STATE_INIT = 1,
 
     // The state used to inform a waiting Writer that it has become the
@@ -76,7 +75,6 @@ class WriteThread {
   struct Writer;
 
   struct WriteGroup {
-    ~WriteGroup() { status.PermitUncheckedError(); }
     Writer* leader = nullptr;
     Writer* last_writer = nullptr;
     SequenceNumber last_sequence;
@@ -121,6 +119,7 @@ class WriteThread {
     bool disable_wal;
     bool disable_memtable;
     size_t batch_cnt;  // if non-zero, number of sub-batches in the write batch
+    size_t protection_bytes_per_key;
     PreReleaseCallback* pre_release_callback;
     uint64_t log_used;  // log number that this batch was inserted into
     uint64_t log_ref;   // log number that memtable insert should reference
@@ -130,7 +129,7 @@ class WriteThread {
     WriteGroup* write_group;
     SequenceNumber sequence;  // the sequence number to use for the first key
     Status status;
-    Status callback_status;   // status returned by callback->Callback()
+    Status callback_status;  // status returned by callback->Callback()
 
     std::aligned_storage<sizeof(std::mutex)>::type state_mutex_bytes;
     std::aligned_storage<sizeof(std::condition_variable)>::type state_cv_bytes;
@@ -144,6 +143,7 @@ class WriteThread {
           disable_wal(false),
           disable_memtable(false),
           batch_cnt(0),
+          protection_bytes_per_key(0),
           pre_release_callback(nullptr),
           log_used(0),
           log_ref(0),
@@ -165,6 +165,7 @@ class WriteThread {
           disable_wal(write_options.disableWAL),
           disable_memtable(_disable_memtable),
           batch_cnt(_batch_cnt),
+          protection_bytes_per_key(_batch->GetProtectionBytesPerKey()),
           pre_release_callback(_pre_release_callback),
           log_used(0),
           log_ref(_log_ref),
@@ -245,7 +246,7 @@ class WriteThread {
     std::condition_variable& StateCV() {
       assert(made_waitable);
       return *static_cast<std::condition_variable*>(
-                 static_cast<void*>(&state_cv_bytes));
+          static_cast<void*>(&state_cv_bytes));
     }
   };
 
@@ -272,7 +273,7 @@ class WriteThread {
   // STATE_GROUP_LEADER.  If w has been made part of a sequential batch
   // group and the leader has performed the write, returns STATE_DONE.
   // If w has been made part of a parallel batch group and is responsible
-  // for updating the memtable, returns STATE_PARALLEL_FOLLOWER.
+  // for updating the memtable, returns STATE_PARALLEL_MEMTABLE_WRITER.
   //
   // The db mutex SHOULD NOT be held when calling this function, because
   // it will block.
@@ -309,8 +310,8 @@ class WriteThread {
   // the next leader if needed.
   void ExitAsMemTableWriter(Writer* self, WriteGroup& write_group);
 
-  // Causes JoinBatchGroup to return STATE_PARALLEL_FOLLOWER for all of the
-  // non-leader members of this write batch group.  Sets Writer::sequence
+  // Causes JoinBatchGroup to return STATE_PARALLEL_MEMTABLE_WRITER for all of
+  // the non-leader members of this write batch group.  Sets Writer::sequence
   // before waking them up.
   //
   // WriteGroup* write_group: Extra state used to coordinate the parallel add
diff --git a/db_stress_tool/batched_ops_stress.cc b/db_stress_tool/batched_ops_stress.cc
index ea1fab5696d..db81eb2bbbb 100644
--- a/db_stress_tool/batched_ops_stress.cc
+++ b/db_stress_tool/batched_ops_stress.cc
@@ -31,7 +31,8 @@ class BatchedOpsStressTest : public StressTest {
     std::string keys[10] = {"9", "8", "7", "6", "5", "4", "3", "2", "1", "0"};
     std::string values[10] = {"9", "8", "7", "6", "5", "4", "3", "2", "1", "0"};
     Slice value_slices[10];
-    WriteBatch batch;
+    WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, 0 /* ts_sz */,
+                     FLAGS_batch_protection_bytes_per_key);
     Status s;
     auto cfh = column_families_[rand_column_families[0]];
     std::string key_str = Key(rand_keys[0]);
@@ -66,7 +67,8 @@ class BatchedOpsStressTest : public StressTest {
                     std::unique_ptr<MutexLock>& /* lock */) override {
     std::string keys[10] = {"9", "7", "5", "3", "1", "8", "6", "4", "2", "0"};
 
-    WriteBatch batch;
+    WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, 0 /* ts_sz */,
+                     FLAGS_batch_protection_bytes_per_key);
     Status s;
     auto cfh = column_families_[rand_column_families[0]];
     std::string key_str = Key(rand_keys[0]);
diff --git a/db_stress_tool/db_stress_common.cc b/db_stress_tool/db_stress_common.cc
index f1823302a29..eccb9b554d5 100644
--- a/db_stress_tool/db_stress_common.cc
+++ b/db_stress_tool/db_stress_common.cc
@@ -16,10 +16,10 @@
 #include "util/file_checksum_helper.h"
 #include "util/xxhash.h"
 
-ROCKSDB_NAMESPACE::DbStressEnvWrapper* db_stress_env = nullptr;
+ROCKSDB_NAMESPACE::Env* db_stress_env = nullptr;
 #ifndef NDEBUG
 // If non-null, injects read error at a rate specified by the
-// read_fault_one_in flag
+// read_fault_one_in or write_fault_one_in flag
 std::shared_ptr<ROCKSDB_NAMESPACE::FaultInjectionTestFS> fault_fs_guard;
 #endif // NDEBUG
 enum ROCKSDB_NAMESPACE::CompressionType compression_type_e =
@@ -30,7 +30,7 @@ enum ROCKSDB_NAMESPACE::ChecksumType checksum_type_e =
     ROCKSDB_NAMESPACE::kCRC32c;
 enum RepFactory FLAGS_rep_factory = kSkipList;
 std::vector<double> sum_probs(100001);
-int64_t zipf_sum_size = 100000;
+constexpr int64_t zipf_sum_size = 100000;
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -160,8 +160,10 @@ void PrintKeyValue(int cf, uint64_t key, const char* value, size_t sz) {
     snprintf(buf, 4, "%X", value[i]);
     tmp.append(buf);
   }
-  fprintf(stdout, "[CF %d] %" PRIi64 " == > (%" ROCKSDB_PRIszt ") %s\n", cf,
-          key, sz, tmp.c_str());
+  auto key_str = Key(key);
+  Slice key_slice = key_str;
+  fprintf(stdout, "[CF %d] %s (%" PRIi64 ") == > (%" ROCKSDB_PRIszt ") %s\n",
+          cf, key_slice.ToString(true).c_str(), key, sz, tmp.c_str());
 }
 
 // Note that if hot_key_alpha != 0, it generates the key based on Zipfian
@@ -231,6 +233,15 @@ size_t GenerateValue(uint32_t rand, char* v, size_t max_sz) {
   return value_sz;  // the size of the value set.
 }
 
+std::string NowNanosStr() {
+  uint64_t t = db_stress_env->NowNanos();
+  std::string ret;
+  PutFixed64(&ret, t);
+  return ret;
+}
+
+std::string GenerateTimestampForRead() { return NowNanosStr(); }
+
 namespace {
 
 class MyXXH64Checksum : public FileChecksumGenerator {
diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h
index 67d4530c5ce..a7476594278 100644
--- a/db_stress_tool/db_stress_common.h
+++ b/db_stress_tool/db_stress_common.h
@@ -132,6 +132,9 @@ DECLARE_int32(set_options_one_in);
 DECLARE_int32(set_in_place_one_in);
 DECLARE_int64(cache_size);
 DECLARE_bool(cache_index_and_filter_blocks);
+DECLARE_int32(top_level_index_pinning);
+DECLARE_int32(partition_pinning);
+DECLARE_int32(unpartitioned_pinning);
 DECLARE_bool(use_clock_cache);
 DECLARE_uint64(subcompactions);
 DECLARE_uint64(periodic_compaction_seconds);
@@ -141,6 +144,7 @@ DECLARE_bool(enable_write_thread_adaptive_yield);
 DECLARE_int32(reopen);
 DECLARE_double(bloom_bits);
 DECLARE_bool(use_block_based_filter);
+DECLARE_bool(use_ribbon_filter);
 DECLARE_bool(partition_filters);
 DECLARE_bool(optimize_filters_for_memory);
 DECLARE_int32(index_type);
@@ -202,6 +206,7 @@ DECLARE_string(bottommost_compression_type);
 DECLARE_int32(compression_max_dict_bytes);
 DECLARE_int32(compression_zstd_max_train_bytes);
 DECLARE_int32(compression_parallel_threads);
+DECLARE_uint64(compression_max_dict_buffer_bytes);
 DECLARE_string(checksum_type);
 DECLARE_string(hdfs);
 DECLARE_string(env_uri);
@@ -228,6 +233,7 @@ DECLARE_int32(get_property_one_in);
 DECLARE_string(file_checksum_impl);
 
 #ifndef ROCKSDB_LITE
+// Options for StackableDB-based BlobDB
 DECLARE_bool(use_blob_db);
 DECLARE_uint64(blob_db_min_blob_size);
 DECLARE_uint64(blob_db_bytes_per_sync);
@@ -235,6 +241,16 @@ DECLARE_uint64(blob_db_file_size);
 DECLARE_bool(blob_db_enable_gc);
 DECLARE_double(blob_db_gc_cutoff);
 #endif  // !ROCKSDB_LITE
+
+// Options for integrated BlobDB
+DECLARE_bool(allow_setting_blob_options_dynamically);
+DECLARE_bool(enable_blob_files);
+DECLARE_uint64(min_blob_size);
+DECLARE_uint64(blob_file_size);
+DECLARE_string(blob_compression_type);
+DECLARE_bool(enable_blob_garbage_collection);
+DECLARE_double(blob_garbage_collection_age_cutoff);
+
 DECLARE_int32(approximate_size_one_in);
 DECLARE_bool(sync_fault_injection);
 
@@ -242,13 +258,17 @@ DECLARE_bool(best_efforts_recovery);
 DECLARE_bool(skip_verifydb);
 DECLARE_bool(enable_compaction_filter);
 DECLARE_bool(paranoid_file_checks);
+DECLARE_bool(fail_if_options_file_error);
+DECLARE_uint64(batch_protection_bytes_per_key);
 
-const long KB = 1024;
-const int kRandomValueMaxFactor = 3;
-const int kValueMaxLen = 100;
+DECLARE_uint64(user_timestamp_size);
+
+constexpr long KB = 1024;
+constexpr int kRandomValueMaxFactor = 3;
+constexpr int kValueMaxLen = 100;
 
 // wrapped posix or hdfs environment
-extern ROCKSDB_NAMESPACE::DbStressEnvWrapper* db_stress_env;
+extern ROCKSDB_NAMESPACE::Env* db_stress_env;
 #ifndef NDEBUG
 namespace ROCKSDB_NAMESPACE {
 class FaultInjectionTestFS;
@@ -544,6 +564,9 @@ extern StressTest* CreateNonBatchedOpsStressTest();
 extern void InitializeHotKeyGenerator(double alpha);
 extern int64_t GetOneHotKeyID(double rand_seed, int64_t max_key);
 
+extern std::string GenerateTimestampForRead();
+extern std::string NowNanosStr();
+
 std::shared_ptr<FileChecksumGenFactory> GetFileChecksumImpl(
     const std::string& name);
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db_stress_tool/db_stress_driver.cc b/db_stress_tool/db_stress_driver.cc
index 69411aa2936..b944214ec6d 100644
--- a/db_stress_tool/db_stress_driver.cc
+++ b/db_stress_tool/db_stress_driver.cc
@@ -57,6 +57,7 @@ void ThreadBody(void* v) {
 }
 
 bool RunStressTest(StressTest* stress) {
+  SystemClock* clock = db_stress_env->GetSystemClock().get();
   stress->InitDb();
   SharedState shared(db_stress_env, stress);
   stress->FinishInitDb(&shared);
@@ -69,9 +70,9 @@ bool RunStressTest(StressTest* stress) {
 
   uint32_t n = shared.GetNumThreads();
 
-  uint64_t now = db_stress_env->NowMicros();
+  uint64_t now = clock->NowMicros();
   fprintf(stdout, "%s Initializing worker threads\n",
-          db_stress_env->TimeToString(now / 1000000).c_str());
+          clock->TimeToString(now / 1000000).c_str());
   std::vector<ThreadState*> threads(n);
   for (uint32_t i = 0; i < n; i++) {
     threads[i] = new ThreadState(i, &shared);
@@ -104,9 +105,9 @@ bool RunStressTest(StressTest* stress) {
       }
     }
 
-    now = db_stress_env->NowMicros();
+    now = clock->NowMicros();
     fprintf(stdout, "%s Starting database operations\n",
-            db_stress_env->TimeToString(now / 1000000).c_str());
+            clock->TimeToString(now / 1000000).c_str());
 
     shared.SetStart();
     shared.GetCondVar()->SignalAll();
@@ -114,16 +115,16 @@ bool RunStressTest(StressTest* stress) {
       shared.GetCondVar()->Wait();
     }
 
-    now = db_stress_env->NowMicros();
+    now = clock->NowMicros();
     if (FLAGS_test_batches_snapshots) {
       fprintf(stdout, "%s Limited verification already done during gets\n",
-              db_stress_env->TimeToString((uint64_t)now / 1000000).c_str());
+              clock->TimeToString((uint64_t)now / 1000000).c_str());
     } else if (FLAGS_skip_verifydb) {
       fprintf(stdout, "%s Verification skipped\n",
-              db_stress_env->TimeToString((uint64_t)now / 1000000).c_str());
+              clock->TimeToString((uint64_t)now / 1000000).c_str());
     } else {
       fprintf(stdout, "%s Starting verification\n",
-              db_stress_env->TimeToString((uint64_t)now / 1000000).c_str());
+              clock->TimeToString((uint64_t)now / 1000000).c_str());
     }
 
     shared.SetStartVerify();
@@ -142,11 +143,11 @@ bool RunStressTest(StressTest* stress) {
     delete threads[i];
     threads[i] = nullptr;
   }
-  now = db_stress_env->NowMicros();
+  now = clock->NowMicros();
   if (!FLAGS_skip_verifydb && !FLAGS_test_batches_snapshots &&
       !shared.HasVerificationFailedYet()) {
     fprintf(stdout, "%s Verification successful\n",
-            db_stress_env->TimeToString(now / 1000000).c_str());
+            clock->TimeToString(now / 1000000).c_str());
   }
   stress->PrintStatistics();
 
diff --git a/db_stress_tool/db_stress_env_wrapper.h b/db_stress_tool/db_stress_env_wrapper.h
index 484071f1067..f517a489b06 100644
--- a/db_stress_tool/db_stress_env_wrapper.h
+++ b/db_stress_tool/db_stress_env_wrapper.h
@@ -28,7 +28,9 @@ class DbStressEnvWrapper : public EnvWrapper {
         f.find(".restore") != std::string::npos) {
       return target()->DeleteFile(f);
     }
-    return Status::OK();
+    // Rename the file instead of deletion to keep the history, and
+    // at the same time it is not visible to RocksDB.
+    return target()->RenameFile(f, f + "_renamed_");
   }
 
   // If true, all manifest files will not be delted in DeleteFile().
diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc
index 7dcdeefce96..df2fc38c23f 100644
--- a/db_stress_tool/db_stress_gflags.cc
+++ b/db_stress_tool/db_stress_gflags.cc
@@ -287,6 +287,24 @@ DEFINE_int64(cache_size, 2LL * KB * KB * KB,
 DEFINE_bool(cache_index_and_filter_blocks, false,
             "True if indexes/filters should be cached in block cache.");
 
+DEFINE_int32(
+    top_level_index_pinning,
+    static_cast<int32_t>(ROCKSDB_NAMESPACE::PinningTier::kFallback),
+    "Type of pinning for top-level indexes into metadata partitions (see "
+    "`enum PinningTier` in table.h)");
+
+DEFINE_int32(
+    partition_pinning,
+    static_cast<int32_t>(ROCKSDB_NAMESPACE::PinningTier::kFallback),
+    "Type of pinning for metadata partitions (see `enum PinningTier` in "
+    "table.h)");
+
+DEFINE_int32(
+    unpartitioned_pinning,
+    static_cast<int32_t>(ROCKSDB_NAMESPACE::PinningTier::kFallback),
+    "Type of pinning for unpartitioned metadata blocks (see `enum PinningTier` "
+    "in table.h)");
+
 DEFINE_bool(use_clock_cache, false,
             "Replace default LRU block cache with clock cache.");
 
@@ -307,33 +325,68 @@ DEFINE_bool(enable_write_thread_adaptive_yield, true,
             "Use a yielding spin loop for brief writer thread waits.");
 
 #ifndef ROCKSDB_LITE
-// BlobDB Options
-DEFINE_bool(use_blob_db, false, "Use BlobDB.");
+// Options for StackableDB-based BlobDB
+DEFINE_bool(use_blob_db, false, "[Stacked BlobDB] Use BlobDB.");
 
-DEFINE_uint64(blob_db_min_blob_size,
-              ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size,
-              "Smallest blob to store in a file. Blobs smaller than this "
-              "will be inlined with the key in the LSM tree.");
+DEFINE_uint64(
+    blob_db_min_blob_size,
+    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size,
+    "[Stacked BlobDB] Smallest blob to store in a file. Blobs "
+    "smaller than this will be inlined with the key in the LSM tree.");
 
-DEFINE_uint64(blob_db_bytes_per_sync,
-              ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync,
-              "Sync blob files once per every N bytes written.");
+DEFINE_uint64(
+    blob_db_bytes_per_sync,
+    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync,
+    "[Stacked BlobDB] Sync blob files once per every N bytes written.");
 
 DEFINE_uint64(blob_db_file_size,
               ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size,
-              "Target size of each blob file.");
+              "[Stacked BlobDB] Target size of each blob file.");
 
 DEFINE_bool(
     blob_db_enable_gc,
     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection,
-    "Enable BlobDB garbage collection.");
+    "[Stacked BlobDB] Enable BlobDB garbage collection.");
 
 DEFINE_double(
     blob_db_gc_cutoff,
     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff,
-    "Cutoff ratio for BlobDB garbage collection.");
+    "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection.");
 #endif  // !ROCKSDB_LITE
 
+// Options for integrated BlobDB
+DEFINE_bool(allow_setting_blob_options_dynamically, false,
+            "[Integrated BlobDB] Allow setting blob options dynamically.");
+
+DEFINE_bool(
+    enable_blob_files,
+    ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().enable_blob_files,
+    "[Integrated BlobDB] Enable writing large values to separate blob files.");
+
+DEFINE_uint64(min_blob_size,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().min_blob_size,
+              "[Integrated BlobDB] The size of the smallest value to be stored "
+              "separately in a blob file.");
+
+DEFINE_uint64(blob_file_size,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_size,
+              "[Integrated BlobDB] The size limit for blob files.");
+
+DEFINE_string(blob_compression_type, "none",
+              "[Integrated BlobDB] The compression algorithm to use for large "
+              "values stored in blob files.");
+
+DEFINE_bool(enable_blob_garbage_collection,
+            ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
+                .enable_blob_garbage_collection,
+            "[Integrated BlobDB] Enable blob garbage collection.");
+
+DEFINE_double(blob_garbage_collection_age_cutoff,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
+                  .blob_garbage_collection_age_cutoff,
+              "[Integrated BlobDB] The cutoff in terms of blob file age for "
+              "garbage collection.");
+
 static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range);
 
@@ -357,6 +410,9 @@ DEFINE_bool(use_block_based_filter, false,
             "use block based filter"
             "instead of full filter for block based table");
 
+DEFINE_bool(use_ribbon_filter, false,
+            "Use Ribbon filter instead of Bloom filter");
+
 DEFINE_bool(partition_filters, false,
             "use partitioned filters "
             "for block-based table");
@@ -417,7 +473,6 @@ DEFINE_int32(kill_random_test, 0,
              "probability 1/this");
 static const bool FLAGS_kill_random_test_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_kill_random_test, &ValidateInt32Positive);
-extern int rocksdb_kill_odds;
 
 DEFINE_string(kill_exclude_prefixes, "",
               "If non-empty, kill points with prefix in the list given will be"
@@ -605,6 +660,10 @@ DEFINE_int32(compression_zstd_max_train_bytes, 0,
 DEFINE_int32(compression_parallel_threads, 1,
              "Number of threads for parallel compression.");
 
+DEFINE_uint64(compression_max_dict_buffer_bytes, 0,
+              "Buffering limit for SST file data to sample for dictionary "
+              "compression.");
+
 DEFINE_string(bottommost_compression_type, "disable",
               "Algorithm to use to compress bottommost level of the database. "
               "\"disable\" means disabling the feature");
@@ -732,8 +791,28 @@ DEFINE_bool(paranoid_file_checks, true,
             "After writing every SST file, reopen it and read all the keys "
             "and validate checksums");
 
+DEFINE_bool(fail_if_options_file_error, false,
+            "Fail operations that fail to detect or properly persist options "
+            "file.");
+
+DEFINE_uint64(batch_protection_bytes_per_key, 0,
+              "If nonzero, enables integrity protection in `WriteBatch` at the "
+              "specified number of bytes per key. Currently the only supported "
+              "nonzero value is eight.");
+
 DEFINE_string(file_checksum_impl, "none",
               "Name of an implementation for file_checksum_gen_factory, or "
               "\"none\" for null.");
 
+DEFINE_int32(write_fault_one_in, 0,
+             "On non-zero, enables fault injection on write");
+
+DEFINE_uint64(user_timestamp_size, 0,
+              "Number of bytes for a user-defined timestamp. Currently, only "
+              "8-byte is supported");
+
+DEFINE_int32(open_metadata_write_fault_one_in, 0,
+             "On non-zero, enables fault injection on file metadata write "
+             "during DB reopen.");
+
 #endif  // GFLAGS
diff --git a/db_stress_tool/db_stress_shared_state.h b/db_stress_tool/db_stress_shared_state.h
index 86310f82f75..03bc0784c74 100644
--- a/db_stress_tool/db_stress_shared_state.h
+++ b/db_stress_tool/db_stress_shared_state.h
@@ -29,6 +29,8 @@ DECLARE_bool(test_batches_snapshots);
 DECLARE_int32(compaction_thread_pool_adjust_interval);
 DECLARE_int32(continuous_verification_interval);
 DECLARE_int32(read_fault_one_in);
+DECLARE_int32(write_fault_one_in);
+DECLARE_int32(open_metadata_write_fault_one_in);
 
 namespace ROCKSDB_NAMESPACE {
 class StressTest;
@@ -417,6 +419,8 @@ struct ThreadState {
     std::string value;
     // optional state of all keys in the db
     std::vector<bool>* key_vec;
+
+    std::string timestamp;
   };
   std::queue<std::pair<uint64_t, SnapshotState>> snapshot_queue;
 
diff --git a/db_stress_tool/db_stress_stat.h b/db_stress_tool/db_stress_stat.h
index e8bc0986a67..429cf3b2a47 100644
--- a/db_stress_tool/db_stress_stat.h
+++ b/db_stress_tool/db_stress_stat.h
@@ -11,9 +11,9 @@
 
 #include "monitoring/histogram.h"
 #include "port/port.h"
-#include "rocksdb/env.h"
 #include "rocksdb/snapshot.h"
 #include "rocksdb/statistics.h"
+#include "rocksdb/system_clock.h"
 #include "util/gflags_compat.h"
 #include "util/random.h"
 
@@ -73,7 +73,7 @@ class Stats {
     seconds_ = 0;
     num_compact_files_succeed_ = 0;
     num_compact_files_failed_ = 0;
-    start_ = Env::Default()->NowMicros();
+    start_ = SystemClock::Default()->NowMicros();
     last_op_finish_ = start_;
     finish_ = start_;
   }
@@ -102,13 +102,13 @@ class Stats {
   }
 
   void Stop() {
-    finish_ = Env::Default()->NowMicros();
+    finish_ = SystemClock::Default()->NowMicros();
     seconds_ = (finish_ - start_) * 1e-6;
   }
 
   void FinishedSingleOp() {
     if (FLAGS_histogram) {
-      auto now = Env::Default()->NowMicros();
+      auto now = SystemClock::Default()->NowMicros();
       auto micros = now - last_op_finish_;
       hist_.Add(micros);
       if (micros > 20000) {
diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc
index 38f435b6eeb..6a649eca9b8 100644
--- a/db_stress_tool/db_stress_test_base.cc
+++ b/db_stress_tool/db_stress_test_base.cc
@@ -15,22 +15,48 @@
 #include "db_stress_tool/db_stress_table_properties_collector.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/sst_file_manager.h"
+#include "rocksdb/types.h"
 #include "util/cast_util.h"
+#include "utilities/backupable/backupable_db_impl.h"
 #include "utilities/fault_injection_fs.h"
 
 namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+std::shared_ptr<const FilterPolicy> CreateFilterPolicy() {
+  if (FLAGS_bloom_bits < 0) {
+    return BlockBasedTableOptions().filter_policy;
+  }
+  const FilterPolicy* new_policy;
+  if (FLAGS_use_ribbon_filter) {
+    // Old and new API should be same
+    if (std::random_device()() & 1) {
+      new_policy = NewExperimentalRibbonFilterPolicy(FLAGS_bloom_bits);
+    } else {
+      new_policy = NewRibbonFilterPolicy(FLAGS_bloom_bits);
+    }
+  } else {
+    if (FLAGS_use_block_based_filter) {
+      new_policy = NewBloomFilterPolicy(FLAGS_bloom_bits, true);
+    } else {
+      new_policy = NewBloomFilterPolicy(FLAGS_bloom_bits, false);
+    }
+  }
+  return std::shared_ptr<const FilterPolicy>(new_policy);
+}
+
+}  // namespace
+
 StressTest::StressTest()
     : cache_(NewCache(FLAGS_cache_size)),
       compressed_cache_(NewLRUCache(FLAGS_compressed_cache_size)),
-      filter_policy_(FLAGS_bloom_bits >= 0
-                         ? FLAGS_use_block_based_filter
-                               ? NewBloomFilterPolicy(FLAGS_bloom_bits, true)
-                               : NewBloomFilterPolicy(FLAGS_bloom_bits, false)
-                         : nullptr),
+      filter_policy_(CreateFilterPolicy()),
       db_(nullptr),
 #ifndef ROCKSDB_LITE
       txn_db_(nullptr),
 #endif
+      clock_(db_stress_env->GetSystemClock().get()),
       new_column_family_name_(1),
       num_times_reopened_(0),
       db_preload_finished_(false),
@@ -104,6 +130,22 @@ std::shared_ptr<Cache> StressTest::NewCache(size_t capacity) {
   }
 }
 
+std::vector<std::string> StressTest::GetBlobCompressionTags() {
+  std::vector<std::string> compression_tags{"kNoCompression"};
+
+  if (Snappy_Supported()) {
+    compression_tags.emplace_back("kSnappyCompression");
+  }
+  if (LZ4_Supported()) {
+    compression_tags.emplace_back("kLZ4Compression");
+  }
+  if (ZSTD_Supported()) {
+    compression_tags.emplace_back("kZSTD");
+  }
+
+  return compression_tags;
+}
+
 bool StressTest::BuildOptionsTable() {
   if (FLAGS_set_options_one_in <= 0) {
     return true;
@@ -182,6 +224,21 @@ bool StressTest::BuildOptionsTable() {
       {"max_sequential_skip_in_iterations", {"4", "8", "12"}},
   };
 
+  if (FLAGS_allow_setting_blob_options_dynamically) {
+    options_tbl.emplace("enable_blob_files",
+                        std::vector<std::string>{"false", "true"});
+    options_tbl.emplace("min_blob_size",
+                        std::vector<std::string>{"0", "8", "16"});
+    options_tbl.emplace("blob_file_size",
+                        std::vector<std::string>{"1M", "16M", "256M", "1G"});
+    options_tbl.emplace("blob_compression_type", GetBlobCompressionTags());
+    options_tbl.emplace("enable_blob_garbage_collection",
+                        std::vector<std::string>{"false", "true"});
+    options_tbl.emplace(
+        "blob_garbage_collection_age_cutoff",
+        std::vector<std::string>{"0.0", "0.25", "0.5", "0.75", "1.0"});
+  }
+
   options_table_ = std::move(options_tbl);
 
   for (const auto& iter : options_table_) {
@@ -191,9 +248,9 @@ bool StressTest::BuildOptionsTable() {
 }
 
 void StressTest::InitDb() {
-  uint64_t now = db_stress_env->NowMicros();
+  uint64_t now = clock_->NowMicros();
   fprintf(stdout, "%s Initializing db_stress\n",
-          db_stress_env->TimeToString(now / 1000000).c_str());
+          clock_->TimeToString(now / 1000000).c_str());
   PrintEnv();
   Open();
   BuildOptionsTable();
@@ -201,25 +258,28 @@ void StressTest::InitDb() {
 
 void StressTest::FinishInitDb(SharedState* shared) {
   if (FLAGS_read_only) {
-    uint64_t now = db_stress_env->NowMicros();
+    uint64_t now = clock_->NowMicros();
     fprintf(stdout, "%s Preloading db with %" PRIu64 " KVs\n",
-            db_stress_env->TimeToString(now / 1000000).c_str(), FLAGS_max_key);
+            clock_->TimeToString(now / 1000000).c_str(), FLAGS_max_key);
     PreloadDbAndReopenAsReadOnly(FLAGS_max_key, shared);
   }
   if (FLAGS_enable_compaction_filter) {
-    reinterpret_cast<DbStressCompactionFilterFactory*>(
-        options_.compaction_filter_factory.get())
-        ->SetSharedState(shared);
+    auto* compaction_filter_factory =
+        reinterpret_cast<DbStressCompactionFilterFactory*>(
+            options_.compaction_filter_factory.get());
+    assert(compaction_filter_factory);
+    compaction_filter_factory->SetSharedState(shared);
+    fprintf(stdout, "Compaction filter factory: %s\n",
+            compaction_filter_factory->Name());
   }
 }
 
 bool StressTest::VerifySecondaries() {
 #ifndef ROCKSDB_LITE
   if (FLAGS_test_secondary) {
-    uint64_t now = db_stress_env->NowMicros();
-    fprintf(
-        stdout, "%s Start to verify secondaries against primary\n",
-        db_stress_env->TimeToString(static_cast<uint64_t>(now) / 1000000).c_str());
+    uint64_t now = clock_->NowMicros();
+    fprintf(stdout, "%s Start to verify secondaries against primary\n",
+            clock_->TimeToString(static_cast<uint64_t>(now) / 1000000).c_str());
   }
   for (size_t k = 0; k != secondaries_.size(); ++k) {
     Status s = secondaries_[k]->TryCatchUpWithPrimary();
@@ -261,10 +321,9 @@ bool StressTest::VerifySecondaries() {
     }
   }
   if (FLAGS_test_secondary) {
-    uint64_t now = db_stress_env->NowMicros();
-    fprintf(
-        stdout, "%s Verification of secondaries succeeded\n",
-        db_stress_env->TimeToString(static_cast<uint64_t>(now) / 1000000).c_str());
+    uint64_t now = clock_->NowMicros();
+    fprintf(stdout, "%s Verification of secondaries succeeded\n",
+            clock_->TimeToString(static_cast<uint64_t>(now) / 1000000).c_str());
   }
 #endif  // ROCKSDB_LITE
   return true;
@@ -278,6 +337,11 @@ Status StressTest::AssertSame(DB* db, ColumnFamilyHandle* cf,
   }
   ReadOptions ropt;
   ropt.snapshot = snap_state.snapshot;
+  Slice ts;
+  if (!snap_state.timestamp.empty()) {
+    ts = snap_state.timestamp;
+    ropt.timestamp = &ts;
+  }
   PinnableSlice exp_v(&snap_state.value);
   exp_v.PinSelf();
   PinnableSlice v;
@@ -329,9 +393,11 @@ void StressTest::VerificationAbort(SharedState* shared, std::string msg,
 
 void StressTest::VerificationAbort(SharedState* shared, std::string msg, int cf,
                                    int64_t key) const {
+  auto key_str = Key(key);
+  Slice key_slice = key_str;
   fprintf(stderr,
-          "Verification failed for column family %d key %" PRIi64 ": %s\n", cf,
-          key, msg.c_str());
+          "Verification failed for column family %d key %s (%" PRIi64 "): %s\n",
+          cf, key_slice.ToString(true).c_str(), key, msg.c_str());
   shared->SetVerificationFailure();
 }
 
@@ -381,6 +447,13 @@ void StressTest::PreloadDbAndReopenAsReadOnly(int64_t number_of_keys,
         }
       } else {
         if (!FLAGS_use_txn) {
+          std::string ts_str;
+          Slice ts;
+          if (FLAGS_user_timestamp_size > 0) {
+            ts_str = NowNanosStr();
+            ts = ts_str;
+            write_opts.timestamp = &ts;
+          }
           s = db_->Put(write_opts, cfh, key, v);
         } else {
 #ifndef ROCKSDB_LITE
@@ -421,9 +494,9 @@ void StressTest::PreloadDbAndReopenAsReadOnly(int64_t number_of_keys,
 #endif
 
     db_preload_finished_.store(true);
-    auto now = db_stress_env->NowMicros();
+    auto now = clock_->NowMicros();
     fprintf(stdout, "%s Reopening database in read-only\n",
-            db_stress_env->TimeToString(now / 1000000).c_str());
+            clock_->TimeToString(now / 1000000).c_str());
     // Reopen as read-only, can ignore all options related to updates
     Open();
   } else {
@@ -466,7 +539,7 @@ Status StressTest::NewTxn(WriteOptions& write_opts, Transaction** txn) {
   }
   static std::atomic<uint64_t> txn_id = {0};
   TransactionOptions txn_options;
-  txn_options.lock_timeout = 60000;  // 1min
+  txn_options.lock_timeout = 600000;  // 10 min
   txn_options.deadlock_detect = true;
   *txn = txn_db_->BeginTransaction(write_opts, txn_options);
   auto istr = std::to_string(txn_id.fetch_add(1));
@@ -520,6 +593,15 @@ void StressTest::OperateDb(ThreadState* thread) {
     fault_fs_guard->SetThreadLocalReadErrorContext(thread->shared->GetSeed(),
                                             FLAGS_read_fault_one_in);
   }
+  if (FLAGS_write_fault_one_in) {
+    IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+    error_msg.SetRetryable(true);
+    std::vector<FileType> types = {FileType::kTableFile,
+                                   FileType::kDescriptorFile,
+                                   FileType::kCurrentFile};
+    fault_fs_guard->SetRandomWriteError(
+        thread->shared->GetSeed(), FLAGS_write_fault_one_in, error_msg, types);
+  }
 #endif // NDEBUG
   thread->stats.Start();
   for (int open_cnt = 0; open_cnt <= FLAGS_reopen; ++open_cnt) {
@@ -613,7 +695,8 @@ void StressTest::OperateDb(ThreadState* thread) {
 
 #ifndef ROCKSDB_LITE
       // Verify GetLiveFiles with a 1 in N chance.
-      if (thread->rand.OneInOpt(FLAGS_get_live_files_one_in)) {
+      if (thread->rand.OneInOpt(FLAGS_get_live_files_one_in) &&
+          !FLAGS_write_fault_one_in) {
         Status status = VerifyGetLiveFiles();
         if (!status.ok()) {
           VerificationAbort(shared, "VerifyGetLiveFiles status not OK", status);
@@ -714,6 +797,20 @@ void StressTest::OperateDb(ThreadState* thread) {
         }
       }
 
+      // Assign timestamps if necessary.
+      std::string read_ts_str;
+      std::string write_ts_str;
+      Slice read_ts;
+      Slice write_ts;
+      if (ShouldAcquireMutexOnKey() && FLAGS_user_timestamp_size > 0) {
+        read_ts_str = GenerateTimestampForRead();
+        read_ts = read_ts_str;
+        read_opts.timestamp = &read_ts;
+        write_ts_str = NowNanosStr();
+        write_ts = write_ts_str;
+        write_opts.timestamp = &write_ts;
+      }
+
       int prob_op = thread->rand.Uniform(100);
       // Reset this in case we pick something other than a read op. We don't
       // want to use a stale value when deciding at the beginning of the loop
@@ -804,8 +901,16 @@ std::vector<std::string> StressTest::GetWhiteBoxKeys(ThreadState* thread,
   std::vector<std::string> boundaries;
   for (const LevelMetaData& lmd : cfmd.levels) {
     for (const SstFileMetaData& sfmd : lmd.files) {
-      boundaries.push_back(sfmd.smallestkey);
-      boundaries.push_back(sfmd.largestkey);
+      // If FLAGS_user_timestamp_size > 0, then both smallestkey and largestkey
+      // have timestamps.
+      const auto& skey = sfmd.smallestkey;
+      const auto& lkey = sfmd.largestkey;
+      assert(skey.size() >= FLAGS_user_timestamp_size);
+      assert(lkey.size() >= FLAGS_user_timestamp_size);
+      boundaries.push_back(
+          skey.substr(0, skey.size() - FLAGS_user_timestamp_size));
+      boundaries.push_back(
+          lkey.substr(0, lkey.size() - FLAGS_user_timestamp_size));
     }
   }
   if (boundaries.empty()) {
@@ -955,6 +1060,7 @@ Status StressTest::TestIterate(ThreadState* thread,
     // iterators with the same set-up, and it doesn't hurt to check them
     // to be equal.
     ReadOptions cmp_ro;
+    cmp_ro.timestamp = readoptionscopy.timestamp;
     cmp_ro.snapshot = snapshot;
     cmp_ro.total_order_seek = true;
     ColumnFamilyHandle* cmp_cfh =
@@ -1074,21 +1180,25 @@ void StressTest::VerifyIterator(ThreadState* thread,
     *diverged = true;
     return;
   } else if (op == kLastOpSeek && ro.iterate_lower_bound != nullptr &&
-             (options_.comparator->Compare(*ro.iterate_lower_bound, seek_key) >=
-                  0 ||
+             (options_.comparator->CompareWithoutTimestamp(
+                  *ro.iterate_lower_bound, /*a_has_ts=*/false, seek_key,
+                  /*b_has_ts=*/false) >= 0 ||
               (ro.iterate_upper_bound != nullptr &&
-               options_.comparator->Compare(*ro.iterate_lower_bound,
-                                            *ro.iterate_upper_bound) >= 0))) {
+               options_.comparator->CompareWithoutTimestamp(
+                   *ro.iterate_lower_bound, /*a_has_ts=*/false,
+                   *ro.iterate_upper_bound, /*b_has_ts*/ false) >= 0))) {
     // Lower bound behavior is not well defined if it is larger than
     // seek key or upper bound. Disable the check for now.
     *diverged = true;
     return;
   } else if (op == kLastOpSeekForPrev && ro.iterate_upper_bound != nullptr &&
-             (options_.comparator->Compare(*ro.iterate_upper_bound, seek_key) <=
-                  0 ||
+             (options_.comparator->CompareWithoutTimestamp(
+                  *ro.iterate_upper_bound, /*a_has_ts=*/false, seek_key,
+                  /*b_has_ts=*/false) <= 0 ||
               (ro.iterate_lower_bound != nullptr &&
-               options_.comparator->Compare(*ro.iterate_lower_bound,
-                                            *ro.iterate_upper_bound) >= 0))) {
+               options_.comparator->CompareWithoutTimestamp(
+                   *ro.iterate_lower_bound, /*a_has_ts=*/false,
+                   *ro.iterate_upper_bound, /*b_has_ts=*/false) >= 0))) {
     // Uppder bound behavior is not well defined if it is smaller than
     // seek key or lower bound. Disable the check for now.
     *diverged = true;
@@ -1157,9 +1267,13 @@ void StressTest::VerifyIterator(ThreadState* thread,
       if ((iter->Valid() && iter->key() != cmp_iter->key()) ||
           (!iter->Valid() &&
            (ro.iterate_upper_bound == nullptr ||
-            cmp->Compare(total_order_key, *ro.iterate_upper_bound) < 0) &&
+            cmp->CompareWithoutTimestamp(total_order_key, /*a_has_ts=*/false,
+                                         *ro.iterate_upper_bound,
+                                         /*b_has_ts=*/false) < 0) &&
            (ro.iterate_lower_bound == nullptr ||
-            cmp->Compare(total_order_key, *ro.iterate_lower_bound) > 0))) {
+            cmp->CompareWithoutTimestamp(total_order_key, /*a_has_ts=*/false,
+                                         *ro.iterate_lower_bound,
+                                         /*b_has_ts=*/false) > 0))) {
         fprintf(stderr,
                 "Iterator diverged from control iterator which"
                 " has value %s %s\n",
@@ -1245,11 +1359,6 @@ Status StressTest::TestBackupRestore(
             backup_opts.share_files_with_checksum_naming |
             BackupableDBOptions::kFlagIncludeFileSize;
       }
-      if (thread->rand.OneIn(2)) {
-        backup_opts.share_files_with_checksum_naming =
-            backup_opts.share_files_with_checksum_naming |
-            BackupableDBOptions::kFlagMatchInterimNaming;
-      }
     }
   }
   BackupEngine* backup_engine = nullptr;
@@ -1259,6 +1368,12 @@ Status StressTest::TestBackupRestore(
     from = "BackupEngine::Open";
   }
   if (s.ok()) {
+    if (thread->rand.OneIn(2)) {
+      TEST_FutureSchemaVersion2Options test_opts;
+      test_opts.crc32c_checksums = thread->rand.OneIn(2) == 0;
+      test_opts.file_sizes = thread->rand.OneIn(2) == 0;
+      TEST_EnableWriteFutureSchemaVersion2(backup_engine, test_opts);
+    }
     s = backup_engine->CreateNewBackup(db_);
     if (!s.ok()) {
       from = "BackupEngine::CreateNewBackup";
@@ -1273,8 +1388,13 @@ Status StressTest::TestBackupRestore(
     }
   }
   std::vector<BackupInfo> backup_info;
+  // If inplace_not_restore, we verify the backup by opening it as a
+  // read-only DB. If !inplace_not_restore, we restore it to a temporary
+  // directory for verification.
+  bool inplace_not_restore = thread->rand.OneIn(3);
   if (s.ok()) {
-    backup_engine->GetBackupInfo(&backup_info);
+    backup_engine->GetBackupInfo(&backup_info,
+                                 /*include_file_details*/ inplace_not_restore);
     if (backup_info.empty()) {
       s = Status::NotFound("no backups found");
       from = "BackupEngine::GetBackupInfo";
@@ -1290,8 +1410,8 @@ Status StressTest::TestBackupRestore(
   }
   const bool allow_persistent = thread->tid == 0;  // not too many
   bool from_latest = false;
-  if (s.ok()) {
-    int count = static_cast<int>(backup_info.size());
+  int count = static_cast<int>(backup_info.size());
+  if (s.ok() && !inplace_not_restore) {
     if (count > 1) {
       s = backup_engine->RestoreDBFromBackup(
           RestoreOptions(), backup_info[thread->rand.Uniform(count)].backup_id,
@@ -1309,7 +1429,9 @@ Status StressTest::TestBackupRestore(
       }
     }
   }
-  if (s.ok()) {
+  if (s.ok() && !inplace_not_restore) {
+    // Purge early if restoring, to ensure the restored directory doesn't
+    // have some secret dependency on the backup directory.
     uint32_t to_keep = 0;
     if (allow_persistent) {
       // allow one thread to keep up to 2 backups
@@ -1337,10 +1459,21 @@ Status StressTest::TestBackupRestore(
     for (auto name : column_family_names_) {
       cf_descriptors.emplace_back(name, ColumnFamilyOptions(restore_options));
     }
-    s = DB::Open(DBOptions(restore_options), restore_dir, cf_descriptors,
-                 &restored_cf_handles, &restored_db);
-    if (!s.ok()) {
-      from = "DB::Open in backup/restore";
+    if (inplace_not_restore) {
+      BackupInfo& info = backup_info[thread->rand.Uniform(count)];
+      restore_options.env = info.env_for_open.get();
+      s = DB::OpenForReadOnly(DBOptions(restore_options), info.name_for_open,
+                              cf_descriptors, &restored_cf_handles,
+                              &restored_db);
+      if (!s.ok()) {
+        from = "DB::OpenForReadOnly in backup/restore";
+      }
+    } else {
+      s = DB::Open(DBOptions(restore_options), restore_dir, cf_descriptors,
+                   &restored_cf_handles, &restored_db);
+      if (!s.ok()) {
+        from = "DB::Open in backup/restore";
+      }
     }
   }
   // Note the column families chosen by `rand_column_families` cannot be
@@ -1354,8 +1487,16 @@ Status StressTest::TestBackupRestore(
     std::string key_str = Key(rand_keys[0]);
     Slice key = key_str;
     std::string restored_value;
+    ReadOptions read_opts;
+    std::string ts_str;
+    Slice ts;
+    if (FLAGS_user_timestamp_size > 0) {
+      ts_str = GenerateTimestampForRead();
+      ts = ts_str;
+      read_opts.timestamp = &ts;
+    }
     Status get_status = restored_db->Get(
-        ReadOptions(), restored_cf_handles[rand_column_families[i]], key,
+        read_opts, restored_cf_handles[rand_column_families[i]], key,
         &restored_value);
     bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
     if (get_status.ok()) {
@@ -1373,10 +1514,6 @@ Status StressTest::TestBackupRestore(
       }
     }
   }
-  if (backup_engine != nullptr) {
-    delete backup_engine;
-    backup_engine = nullptr;
-  }
   if (restored_db != nullptr) {
     for (auto* cf_handle : restored_cf_handles) {
       restored_db->DestroyColumnFamilyHandle(cf_handle);
@@ -1384,6 +1521,22 @@ Status StressTest::TestBackupRestore(
     delete restored_db;
     restored_db = nullptr;
   }
+  if (s.ok() && inplace_not_restore) {
+    // Purge late if inplace open read-only
+    uint32_t to_keep = 0;
+    if (allow_persistent) {
+      // allow one thread to keep up to 2 backups
+      to_keep = thread->rand.Uniform(3);
+    }
+    s = backup_engine->PurgeOldBackups(to_keep);
+    if (!s.ok()) {
+      from = "BackupEngine::PurgeOldBackups";
+    }
+  }
+  if (backup_engine != nullptr) {
+    delete backup_engine;
+    backup_engine = nullptr;
+  }
   if (s.ok()) {
     // Preserve directories on failure, or allowed persistent backup
     if (!allow_persistent) {
@@ -1455,7 +1608,7 @@ Status StressTest::TestCheckpoint(ThreadState* thread,
       FLAGS_db + "/.checkpoint" + ToString(thread->tid);
   Options tmp_opts(options_);
   tmp_opts.listeners.clear();
-  tmp_opts.env = db_stress_env->target();
+  tmp_opts.env = db_stress_env;
 
   DestroyDB(checkpoint_dir, tmp_opts);
 
@@ -1488,11 +1641,11 @@ Status StressTest::TestCheckpoint(ThreadState* thread,
       }
     }
   }
+  delete checkpoint;
+  checkpoint = nullptr;
   std::vector<ColumnFamilyHandle*> cf_handles;
   DB* checkpoint_db = nullptr;
   if (s.ok()) {
-    delete checkpoint;
-    checkpoint = nullptr;
     Options options(options_);
     options.listeners.clear();
     std::vector<ColumnFamilyDescriptor> cf_descs;
@@ -1677,7 +1830,7 @@ Status StressTest::TestPauseBackground(ThreadState* thread) {
   // 1 chance in 625 of pausing full 16s.)
   int pwr2_micros =
       std::min(thread->rand.Uniform(25), thread->rand.Uniform(25));
-  db_stress_env->SleepForMicroseconds(1 << pwr2_micros);
+  clock_->SleepForMicroseconds(1 << pwr2_micros);
   return db_->ContinueBackgroundWork();
 }
 
@@ -1686,6 +1839,7 @@ void StressTest::TestAcquireSnapshot(ThreadState* thread,
                                      const std::string& keystr, uint64_t i) {
   Slice key = keystr;
   ColumnFamilyHandle* column_family = column_families_[rand_column_family];
+  ReadOptions ropt;
 #ifndef ROCKSDB_LITE
   auto db_impl = static_cast_with_check<DBImpl>(db_->GetRootDB());
   const bool ww_snapshot = thread->rand.OneIn(10);
@@ -1695,8 +1849,19 @@ void StressTest::TestAcquireSnapshot(ThreadState* thread,
 #else
   const Snapshot* snapshot = db_->GetSnapshot();
 #endif  // !ROCKSDB_LITE
-  ReadOptions ropt;
   ropt.snapshot = snapshot;
+
+  // Ideally, we want snapshot taking and timestamp generation to be atomic
+  // here, so that the snapshot corresponds to the timestamp. However, it is
+  // not possible with current GetSnapshot() API.
+  std::string ts_str;
+  Slice ts;
+  if (FLAGS_user_timestamp_size > 0) {
+    ts_str = GenerateTimestampForRead();
+    ts = ts_str;
+    ropt.timestamp = &ts;
+  }
+
   std::string value_at;
   // When taking a snapshot, we also read a key from that snapshot. We
   // will later read the same key before releasing the snapshot and
@@ -1718,10 +1883,14 @@ void StressTest::TestAcquireSnapshot(ThreadState* thread,
     }
   }
 
-  ThreadState::SnapshotState snap_state = {
-      snapshot, rand_column_family, column_family->GetName(),
-      keystr,   status_at,          value_at,
-      key_vec};
+  ThreadState::SnapshotState snap_state = {snapshot,
+                                           rand_column_family,
+                                           column_family->GetName(),
+                                           keystr,
+                                           status_at,
+                                           value_at,
+                                           key_vec,
+                                           ts_str};
   uint64_t hold_for = FLAGS_snapshot_hold_ops;
   if (FLAGS_long_running_snapshots) {
     // Hold 10% of snapshots for 10x more
@@ -1826,6 +1995,13 @@ uint32_t StressTest::GetRangeHash(ThreadState* thread, const Snapshot* snapshot,
   ReadOptions ro;
   ro.snapshot = snapshot;
   ro.total_order_seek = true;
+  std::string ts_str;
+  Slice ts;
+  if (FLAGS_user_timestamp_size > 0) {
+    ts_str = GenerateTimestampForRead();
+    ts = ts_str;
+    ro.timestamp = &ts;
+  }
   std::unique_ptr<Iterator> it(db_->NewIterator(ro, column_family));
   for (it->Seek(start_key);
        it->Valid() && options_.comparator->Compare(it->key(), end_key) <= 0;
@@ -1852,7 +2028,7 @@ void StressTest::PrintEnv() const {
   fprintf(stdout, "TransactionDB             : %s\n",
           FLAGS_use_txn ? "true" : "false");
 #ifndef ROCKSDB_LITE
-  fprintf(stdout, "BlobDB                    : %s\n",
+  fprintf(stdout, "Stacked BlobDB            : %s\n",
           FLAGS_use_blob_db ? "true" : "false");
 #endif  // !ROCKSDB_LITE
   fprintf(stdout, "Read only mode            : %s\n",
@@ -1927,13 +2103,16 @@ void StressTest::PrintEnv() const {
 
   fprintf(stdout, "Memtablerep               : %s\n", memtablerep);
 
-  fprintf(stdout, "Test kill odd             : %d\n", rocksdb_kill_odds);
-  if (!rocksdb_kill_exclude_prefixes.empty()) {
+#ifndef NDEBUG
+  KillPoint* kp = KillPoint::GetInstance();
+  fprintf(stdout, "Test kill odd             : %d\n", kp->rocksdb_kill_odds);
+  if (!kp->rocksdb_kill_exclude_prefixes.empty()) {
     fprintf(stdout, "Skipping kill points prefixes:\n");
-    for (auto& p : rocksdb_kill_exclude_prefixes) {
+    for (auto& p : kp->rocksdb_kill_exclude_prefixes) {
       fprintf(stdout, "  %s\n", p.c_str());
     }
   }
+#endif
   fprintf(stdout, "Periodic Compaction Secs  : %" PRIu64 "\n",
           FLAGS_periodic_compaction_seconds);
   fprintf(stdout, "Compaction TTL            : %" PRIu64 "\n",
@@ -1947,9 +2126,17 @@ void StressTest::PrintEnv() const {
   fprintf(stdout, "Use dynamic level         : %d\n",
           static_cast<int>(FLAGS_level_compaction_dynamic_level_bytes));
   fprintf(stdout, "Read fault one in         : %d\n", FLAGS_read_fault_one_in);
+  fprintf(stdout, "Write fault one in        : %d\n", FLAGS_write_fault_one_in);
+  fprintf(stdout, "Open metadata write fault one in:\n");
+  fprintf(stdout, "                            %d\n",
+          FLAGS_open_metadata_write_fault_one_in);
   fprintf(stdout, "Sync fault injection      : %d\n", FLAGS_sync_fault_injection);
   fprintf(stdout, "Best efforts recovery     : %d\n",
           static_cast<int>(FLAGS_best_efforts_recovery));
+  fprintf(stdout, "Fail if OPTIONS file error: %d\n",
+          static_cast<int>(FLAGS_fail_if_options_file_error));
+  fprintf(stdout, "User timestamp size bytes : %d\n",
+          static_cast<int>(FLAGS_user_timestamp_size));
 
   fprintf(stdout, "------------------------------------------------\n");
 }
@@ -1964,6 +2151,12 @@ void StressTest::Open() {
     block_based_options.block_cache = cache_;
     block_based_options.cache_index_and_filter_blocks =
         FLAGS_cache_index_and_filter_blocks;
+    block_based_options.metadata_cache_options.top_level_index_pinning =
+        static_cast<PinningTier>(FLAGS_top_level_index_pinning);
+    block_based_options.metadata_cache_options.partition_pinning =
+        static_cast<PinningTier>(FLAGS_partition_pinning);
+    block_based_options.metadata_cache_options.unpartitioned_pinning =
+        static_cast<PinningTier>(FLAGS_unpartitioned_pinning);
     block_based_options.block_cache_compressed = compressed_cache_;
     block_based_options.checksum = checksum_type_e;
     block_based_options.block_size = FLAGS_block_size;
@@ -2028,6 +2221,8 @@ void StressTest::Open() {
         FLAGS_compression_zstd_max_train_bytes;
     options_.compression_opts.parallel_threads =
         FLAGS_compression_parallel_threads;
+    options_.compression_opts.max_dict_buffer_bytes =
+        FLAGS_compression_max_dict_buffer_bytes;
     options_.create_if_missing = true;
     options_.max_manifest_file_size = FLAGS_max_manifest_file_size;
     options_.inplace_update_support = FLAGS_in_place_update;
@@ -2058,6 +2253,18 @@ void StressTest::Open() {
         FLAGS_level_compaction_dynamic_level_bytes;
     options_.file_checksum_gen_factory =
         GetFileChecksumImpl(FLAGS_file_checksum_impl);
+    options_.track_and_verify_wals_in_manifest = true;
+
+    // Integrated BlobDB
+    options_.enable_blob_files = FLAGS_enable_blob_files;
+    options_.min_blob_size = FLAGS_min_blob_size;
+    options_.blob_file_size = FLAGS_blob_file_size;
+    options_.blob_compression_type =
+        StringToCompressionType(FLAGS_blob_compression_type.c_str());
+    options_.enable_blob_garbage_collection =
+        FLAGS_enable_blob_garbage_collection;
+    options_.blob_garbage_collection_age_cutoff =
+        FLAGS_blob_garbage_collection_age_cutoff;
   } else {
 #ifdef ROCKSDB_LITE
     fprintf(stderr, "--options_file not supported in lite mode\n");
@@ -2146,10 +2353,38 @@ void StressTest::Open() {
 
   options_.best_efforts_recovery = FLAGS_best_efforts_recovery;
   options_.paranoid_file_checks = FLAGS_paranoid_file_checks;
+  options_.fail_if_options_file_error = FLAGS_fail_if_options_file_error;
+
+  if ((options_.enable_blob_files || options_.enable_blob_garbage_collection ||
+       FLAGS_allow_setting_blob_options_dynamically) &&
+      (FLAGS_use_merge || FLAGS_best_efforts_recovery)) {
+    fprintf(stderr,
+            "Integrated BlobDB is currently incompatible with Merge, "
+            "and best-effort recovery\n");
+    exit(1);
+  }
+
+  if (options_.enable_blob_files) {
+    fprintf(stdout,
+            "Integrated BlobDB: blob files enabled, min blob size %" PRIu64
+            ", blob file size %" PRIu64 ", blob compression type %s\n",
+            options_.min_blob_size, options_.blob_file_size,
+            CompressionTypeToString(options_.blob_compression_type).c_str());
+  }
+
+  if (options_.enable_blob_garbage_collection) {
+    fprintf(stdout, "Integrated BlobDB: blob GC enabled, cutoff %f\n",
+            options_.blob_garbage_collection_age_cutoff);
+  }
 
   fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
 
   Status s;
+
+  if (FLAGS_user_timestamp_size > 0) {
+    CheckAndSetOptionsForUserTimestamp();
+  }
+
   if (FLAGS_ttl == -1) {
     std::vector<std::string> existing_column_families;
     s = DB::ListColumnFamilies(DBOptions(options_), FLAGS_db,
@@ -2203,32 +2438,88 @@ void StressTest::Open() {
         new DbStressListener(FLAGS_db, options_.db_paths, cf_descriptors));
     options_.create_missing_column_families = true;
     if (!FLAGS_use_txn) {
+#ifndef NDEBUG
+      // Determine whether we need to ingest file metadata write failures
+      // during DB reopen. If it does, enable it.
+      // Only ingest metadata error if it is reopening, as initial open
+      // failure doesn't need to be handled.
+      // TODO cover transaction DB is not covered in this fault test too.
+      bool ingest_meta_error =
+          FLAGS_open_metadata_write_fault_one_in &&
+          fault_fs_guard
+              ->FileExists(FLAGS_db + "/CURRENT", IOOptions(), nullptr)
+              .ok();
+      if (ingest_meta_error) {
+        fault_fs_guard->EnableMetadataWriteErrorInjection();
+        fault_fs_guard->SetRandomMetadataWriteError(
+            FLAGS_open_metadata_write_fault_one_in);
+      }
+      while (true) {
+#endif  // NDEBUG
 #ifndef ROCKSDB_LITE
-      if (FLAGS_use_blob_db) {
-        blob_db::BlobDBOptions blob_db_options;
-        blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size;
-        blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
-        blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
-        blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
-        blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff;
-
-        blob_db::BlobDB* blob_db = nullptr;
-        s = blob_db::BlobDB::Open(options_, blob_db_options, FLAGS_db,
-                                  cf_descriptors, &column_families_, &blob_db);
-        if (s.ok()) {
-          db_ = blob_db;
-        }
-      } else
+        // StackableDB-based BlobDB
+        if (FLAGS_use_blob_db) {
+          blob_db::BlobDBOptions blob_db_options;
+          blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size;
+          blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
+          blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
+          blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
+          blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff;
+
+          blob_db::BlobDB* blob_db = nullptr;
+          s = blob_db::BlobDB::Open(options_, blob_db_options, FLAGS_db,
+                                    cf_descriptors, &column_families_,
+                                    &blob_db);
+          if (s.ok()) {
+            db_ = blob_db;
+          }
+        } else
 #endif  // !ROCKSDB_LITE
-      {
-        if (db_preload_finished_.load() && FLAGS_read_only) {
-          s = DB::OpenForReadOnly(DBOptions(options_), FLAGS_db, cf_descriptors,
-                                  &column_families_, &db_);
-        } else {
-          s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors,
-                       &column_families_, &db_);
+        {
+          if (db_preload_finished_.load() && FLAGS_read_only) {
+            s = DB::OpenForReadOnly(DBOptions(options_), FLAGS_db,
+                                    cf_descriptors, &column_families_, &db_);
+          } else {
+            s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors,
+                         &column_families_, &db_);
+          }
+        }
+
+#ifndef NDEBUG
+        if (ingest_meta_error) {
+          fault_fs_guard->DisableMetadataWriteErrorInjection();
+          if (s.ok()) {
+            // Ingested errors might happen in background compactions. We
+            // wait for all compactions to finish to make sure DB is in
+            // clean state before executing queries.
+            s = static_cast_with_check<DBImpl>(db_->GetRootDB())
+                    ->TEST_WaitForCompact(true);
+            if (!s.ok()) {
+              delete db_;
+            }
+          }
+          if (!s.ok()) {
+            // After failure to opening a DB due to IO error, retry should
+            // successfully open the DB with correct data if no IO error shows
+            // up.
+            ingest_meta_error = false;
+
+            Random rand(static_cast<uint32_t>(FLAGS_seed));
+            if (rand.OneIn(2)) {
+              fault_fs_guard->DeleteFilesCreatedAfterLastDirSync(IOOptions(),
+                                                                 nullptr);
+            }
+            if (rand.OneIn(3)) {
+              fault_fs_guard->DropUnsyncedFileData();
+            } else if (rand.OneIn(2)) {
+              fault_fs_guard->DropRandomUnsyncedFileData(&rand);
+            }
+            continue;
+          }
         }
+        break;
       }
+#endif  // NDEBUG
     } else {
 #ifndef ROCKSDB_LITE
       TransactionDBOptions txn_db_options;
@@ -2395,11 +2686,77 @@ void StressTest::Reopen(ThreadState* thread) {
   secondaries_.clear();
 
   num_times_reopened_++;
-  auto now = db_stress_env->NowMicros();
+  auto now = clock_->NowMicros();
   fprintf(stdout, "%s Reopening database for the %dth time\n",
-          db_stress_env->TimeToString(now / 1000000).c_str(),
-          num_times_reopened_);
+          clock_->TimeToString(now / 1000000).c_str(), num_times_reopened_);
   Open();
 }
+
+void StressTest::CheckAndSetOptionsForUserTimestamp() {
+  assert(FLAGS_user_timestamp_size > 0);
+  const Comparator* const cmp = test::ComparatorWithU64Ts();
+  assert(cmp);
+  if (FLAGS_user_timestamp_size != cmp->timestamp_size()) {
+    fprintf(stderr,
+            "Only -user_timestamp_size=%d is supported in stress test.\n",
+            static_cast<int>(cmp->timestamp_size()));
+    exit(1);
+  }
+  if (FLAGS_nooverwritepercent > 0) {
+    fprintf(stderr,
+            "-nooverwritepercent must be 0 because SingleDelete must be "
+            "disabled.\n");
+    exit(1);
+  }
+  if (FLAGS_use_merge || FLAGS_use_full_merge_v1) {
+    fprintf(stderr, "Merge does not support timestamp yet.\n");
+    exit(1);
+  }
+  if (FLAGS_delrangepercent > 0) {
+    fprintf(stderr, "DeleteRange does not support timestamp yet.\n");
+    exit(1);
+  }
+  if (FLAGS_use_txn) {
+    fprintf(stderr, "TransactionDB does not support timestamp yet.\n");
+    exit(1);
+  }
+  if (FLAGS_read_only) {
+    fprintf(stderr, "When opened as read-only, timestamp not supported.\n");
+    exit(1);
+  }
+  if (FLAGS_test_secondary || FLAGS_secondary_catch_up_one_in > 0 ||
+      FLAGS_continuous_verification_interval > 0) {
+    fprintf(stderr, "Secondary instance does not support timestamp.\n");
+    exit(1);
+  }
+  if (FLAGS_checkpoint_one_in > 0) {
+    fprintf(stderr,
+            "-checkpoint_one_in=%d requires "
+            "DBImplReadOnly, which is not supported with timestamp\n",
+            FLAGS_checkpoint_one_in);
+    exit(1);
+  }
+#ifndef ROCKSDB_LITE
+  if (FLAGS_enable_blob_files || FLAGS_use_blob_db) {
+    fprintf(stderr, "BlobDB not supported with timestamp.\n");
+    exit(1);
+  }
+#endif  // !ROCKSDB_LITE
+  if (FLAGS_enable_compaction_filter) {
+    fprintf(stderr, "CompactionFilter not supported with timestamp.\n");
+    exit(1);
+  }
+  if (FLAGS_test_cf_consistency || FLAGS_test_batches_snapshots) {
+    fprintf(stderr,
+            "Due to per-key ts-seq ordering constraint, only the (default) "
+            "non-batched test is supported with timestamp.\n");
+    exit(1);
+  }
+  if (FLAGS_ingest_external_file_one_in > 0) {
+    fprintf(stderr, "Bulk loading may not support timestamp yet.\n");
+    exit(1);
+  }
+  options_.comparator = cmp;
+}
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // GFLAGS
diff --git a/db_stress_tool/db_stress_test_base.h b/db_stress_tool/db_stress_test_base.h
index 426af3bd082..351dc0137b4 100644
--- a/db_stress_tool/db_stress_test_base.h
+++ b/db_stress_tool/db_stress_test_base.h
@@ -13,6 +13,7 @@
 #include "db_stress_tool/db_stress_shared_state.h"
 
 namespace ROCKSDB_NAMESPACE {
+class SystemClock;
 class Transaction;
 class TransactionDB;
 
@@ -24,6 +25,8 @@ class StressTest {
 
   std::shared_ptr<Cache> NewCache(size_t capacity);
 
+  static std::vector<std::string> GetBlobCompressionTags();
+
   bool BuildOptionsTable();
 
   void InitDb();
@@ -208,6 +211,8 @@ class StressTest {
 
   void Reopen(ThreadState* thread);
 
+  void CheckAndSetOptionsForUserTimestamp();
+
   std::shared_ptr<Cache> cache_;
   std::shared_ptr<Cache> compressed_cache_;
   std::shared_ptr<const FilterPolicy> filter_policy_;
@@ -216,6 +221,7 @@ class StressTest {
   TransactionDB* txn_db_;
 #endif
   Options options_;
+  SystemClock* clock_;
   std::vector<ColumnFamilyHandle*> column_families_;
   std::vector<std::string> column_family_names_;
   std::atomic<int> new_column_family_name_;
diff --git a/db_stress_tool/db_stress_tool.cc b/db_stress_tool/db_stress_tool.cc
index 2126a143634..cb489eb1340 100644
--- a/db_stress_tool/db_stress_tool.cc
+++ b/db_stress_tool/db_stress_tool.cc
@@ -23,6 +23,7 @@
 #ifdef GFLAGS
 #include "db_stress_tool/db_stress_common.h"
 #include "db_stress_tool/db_stress_driver.h"
+#include "rocksdb/convenience.h"
 #ifndef NDEBUG
 #include "utilities/fault_injection_fs.h"
 #endif
@@ -34,11 +35,6 @@ static std::shared_ptr<ROCKSDB_NAMESPACE::DbStressEnvWrapper> env_wrapper_guard;
 static std::shared_ptr<CompositeEnvWrapper> fault_env_guard;
 }  // namespace
 
-static Env* GetCompositeEnv(std::shared_ptr<FileSystem> fs) {
-  static std::shared_ptr<Env> composite_env = NewCompositeEnv(fs);
-  return composite_env.get();
-}
-
 KeyGenContext key_gen_ctx;
 
 int db_stress_tool(int argc, char** argv) {
@@ -78,39 +74,53 @@ int db_stress_tool(int argc, char** argv) {
 
   if (!FLAGS_hdfs.empty()) {
     raw_env = new ROCKSDB_NAMESPACE::HdfsEnv(FLAGS_hdfs);
-  } else if (!FLAGS_env_uri.empty()) {
-    Status s = Env::LoadEnv(FLAGS_env_uri, &raw_env, &env_guard);
-    if (raw_env == nullptr) {
-      fprintf(stderr, "No Env registered for URI: %s\n", FLAGS_env_uri.c_str());
-      exit(1);
-    }
-  } else if (!FLAGS_fs_uri.empty()) {
-    std::shared_ptr<FileSystem> fs;
-    Status s = FileSystem::Load(FLAGS_fs_uri, &fs);
+  } else {
+    Status s = Env::CreateFromUri(ConfigOptions(), FLAGS_env_uri, FLAGS_fs_uri,
+                                  &raw_env, &env_guard);
     if (!s.ok()) {
-      fprintf(stderr, "Error: %s\n", s.ToString().c_str());
+      fprintf(stderr, "Error Creating Env URI: %s: %s\n", FLAGS_env_uri.c_str(),
+              s.ToString().c_str());
       exit(1);
     }
-    raw_env = GetCompositeEnv(fs);
-  } else {
-    raw_env = Env::Default();
   }
 
 #ifndef NDEBUG
-  if (FLAGS_read_fault_one_in || FLAGS_sync_fault_injection) {
+  if (FLAGS_read_fault_one_in || FLAGS_sync_fault_injection ||
+      FLAGS_write_fault_one_in || FLAGS_open_metadata_write_fault_one_in) {
     FaultInjectionTestFS* fs =
         new FaultInjectionTestFS(raw_env->GetFileSystem());
     fault_fs_guard.reset(fs);
-    fault_fs_guard->SetFilesystemDirectWritable(true);
+    if (FLAGS_write_fault_one_in) {
+      fault_fs_guard->SetFilesystemDirectWritable(false);
+    } else {
+      fault_fs_guard->SetFilesystemDirectWritable(true);
+    }
     fault_env_guard =
         std::make_shared<CompositeEnvWrapper>(raw_env, fault_fs_guard);
     raw_env = fault_env_guard.get();
   }
+  if (FLAGS_write_fault_one_in) {
+    SyncPoint::GetInstance()->SetCallBack(
+        "BuildTable:BeforeFinishBuildTable",
+        [&](void*) { fault_fs_guard->EnableWriteErrorInjection(); });
+    SyncPoint::GetInstance()->EnableProcessing();
+  }
 #endif
 
   env_wrapper_guard = std::make_shared<DbStressEnvWrapper>(raw_env);
   db_stress_env = env_wrapper_guard.get();
 
+#ifndef NDEBUG
+  if (FLAGS_write_fault_one_in) {
+    // In the write injection case, we need to use the FS interface and returns
+    // the IOStatus with different error and flags. Therefore,
+    // DbStressEnvWrapper cannot be used which will swallow the FS
+    // implementations. We should directly use the raw_env which is the
+    // CompositeEnvWrapper of env and fault_fs.
+    db_stress_env = raw_env;
+  }
+#endif
+
   FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
 
   // The number of background threads should be at least as much the
@@ -131,17 +141,22 @@ int db_stress_tool(int argc, char** argv) {
             "test_batches_snapshots test!\n");
     exit(1);
   }
-  if (FLAGS_memtable_prefix_bloom_size_ratio > 0.0 && FLAGS_prefix_size < 0) {
+  if (FLAGS_memtable_prefix_bloom_size_ratio > 0.0 && FLAGS_prefix_size < 0 &&
+      !FLAGS_memtable_whole_key_filtering) {
     fprintf(stderr,
-            "Error: please specify positive prefix_size in order to use "
-            "memtable_prefix_bloom_size_ratio\n");
+            "Error: please specify positive prefix_size or enable whole key "
+            "filtering in order to use memtable_prefix_bloom_size_ratio\n");
     exit(1);
   }
   if ((FLAGS_readpercent + FLAGS_prefixpercent + FLAGS_writepercent +
        FLAGS_delpercent + FLAGS_delrangepercent + FLAGS_iterpercent) != 100) {
     fprintf(stderr,
-            "Error: Read+Prefix+Write+Delete+DeleteRange+Iterate percents != "
-            "100!\n");
+            "Error: "
+            "Read(%d)+Prefix(%d)+Write(%d)+Delete(%d)+DeleteRange(%d)"
+            "+Iterate(%d) percents != "
+            "100!\n",
+            FLAGS_readpercent, FLAGS_prefixpercent, FLAGS_writepercent,
+            FLAGS_delpercent, FLAGS_delrangepercent, FLAGS_iterpercent);
     exit(1);
   }
   if (FLAGS_disable_wal == 1 && FLAGS_reopen > 0) {
@@ -264,9 +279,19 @@ int db_stress_tool(int argc, char** argv) {
         "test_batches_snapshots  must all be 0 when using compaction filter\n");
     exit(1);
   }
+  if (FLAGS_batch_protection_bytes_per_key > 0 &&
+      !FLAGS_test_batches_snapshots) {
+    fprintf(stderr,
+            "Error: test_batches_snapshots must be enabled when "
+            "batch_protection_bytes_per_key > 0\n");
+    exit(1);
+  }
 
-  rocksdb_kill_odds = FLAGS_kill_random_test;
-  rocksdb_kill_exclude_prefixes = SplitString(FLAGS_kill_exclude_prefixes);
+#ifndef NDEBUG
+  KillPoint* kp = KillPoint::GetInstance();
+  kp->rocksdb_kill_odds = FLAGS_kill_random_test;
+  kp->rocksdb_kill_exclude_prefixes = SplitString(FLAGS_kill_exclude_prefixes);
+#endif
 
   unsigned int levels = FLAGS_max_key_len;
   std::vector<std::string> weights;
diff --git a/db_stress_tool/no_batched_ops_stress.cc b/db_stress_tool/no_batched_ops_stress.cc
index 269d0886d3f..a4ca23d87a0 100644
--- a/db_stress_tool/no_batched_ops_stress.cc
+++ b/db_stress_tool/no_batched_ops_stress.cc
@@ -22,6 +22,13 @@ class NonBatchedOpsStressTest : public StressTest {
 
   void VerifyDb(ThreadState* thread) const override {
     ReadOptions options(FLAGS_verify_checksum, true);
+    std::string ts_str;
+    Slice ts;
+    if (FLAGS_user_timestamp_size > 0) {
+      ts_str = GenerateTimestampForRead();
+      ts = ts_str;
+      options.timestamp = &ts;
+    }
     auto shared = thread->shared;
     const int64_t max_key = shared->GetMaxKey();
     const int64_t keys_per_thread = max_key / shared->GetNumThreads();
@@ -477,6 +484,8 @@ class NonBatchedOpsStressTest : public StressTest {
     int64_t max_key = shared->GetMaxKey();
     int64_t rand_key = rand_keys[0];
     int rand_column_family = rand_column_families[0];
+    std::string write_ts_str;
+    Slice write_ts;
     while (!shared->AllowsOverwrite(rand_key) &&
            (FLAGS_use_merge || shared->Exists(rand_column_family, rand_key))) {
       lock.reset();
@@ -484,6 +493,11 @@ class NonBatchedOpsStressTest : public StressTest {
       rand_column_family = thread->rand.Next() % FLAGS_column_families;
       lock.reset(
           new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key)));
+      if (FLAGS_user_timestamp_size > 0) {
+        write_ts_str = NowNanosStr();
+        write_ts = write_ts_str;
+        write_opts.timestamp = &write_ts;
+      }
     }
 
     std::string key_str = Key(rand_key);
@@ -559,6 +573,8 @@ class NonBatchedOpsStressTest : public StressTest {
     // OPERATION delete
     // If the chosen key does not allow overwrite and it does not exist,
     // choose another key.
+    std::string write_ts_str;
+    Slice write_ts;
     while (!shared->AllowsOverwrite(rand_key) &&
            !shared->Exists(rand_column_family, rand_key)) {
       lock.reset();
@@ -566,6 +582,11 @@ class NonBatchedOpsStressTest : public StressTest {
       rand_column_family = thread->rand.Next() % FLAGS_column_families;
       lock.reset(
           new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key)));
+      if (FLAGS_user_timestamp_size > 0) {
+        write_ts_str = NowNanosStr();
+        write_ts = write_ts_str;
+        write_opts.timestamp = &write_ts;
+      }
     }
 
     std::string key_str = Key(rand_key);
diff --git a/docs/Gemfile b/docs/Gemfile
index de6f39519db..d78e4354d29 100644
--- a/docs/Gemfile
+++ b/docs/Gemfile
@@ -1,2 +1,4 @@
 source 'https://rubygems.org'
-gem 'github-pages', '~> 207'
+gem 'github-pages', '~> 209'
+
+gem "webrick", "~> 1.7"
diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock
index 7702aece13b..5a366b7d448 100644
--- a/docs/Gemfile.lock
+++ b/docs/Gemfile.lock
@@ -1,7 +1,7 @@
 GEM
   remote: https://rubygems.org/
   specs:
-    activesupport (6.0.3.2)
+    activesupport (6.0.3.4)
       concurrent-ruby (~> 1.0, >= 1.0.2)
       i18n (>= 0.7, < 2)
       minitest (~> 5.1)
@@ -17,37 +17,40 @@ GEM
     commonmarker (0.17.13)
       ruby-enum (~> 0.5)
     concurrent-ruby (1.1.7)
-    dnsruby (1.61.4)
+    dnsruby (1.61.5)
       simpleidn (~> 0.1)
-    em-websocket (0.5.1)
+    em-websocket (0.5.2)
       eventmachine (>= 0.12.9)
       http_parser.rb (~> 0.6.0)
     ethon (0.12.0)
       ffi (>= 1.3.0)
     eventmachine (1.2.7)
     execjs (2.7.0)
-    faraday (1.0.1)
+    faraday (1.3.0)
+      faraday-net_http (~> 1.0)
       multipart-post (>= 1.2, < 3)
-    ffi (1.13.1)
+      ruby2_keywords
+    faraday-net_http (1.0.0)
+    ffi (1.14.2)
     forwardable-extended (2.6.0)
     gemoji (3.0.1)
-    github-pages (207)
+    github-pages (209)
       github-pages-health-check (= 1.16.1)
       jekyll (= 3.9.0)
       jekyll-avatar (= 0.7.0)
       jekyll-coffeescript (= 1.1.1)
       jekyll-commonmark-ghpages (= 0.1.6)
       jekyll-default-layout (= 0.1.4)
-      jekyll-feed (= 0.13.0)
+      jekyll-feed (= 0.15.1)
       jekyll-gist (= 1.5.0)
       jekyll-github-metadata (= 2.13.0)
-      jekyll-mentions (= 1.5.1)
+      jekyll-mentions (= 1.6.0)
       jekyll-optional-front-matter (= 0.3.2)
       jekyll-paginate (= 1.1.0)
       jekyll-readme-index (= 0.3.0)
-      jekyll-redirect-from (= 0.15.0)
+      jekyll-redirect-from (= 0.16.0)
       jekyll-relative-links (= 0.6.1)
-      jekyll-remote-theme (= 0.4.1)
+      jekyll-remote-theme (= 0.4.2)
       jekyll-sass-converter (= 1.5.2)
       jekyll-seo-tag (= 2.6.1)
       jekyll-sitemap (= 1.4.0)
@@ -55,7 +58,7 @@ GEM
       jekyll-theme-architect (= 0.1.1)
       jekyll-theme-cayman (= 0.1.1)
       jekyll-theme-dinky (= 0.1.1)
-      jekyll-theme-hacker (= 0.1.1)
+      jekyll-theme-hacker (= 0.1.2)
       jekyll-theme-leap-day (= 0.1.1)
       jekyll-theme-merlot (= 0.1.1)
       jekyll-theme-midnight (= 0.1.1)
@@ -66,14 +69,14 @@ GEM
       jekyll-theme-tactile (= 0.1.1)
       jekyll-theme-time-machine (= 0.1.1)
       jekyll-titles-from-headings (= 0.5.3)
-      jemoji (= 0.11.1)
-      kramdown (= 2.3.0)
+      jemoji (= 0.12.0)
+      kramdown (= 2.3.1)
       kramdown-parser-gfm (= 1.1.0)
       liquid (= 4.0.3)
       mercenary (~> 0.3)
       minima (= 2.5.1)
       nokogiri (>= 1.10.4, < 2.0)
-      rouge (= 3.19.0)
+      rouge (= 3.23.0)
       terminal-table (~> 1.4)
     github-pages-health-check (1.16.1)
       addressable (~> 2.3)
@@ -81,7 +84,7 @@ GEM
       octokit (~> 4.0)
       public_suffix (~> 3.0)
       typhoeus (~> 1.3)
-    html-pipeline (2.13.0)
+    html-pipeline (2.14.0)
       activesupport (>= 2)
       nokogiri (>= 1.4)
     http_parser.rb (0.6.0)
@@ -114,14 +117,14 @@ GEM
       rouge (>= 2.0, < 4.0)
     jekyll-default-layout (0.1.4)
       jekyll (~> 3.0)
-    jekyll-feed (0.13.0)
+    jekyll-feed (0.15.1)
       jekyll (>= 3.7, < 5.0)
     jekyll-gist (1.5.0)
       octokit (~> 4.2)
     jekyll-github-metadata (2.13.0)
       jekyll (>= 3.4, < 5.0)
       octokit (~> 4.0, != 4.4.0)
-    jekyll-mentions (1.5.1)
+    jekyll-mentions (1.6.0)
       html-pipeline (~> 2.3)
       jekyll (>= 3.7, < 5.0)
     jekyll-optional-front-matter (0.3.2)
@@ -129,14 +132,15 @@ GEM
     jekyll-paginate (1.1.0)
     jekyll-readme-index (0.3.0)
       jekyll (>= 3.0, < 5.0)
-    jekyll-redirect-from (0.15.0)
+    jekyll-redirect-from (0.16.0)
       jekyll (>= 3.3, < 5.0)
     jekyll-relative-links (0.6.1)
       jekyll (>= 3.3, < 5.0)
-    jekyll-remote-theme (0.4.1)
+    jekyll-remote-theme (0.4.2)
       addressable (~> 2.0)
       jekyll (>= 3.5, < 5.0)
-      rubyzip (>= 1.3.0)
+      jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0)
+      rubyzip (>= 1.3.0, < 3.0)
     jekyll-sass-converter (1.5.2)
       sass (~> 3.4)
     jekyll-seo-tag (2.6.1)
@@ -153,8 +157,8 @@ GEM
     jekyll-theme-dinky (0.1.1)
       jekyll (~> 3.5)
       jekyll-seo-tag (~> 2.0)
-    jekyll-theme-hacker (0.1.1)
-      jekyll (~> 3.5)
+    jekyll-theme-hacker (0.1.2)
+      jekyll (> 3.5, < 5.0)
       jekyll-seo-tag (~> 2.0)
     jekyll-theme-leap-day (0.1.1)
       jekyll (~> 3.5)
@@ -188,41 +192,44 @@ GEM
       jekyll (>= 3.3, < 5.0)
     jekyll-watch (2.2.1)
       listen (~> 3.0)
-    jemoji (0.11.1)
+    jemoji (0.12.0)
       gemoji (~> 3.0)
       html-pipeline (~> 2.2)
       jekyll (>= 3.0, < 5.0)
-    kramdown (2.3.0)
+    kramdown (2.3.1)
       rexml
     kramdown-parser-gfm (1.1.0)
       kramdown (~> 2.0)
     liquid (4.0.3)
-    listen (3.2.1)
+    listen (3.4.0)
       rb-fsevent (~> 0.10, >= 0.10.3)
       rb-inotify (~> 0.9, >= 0.9.10)
     mercenary (0.3.6)
-    mini_portile2 (2.4.0)
+    mini_portile2 (2.5.1)
     minima (2.5.1)
       jekyll (>= 3.5, < 5.0)
       jekyll-feed (~> 0.9)
       jekyll-seo-tag (~> 2.1)
-    minitest (5.14.1)
+    minitest (5.14.3)
     multipart-post (2.1.1)
-    nokogiri (1.10.10)
-      mini_portile2 (~> 2.4.0)
-    octokit (4.18.0)
+    nokogiri (1.11.4)
+      mini_portile2 (~> 2.5.0)
+      racc (~> 1.4)
+    octokit (4.20.0)
       faraday (>= 0.9)
       sawyer (~> 0.8.0, >= 0.5.3)
     pathutil (0.16.2)
       forwardable-extended (~> 2.6)
     public_suffix (3.1.1)
+    racc (1.5.2)
     rb-fsevent (0.10.4)
     rb-inotify (0.10.1)
       ffi (~> 1.0)
-    rexml (3.2.4)
-    rouge (3.19.0)
+    rexml (3.2.5)
+    rouge (3.23.0)
     ruby-enum (0.8.0)
       i18n
+    ruby2_keywords (0.0.2)
     rubyzip (2.3.0)
     safe_yaml (1.0.5)
     sass (3.7.4)
@@ -240,19 +247,21 @@ GEM
     thread_safe (0.3.6)
     typhoeus (1.4.0)
       ethon (>= 0.9.0)
-    tzinfo (1.2.7)
+    tzinfo (1.2.9)
       thread_safe (~> 0.1)
     unf (0.1.4)
       unf_ext
     unf_ext (0.0.7.7)
     unicode-display_width (1.7.0)
-    zeitwerk (2.4.0)
+    webrick (1.7.0)
+    zeitwerk (2.4.2)
 
 PLATFORMS
   ruby
 
 DEPENDENCIES
-  github-pages (~> 207)
+  github-pages (~> 209)
+  webrick (~> 1.7)
 
 BUNDLED WITH
-   2.1.4
+   2.2.3
diff --git a/docs/_data/authors.yml b/docs/_data/authors.yml
index 13225be9dfd..dca958ea244 100644
--- a/docs/_data/authors.yml
+++ b/docs/_data/authors.yml
@@ -68,3 +68,6 @@ lightmark:
 fgwu:
   full_name: Fenggang Wu
   fbid: 100002297362180
+
+ltamasi:
+  full_name: Levi Tamasi
diff --git a/docs/_posts/2021-04-12-universal-improvements.markdown b/docs/_posts/2021-04-12-universal-improvements.markdown
new file mode 100644
index 00000000000..fa4e9d463b2
--- /dev/null
+++ b/docs/_posts/2021-04-12-universal-improvements.markdown
@@ -0,0 +1,46 @@
+---
+title: (Call For Contribution) Make Universal Compaction More Incremental
+layout: post
+author: sdong
+category: blog
+---
+
+### Motivation
+
+Universal Compaction is an important compaction style, but few changes were made after we made the structure multi-leveled. Yet the major restriction of always compacting full sorted run is not relaxed. Compared to Leveled Compaction, where we usually only compile several SST files together, in universal compaction, we frequently compact GBs of data. Two issues with this gap: 1. it makes it harder to unify universal and leveled compaction; 2. periodically data is fully compacted, and in the mean time space is doubled. To ease the problem, we can break the restriction and do similar as leveled compaction, and bring it closer to unified compaction.
+
+We call for help for making following improvements.
+
+
+### How Universal Compaction Works
+
+In universal, whole levels are compacted together to satisfy two conditions (See [wiki page](https://github.com/facebook/rocksdb/wiki/Universal-Compaction) for more details):
+
+1. total size / bottommost level size > a threshold, or
+2. total number of sorted runs (non-0 levels + L0 files) is within a threshold
+
+1 is to limit extra space overhead used for dead data and 2 is for read performance.
+
+If 1 is triggered, likely a full compaction will be triggered. If 2 is triggered, RocksDB compact some sorted runs to bring the number down. It does it by using a simple heuristic so that less writes needed for that purpose over time: it starts from compacting smaller files, but if total size to compact is similar to or larger than size of the next level, it will take that level together, as soon on (whether it is the best heuristic is another question and we’ve never seriously looked at it).
+
+### How We Can Improve?
+
+Let’s start from condition 1. Here we do full compaction but is not necessary.  A simple optimization would be to compact so that just enough files are merged into the bottommost level (Lmax) to satisfy condition 1. It would work if we only need to pick some files from Lmax-1, or if it is cheaper over time, we can pick some files from other levels too.
+
+Then condition 2. If we finish condition 1, there might be holes in some ranges in older levels. These holes might make it possible that only by compacting some sub ranges, we can fix the LSM-tree for condition 2. RocksDB can take single files into consideration and apply more sophisticated heuristic.
+
+This new approach makes universal compaction closer to leveled compaction. The operation for 1 is closer to how Leveled compaction triggeres Lmax-1 to Lmax compaction. And 2 can potentially be implemented as something similar to level picking in Leveled Compaction. In fact, all those file picking can co-existing in one single compaction style and there isn’t fundamental conflicts to that.
+
+### Limitation
+
+There are two limitations:
+
+* Periodic automatic full compaction is unpleasant but at the same time is pleasant in another way. Some users might uses it to reason that everything is periodically collapsed so dead data is gone and old data is rewritten. We need to make sure periodic compaction works to continue with that.
+* L0 to the first non-L0 level compaction is the first time data is partitioned in LSM-tree so that incremental compaction by range is possible. We might need to do more of these compactions in order to make incremental possible, which will increase compaction slightly.
+* Compacting subset of a level would introduce some extra overhead for unaligned files, just as in leveled compaction. More SST boundary cutting heuristic can reduce this overhead but it will be there.
+
+But I believe the benefits would outweight the limitations. Reducing temporary space doubling and moving towards to unified compaction would be important achievements.
+
+### Interested in Help?
+
+Compaction is the core of LSM-tree, but its improvements are far overdue. If you are a user of universal compaction and would be able to benefit from those improvements, we will be happy to work with you on speeding up the project and bring them to RocksDB sooner. Feel free to communicate with us in [this issue](https://github.com/facebook/rocksdb/issues/8181).
diff --git a/docs/_posts/2021-05-26-integrated-blob-db.markdown b/docs/_posts/2021-05-26-integrated-blob-db.markdown
new file mode 100644
index 00000000000..9f3a22fa275
--- /dev/null
+++ b/docs/_posts/2021-05-26-integrated-blob-db.markdown
@@ -0,0 +1,101 @@
+---
+title: Integrated BlobDB
+layout: post
+author: ltamasi
+category: blog
+---
+## Background
+
+BlobDB is essentially RocksDB for large-value use cases. The basic idea, which was proposed in the [WiscKey paper](https://www.usenix.org/system/files/conference/fast16/fast16-papers-lu.pdf), is key-value separation: by storing large values in dedicated blob files and storing only small pointers to them in the LSM tree, we avoid copying the values over and over again during compaction, thus reducing write amplification. Historically, BlobDB supported only FIFO and TTL based use cases that can tolerate some data loss. In addition, it was incompatible with many widely used RocksDB features, and required users to adopt a custom API. In 2020, we decided to rearchitect BlobDB from the ground up, taking the lessons learned from WiscKey and the original BlobDB but also drawing inspiration and incorporating ideas from other similar systems. Our goals were to eliminate the above limitations and to create a new integrated version that enables customers to use the well-known RocksDB API, has feature parity with the core of RocksDB, and offers better performance. This new implementation is now available and provides the following improvements over the original:
+
+* **API.** In contrast with the legacy BlobDB implementation, which had its own `StackableDB`-based interface (`rocksdb::blob_db::BlobDB`), the new version can be used via the well-known `rocksdb::DB` API, and can be configured simply by using a few column family options.
+* **Consistency.** With the integrated BlobDB implementation, RocksDB’s consistency guarantees and various write options (like using the WAL or synchronous writes) now apply to blobs as well. Moreover, the new BlobDB keeps track of blob files in the RocksDB MANIFEST.
+* **Write performance.** When using the old BlobDB, blobs are extracted and immediately written to blob files by the BlobDB layer *in the application thread*. This has multiple drawbacks from a performance perspective: first, it requires synchronization; second, it means that expensive operations like compression are performed in the application thread; and finally, it involves flushing the blob file after each blob. The new code takes a completely different approach by *offloading blob file building to RocksDB’s background jobs*, i.e. flushes and compactions. This means that similarly to SSTs, any given blob file is now written by a single background thread, eliminating the need for locking, flushing, or performing compression in the foreground. Note that this approach is also a better fit for network-based file systems where small writes might be expensive and opens up the possibility of file format optimizations that involve buffering (like dictionary compression).
+* **Read performance.** The old code relies on each read (i.e. `Get`, `MultiGet`, or iterator) taking a snapshot and uses those snapshots when deciding which obsolete blob files can be removed. The new BlobDB improves this by generalizing RocksDB’s Version concept, which historically referred to the set of live SST files at a given point in time, to include the set of live blob files as well. This has performance benefits like [making the read path mostly lock-free by utilizing thread-local storage](https://rocksdb.org/blog/2014/06/27/avoid-expensive-locks-in-get.html). We have also introduced a blob file cache that can be utilized to keep frequently accessed blob files open.
+* **Garbage collection.** Key-value separation means that if a key pointing to a blob gets overwritten or deleted, the blob becomes unreferenced garbage. To be able to reclaim this space, BlobDB now has garbage collection capabilities. GC is integrated into the compaction process and works by relocating valid blobs residing in old blob files as they are encountered during compaction. Blob files can be marked obsolete (and eventually deleted in one shot) once they contain nothing but garbage. This is more efficient than the method used by WiscKey, which involves performing a `Get` operation to find out whether a blob is still referenced followed by a `Put` to update the reference, which in turn results in garbage collection competing and potentially conflicting with the application’s writes.
+* **Feature parity with the RocksDB core.** The new BlobDB supports way more features than the original and is near feature parity with vanilla RocksDB. In particular, we support all basic read/write APIs (with the exception of `Merge`, which is coming soon), recovery, compression, atomic flush, column families, compaction filters, checkpoints, backup/restore, transactions, per-file checksums, and the SST file manager. In addition, the new BlobDB’s options can be dynamically adjusted using the `SetOptions` interface.
+
+## API
+
+The new BlobDB can be configured (on a per-column family basis if needed) simply by using the following options:
+
+* `enable_blob_files`: set it to `true` to enable key-value separation.
+* `min_blob_size`: values at or above this threshold will be written to blob files during flush or compaction.
+* `blob_file_size`: the size limit for blob files.
+* `blob_compression_type`: the compression type to use for blob files. All blobs in the same file are compressed using the same algorithm.
+* `enable_blob_garbage_collection`: set this to `true` to make BlobDB actively relocate valid blobs from the oldest blob files as they are encountered during compaction.
+* `blob_garbage_collection_age_cutoff`: the threshold that the GC logic uses to determine which blob files should be considered “old.” For example, the default value of 0.25 signals to RocksDB that blobs residing in the oldest 25% of blob files should be relocated by GC. This parameter can be tuned to adjust the trade-off between write amplification and space amplification.
+
+The above options are all dynamically adjustable via the `SetOptions` API; changing them will affect subsequent flushes and compactions but not ones that are already in progress.
+
+In terms of compaction styles, we recommend using leveled compaction with BlobDB. The rationale behind universal compaction in general is to provide lower write amplification at the expense of higher read amplification; however, as we will see later in the Performance section, BlobDB can provide very low write amp and good read performance with leveled compaction. Therefore, there is really no reason to take the hit in read performance that comes with universal compaction.
+
+In addition to the above, consider tuning the following non-BlobDB specific options:
+
+* `write_buffer_size`: this is the memtable size. You might want to increase it for large-value workloads to ensure that SST and blob files contain a decent number of keys.
+* `target_file_size_base`: the target size of SST files. Note that even when using BlobDB, it is important to have an LSM tree with a “nice” shape and multiple levels and files per level to prevent heavy compactions. Since BlobDB extracts and writes large values to blob files, it makes sense to make this parameter significantly smaller than the memtable size. One guideline is to set `blob_file_size` to the same value as `write_buffer_size` (adjusted for compression if needed) and make `target_file_size_base` proportionally smaller based on the ratio of key size to value size.
+* `max_bytes_for_level_base`: consider setting this to a multiple (e.g. 8x or 10x) of `target_file_size_base`.
+
+As mentioned above, the new BlobDB now also supports compaction filters. Key-value separation actually enables an optimization here: if the compaction filter of an application can make a decision about a key-value solely based on the key, it is unnecessary to read the value from the blob file. Applications can take advantage of this optimization by implementing the new `FilterBlobByKey` method of the `CompactionFilter` interface. This method gets called by RocksDB first whenever it encounters a key-value where the value is stored in a blob file. If this method returns a “final” decision like `kKeep`, `kRemove`, `kChangeValue`, or `kRemoveAndSkipUntil`, RocksDB will honor that decision; on the other hand, if the method returns `kUndetermined`, RocksDB will read the blob from the blob file and call `FilterV2` with the value in the usual fashion.
+
+## Performance
+
+We tested the performance of the new BlobDB for six different value sizes between 1 KB and 1 MB using a customized version of our [standard benchmark suite](https://github.com/facebook/rocksdb/wiki/Performance-Benchmarks) on a box with an 18-core Skylake DE CPU (running at 1.6 GHz, with hyperthreading enabled), 64 GB RAM, a 512 GB boot SSD, and two 1.88 TB M.2 SSDs in a RAID0 configuration for data. The RocksDB version used was equivalent to 6.18.1, with some benchmarking and statistics related enhancements. Leveled and universal compaction without key-value separation were used as reference points. Note that for simplicity, we use “leveled compaction” and “universal compaction” as shorthand for leveled and universal compaction without key-value separation, respectively, and “BlobDB” for BlobDB with leveled compaction.
+
+Our benchmarks cycled through six different workloads: two write-only ones (initial load and overwrite), two read/write ones (point lookup/write mix and range scan/write mix), and finally two read-only ones (point lookups and range scans). The first two phases performed a fixed amount of work (see below), while the final four were run for a fixed amount of time, namely 30 minutes each. Each phase other than the first one started with the database state left behind by the previous one. Here’s a brief description of the workloads:
+
+* **Initial load**: this workload has two distinct stages, a single-threaded random write stage during which compactions are disabled (so all data is flushed to L0, where it remains for the rest of the stage), followed by a full manual compaction. The random writes are performed with load-optimized settings, namely using the vector memtable implementation and with concurrent memtable writes and WAL disabled. This stage was used to populate the database with 1 TB worth of raw values, e.g. 2^30 (~1 billion) 1 KB values or 2^20 (~1 million) 1 MB values.
+* **Overwrite**: this is a multi-threaded random write workload using the usual skiplist memtable, with compactions, WAL, and concurrent memtable writes enabled. In our tests, 16 writer threads were used. The total number of writes was set to the same number as in the initial load stage and split up evenly between the writer threads. For instance, for the 1 MB value size, we had 2^20 writes divided up between the 16 threads, resulting in each thread performing 2^16 write operations. At the end of this phase, a “wait for compactions” step was added to prevent this workload from exhibiting artificially low write amp or conversely, the next phase showing inflated write amp.
+* **Point lookup/write mix**: a single writer thread performing random writes while N (in our case, 16) threads perform random point lookups. WAL is enabled and all writes are synced.
+* **Range scan/write mix**: similar to the above, with one writer thread and N reader threads (where N was again set to 16 in our tests). The reader threads perform random range scans, with 10 `Next` calls per `Seek`. Again, WAL is enabled, and sync writes are used.
+* **Point lookups (read-only)**: N=16 threads perform random point lookups.
+* **Range scans (read-only)**: N=16 threads execute random range scans, with 10 `Next`s per `Seek` like above.
+
+With that out of the way, let’s see how the new BlobDB performs against traditional leveled and universal compaction. In the next few sections, we’ll be looking at write amplification as well as read and write performance. We’ll also briefly compare the write performance of the new BlobDB with the legacy implementation.
+
+### Write amplification
+
+Reducing write amp is the original motivation for key-value separation. Here, we follow RocksDB’s definition of write amplification (as used in compaction statistics and the info log). That is, we define write amp as the total amount of data written by flushes and compactions divided by the amount of data written by flushes, where “data written” includes SST files and blob files as well (if applicable). The following charts show that BlobDB significantly reduces write amplification for all of our (non-read only) workloads.
+
+For the initial load, where due to the nature of the workload both leveled and universal already have a low write amp factor of 1.6, BlobDB has a write amp close to the theoretical minimum of 1.0, namely in the 1.0..1.02 range, depending on value size. How is this possible? Well, the trick is that when key-value separation is used, the full compaction step only has to sort the keys but not the values. This results in a write amp that is about **36% lower** than the already low write amp you get with either leveled or universal.
+
+In the case of the overwrite workload, BlobDB had a write amp between 1.4 and 1.7 depending on value size. This is around **75-78% lower** than the write amp of leveled compaction (6.1 to 6.8) and **70-77% lower** than universal (5.7 to 6.2); for this workload, there wasn’t a huge difference between the performance of leveled and universal.
+
+When it comes to the point lookup/write mix workload, BlobDB had a write amp between 1.4 and 1.8. This is **83-88% lower** than the write amp of leveled compaction, which had values between 10.8 and 12.5. Universal fared much better than leveled under this workload, and had write amp in the 2.2..6.6 range; however, BlobDB still provided significant gains for all value sizes we tested: namely, write amp was **18-77% lower** than that of universal, depending on value size.
+
+As for the range scan/write mix workload, BlobDB again had a write amp between 1.4 and 1.8, while leveled had values between 13.6 and 14.9, and universal was between 2.8 and 5.0. In other words, BlobDB’s write amp was **88-90% lower** than that of leveled, and **46-70% lower** than that of universal.
+
+![Write amplification](/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Amp.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+### Write performance
+
+In terms of write performance, there are other factors to consider besides write amplification. The following charts show some interesting metrics for the two write-only workloads (initial load and overwrite). As discussed earlier, these two workloads perform a fixed amount of work; the two charts in the top row show how long it took BlobDB, leveled, and universal to complete that work. Note that each bar is broken down into two, corresponding to the two stages of each workload (random write and full compaction for initial load, and random write and waiting for compactions for overwrite).
+
+For initial load, note that the random write stage takes the same amount of time regardless of which algorithm is used. This is not surprising considering the fact that compactions are disabled during this stage and thus RocksDB is simply writing L0 files (and in BlobDB’s case, blob files) as fast as it can. The second stage, on the other hand, is very different: as mentioned above, BlobDB essentially only needs to read, sort, and rewrite the keys during compaction, which can be done much much faster (with 1 MB values, more than a hundred times faster) than doing the same for large key-values. Due to this, initial load completed **2.3x to 4.7x faster** overall when using BlobDB.
+
+As for the overwrite workload, BlobDB performs much better during both stages. The two charts in the bottom row help explain why. In the case of both leveled and universal compaction, compactions can’t keep up with the write rate, which eventually leads to back pressure in the form of write stalls. As shown in the chart below, both leveled and universal stall between ~40% and ~70% of the time; on the other hand, BlobDB is stall-free except for the largest value size tested (1 MB). This naturally leads to higher throughput, namely **2.1x to 3.5x higher** throughput compared to leveled, and **1.6x to 3.0x higher** throughput compared to universal. The overwrite time chart also shows that the catch-up stage that waits for all compactions to finish is much shorter (and in fact, at larger value sizes, negligible) with BlobDB.
+
+![Write performance](/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Perf.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+### Read/write and read-only performance
+
+The charts below show the read performance (in terms of operations per second) of BlobDB versus leveled and universal compaction under the two read/write workloads and the two read-only workloads. BlobDB meets or exceeds the read performance of leveled compaction, except for workloads involving range scans at the two smallest value sizes tested (1 KB and 4 KB). It also provides better (in some cases, much better) read performance than universal across the board. In particular, BlobDB provides up **1.4x higher** read performance than leveled (for larger values), and up to **5.6x higher** than universal.
+
+![Read-write and read-only performance](/static/images/integrated-blob-db/BlobDB_Benchmarks_RW_RO_Perf.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+### Comparing the two BlobDB implementations
+
+To compare the write performance of the new BlobDB with the legacy implementation, we ran two versions of the first (single-threaded random write) stage of the initial load benchmark using 1 KB values: one with WAL disabled, and one with WAL enabled. The new implementation completed the load **4.6x faster** than the old one without WAL, and **2.3x faster** with WAL.
+
+![Comparing the two BlobDB implementations](/static/images/integrated-blob-db/BlobDB_Benchmarks_Legacy_Vs_Integrated.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+## Future work
+
+There are a few remaining features that are not yet supported by the new BlobDB. The most important one is `Merge` (and the related `GetMergeOperands` API); in addition, we don’t currently support the `EventListener` interface, the `GetLiveFilesMetaData` and `GetColumnFamilyMetaData` APIs, secondary instances, and ingestion of blob files. We will continue to work on closing this gap.
+
+We also have further plans when it comes to performance. These include optimizing garbage collection, introducing a dedicated cache for blobs, improving iterator and `MultiGet` performance, and evolving the blob file format amongst others.
+
diff --git a/docs/_posts/2021-05-26-online-validation.markdown b/docs/_posts/2021-05-26-online-validation.markdown
new file mode 100644
index 00000000000..33e9dfc151a
--- /dev/null
+++ b/docs/_posts/2021-05-26-online-validation.markdown
@@ -0,0 +1,17 @@
+---
+title: Online Validation
+layout: post
+author: sdong
+category: blog
+---
+To prevent or mitigate data corrution in RocksDB when some software or hardware issues happens, we keep adding online consistency checks and improving existing ones.
+
+We improved ColumnFamilyOptions::force_consistency_checks and enabled it by default. The option does some basic consistency checks to LSM-tree, e.g., files in one level are not overlapping. The DB will be frozen from new writes if a violation is detected. Previously, the feature’s check was too limited and didn’t always freeze the DB in a timely manner. Last year, we made the checking stricter so that it can [catch much more corrupted LSM-tree structures](https://github.com/facebook/rocksdb/pull/6901). We also fixed several issues where the checking failure was swallowed without freezing the DB. After making force_consistency_checks more reliable, we changed the default value to be on.
+
+ColumnFamilyOptions::paranoid_file_checks does some more expensive extra checking when generating a new SST file. Last year, we advanced coverage to this feature: after every SST file is generated, the SST file is created, read back keys one by one and check two things: (1) the keys are in comparator order (also available and enabled by default during file write via ColumnFamilyOptions::check_flush_compaction_key_order); (2) the hash of all the KVs is the same as calculated when we add KVs into it. These checks detect certain corruptions so we can prevent the corrupt files from being applied to the DB. We suggest users turn it on at least in shadow environments, and consider to run it in production too if you can afford the overheads.
+
+A recent feature is added to check the count of entries added into memtable while flushing it into an SST file. This feature is to have some online coverage to memtable corruption, caused by either software bug or hardware issue. This feature will be released in the coming release (6.21) and by default on. In the future, we will check more counters during memtables, e.g. number of puts or number of deletes.
+
+We also improved the reporting of online validation errors to improve debuggability. For example, failure to parse a corrupt key now reports details about the corrupt key. Since we did not want to expose key data in logs, error messages, etc., by default, this reporting is opt-in via DBOptions::allow_data_in_errors.
+
+More online checking features are planned and some are more sophisticated, including key/value checksums and sample based query validation.
diff --git a/docs/_posts/2021-05-27-rocksdb-secondary-cache.markdown b/docs/_posts/2021-05-27-rocksdb-secondary-cache.markdown
new file mode 100644
index 00000000000..422554a30cc
--- /dev/null
+++ b/docs/_posts/2021-05-27-rocksdb-secondary-cache.markdown
@@ -0,0 +1,195 @@
+---
+title: RocksDB Secondary Cache
+layout: post
+author: anand1976
+category: blog
+---
+## Introduction
+
+The RocksDB team is implementing support for a block cache on non-volatile media, such as a local flash device or NVM/SCM. It can be viewed as an extension of RocksDB’s current volatile block cache (LRUCache or ClockCache). The non-volatile block cache acts as a second tier cache that contains blocks evicted from the volatile cache. Those blocks are then promoted to the volatile cache as they become hotter due to access.
+
+This feature is meant for cases where the DB is located on remote storage or cloud storage. The non-volatile cache is officially referred to in RocksDB as the SecondaryCache. By maintaining a SecondaryCache that’s an order of magnitude larger than DRAM, fewer reads would be required from remote storage, thus reducing read latency as well as network bandwidth consumption. 
+
+From the user point of view, the local flash cache will support the following requirements -
+
+1. Provide a pointer to a secondary cache when opening a DB
+2. Be able to share the secondary cache across DBs in the same process
+3. Have multiple secondary caches on a host
+4. Support persisting the cache across process restarts and reboots by ensuring repeatability of the cache key
+
+![Architecture](/static/images/rocksdb-secondary-cache/arch_diagram.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+## Design
+
+When designing the API for a SecondaryCache, we had a choice between making it visible to the RocksDB code (table reader) or hiding it behind the RocksDB block cache. There are several advantages of hiding it behind the block cache -
+
+* Allows flexibility in insertion of blocks into the secondary cache. A block can be inserted on eviction from the RAM tier, or it could be eagerly inserted.
+* It makes the rest of the RocksDB code less complex by providing a uniform interface regardless of whether a secondary cache is configured or not
+* Makes parallel reads, peeking in the cache for prefetching, failure handling etc. easier
+* Makes it easier to extend to compressed data if needed, and allows other persistent media, such as PM, to be added as an additional tier
+
+
+We decided to make the secondary cache transparent to the rest of RocksDB code by hiding it behind the block cache. A key issue that we needed to address was the allocation and ownership of memory of the cached items - insertion into the secondary cache may require that memory be allocated by the same. This means that parts of the cached object that can be transferred to the secondary cache needs to be copied out (referred to as **unpacking**), and on a lookup the data stored in the secondary cache needs to be provided to the object constructor (referred to as **packing**). For RocksDB cached objects such as data blocks, index and filter blocks, and compression dictionaries, unpacking involves copying out the raw uncompressed BlockContents of the block, and packing involves constructing the corresponding block/index/filter/dictionary object using the raw uncompressed data.
+
+Another alternative we considered was the existing PersistentCache interface. However, we decided to not pursue it and eventually deprecate it for the following reasons -
+* It is exposed directly to the table reader code, which makes it more difficult to implement different policies such as inclusive/exclusive cache, as well as extending it to more sophisticated admission control policies
+* The interface does not allow for custom memory allocation and object packing/unpacking, so new APIs would have to be defined anyway
+* The current PersistentCache implementation is very simple and does not have any admission control policies
+
+## API
+
+The interface between RocksDB’s block cache and the secondary cache is designed to allow pluggable implementations. For FB internal usage, we plan to use Cachelib  with a wrapper to provide the plug-in implementation and use folly and other fbcode libraries, which cannot be used directly by RocksDB, to efficiently implement the cache operations. The following diagrams show the flow of insertion  and lookup of a block.
+
+![Insert flow](/static/images/rocksdb-secondary-cache/insert_flow.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+![Lookup flow](/static/images/rocksdb-secondary-cache/lookup_flow.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+An item in the secondary cache is referenced by a SecondaryCacheHandle. The handle may not be immediately ready or have a valid  value. The caller can call IsReady() to determine if its ready, and can call Wait() in order to block until it becomes ready. The caller must call Value() after it becomes ready to determine if the item was successfully read. Value() must return nullptr on failure.
+
+```
+class SecondaryCacheHandle {
+ public:
+  virtual ~SecondaryCacheHandle() {}
+
+  // Returns whether the handle is ready or not
+  virtual bool IsReady() = 0;
+
+  // Block until handle becomes ready
+  virtual void Wait() = 0;
+
+  // Return the value. If nullptr, it means the lookup was unsuccessful
+  virtual void* Value() = 0;
+
+  // Return the size of value
+  virtual size_t Size() = 0;
+};
+```
+
+The user of the secondary cache (for example, BlockBasedTableReader indirectly through LRUCache) must implement the callbacks defined in CacheItemHelper, in order to facilitate the unpacking/packing of objects for saving to and restoring from the secondary cache. The CreateCallback must be implemented to construct a cacheable object from the raw data in secondary cache.
+
+```
+  // The SizeCallback takes a void* pointer to the object and returns the size
+  // of the persistable data. It can be used by the secondary cache to allocate
+  // memory if needed.
+  using SizeCallback = size_t (*)(void* obj);
+
+  // The SaveToCallback takes a void* object pointer and saves the persistable
+  // data into a buffer. The secondary cache may decide to not store it in a
+  // contiguous buffer, in which case this callback will be called multiple
+  // times with increasing offset
+  using SaveToCallback = Status (*)(void* from_obj, size_t from_offset,
+                                    size_t length, void* out);
+
+  // A function pointer type for custom destruction of an entry's
+  // value. The Cache is responsible for copying and reclaiming space
+  // for the key, but values are managed by the caller.
+  using DeleterFn = void (*)(const Slice& key, void* value);
+
+  // A struct with pointers to helper functions for spilling items from the
+  // cache into the secondary cache. May be extended in the future. An
+  // instance of this struct is expected to outlive the cache.
+  struct CacheItemHelper {
+    SizeCallback size_cb;
+    SaveToCallback saveto_cb;
+    DeleterFn del_cb;
+
+    CacheItemHelper() : size_cb(nullptr), saveto_cb(nullptr), del_cb(nullptr) {}
+    CacheItemHelper(SizeCallback _size_cb, SaveToCallback _saveto_cb,
+                    DeleterFn _del_cb)
+        : size_cb(_size_cb), saveto_cb(_saveto_cb), del_cb(_del_cb) {}
+  };
+
+  // The CreateCallback is passed by the block cache user to Lookup(). It
+  // takes in a buffer from the NVM cache and constructs an object using
+  // it. The callback doesn't have ownership of the buffer and should
+  // copy the contents into its own buffer.
+  // typedef std::function<Status(void* buf, size_t size, void** out_obj,
+  //                             size_t* charge)>
+  //    CreateCallback;
+  using CreateCallback = std::function<Status(void* buf, size_t size,
+                                              void** out_obj, size_t* charge)>;
+```
+
+The secondary cache provider must provide a concrete implementation of the SecondaryCache abstract class.
+
+```
+// SecondaryCache
+//
+// Cache interface for caching blocks on a secondary tier (which can include
+// non-volatile media, or alternate forms of caching such as compressed data)
+class SecondaryCache {
+ public:
+  virtual ~SecondaryCache() {}
+
+  virtual std::string Name() = 0;
+
+  static const std::string Type() { return "SecondaryCache"; }
+
+  // Insert the given value into this cache. The value is not written
+  // directly. Rather, the SaveToCallback provided by helper_cb will be
+  // used to extract the persistable data in value, which will be written
+  // to this tier. The implementation may or may not write it to cache
+  // depending on the admission control policy, even if the return status is
+  // success.
+  virtual Status Insert(const Slice& key, void* value,
+                        const Cache::CacheItemHelper* helper) = 0;
+
+  // Lookup the data for the given key in this cache. The create_cb
+  // will be used to create the object. The handle returned may not be
+  // ready yet, unless wait=true, in which case Lookup() will block until
+  // the handle is ready
+  virtual std::unique_ptr<SecondaryCacheHandle> Lookup(
+      const Slice& key, const Cache::CreateCallback& create_cb, bool wait) = 0;
+
+  // At the discretion of the implementation, erase the data associated
+  // with key
+  virtual void Erase(const Slice& key) = 0;
+
+  // Wait for a collection of handles to become ready. This would be used
+  // by MultiGet, for example, to read multitple data blocks in parallel
+  virtual void WaitAll(std::vector<SecondaryCacheHandle*> handles) = 0;
+
+  virtual std::string GetPrintableOptions() const = 0;
+};
+```
+
+A SecondaryCache is configured by the user by providing a pointer to it in LRUCacheOptions -
+```
+struct LRUCacheOptions {
+  ...
+  // A SecondaryCache instance to use as an additional cache tier
+  std::shared_ptr<SecondaryCache> secondary_cache;
+  ...
+};
+```
+
+## Current Status
+
+The initial RocksDB support for the secondary cache has been merged into the master branch, and will be available in the 6.21 release. This includes providing a way for the user to configure a secondary cache when instantiating RocksDB’s LRU cache (volatile block cache), spilling blocks evicted from the LRU cache to the flash cache, promoting a block read from the SecondaryCache to the LRU cache, update tools such as cache_bench and db_bench to specify a flash cache. The relevant PRs are [#8271](https://github.com/facebook/rocksdb/pull/8271), [#8191](https://github.com/facebook/rocksdb/pull/8191), and [#8312](https://github.com/facebook/rocksdb/pull/8312).
+
+We prototyped an end-to-end solution, with the above PRs as well as a Cachelib based implementation of the SecondaryCache. We ran a mixgraph benchmark to simulate a realistic read/write workload. The results showed a 15% gain with the local flash cache over no local cache, and a ~25-30% reduction in network reads with a corresponding decrease in cache misses.
+
+![Throughput](/static/images/rocksdb-secondary-cache/Mixgraph_throughput.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+![Hit Rate](/static/images/rocksdb-secondary-cache/Mixgraph_hit_rate.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+## Future Work
+
+In the short term, we plan to do the following in order to fully integrate the SecondaryCache with RocksDB -
+
+1. Use DB session ID as the cache key prefix to ensure uniqueness and repeatability
+2. Optimize flash cache usage of MultiGet and iterator workloads
+3. Stress testing
+4. More benchmarking
+
+Longer term, we plan to deploy this in production at Facebook.
+
+## Call to Action
+
+We are hoping for a community contribution of a secondary cache implementation, which would make this feature usable by the broader RocksDB userbase. If you are interested in contributing, please reach out to us in [this issue](https://github.com/facebook/rocksdb/issues/8347).
+
diff --git a/docs/_posts/2021-05-31-dictionary-compression.markdown b/docs/_posts/2021-05-31-dictionary-compression.markdown
new file mode 100644
index 00000000000..9b0f4529344
--- /dev/null
+++ b/docs/_posts/2021-05-31-dictionary-compression.markdown
@@ -0,0 +1,157 @@
+---
+title: Preset Dictionary Compression
+layout: post
+author: ajkr
+category: blog
+---
+
+## Summary
+
+Compression algorithms relying on an adaptive dictionary, such as LZ4, zstd, and zlib, struggle to achieve good compression ratios on small inputs when using the basic compress API.
+With the basic compress API, the compressor starts with an empty dictionary.
+With small inputs, not much content gets added to the dictionary during the compression.
+Combined, these factors suggest the dictionary will never have enough contents to achieve great compression ratios.
+
+RocksDB groups key-value pairs into data blocks before storing them in files.
+For use cases that are heavy on random accesses, smaller data block size is sometimes desirable for reducing I/O and CPU spent reading blocks.
+However, as explained above, smaller data block size comes with the downside of worse compression ratio when using the basic compress API.
+
+Fortunately, zstd and other libraries offer advanced compress APIs that preset the dictionary.
+A preset dictionary makes it possible for the compressor to start from a useful state instead of from an empty one, making compression immediately effective.
+
+RocksDB now optionally takes advantage of these dictionary presetting APIs.
+The challenges in integrating this feature into the storage engine were more substantial than apparent on the surface.
+First, we need to target a preset dictionary to the relevant data.
+Second, preset dictionaries need to be trained from data samples, which need to be gathered.
+Third, preset dictionaries need to be persisted since they are needed at decompression time.
+Fourth, overhead in accessing the preset dictionary must be minimized to prevent regression in critical code paths.
+Fifth, we need easy-to-use measurement to evaluate candidate use cases and production impact.
+
+In production, we have deployed dictionary presetting to save space in multiple RocksDB use cases with data block size 8KB or smaller.
+We have measured meaningful benefit to compression ratio in use cases with data block size up to 16KB.
+We have also measured a use case that can save both CPU and space by reducing data block size and turning on dictionary presetting at the same time.
+
+## Feature design
+#### Targeting
+
+Over time we have considered a few possibilities for the scope of a dictionary.
+
+- Subcompaction
+- SST file
+- Column family
+
+The original choice was subcompaction scope.
+This enabled an approach with minimal buffering overhead because we could collect samples while generating the first output SST file.
+The dictionary could then be trained and applied to subsequent SST files in the same subcompaction.
+
+However, we found a large use case where the proximity of data in the keyspace was more correlated with its similarity than we had predicted.
+In particular, the approach of training a dictionary on an adjacent file yielded substantially worse ratios than training the dictionary on the same file it would be used to compress.
+In response to this finding, we changed the preset dictionary scope to per SST file.
+
+With this change in approach, we had to face the problem we had hoped to avoid: how can we compress all of an SST file's data blocks with the same preset dictionary while that dictionary can only be trained after many data blocks have been sampled?
+The solutions we considered both involved a new overhead.
+We could read the input more than once and introduce I/O overhead, or we could buffer the uncompressed output file data blocks until a dictionary is trained, introducing memory overhead.
+We chose to take the hit on memory overhead.
+
+Another approach that we considered was associating multiple dictionaries with a column family.
+For example, in MyRocks there could be a dictionary trained on data from each large table.
+When compressing a data block, we would look at the table to which its data belongs and pick the corresponding dictionary.
+However, this approach would introduce many challenges.
+RocksDB would need to be aware of the key schema to know where are the table boundaries.
+RocksDB would also need to periodically update the dictionaries to account for changes in data pattern.
+It would need somewhere to store dictionaries at column family scope.
+Overall, we thought these challenges were too difficult to pursue the approach.
+
+#### Training
+
+![](/static/images/dictcmp/dictcmp_raw_sampled.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+<p align="center"><i>
+Raw samples mode (`zstd_max_train_bytes == 0`)
+</i></p>
+
+As mentioned earlier, the approach we took is to build the dictionary from buffered uncompressed data blocks.
+The first row of data blocks in these diagrams illustrate this buffering.
+The second row illustrates training samples selected from the buffered blocks.
+In raw samples mode (above), the final dictionary is simply the concatenation of these samples.
+Whereas, in zstd training mode (below), these samples will be passed to the trainer to produce the final dictionary.
+
+![](/static/images/dictcmp/dictcmp_zstd_trained.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+<p align="center"><i>
+zstd training mode (`zstd_max_train_bytes > 0`)
+</i></p>
+
+#### Compression path
+
+Once the preset dictionary is generated by the above process, we apply it to the buffered data blocks and write them to the output file.
+Thereafter, newly generated data blocks are immediately compressed and written out.
+
+One optimization here is available to zstd v0.7.0+ users.
+Instead of deserializing the dictionary on each compress invocation, we can do that work once and reuse it.
+A `ZSTD_CDict` holds this digested dictionary state and is passed to the compress API.
+
+#### Persistence
+
+When an SST file's data blocks are compressed using a preset dictionary, that dictionary is stored inside the file for later use in decompression.
+
+![](/static/images/dictcmp/dictcmp_sst_blocks.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+<p align="center"><i>
+SST file layout with the preset dictionary in its own (uncompressed) block
+</i></p>
+
+#### Decompression path
+
+To decompress, we need to provide both the data block and the dictionary used to compress it.
+Since dictionaries are just blocks in a file, we access them through block cache.
+However this additional load on block cache can be problematic.
+It can be alleviated by pinning the dictionaries to avoid going through the LRU locks.
+
+An optimization analogous to the digested dictionary exists for certain zstd users (see User API section for details).
+When enabled, the block cache stores the digested dictionary state for decompression (`ZSTD_DDict`) instead of the block contents.
+In some cases we have seen decompression CPU decrease overall when enabling dictionary thanks to this optimization.
+
+#### Measurement
+
+Typically our first step in evaluating a candidate use case is an offline analysis of the data.
+This gives us a quick idea whether presetting dictionary will be beneficial without any code, config, or data changes.
+Our `sst_dump` tool reports what size SST files would have been using specified compression libraries and options.
+We can select random SST files and compare the size with vs. without dictionary.
+
+When that goes well, the next step is to see how it works in a live DB, like a production shadow or canary.
+There we can observe how it affects application/system metrics.
+
+Even after dictionary is enabled, there is the question of how much space was finally saved.
+We provide a way to A/B test size with vs. without dictionary while running in production.
+This feature picks a sample of data blocks to compress in multiple ways -- one of the outputs is stored, while the other outputs are thrown away after counting their size.
+Due to API limitations, the stored output always has to be the dictionary-compressed one, so this feature can only be used after enabling dictionary.
+The size with and without dictionary are stored in the SST file as table properties.
+These properties can be aggregated across all SST files in a DB (and across all DBs in a tier) to learn the final space saving.
+
+## User API
+
+RocksDB allows presetting compression dictionary for users of LZ4, zstd, and zlib.
+The most advanced capabilities are available to zstd v1.1.4+ users who statically link (see below).
+Newer versions of zstd (v1.3.6+) have internal changes to the dictionary trainer and digested dictionary management, which significantly improve memory and CPU efficiency.
+
+Run-time settings:
+
+- `CompressionOptions::max_dict_bytes`: Limit on per-SST file dictionary size. Increasing this causes dictionaries to consume more space and memory for the possibility of better data block compression. A typical value we use is 16KB.
+- (**zstd only**) `CompressionOptions::zstd_max_train_bytes`: Limit on training data passed to zstd dictionary trainer. Larger values cause the training to consume more CPU (and take longer) while generating more effective dictionaries. The starting point guidance we received from zstd team is to set it to 100x `CompressionOptions::max_dict_bytes`.
+- `CompressionOptions::max_dict_buffer_bytes`: Limit on data buffering from which training samples are gathered. By default we buffer up to the target file size per ongoing background job. If this amount of memory is concerning, this option can constrain the buffering with the downside that training samples will cover a smaller portion of the SST file. Work is ongoing to charge this memory usage to block cache so it will not need to be accounted for separately.
+- `BlockBasedTableOptions::cache_index_and_filter_blocks`: Controls whether metadata blocks including dictionary are accessed through block cache or held in table reader memory (yes, its name is outdated).
+- `BlockBasedTableOptions::metadata_cache_options`: Controls what metadata blocks are pinned in block cache. Pinning avoids LRU contention at the risk of cold blocks holding memory.
+- `ColumnFamilyOptions::sample_for_compression`: Controls frequency of measuring extra compressions on data blocks using various libraries with default settings (i.e., without preset dictionary).
+
+Compile-time setting:
+
+- (**zstd only**) `EXTRA_CXXFLAGS=-DZSTD_STATIC_LINKING_ONLY`: Hold digested dictionaries in block cache to save repetitive deserialization overhead. This saves a lot of CPU for read-heavy workloads. This compiler flag is necessary because one of the digested dictionary APIs we use is marked as experimental. We still use it in production, however.
+
+Function:
+
+- `DB::GetPropertiesOfAllTables()`: The properties `kSlowCompressionEstimatedDataSize` and `kFastCompressionEstimatedDataSize` estimate what the data block size (`kDataSize`) would have been if the corresponding compression library had been used. These properties are only present when `ColumnFamilyOptions::sample_for_compression` causes one or more samples to be measured, and they become more accurate with higher sampling frequency.
+
+Tool:
+
+- `sst_dump --command=recompress`: Offline analysis tool that reports what the SST file size would have been using the specified compression library and options.
diff --git a/docs/static/images/dictcmp/dictcmp_raw_sampled.png b/docs/static/images/dictcmp/dictcmp_raw_sampled.png
new file mode 100644
index 00000000000..2eb6463c248
Binary files /dev/null and b/docs/static/images/dictcmp/dictcmp_raw_sampled.png differ
diff --git a/docs/static/images/dictcmp/dictcmp_sst_blocks.png b/docs/static/images/dictcmp/dictcmp_sst_blocks.png
new file mode 100644
index 00000000000..551860b2e9b
Binary files /dev/null and b/docs/static/images/dictcmp/dictcmp_sst_blocks.png differ
diff --git a/docs/static/images/dictcmp/dictcmp_zstd_trained.png b/docs/static/images/dictcmp/dictcmp_zstd_trained.png
new file mode 100644
index 00000000000..966c7fe0f6c
Binary files /dev/null and b/docs/static/images/dictcmp/dictcmp_zstd_trained.png differ
diff --git a/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Legacy_Vs_Integrated.png b/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Legacy_Vs_Integrated.png
new file mode 100644
index 00000000000..7215390cb5f
Binary files /dev/null and b/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Legacy_Vs_Integrated.png differ
diff --git a/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_RW_RO_Perf.png b/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_RW_RO_Perf.png
new file mode 100644
index 00000000000..f412ee60f09
Binary files /dev/null and b/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_RW_RO_Perf.png differ
diff --git a/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Amp.png b/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Amp.png
new file mode 100644
index 00000000000..19f40b035ae
Binary files /dev/null and b/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Amp.png differ
diff --git a/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Perf.png b/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Perf.png
new file mode 100644
index 00000000000..a1d43da0c24
Binary files /dev/null and b/docs/static/images/integrated-blob-db/BlobDB_Benchmarks_Write_Perf.png differ
diff --git a/docs/static/images/rocksdb-secondary-cache/Mixgraph_hit_rate.png b/docs/static/images/rocksdb-secondary-cache/Mixgraph_hit_rate.png
new file mode 100644
index 00000000000..10fa7372825
Binary files /dev/null and b/docs/static/images/rocksdb-secondary-cache/Mixgraph_hit_rate.png differ
diff --git a/docs/static/images/rocksdb-secondary-cache/Mixgraph_throughput.png b/docs/static/images/rocksdb-secondary-cache/Mixgraph_throughput.png
new file mode 100644
index 00000000000..df2e333f9cd
Binary files /dev/null and b/docs/static/images/rocksdb-secondary-cache/Mixgraph_throughput.png differ
diff --git a/docs/static/images/rocksdb-secondary-cache/arch_diagram.png b/docs/static/images/rocksdb-secondary-cache/arch_diagram.png
new file mode 100644
index 00000000000..696a376ed8a
Binary files /dev/null and b/docs/static/images/rocksdb-secondary-cache/arch_diagram.png differ
diff --git a/docs/static/images/rocksdb-secondary-cache/insert_flow.png b/docs/static/images/rocksdb-secondary-cache/insert_flow.png
new file mode 100644
index 00000000000..f02e7e4c509
Binary files /dev/null and b/docs/static/images/rocksdb-secondary-cache/insert_flow.png differ
diff --git a/docs/static/images/rocksdb-secondary-cache/lookup_flow.png b/docs/static/images/rocksdb-secondary-cache/lookup_flow.png
new file mode 100644
index 00000000000..2b3c70edb49
Binary files /dev/null and b/docs/static/images/rocksdb-secondary-cache/lookup_flow.png differ
diff --git a/env/composite_env.cc b/env/composite_env.cc
new file mode 100644
index 00000000000..0d70855af51
--- /dev/null
+++ b/env/composite_env.cc
@@ -0,0 +1,383 @@
+// Copyright (c) 2019-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "env/composite_env_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+// The CompositeEnvWrapper class provides an interface that is compatible
+// with the old monolithic Env API, and an implementation that wraps around
+// the new Env that provides threading and other OS related functionality, and
+// the new FileSystem API that provides storage functionality. By
+// providing the old Env interface, it allows the rest of RocksDB code to
+// be agnostic of whether the underlying Env implementation is a monolithic
+// Env or an Env + FileSystem. In the former case, the user will specify
+// Options::env only, whereas in the latter case, the user will specify
+// Options::env and Options::file_system.
+
+class CompositeSequentialFileWrapper : public SequentialFile {
+ public:
+  explicit CompositeSequentialFileWrapper(
+      std::unique_ptr<FSSequentialFile>& target)
+      : target_(std::move(target)) {}
+
+  Status Read(size_t n, Slice* result, char* scratch) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Read(n, io_opts, result, scratch, &dbg);
+  }
+  Status Skip(uint64_t n) override { return target_->Skip(n); }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  Status InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+  Status PositionedRead(uint64_t offset, size_t n, Slice* result,
+                        char* scratch) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->PositionedRead(offset, n, io_opts, result, scratch, &dbg);
+  }
+
+ private:
+  std::unique_ptr<FSSequentialFile> target_;
+};
+
+class CompositeRandomAccessFileWrapper : public RandomAccessFile {
+ public:
+  explicit CompositeRandomAccessFileWrapper(
+      std::unique_ptr<FSRandomAccessFile>& target)
+      : target_(std::move(target)) {}
+
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Read(offset, n, io_opts, result, scratch, &dbg);
+  }
+  Status MultiRead(ReadRequest* reqs, size_t num_reqs) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    std::vector<FSReadRequest> fs_reqs;
+    Status status;
+
+    fs_reqs.resize(num_reqs);
+    for (size_t i = 0; i < num_reqs; ++i) {
+      fs_reqs[i].offset = reqs[i].offset;
+      fs_reqs[i].len = reqs[i].len;
+      fs_reqs[i].scratch = reqs[i].scratch;
+      fs_reqs[i].status = IOStatus::OK();
+    }
+    status = target_->MultiRead(fs_reqs.data(), num_reqs, io_opts, &dbg);
+    for (size_t i = 0; i < num_reqs; ++i) {
+      reqs[i].result = fs_reqs[i].result;
+      reqs[i].status = fs_reqs[i].status;
+    }
+    return status;
+  }
+  Status Prefetch(uint64_t offset, size_t n) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Prefetch(offset, n, io_opts, &dbg);
+  }
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+  void Hint(AccessPattern pattern) override {
+    target_->Hint((FSRandomAccessFile::AccessPattern)pattern);
+  }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  Status InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+
+ private:
+  std::unique_ptr<FSRandomAccessFile> target_;
+};
+
+class CompositeWritableFileWrapper : public WritableFile {
+ public:
+  explicit CompositeWritableFileWrapper(std::unique_ptr<FSWritableFile>& t)
+      : target_(std::move(t)) {}
+
+  Status Append(const Slice& data) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Append(data, io_opts, &dbg);
+  }
+  Status Append(const Slice& data,
+                const DataVerificationInfo& verification_info) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Append(data, io_opts, verification_info, &dbg);
+  }
+  Status PositionedAppend(const Slice& data, uint64_t offset) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->PositionedAppend(data, offset, io_opts, &dbg);
+  }
+  Status PositionedAppend(
+      const Slice& data, uint64_t offset,
+      const DataVerificationInfo& verification_info) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->PositionedAppend(data, offset, io_opts, verification_info,
+                                     &dbg);
+  }
+  Status Truncate(uint64_t size) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Truncate(size, io_opts, &dbg);
+  }
+  Status Close() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Close(io_opts, &dbg);
+  }
+  Status Flush() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Flush(io_opts, &dbg);
+  }
+  Status Sync() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Sync(io_opts, &dbg);
+  }
+  Status Fsync() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Fsync(io_opts, &dbg);
+  }
+  bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
+
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+
+  void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override {
+    target_->SetWriteLifeTimeHint(hint);
+  }
+
+  Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
+    return target_->GetWriteLifeTimeHint();
+  }
+
+  uint64_t GetFileSize() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->GetFileSize(io_opts, &dbg);
+  }
+
+  void SetPreallocationBlockSize(size_t size) override {
+    target_->SetPreallocationBlockSize(size);
+  }
+
+  void GetPreallocationStatus(size_t* block_size,
+                              size_t* last_allocated_block) override {
+    target_->GetPreallocationStatus(block_size, last_allocated_block);
+  }
+
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+
+  Status InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+
+  Status RangeSync(uint64_t offset, uint64_t nbytes) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->RangeSync(offset, nbytes, io_opts, &dbg);
+  }
+
+  void PrepareWrite(size_t offset, size_t len) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    target_->PrepareWrite(offset, len, io_opts, &dbg);
+  }
+
+  Status Allocate(uint64_t offset, uint64_t len) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Allocate(offset, len, io_opts, &dbg);
+  }
+
+  std::unique_ptr<FSWritableFile>* target() { return &target_; }
+
+ private:
+  std::unique_ptr<FSWritableFile> target_;
+};
+
+class CompositeRandomRWFileWrapper : public RandomRWFile {
+ public:
+  explicit CompositeRandomRWFileWrapper(std::unique_ptr<FSRandomRWFile>& target)
+      : target_(std::move(target)) {}
+
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  Status Write(uint64_t offset, const Slice& data) override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Write(offset, data, io_opts, &dbg);
+  }
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Read(offset, n, io_opts, result, scratch, &dbg);
+  }
+  Status Flush() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Flush(io_opts, &dbg);
+  }
+  Status Sync() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Sync(io_opts, &dbg);
+  }
+  Status Fsync() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Fsync(io_opts, &dbg);
+  }
+  Status Close() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Close(io_opts, &dbg);
+  }
+
+ private:
+  std::unique_ptr<FSRandomRWFile> target_;
+};
+
+class CompositeDirectoryWrapper : public Directory {
+ public:
+  explicit CompositeDirectoryWrapper(std::unique_ptr<FSDirectory>& target)
+      : target_(std::move(target)) {}
+
+  Status Fsync() override {
+    IOOptions io_opts;
+    IODebugContext dbg;
+    return target_->Fsync(io_opts, &dbg);
+  }
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+
+ private:
+  std::unique_ptr<FSDirectory> target_;
+};
+}  // namespace
+
+Status CompositeEnv::NewSequentialFile(const std::string& f,
+                                       std::unique_ptr<SequentialFile>* r,
+                                       const EnvOptions& options) {
+  IODebugContext dbg;
+  std::unique_ptr<FSSequentialFile> file;
+  Status status;
+  status =
+      file_system_->NewSequentialFile(f, FileOptions(options), &file, &dbg);
+  if (status.ok()) {
+    r->reset(new CompositeSequentialFileWrapper(file));
+  }
+  return status;
+}
+
+Status CompositeEnv::NewRandomAccessFile(const std::string& f,
+                                         std::unique_ptr<RandomAccessFile>* r,
+                                         const EnvOptions& options) {
+  IODebugContext dbg;
+  std::unique_ptr<FSRandomAccessFile> file;
+  Status status;
+  status =
+      file_system_->NewRandomAccessFile(f, FileOptions(options), &file, &dbg);
+  if (status.ok()) {
+    r->reset(new CompositeRandomAccessFileWrapper(file));
+  }
+  return status;
+}
+
+Status CompositeEnv::NewWritableFile(const std::string& f,
+                                     std::unique_ptr<WritableFile>* r,
+                                     const EnvOptions& options) {
+  IODebugContext dbg;
+  std::unique_ptr<FSWritableFile> file;
+  Status status;
+  status = file_system_->NewWritableFile(f, FileOptions(options), &file, &dbg);
+  if (status.ok()) {
+    r->reset(new CompositeWritableFileWrapper(file));
+  }
+  return status;
+}
+
+Status CompositeEnv::ReopenWritableFile(const std::string& fname,
+                                        std::unique_ptr<WritableFile>* result,
+                                        const EnvOptions& options) {
+  IODebugContext dbg;
+  Status status;
+  std::unique_ptr<FSWritableFile> file;
+  status = file_system_->ReopenWritableFile(fname, FileOptions(options), &file,
+                                            &dbg);
+  if (status.ok()) {
+    result->reset(new CompositeWritableFileWrapper(file));
+  }
+  return status;
+}
+
+Status CompositeEnv::ReuseWritableFile(const std::string& fname,
+                                       const std::string& old_fname,
+                                       std::unique_ptr<WritableFile>* r,
+                                       const EnvOptions& options) {
+  IODebugContext dbg;
+  Status status;
+  std::unique_ptr<FSWritableFile> file;
+  status = file_system_->ReuseWritableFile(fname, old_fname,
+                                           FileOptions(options), &file, &dbg);
+  if (status.ok()) {
+    r->reset(new CompositeWritableFileWrapper(file));
+  }
+  return status;
+}
+
+Status CompositeEnv::NewRandomRWFile(const std::string& fname,
+                                     std::unique_ptr<RandomRWFile>* result,
+                                     const EnvOptions& options) {
+  IODebugContext dbg;
+  std::unique_ptr<FSRandomRWFile> file;
+  Status status;
+  status =
+      file_system_->NewRandomRWFile(fname, FileOptions(options), &file, &dbg);
+  if (status.ok()) {
+    result->reset(new CompositeRandomRWFileWrapper(file));
+  }
+  return status;
+}
+
+Status CompositeEnv::NewDirectory(const std::string& name,
+                                  std::unique_ptr<Directory>* result) {
+  IOOptions io_opts;
+  IODebugContext dbg;
+  std::unique_ptr<FSDirectory> dir;
+  Status status;
+  status = file_system_->NewDirectory(name, io_opts, &dir, &dbg);
+  if (status.ok()) {
+    result->reset(new CompositeDirectoryWrapper(dir));
+  }
+  return status;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/env/composite_env_wrapper.h b/env/composite_env_wrapper.h
index 7a0da5c3e26..c4df652f921 100644
--- a/env/composite_env_wrapper.h
+++ b/env/composite_env_wrapper.h
@@ -7,280 +7,24 @@
 
 #include "rocksdb/env.h"
 #include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
 
-namespace ROCKSDB_NAMESPACE {
-
-// The CompositeEnvWrapper class provides an interface that is compatible
-// with the old monolithic Env API, and an implementation that wraps around
-// the new Env that provides threading and other OS related functionality, and
-// the new FileSystem API that provides storage functionality. By
-// providing the old Env interface, it allows the rest of RocksDB code to
-// be agnostic of whether the underlying Env implementation is a monolithic
-// Env or an Env + FileSystem. In the former case, the user will specify
-// Options::env only, whereas in the latter case, the user will specify
-// Options::env and Options::file_system.
-
-class CompositeSequentialFileWrapper : public SequentialFile {
- public:
-  explicit CompositeSequentialFileWrapper(
-      std::unique_ptr<FSSequentialFile>& target)
-      : target_(std::move(target)) {}
-
-  Status Read(size_t n, Slice* result, char* scratch) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Read(n, io_opts, result, scratch, &dbg);
-  }
-  Status Skip(uint64_t n) override { return target_->Skip(n); }
-  bool use_direct_io() const override { return target_->use_direct_io(); }
-  size_t GetRequiredBufferAlignment() const override {
-    return target_->GetRequiredBufferAlignment();
-  }
-  Status InvalidateCache(size_t offset, size_t length) override {
-    return target_->InvalidateCache(offset, length);
-  }
-  Status PositionedRead(uint64_t offset, size_t n, Slice* result,
-                        char* scratch) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->PositionedRead(offset, n, io_opts, result, scratch, &dbg);
-  }
-
- private:
-  std::unique_ptr<FSSequentialFile> target_;
-};
-
-class CompositeRandomAccessFileWrapper : public RandomAccessFile {
- public:
-  explicit CompositeRandomAccessFileWrapper(
-      std::unique_ptr<FSRandomAccessFile>& target)
-      : target_(std::move(target)) {}
-
-  Status Read(uint64_t offset, size_t n, Slice* result,
-              char* scratch) const override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Read(offset, n, io_opts, result, scratch, &dbg);
-  }
-  Status MultiRead(ReadRequest* reqs, size_t num_reqs) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    std::vector<FSReadRequest> fs_reqs;
-    Status status;
-
-    fs_reqs.resize(num_reqs);
-    for (size_t i = 0; i < num_reqs; ++i) {
-      fs_reqs[i].offset = reqs[i].offset;
-      fs_reqs[i].len = reqs[i].len;
-      fs_reqs[i].scratch = reqs[i].scratch;
-      fs_reqs[i].status = IOStatus::OK();
-    }
-    status = target_->MultiRead(fs_reqs.data(), num_reqs, io_opts, &dbg);
-    for (size_t i = 0; i < num_reqs; ++i) {
-      reqs[i].result = fs_reqs[i].result;
-      reqs[i].status = fs_reqs[i].status;
-    }
-    return status;
-  }
-  Status Prefetch(uint64_t offset, size_t n) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Prefetch(offset, n, io_opts, &dbg);
-  }
-  size_t GetUniqueId(char* id, size_t max_size) const override {
-    return target_->GetUniqueId(id, max_size);
-  };
-  void Hint(AccessPattern pattern) override {
-    target_->Hint((FSRandomAccessFile::AccessPattern)pattern);
-  }
-  bool use_direct_io() const override { return target_->use_direct_io(); }
-  size_t GetRequiredBufferAlignment() const override {
-    return target_->GetRequiredBufferAlignment();
-  }
-  Status InvalidateCache(size_t offset, size_t length) override {
-    return target_->InvalidateCache(offset, length);
-  }
-
- private:
-  std::unique_ptr<FSRandomAccessFile> target_;
-};
-
-class CompositeWritableFileWrapper : public WritableFile {
- public:
-  explicit CompositeWritableFileWrapper(std::unique_ptr<FSWritableFile>& t)
-      : target_(std::move(t)) {}
-
-  Status Append(const Slice& data) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Append(data, io_opts, &dbg);
-  }
-  Status PositionedAppend(const Slice& data, uint64_t offset) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->PositionedAppend(data, offset, io_opts, &dbg);
-  }
-  Status Truncate(uint64_t size) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Truncate(size, io_opts, &dbg);
-  }
-  Status Close() override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Close(io_opts, &dbg);
-  }
-  Status Flush() override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Flush(io_opts, &dbg);
-  }
-  Status Sync() override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Sync(io_opts, &dbg);
-  }
-  Status Fsync() override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Fsync(io_opts, &dbg);
-  }
-  bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
-
-  bool use_direct_io() const override { return target_->use_direct_io(); }
-
-  size_t GetRequiredBufferAlignment() const override {
-    return target_->GetRequiredBufferAlignment();
-  }
-
-  void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override {
-    target_->SetWriteLifeTimeHint(hint);
-  }
-
-  Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
-    return target_->GetWriteLifeTimeHint();
-  }
-
-  uint64_t GetFileSize() override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->GetFileSize(io_opts, &dbg);
-  }
-
-  void SetPreallocationBlockSize(size_t size) override {
-    target_->SetPreallocationBlockSize(size);
-  }
-
-  void GetPreallocationStatus(size_t* block_size,
-                              size_t* last_allocated_block) override {
-    target_->GetPreallocationStatus(block_size, last_allocated_block);
-  }
-
-  size_t GetUniqueId(char* id, size_t max_size) const override {
-    return target_->GetUniqueId(id, max_size);
-  }
-
-  Status InvalidateCache(size_t offset, size_t length) override {
-    return target_->InvalidateCache(offset, length);
-  }
-
-  Status RangeSync(uint64_t offset, uint64_t nbytes) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->RangeSync(offset, nbytes, io_opts, &dbg);
-  }
-
-  void PrepareWrite(size_t offset, size_t len) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    target_->PrepareWrite(offset, len, io_opts, &dbg);
-  }
-
-  Status Allocate(uint64_t offset, uint64_t len) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Allocate(offset, len, io_opts, &dbg);
-  }
-
-  std::unique_ptr<FSWritableFile>* target() { return &target_; }
-
- private:
-  std::unique_ptr<FSWritableFile> target_;
-};
-
-class CompositeRandomRWFileWrapper : public RandomRWFile {
- public:
-  explicit CompositeRandomRWFileWrapper(std::unique_ptr<FSRandomRWFile>& target)
-      : target_(std::move(target)) {}
-
-  bool use_direct_io() const override { return target_->use_direct_io(); }
-  size_t GetRequiredBufferAlignment() const override {
-    return target_->GetRequiredBufferAlignment();
-  }
-  Status Write(uint64_t offset, const Slice& data) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Write(offset, data, io_opts, &dbg);
-  }
-  Status Read(uint64_t offset, size_t n, Slice* result,
-              char* scratch) const override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Read(offset, n, io_opts, result, scratch, &dbg);
-  }
-  Status Flush() override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Flush(io_opts, &dbg);
-  }
-  Status Sync() override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Sync(io_opts, &dbg);
-  }
-  Status Fsync() override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Fsync(io_opts, &dbg);
-  }
-  Status Close() override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Close(io_opts, &dbg);
-  }
-
- private:
-  std::unique_ptr<FSRandomRWFile> target_;
-};
-
-class CompositeDirectoryWrapper : public Directory {
- public:
-  explicit CompositeDirectoryWrapper(std::unique_ptr<FSDirectory>& target)
-      : target_(std::move(target)) {}
-
-  Status Fsync() override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    return target_->Fsync(io_opts, &dbg);
-  }
-  size_t GetUniqueId(char* id, size_t max_size) const override {
-    return target_->GetUniqueId(id, max_size);
-  }
+#ifdef _WIN32
+// Windows API macro interference
+#undef DeleteFile
+#undef GetCurrentTime
+#undef LoadLibrary
+#endif
 
- private:
-  std::unique_ptr<FSDirectory> target_;
-};
+namespace ROCKSDB_NAMESPACE {
 
-class CompositeEnvWrapper : public Env {
+class CompositeEnv : public Env {
  public:
   // Initialize a CompositeEnvWrapper that delegates all thread/time related
   // calls to env, and all file operations to fs
-  explicit CompositeEnvWrapper(Env* env, std::shared_ptr<FileSystem> fs)
-      : Env(fs), env_target_(env) {}
-  ~CompositeEnvWrapper() {}
-
-  // Return the target to which this Env forwards all calls
-  Env* env_target() const { return env_target_; }
+  explicit CompositeEnv(const std::shared_ptr<FileSystem>& fs,
+                        const std::shared_ptr<SystemClock>& clock)
+      : Env(fs, clock) {}
 
   Status RegisterDbPaths(const std::vector<std::string>& paths) override {
     return file_system_->RegisterDbPaths(paths);
@@ -292,99 +36,37 @@ class CompositeEnvWrapper : public Env {
   // The following text is boilerplate that forwards all methods to target()
   Status NewSequentialFile(const std::string& f,
                            std::unique_ptr<SequentialFile>* r,
-                           const EnvOptions& options) override {
-    IODebugContext dbg;
-    std::unique_ptr<FSSequentialFile> file;
-    Status status;
-    status =
-        file_system_->NewSequentialFile(f, FileOptions(options), &file, &dbg);
-    if (status.ok()) {
-      r->reset(new CompositeSequentialFileWrapper(file));
-    }
-    return status;
-  }
+                           const EnvOptions& options) override;
+
   Status NewRandomAccessFile(const std::string& f,
                              std::unique_ptr<RandomAccessFile>* r,
-                             const EnvOptions& options) override {
-    IODebugContext dbg;
-    std::unique_ptr<FSRandomAccessFile> file;
-    Status status;
-    status =
-        file_system_->NewRandomAccessFile(f, FileOptions(options), &file, &dbg);
-    if (status.ok()) {
-      r->reset(new CompositeRandomAccessFileWrapper(file));
-    }
-    return status;
-  }
+                             const EnvOptions& options) override;
+
   Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
-                         const EnvOptions& options) override {
-    IODebugContext dbg;
-    std::unique_ptr<FSWritableFile> file;
-    Status status;
-    status =
-        file_system_->NewWritableFile(f, FileOptions(options), &file, &dbg);
-    if (status.ok()) {
-      r->reset(new CompositeWritableFileWrapper(file));
-    }
-    return status;
-  }
+                         const EnvOptions& options) override;
+
   Status ReopenWritableFile(const std::string& fname,
                             std::unique_ptr<WritableFile>* result,
-                            const EnvOptions& options) override {
-    IODebugContext dbg;
-    Status status;
-    std::unique_ptr<FSWritableFile> file;
-    status = file_system_->ReopenWritableFile(fname, FileOptions(options),
-                                              &file, &dbg);
-    if (status.ok()) {
-      result->reset(new CompositeWritableFileWrapper(file));
-    }
-    return status;
-  }
+                            const EnvOptions& options) override;
+
   Status ReuseWritableFile(const std::string& fname,
                            const std::string& old_fname,
                            std::unique_ptr<WritableFile>* r,
-                           const EnvOptions& options) override {
-    IODebugContext dbg;
-    Status status;
-    std::unique_ptr<FSWritableFile> file;
-    status = file_system_->ReuseWritableFile(fname, old_fname,
-                                             FileOptions(options), &file, &dbg);
-    if (status.ok()) {
-      r->reset(new CompositeWritableFileWrapper(file));
-    }
-    return status;
-  }
+                           const EnvOptions& options) override;
+
   Status NewRandomRWFile(const std::string& fname,
                          std::unique_ptr<RandomRWFile>* result,
-                         const EnvOptions& options) override {
-    IODebugContext dbg;
-    std::unique_ptr<FSRandomRWFile> file;
-    Status status;
-    status =
-        file_system_->NewRandomRWFile(fname, FileOptions(options), &file, &dbg);
-    if (status.ok()) {
-      result->reset(new CompositeRandomRWFileWrapper(file));
-    }
-    return status;
-  }
+                         const EnvOptions& options) override;
+
   Status NewMemoryMappedFileBuffer(
       const std::string& fname,
       std::unique_ptr<MemoryMappedFileBuffer>* result) override {
     return file_system_->NewMemoryMappedFileBuffer(fname, result);
   }
+
   Status NewDirectory(const std::string& name,
-                      std::unique_ptr<Directory>* result) override {
-    IOOptions io_opts;
-    IODebugContext dbg;
-    std::unique_ptr<FSDirectory> dir;
-    Status status;
-    status = file_system_->NewDirectory(name, io_opts, &dir, &dbg);
-    if (status.ok()) {
-      result->reset(new CompositeDirectoryWrapper(dir));
-    }
-    return status;
-  }
+                      std::unique_ptr<Directory>* result) override;
+
   Status FileExists(const std::string& f) override {
     IOOptions io_opts;
     IODebugContext dbg;
@@ -498,109 +180,32 @@ class CompositeEnvWrapper : public Env {
     return file_system_->IsDirectory(path, io_opts, is_dir, &dbg);
   }
 
-#if !defined(OS_WIN) && !defined(ROCKSDB_NO_DYNAMIC_EXTENSION)
-  Status LoadLibrary(const std::string& lib_name,
-                     const std::string& search_path,
-                     std::shared_ptr<DynamicLibrary>* result) override {
-    return env_target_->LoadLibrary(lib_name, search_path, result);
-  }
-#endif
-
-  void Schedule(void (*f)(void* arg), void* a, Priority pri,
-                void* tag = nullptr, void (*u)(void* arg) = nullptr) override {
-    return env_target_->Schedule(f, a, pri, tag, u);
-  }
-
-  int UnSchedule(void* tag, Priority pri) override {
-    return env_target_->UnSchedule(tag, pri);
-  }
-
-  void StartThread(void (*f)(void*), void* a) override {
-    return env_target_->StartThread(f, a);
-  }
-  void WaitForJoin() override { return env_target_->WaitForJoin(); }
-  unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override {
-    return env_target_->GetThreadPoolQueueLen(pri);
-  }
   Status GetTestDirectory(std::string* path) override {
     IOOptions io_opts;
     IODebugContext dbg;
     return file_system_->GetTestDirectory(io_opts, path, &dbg);
   }
-  uint64_t NowMicros() override { return env_target_->NowMicros(); }
-  uint64_t NowNanos() override { return env_target_->NowNanos(); }
-  uint64_t NowCPUNanos() override { return env_target_->NowCPUNanos(); }
-
-  void SleepForMicroseconds(int micros) override {
-    env_target_->SleepForMicroseconds(micros);
-  }
-  Status GetHostName(char* name, uint64_t len) override {
-    return env_target_->GetHostName(name, len);
-  }
-  Status GetCurrentTime(int64_t* unix_time) override {
-    return env_target_->GetCurrentTime(unix_time);
-  }
-  void SetBackgroundThreads(int num, Priority pri) override {
-    return env_target_->SetBackgroundThreads(num, pri);
-  }
-  int GetBackgroundThreads(Priority pri) override {
-    return env_target_->GetBackgroundThreads(pri);
-  }
-
-  Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override {
-    return env_target_->SetAllowNonOwnerAccess(allow_non_owner_access);
-  }
-
-  void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
-    return env_target_->IncBackgroundThreadsIfNeeded(num, pri);
-  }
-
-  void LowerThreadPoolIOPriority(Priority pool) override {
-    env_target_->LowerThreadPoolIOPriority(pool);
-  }
-
-  void LowerThreadPoolCPUPriority(Priority pool) override {
-    env_target_->LowerThreadPoolCPUPriority(pool);
-  }
-
-  Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override {
-    return env_target_->LowerThreadPoolCPUPriority(pool, pri);
-  }
-
-  std::string TimeToString(uint64_t time) override {
-    return env_target_->TimeToString(time);
-  }
-
-  Status GetThreadList(std::vector<ThreadStatus>* thread_list) override {
-    return env_target_->GetThreadList(thread_list);
-  }
-
-  ThreadStatusUpdater* GetThreadStatusUpdater() const override {
-    return env_target_->GetThreadStatusUpdater();
-  }
-
-  uint64_t GetThreadID() const override { return env_target_->GetThreadID(); }
-
-  std::string GenerateUniqueId() override {
-    return env_target_->GenerateUniqueId();
-  }
 
   EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const override {
     return file_system_->OptimizeForLogRead(FileOptions(env_options));
   }
+
   EnvOptions OptimizeForManifestRead(
       const EnvOptions& env_options) const override {
     return file_system_->OptimizeForManifestRead(FileOptions(env_options));
   }
+
   EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
                                  const DBOptions& db_options) const override {
     return file_system_->OptimizeForLogWrite(FileOptions(env_options),
                                              db_options);
   }
+
   EnvOptions OptimizeForManifestWrite(
       const EnvOptions& env_options) const override {
     return file_system_->OptimizeForManifestWrite(FileOptions(env_options));
   }
+
   EnvOptions OptimizeForCompactionTableWrite(
       const EnvOptions& env_options,
       const ImmutableDBOptions& immutable_ops) const override {
@@ -613,7 +218,12 @@ class CompositeEnvWrapper : public Env {
     return file_system_->OptimizeForCompactionTableRead(
         FileOptions(env_options), db_options);
   }
-
+  EnvOptions OptimizeForBlobFileRead(
+      const EnvOptions& env_options,
+      const ImmutableDBOptions& db_options) const override {
+    return file_system_->OptimizeForBlobFileRead(FileOptions(env_options),
+                                                 db_options);
+  }
   // This seems to clash with a macro on Windows, so #undef it here
 #ifdef GetFreeSpace
 #undef GetFreeSpace
@@ -623,522 +233,113 @@ class CompositeEnvWrapper : public Env {
     IODebugContext dbg;
     return file_system_->GetFreeSpace(path, io_opts, diskfree, &dbg);
   }
+  uint64_t NowMicros() override { return system_clock_->NowMicros(); }
+  uint64_t NowNanos() override { return system_clock_->NowNanos(); }
 
- private:
-  Env* env_target_;
-};
-
-class LegacySequentialFileWrapper : public FSSequentialFile {
- public:
-  explicit LegacySequentialFileWrapper(
-      std::unique_ptr<SequentialFile>&& _target)
-      : target_(std::move(_target)) {}
-
-  IOStatus Read(size_t n, const IOOptions& /*options*/, Slice* result,
-                char* scratch, IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Read(n, result, scratch));
-  }
-  IOStatus Skip(uint64_t n) override {
-    return status_to_io_status(target_->Skip(n));
-  }
-  bool use_direct_io() const override { return target_->use_direct_io(); }
-  size_t GetRequiredBufferAlignment() const override {
-    return target_->GetRequiredBufferAlignment();
-  }
-  IOStatus InvalidateCache(size_t offset, size_t length) override {
-    return status_to_io_status(target_->InvalidateCache(offset, length));
-  }
-  IOStatus PositionedRead(uint64_t offset, size_t n,
-                          const IOOptions& /*options*/, Slice* result,
-                          char* scratch, IODebugContext* /*dbg*/) override {
-    return status_to_io_status(
-        target_->PositionedRead(offset, n, result, scratch));
-  }
-  SequentialFile* target() { return target_.get(); }
-
- private:
-  std::unique_ptr<SequentialFile> target_;
-};
+  uint64_t NowCPUNanos() override { return system_clock_->CPUNanos(); }
 
-class LegacyRandomAccessFileWrapper : public FSRandomAccessFile {
- public:
-  explicit LegacyRandomAccessFileWrapper(
-      std::unique_ptr<RandomAccessFile>&& target)
-      : target_(std::move(target)) {}
-
-  IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/,
-                Slice* result, char* scratch,
-                IODebugContext* /*dbg*/) const override {
-    return status_to_io_status(target_->Read(offset, n, result, scratch));
+  void SleepForMicroseconds(int micros) override {
+    system_clock_->SleepForMicroseconds(micros);
   }
-  IOStatus MultiRead(FSReadRequest* fs_reqs, size_t num_reqs,
-                     const IOOptions& /*options*/,
-                     IODebugContext* /*dbg*/) override {
-    std::vector<ReadRequest> reqs;
-    Status status;
-
-    reqs.reserve(num_reqs);
-    for (size_t i = 0; i < num_reqs; ++i) {
-      ReadRequest req;
 
-      req.offset = fs_reqs[i].offset;
-      req.len = fs_reqs[i].len;
-      req.scratch = fs_reqs[i].scratch;
-      req.status = Status::OK();
-
-      reqs.emplace_back(req);
-    }
-    status = target_->MultiRead(reqs.data(), num_reqs);
-    for (size_t i = 0; i < num_reqs; ++i) {
-      fs_reqs[i].result = reqs[i].result;
-      fs_reqs[i].status = status_to_io_status(std::move(reqs[i].status));
-    }
-    return status_to_io_status(std::move(status));
-    ;
-  }
-  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& /*options*/,
-                    IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Prefetch(offset, n));
-  }
-  size_t GetUniqueId(char* id, size_t max_size) const override {
-    return target_->GetUniqueId(id, max_size);
-  };
-  void Hint(AccessPattern pattern) override {
-    target_->Hint((RandomAccessFile::AccessPattern)pattern);
-  }
-  bool use_direct_io() const override { return target_->use_direct_io(); }
-  size_t GetRequiredBufferAlignment() const override {
-    return target_->GetRequiredBufferAlignment();
+  Status GetCurrentTime(int64_t* unix_time) override {
+    return system_clock_->GetCurrentTime(unix_time);
   }
-  IOStatus InvalidateCache(size_t offset, size_t length) override {
-    return status_to_io_status(target_->InvalidateCache(offset, length));
+  std::string TimeToString(uint64_t time) override {
+    return system_clock_->TimeToString(time);
   }
-
- private:
-  std::unique_ptr<RandomAccessFile> target_;
 };
 
-class LegacyWritableFileWrapper : public FSWritableFile {
+class CompositeEnvWrapper : public CompositeEnv {
  public:
-  explicit LegacyWritableFileWrapper(std::unique_ptr<WritableFile>&& _target)
-      : target_(std::move(_target)) {}
-
-  IOStatus Append(const Slice& data, const IOOptions& /*options*/,
-                  IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Append(data));
-  }
-  IOStatus Append(const Slice& data, const IOOptions& /*options*/,
-                  const DataVerificationInfo& /*verification_info*/,
-                  IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Append(data));
-  }
-  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
-                            const IOOptions& /*options*/,
-                            IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->PositionedAppend(data, offset));
-  }
-  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
-                            const IOOptions& /*options*/,
-                            const DataVerificationInfo& /*verification_info*/,
-                            IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->PositionedAppend(data, offset));
-  }
-  IOStatus Truncate(uint64_t size, const IOOptions& /*options*/,
-                    IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Truncate(size));
-  }
-  IOStatus Close(const IOOptions& /*options*/,
-                 IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Close());
-  }
-  IOStatus Flush(const IOOptions& /*options*/,
-                 IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Flush());
-  }
-  IOStatus Sync(const IOOptions& /*options*/,
-                IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Sync());
-  }
-  IOStatus Fsync(const IOOptions& /*options*/,
-                 IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Fsync());
-  }
-  bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
-
-  bool use_direct_io() const override { return target_->use_direct_io(); }
-
-  size_t GetRequiredBufferAlignment() const override {
-    return target_->GetRequiredBufferAlignment();
-  }
-
-  void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override {
-    target_->SetWriteLifeTimeHint(hint);
-  }
-
-  Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
-    return target_->GetWriteLifeTimeHint();
-  }
-
-  uint64_t GetFileSize(const IOOptions& /*options*/,
-                       IODebugContext* /*dbg*/) override {
-    return target_->GetFileSize();
-  }
-
-  void SetPreallocationBlockSize(size_t size) override {
-    target_->SetPreallocationBlockSize(size);
-  }
-
-  void GetPreallocationStatus(size_t* block_size,
-                              size_t* last_allocated_block) override {
-    target_->GetPreallocationStatus(block_size, last_allocated_block);
-  }
-
-  size_t GetUniqueId(char* id, size_t max_size) const override {
-    return target_->GetUniqueId(id, max_size);
-  }
+  // Initialize a CompositeEnvWrapper that delegates all thread/time related
+  // calls to env, and all file operations to fs
+  explicit CompositeEnvWrapper(Env* env, const std::shared_ptr<FileSystem>& fs)
+      : CompositeEnvWrapper(env, fs, env->GetSystemClock()) {}
 
-  IOStatus InvalidateCache(size_t offset, size_t length) override {
-    return status_to_io_status(target_->InvalidateCache(offset, length));
-  }
+  explicit CompositeEnvWrapper(Env* env, const std::shared_ptr<SystemClock>& sc)
+      : CompositeEnvWrapper(env, env->GetFileSystem(), sc) {}
 
-  IOStatus RangeSync(uint64_t offset, uint64_t nbytes,
-                     const IOOptions& /*options*/,
-                     IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->RangeSync(offset, nbytes));
-  }
+  explicit CompositeEnvWrapper(Env* env, const std::shared_ptr<FileSystem>& fs,
+                               const std::shared_ptr<SystemClock>& sc)
+      : CompositeEnv(fs, sc), env_target_(env) {}
 
-  void PrepareWrite(size_t offset, size_t len, const IOOptions& /*options*/,
-                    IODebugContext* /*dbg*/) override {
-    target_->PrepareWrite(offset, len);
-  }
+  // Return the target to which this Env forwards all calls
+  Env* env_target() const { return env_target_; }
 
-  IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& /*options*/,
-                    IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Allocate(offset, len));
+#if !defined(OS_WIN) && !defined(ROCKSDB_NO_DYNAMIC_EXTENSION)
+  Status LoadLibrary(const std::string& lib_name,
+                     const std::string& search_path,
+                     std::shared_ptr<DynamicLibrary>* result) override {
+    return env_target_->LoadLibrary(lib_name, search_path, result);
   }
+#endif
 
-  WritableFile* target() { return target_.get(); }
-
- private:
-  std::unique_ptr<WritableFile> target_;
-};
-
-class LegacyRandomRWFileWrapper : public FSRandomRWFile {
- public:
-  explicit LegacyRandomRWFileWrapper(std::unique_ptr<RandomRWFile>&& target)
-      : target_(std::move(target)) {}
-
-  bool use_direct_io() const override { return target_->use_direct_io(); }
-  size_t GetRequiredBufferAlignment() const override {
-    return target_->GetRequiredBufferAlignment();
-  }
-  IOStatus Write(uint64_t offset, const Slice& data,
-                 const IOOptions& /*options*/,
-                 IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Write(offset, data));
-  }
-  IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/,
-                Slice* result, char* scratch,
-                IODebugContext* /*dbg*/) const override {
-    return status_to_io_status(target_->Read(offset, n, result, scratch));
-  }
-  IOStatus Flush(const IOOptions& /*options*/,
-                 IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Flush());
-  }
-  IOStatus Sync(const IOOptions& /*options*/,
-                IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Sync());
-  }
-  IOStatus Fsync(const IOOptions& /*options*/,
-                 IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Fsync());
-  }
-  IOStatus Close(const IOOptions& /*options*/,
-                 IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Close());
+  void Schedule(void (*f)(void* arg), void* a, Priority pri,
+                void* tag = nullptr, void (*u)(void* arg) = nullptr) override {
+    return env_target_->Schedule(f, a, pri, tag, u);
   }
 
- private:
-  std::unique_ptr<RandomRWFile> target_;
-};
-
-class LegacyDirectoryWrapper : public FSDirectory {
- public:
-  explicit LegacyDirectoryWrapper(std::unique_ptr<Directory>&& target)
-      : target_(std::move(target)) {}
-
-  IOStatus Fsync(const IOOptions& /*options*/,
-                 IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Fsync());
-  }
-  size_t GetUniqueId(char* id, size_t max_size) const override {
-    return target_->GetUniqueId(id, max_size);
+  int UnSchedule(void* tag, Priority pri) override {
+    return env_target_->UnSchedule(tag, pri);
   }
 
- private:
-  std::unique_ptr<Directory> target_;
-};
-
-class LegacyFileSystemWrapper : public FileSystem {
- public:
-  // Initialize an EnvWrapper that delegates all calls to *t
-  explicit LegacyFileSystemWrapper(Env* t) : target_(t) {}
-  ~LegacyFileSystemWrapper() override {}
-
-  const char* Name() const override { return "Legacy File System"; }
-
-  // Return the target to which this Env forwards all calls
-  Env* target() const { return target_; }
-
-  // The following text is boilerplate that forwards all methods to target()
-  IOStatus NewSequentialFile(const std::string& f,
-                             const FileOptions& file_opts,
-                             std::unique_ptr<FSSequentialFile>* r,
-                             IODebugContext* /*dbg*/) override {
-    std::unique_ptr<SequentialFile> file;
-    Status s = target_->NewSequentialFile(f, &file, file_opts);
-    if (s.ok()) {
-      r->reset(new LegacySequentialFileWrapper(std::move(file)));
-    }
-    return status_to_io_status(std::move(s));
-  }
-  IOStatus NewRandomAccessFile(const std::string& f,
-      const FileOptions& file_opts,
-                               std::unique_ptr<FSRandomAccessFile>* r,
-                               IODebugContext* /*dbg*/) override {
-    std::unique_ptr<RandomAccessFile> file;
-    Status s = target_->NewRandomAccessFile(f, &file, file_opts);
-    if (s.ok()) {
-      r->reset(new LegacyRandomAccessFileWrapper(std::move(file)));
-    }
-    return status_to_io_status(std::move(s));
-  }
-  IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts,
-                           std::unique_ptr<FSWritableFile>* r,
-                           IODebugContext* /*dbg*/) override {
-    std::unique_ptr<WritableFile> file;
-    Status s = target_->NewWritableFile(f, &file, file_opts);
-    if (s.ok()) {
-      r->reset(new LegacyWritableFileWrapper(std::move(file)));
-    }
-    return status_to_io_status(std::move(s));
-  }
-  IOStatus ReopenWritableFile(const std::string& fname,
-                              const FileOptions& file_opts,
-                              std::unique_ptr<FSWritableFile>* result,
-                              IODebugContext* /*dbg*/) override {
-    std::unique_ptr<WritableFile> file;
-    Status s = target_->ReopenWritableFile(fname, &file, file_opts);
-    if (s.ok()) {
-      result->reset(new LegacyWritableFileWrapper(std::move(file)));
-    }
-    return status_to_io_status(std::move(s));
-  }
-  IOStatus ReuseWritableFile(const std::string& fname,
-                             const std::string& old_fname,
-                             const FileOptions& file_opts,
-                             std::unique_ptr<FSWritableFile>* r,
-                             IODebugContext* /*dbg*/) override {
-    std::unique_ptr<WritableFile> file;
-    Status s = target_->ReuseWritableFile(fname, old_fname, &file, file_opts);
-    if (s.ok()) {
-      r->reset(new LegacyWritableFileWrapper(std::move(file)));
-    }
-    return status_to_io_status(std::move(s));
-  }
-  IOStatus NewRandomRWFile(const std::string& fname,
-      const FileOptions& file_opts,
-                           std::unique_ptr<FSRandomRWFile>* result,
-                           IODebugContext* /*dbg*/) override {
-    std::unique_ptr<RandomRWFile> file;
-    Status s = target_->NewRandomRWFile(fname, &file, file_opts);
-    if (s.ok()) {
-      result->reset(new LegacyRandomRWFileWrapper(std::move(file)));
-    }
-    return status_to_io_status(std::move(s));
-  }
-  IOStatus NewMemoryMappedFileBuffer(
-      const std::string& fname,
-      std::unique_ptr<MemoryMappedFileBuffer>* result) override {
-    return status_to_io_status(
-        target_->NewMemoryMappedFileBuffer(fname, result));
-  }
-  IOStatus NewDirectory(const std::string& name, const IOOptions& /*io_opts*/,
-                        std::unique_ptr<FSDirectory>* result,
-                        IODebugContext* /*dbg*/) override {
-    std::unique_ptr<Directory> dir;
-    Status s = target_->NewDirectory(name, &dir);
-    if (s.ok()) {
-      result->reset(new LegacyDirectoryWrapper(std::move(dir)));
-    }
-    return status_to_io_status(std::move(s));
-  }
-  IOStatus FileExists(const std::string& f, const IOOptions& /*io_opts*/,
-                      IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->FileExists(f));
-  }
-  IOStatus GetChildren(const std::string& dir, const IOOptions& /*io_opts*/,
-                       std::vector<std::string>* r,
-                       IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->GetChildren(dir, r));
-  }
-  IOStatus GetChildrenFileAttributes(const std::string& dir,
-                                     const IOOptions& /*options*/,
-                                     std::vector<FileAttributes>* result,
-                                     IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->GetChildrenFileAttributes(dir, result));
-  }
-  IOStatus DeleteFile(const std::string& f, const IOOptions& /*options*/,
-                      IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->DeleteFile(f));
-  }
-  IOStatus Truncate(const std::string& fname, size_t size,
-                    const IOOptions& /*options*/,
-                    IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->Truncate(fname, size));
-  }
-  IOStatus CreateDir(const std::string& d, const IOOptions& /*options*/,
-                     IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->CreateDir(d));
-  }
-  IOStatus CreateDirIfMissing(const std::string& d,
-                              const IOOptions& /*options*/,
-                              IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->CreateDirIfMissing(d));
-  }
-  IOStatus DeleteDir(const std::string& d, const IOOptions& /*options*/,
-                     IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->DeleteDir(d));
+  void StartThread(void (*f)(void*), void* a) override {
+    return env_target_->StartThread(f, a);
   }
-  IOStatus GetFileSize(const std::string& f, const IOOptions& /*options*/,
-                       uint64_t* s, IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->GetFileSize(f, s));
+  void WaitForJoin() override { return env_target_->WaitForJoin(); }
+  unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override {
+    return env_target_->GetThreadPoolQueueLen(pri);
   }
 
-  IOStatus GetFileModificationTime(const std::string& fname,
-                                   const IOOptions& /*options*/,
-                                   uint64_t* file_mtime,
-                                   IODebugContext* /*dbg*/) override {
-    return status_to_io_status(
-        target_->GetFileModificationTime(fname, file_mtime));
+  Status GetHostName(char* name, uint64_t len) override {
+    return env_target_->GetHostName(name, len);
   }
-
-  IOStatus GetAbsolutePath(const std::string& db_path,
-                           const IOOptions& /*options*/,
-                           std::string* output_path,
-                           IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->GetAbsolutePath(db_path, output_path));
+  void SetBackgroundThreads(int num, Priority pri) override {
+    return env_target_->SetBackgroundThreads(num, pri);
   }
-
-  IOStatus RenameFile(const std::string& s, const std::string& t,
-                      const IOOptions& /*options*/,
-                      IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->RenameFile(s, t));
+  int GetBackgroundThreads(Priority pri) override {
+    return env_target_->GetBackgroundThreads(pri);
   }
 
-  IOStatus LinkFile(const std::string& s, const std::string& t,
-                    const IOOptions& /*options*/,
-                    IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->LinkFile(s, t));
+  Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override {
+    return env_target_->SetAllowNonOwnerAccess(allow_non_owner_access);
   }
 
-  IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*options*/,
-                        uint64_t* count, IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->NumFileLinks(fname, count));
+  void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
+    return env_target_->IncBackgroundThreadsIfNeeded(num, pri);
   }
 
-  IOStatus AreFilesSame(const std::string& first, const std::string& second,
-                        const IOOptions& /*options*/, bool* res,
-                        IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->AreFilesSame(first, second, res));
+  void LowerThreadPoolIOPriority(Priority pool) override {
+    env_target_->LowerThreadPoolIOPriority(pool);
   }
 
-  IOStatus LockFile(const std::string& f, const IOOptions& /*options*/,
-                    FileLock** l, IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->LockFile(f, l));
+  void LowerThreadPoolCPUPriority(Priority pool) override {
+    env_target_->LowerThreadPoolCPUPriority(pool);
   }
 
-  IOStatus UnlockFile(FileLock* l, const IOOptions& /*options*/,
-                      IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->UnlockFile(l));
+  Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override {
+    return env_target_->LowerThreadPoolCPUPriority(pool, pri);
   }
 
-  IOStatus GetTestDirectory(const IOOptions& /*options*/, std::string* path,
-                            IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->GetTestDirectory(path));
-  }
-  IOStatus NewLogger(const std::string& fname, const IOOptions& /*options*/,
-                     std::shared_ptr<Logger>* result,
-                     IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->NewLogger(fname, result));
+  Status GetThreadList(std::vector<ThreadStatus>* thread_list) override {
+    return env_target_->GetThreadList(thread_list);
   }
 
-  void SanitizeFileOptions(FileOptions* opts) const override {
-    target_->SanitizeEnvOptions(opts);
+  ThreadStatusUpdater* GetThreadStatusUpdater() const override {
+    return env_target_->GetThreadStatusUpdater();
   }
 
-  FileOptions OptimizeForLogRead(
-                  const FileOptions& file_options) const override {
-    return target_->OptimizeForLogRead(file_options);
-  }
-  FileOptions OptimizeForManifestRead(
-      const FileOptions& file_options) const override {
-    return target_->OptimizeForManifestRead(file_options);
-  }
-  FileOptions OptimizeForLogWrite(const FileOptions& file_options,
-                                 const DBOptions& db_options) const override {
-    return target_->OptimizeForLogWrite(file_options, db_options);
-  }
-  FileOptions OptimizeForManifestWrite(
-      const FileOptions& file_options) const override {
-    return target_->OptimizeForManifestWrite(file_options);
-  }
-  FileOptions OptimizeForCompactionTableWrite(
-      const FileOptions& file_options,
-      const ImmutableDBOptions& immutable_ops) const override {
-    return target_->OptimizeForCompactionTableWrite(file_options,
-                                                     immutable_ops);
-  }
-  FileOptions OptimizeForCompactionTableRead(
-      const FileOptions& file_options,
-      const ImmutableDBOptions& db_options) const override {
-    return target_->OptimizeForCompactionTableRead(file_options, db_options);
-  }
+  uint64_t GetThreadID() const override { return env_target_->GetThreadID(); }
 
-// This seems to clash with a macro on Windows, so #undef it here
-#ifdef GetFreeSpace
-#undef GetFreeSpace
-#endif
-  IOStatus GetFreeSpace(const std::string& path, const IOOptions& /*options*/,
-                        uint64_t* diskfree, IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->GetFreeSpace(path, diskfree));
-  }
-  IOStatus IsDirectory(const std::string& path, const IOOptions& /*options*/,
-                       bool* is_dir, IODebugContext* /*dbg*/) override {
-    return status_to_io_status(target_->IsDirectory(path, is_dir));
+  std::string GenerateUniqueId() override {
+    return env_target_->GenerateUniqueId();
   }
 
  private:
-  Env* target_;
+  Env* env_target_;
 };
 
-inline std::unique_ptr<FSSequentialFile> NewLegacySequentialFileWrapper(
-    std::unique_ptr<SequentialFile>& file) {
-  return std::unique_ptr<FSSequentialFile>(
-      new LegacySequentialFileWrapper(std::move(file)));
-}
-
-inline std::unique_ptr<FSRandomAccessFile> NewLegacyRandomAccessFileWrapper(
-    std::unique_ptr<RandomAccessFile>& file) {
-  return std::unique_ptr<FSRandomAccessFile>(
-      new LegacyRandomAccessFileWrapper(std::move(file)));
-}
-
-inline std::unique_ptr<FSWritableFile> NewLegacyWritableFileWrapper(
-    std::unique_ptr<WritableFile>&& file) {
-  return std::unique_ptr<FSWritableFile>(
-      new LegacyWritableFileWrapper(std::move(file)));
-}
-
+std::unique_ptr<FSSequentialFile> NewLegacySequentialFileWrapper(
+    std::unique_ptr<SequentialFile>& file);
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/env/env.cc b/env/env.cc
index 829fcefb15d..000a52575ef 100644
--- a/env/env.cc
+++ b/env/env.cc
@@ -10,25 +10,569 @@
 #include "rocksdb/env.h"
 
 #include <thread>
+
 #include "env/composite_env_wrapper.h"
 #include "logging/env_logger.h"
 #include "memory/arena.h"
 #include "options/db_options.h"
 #include "port/port.h"
-#include "port/sys_time.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/options.h"
+#include "rocksdb/system_clock.h"
 #include "rocksdb/utilities/object_registry.h"
 #include "util/autovector.h"
 
 namespace ROCKSDB_NAMESPACE {
+namespace {
+class LegacySystemClock : public SystemClock {
+ private:
+  Env* env_;
+
+ public:
+  explicit LegacySystemClock(Env* env) : env_(env) {}
+  const char* Name() const override { return "Legacy System Clock"; }
+
+  // Returns the number of micro-seconds since some fixed point in time.
+  // It is often used as system time such as in GenericRateLimiter
+  // and other places so a port needs to return system time in order to work.
+  uint64_t NowMicros() override { return env_->NowMicros(); }
+
+  // Returns the number of nano-seconds since some fixed point in time. Only
+  // useful for computing deltas of time in one run.
+  // Default implementation simply relies on NowMicros.
+  // In platform-specific implementations, NowNanos() should return time points
+  // that are MONOTONIC.
+  uint64_t NowNanos() override { return env_->NowNanos(); }
+
+  uint64_t CPUMicros() override { return CPUNanos() / 1000; }
+  uint64_t CPUNanos() override { return env_->NowCPUNanos(); }
+
+  // Sleep/delay the thread for the prescribed number of micro-seconds.
+  void SleepForMicroseconds(int micros) override {
+    env_->SleepForMicroseconds(micros);
+  }
+
+  // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
+  // Only overwrites *unix_time on success.
+  Status GetCurrentTime(int64_t* unix_time) override {
+    return env_->GetCurrentTime(unix_time);
+  }
+  // Converts seconds-since-Jan-01-1970 to a printable string
+  std::string TimeToString(uint64_t time) override {
+    return env_->TimeToString(time);
+  }
+};
+
+class LegacySequentialFileWrapper : public FSSequentialFile {
+ public:
+  explicit LegacySequentialFileWrapper(
+      std::unique_ptr<SequentialFile>&& _target)
+      : target_(std::move(_target)) {}
+
+  IOStatus Read(size_t n, const IOOptions& /*options*/, Slice* result,
+                char* scratch, IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Read(n, result, scratch));
+  }
+  IOStatus Skip(uint64_t n) override {
+    return status_to_io_status(target_->Skip(n));
+  }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  IOStatus InvalidateCache(size_t offset, size_t length) override {
+    return status_to_io_status(target_->InvalidateCache(offset, length));
+  }
+  IOStatus PositionedRead(uint64_t offset, size_t n,
+                          const IOOptions& /*options*/, Slice* result,
+                          char* scratch, IODebugContext* /*dbg*/) override {
+    return status_to_io_status(
+        target_->PositionedRead(offset, n, result, scratch));
+  }
+
+ private:
+  std::unique_ptr<SequentialFile> target_;
+};
+
+class LegacyRandomAccessFileWrapper : public FSRandomAccessFile {
+ public:
+  explicit LegacyRandomAccessFileWrapper(
+      std::unique_ptr<RandomAccessFile>&& target)
+      : target_(std::move(target)) {}
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/,
+                Slice* result, char* scratch,
+                IODebugContext* /*dbg*/) const override {
+    return status_to_io_status(target_->Read(offset, n, result, scratch));
+  }
+
+  IOStatus MultiRead(FSReadRequest* fs_reqs, size_t num_reqs,
+                     const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    std::vector<ReadRequest> reqs;
+    Status status;
+
+    reqs.reserve(num_reqs);
+    for (size_t i = 0; i < num_reqs; ++i) {
+      ReadRequest req;
+
+      req.offset = fs_reqs[i].offset;
+      req.len = fs_reqs[i].len;
+      req.scratch = fs_reqs[i].scratch;
+      req.status = Status::OK();
+
+      reqs.emplace_back(req);
+    }
+    status = target_->MultiRead(reqs.data(), num_reqs);
+    for (size_t i = 0; i < num_reqs; ++i) {
+      fs_reqs[i].result = reqs[i].result;
+      fs_reqs[i].status = status_to_io_status(std::move(reqs[i].status));
+    }
+    return status_to_io_status(std::move(status));
+  }
+
+  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Prefetch(offset, n));
+  }
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+  void Hint(AccessPattern pattern) override {
+    target_->Hint((RandomAccessFile::AccessPattern)pattern);
+  }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  IOStatus InvalidateCache(size_t offset, size_t length) override {
+    return status_to_io_status(target_->InvalidateCache(offset, length));
+  }
+
+ private:
+  std::unique_ptr<RandomAccessFile> target_;
+};
+
+class LegacyRandomRWFileWrapper : public FSRandomRWFile {
+ public:
+  explicit LegacyRandomRWFileWrapper(std::unique_ptr<RandomRWFile>&& target)
+      : target_(std::move(target)) {}
+
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  IOStatus Write(uint64_t offset, const Slice& data,
+                 const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Write(offset, data));
+  }
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/,
+                Slice* result, char* scratch,
+                IODebugContext* /*dbg*/) const override {
+    return status_to_io_status(target_->Read(offset, n, result, scratch));
+  }
+  IOStatus Flush(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Flush());
+  }
+  IOStatus Sync(const IOOptions& /*options*/,
+                IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Sync());
+  }
+  IOStatus Fsync(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Fsync());
+  }
+  IOStatus Close(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Close());
+  }
+
+ private:
+  std::unique_ptr<RandomRWFile> target_;
+};
+
+class LegacyWritableFileWrapper : public FSWritableFile {
+ public:
+  explicit LegacyWritableFileWrapper(std::unique_ptr<WritableFile>&& _target)
+      : target_(std::move(_target)) {}
+
+  IOStatus Append(const Slice& data, const IOOptions& /*options*/,
+                  IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Append(data));
+  }
+  IOStatus Append(const Slice& data, const IOOptions& /*options*/,
+                  const DataVerificationInfo& /*verification_info*/,
+                  IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Append(data));
+  }
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& /*options*/,
+                            IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->PositionedAppend(data, offset));
+  }
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& /*options*/,
+                            const DataVerificationInfo& /*verification_info*/,
+                            IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->PositionedAppend(data, offset));
+  }
+  IOStatus Truncate(uint64_t size, const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Truncate(size));
+  }
+  IOStatus Close(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Close());
+  }
+  IOStatus Flush(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Flush());
+  }
+  IOStatus Sync(const IOOptions& /*options*/,
+                IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Sync());
+  }
+  IOStatus Fsync(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Fsync());
+  }
+  bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
+
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+
+  void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override {
+    target_->SetWriteLifeTimeHint(hint);
+  }
+
+  Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
+    return target_->GetWriteLifeTimeHint();
+  }
+
+  uint64_t GetFileSize(const IOOptions& /*options*/,
+                       IODebugContext* /*dbg*/) override {
+    return target_->GetFileSize();
+  }
+
+  void SetPreallocationBlockSize(size_t size) override {
+    target_->SetPreallocationBlockSize(size);
+  }
+
+  void GetPreallocationStatus(size_t* block_size,
+                              size_t* last_allocated_block) override {
+    target_->GetPreallocationStatus(block_size, last_allocated_block);
+  }
+
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+
+  IOStatus InvalidateCache(size_t offset, size_t length) override {
+    return status_to_io_status(target_->InvalidateCache(offset, length));
+  }
+
+  IOStatus RangeSync(uint64_t offset, uint64_t nbytes,
+                     const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->RangeSync(offset, nbytes));
+  }
+
+  void PrepareWrite(size_t offset, size_t len, const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    target_->PrepareWrite(offset, len);
+  }
+
+  IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Allocate(offset, len));
+  }
+
+ private:
+  std::unique_ptr<WritableFile> target_;
+};
+
+class LegacyDirectoryWrapper : public FSDirectory {
+ public:
+  explicit LegacyDirectoryWrapper(std::unique_ptr<Directory>&& target)
+      : target_(std::move(target)) {}
+
+  IOStatus Fsync(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Fsync());
+  }
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+
+ private:
+  std::unique_ptr<Directory> target_;
+};
+
+class LegacyFileSystemWrapper : public FileSystem {
+ public:
+  // Initialize an EnvWrapper that delegates all calls to *t
+  explicit LegacyFileSystemWrapper(Env* t) : target_(t) {}
+  ~LegacyFileSystemWrapper() override {}
+
+  const char* Name() const override { return "Legacy File System"; }
+
+  // Return the target to which this Env forwards all calls
+  Env* target() const { return target_; }
+
+  // The following text is boilerplate that forwards all methods to target()
+  IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts,
+                             std::unique_ptr<FSSequentialFile>* r,
+                             IODebugContext* /*dbg*/) override {
+    std::unique_ptr<SequentialFile> file;
+    Status s = target_->NewSequentialFile(f, &file, file_opts);
+    if (s.ok()) {
+      r->reset(new LegacySequentialFileWrapper(std::move(file)));
+    }
+    return status_to_io_status(std::move(s));
+  }
+  IOStatus NewRandomAccessFile(const std::string& f,
+                               const FileOptions& file_opts,
+                               std::unique_ptr<FSRandomAccessFile>* r,
+                               IODebugContext* /*dbg*/) override {
+    std::unique_ptr<RandomAccessFile> file;
+    Status s = target_->NewRandomAccessFile(f, &file, file_opts);
+    if (s.ok()) {
+      r->reset(new LegacyRandomAccessFileWrapper(std::move(file)));
+    }
+    return status_to_io_status(std::move(s));
+  }
+  IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts,
+                           std::unique_ptr<FSWritableFile>* r,
+                           IODebugContext* /*dbg*/) override {
+    std::unique_ptr<WritableFile> file;
+    Status s = target_->NewWritableFile(f, &file, file_opts);
+    if (s.ok()) {
+      r->reset(new LegacyWritableFileWrapper(std::move(file)));
+    }
+    return status_to_io_status(std::move(s));
+  }
+  IOStatus ReopenWritableFile(const std::string& fname,
+                              const FileOptions& file_opts,
+                              std::unique_ptr<FSWritableFile>* result,
+                              IODebugContext* /*dbg*/) override {
+    std::unique_ptr<WritableFile> file;
+    Status s = target_->ReopenWritableFile(fname, &file, file_opts);
+    if (s.ok()) {
+      result->reset(new LegacyWritableFileWrapper(std::move(file)));
+    }
+    return status_to_io_status(std::move(s));
+  }
+  IOStatus ReuseWritableFile(const std::string& fname,
+                             const std::string& old_fname,
+                             const FileOptions& file_opts,
+                             std::unique_ptr<FSWritableFile>* r,
+                             IODebugContext* /*dbg*/) override {
+    std::unique_ptr<WritableFile> file;
+    Status s = target_->ReuseWritableFile(fname, old_fname, &file, file_opts);
+    if (s.ok()) {
+      r->reset(new LegacyWritableFileWrapper(std::move(file)));
+    }
+    return status_to_io_status(std::move(s));
+  }
+  IOStatus NewRandomRWFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* /*dbg*/) override {
+    std::unique_ptr<RandomRWFile> file;
+    Status s = target_->NewRandomRWFile(fname, &file, file_opts);
+    if (s.ok()) {
+      result->reset(new LegacyRandomRWFileWrapper(std::move(file)));
+    }
+    return status_to_io_status(std::move(s));
+  }
+  IOStatus NewMemoryMappedFileBuffer(
+      const std::string& fname,
+      std::unique_ptr<MemoryMappedFileBuffer>* result) override {
+    return status_to_io_status(
+        target_->NewMemoryMappedFileBuffer(fname, result));
+  }
+  IOStatus NewDirectory(const std::string& name, const IOOptions& /*io_opts*/,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* /*dbg*/) override {
+    std::unique_ptr<Directory> dir;
+    Status s = target_->NewDirectory(name, &dir);
+    if (s.ok()) {
+      result->reset(new LegacyDirectoryWrapper(std::move(dir)));
+    }
+    return status_to_io_status(std::move(s));
+  }
+  IOStatus FileExists(const std::string& f, const IOOptions& /*io_opts*/,
+                      IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->FileExists(f));
+  }
+  IOStatus GetChildren(const std::string& dir, const IOOptions& /*io_opts*/,
+                       std::vector<std::string>* r,
+                       IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->GetChildren(dir, r));
+  }
+  IOStatus GetChildrenFileAttributes(const std::string& dir,
+                                     const IOOptions& /*options*/,
+                                     std::vector<FileAttributes>* result,
+                                     IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->GetChildrenFileAttributes(dir, result));
+  }
+  IOStatus DeleteFile(const std::string& f, const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->DeleteFile(f));
+  }
+  IOStatus Truncate(const std::string& fname, size_t size,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->Truncate(fname, size));
+  }
+  IOStatus CreateDir(const std::string& d, const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->CreateDir(d));
+  }
+  IOStatus CreateDirIfMissing(const std::string& d,
+                              const IOOptions& /*options*/,
+                              IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->CreateDirIfMissing(d));
+  }
+  IOStatus DeleteDir(const std::string& d, const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->DeleteDir(d));
+  }
+  IOStatus GetFileSize(const std::string& f, const IOOptions& /*options*/,
+                       uint64_t* s, IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->GetFileSize(f, s));
+  }
+
+  IOStatus GetFileModificationTime(const std::string& fname,
+                                   const IOOptions& /*options*/,
+                                   uint64_t* file_mtime,
+                                   IODebugContext* /*dbg*/) override {
+    return status_to_io_status(
+        target_->GetFileModificationTime(fname, file_mtime));
+  }
+
+  IOStatus GetAbsolutePath(const std::string& db_path,
+                           const IOOptions& /*options*/,
+                           std::string* output_path,
+                           IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->GetAbsolutePath(db_path, output_path));
+  }
+
+  IOStatus RenameFile(const std::string& s, const std::string& t,
+                      const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->RenameFile(s, t));
+  }
+
+  IOStatus LinkFile(const std::string& s, const std::string& t,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->LinkFile(s, t));
+  }
+
+  IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*options*/,
+                        uint64_t* count, IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->NumFileLinks(fname, count));
+  }
+
+  IOStatus AreFilesSame(const std::string& first, const std::string& second,
+                        const IOOptions& /*options*/, bool* res,
+                        IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->AreFilesSame(first, second, res));
+  }
+
+  IOStatus LockFile(const std::string& f, const IOOptions& /*options*/,
+                    FileLock** l, IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->LockFile(f, l));
+  }
+
+  IOStatus UnlockFile(FileLock* l, const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->UnlockFile(l));
+  }
+
+  IOStatus GetTestDirectory(const IOOptions& /*options*/, std::string* path,
+                            IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->GetTestDirectory(path));
+  }
+  IOStatus NewLogger(const std::string& fname, const IOOptions& /*options*/,
+                     std::shared_ptr<Logger>* result,
+                     IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->NewLogger(fname, result));
+  }
+
+  void SanitizeFileOptions(FileOptions* opts) const override {
+    target_->SanitizeEnvOptions(opts);
+  }
+
+  FileOptions OptimizeForLogRead(
+      const FileOptions& file_options) const override {
+    return target_->OptimizeForLogRead(file_options);
+  }
+  FileOptions OptimizeForManifestRead(
+      const FileOptions& file_options) const override {
+    return target_->OptimizeForManifestRead(file_options);
+  }
+  FileOptions OptimizeForLogWrite(const FileOptions& file_options,
+                                  const DBOptions& db_options) const override {
+    return target_->OptimizeForLogWrite(file_options, db_options);
+  }
+  FileOptions OptimizeForManifestWrite(
+      const FileOptions& file_options) const override {
+    return target_->OptimizeForManifestWrite(file_options);
+  }
+  FileOptions OptimizeForCompactionTableWrite(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& immutable_ops) const override {
+    return target_->OptimizeForCompactionTableWrite(file_options,
+                                                    immutable_ops);
+  }
+  FileOptions OptimizeForCompactionTableRead(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& db_options) const override {
+    return target_->OptimizeForCompactionTableRead(file_options, db_options);
+  }
+  FileOptions OptimizeForBlobFileRead(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& db_options) const override {
+    return target_->OptimizeForBlobFileRead(file_options, db_options);
+  }
+
+#ifdef GetFreeSpace
+#undef GetFreeSpace
+#endif
+  IOStatus GetFreeSpace(const std::string& path, const IOOptions& /*options*/,
+                        uint64_t* diskfree, IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->GetFreeSpace(path, diskfree));
+  }
+  IOStatus IsDirectory(const std::string& path, const IOOptions& /*options*/,
+                       bool* is_dir, IODebugContext* /*dbg*/) override {
+    return status_to_io_status(target_->IsDirectory(path, is_dir));
+  }
+
+ private:
+  Env* target_;
+};
+}  // end anonymous namespace
 
 Env::Env() : thread_status_updater_(nullptr) {
   file_system_ = std::make_shared<LegacyFileSystemWrapper>(this);
+  system_clock_ = std::make_shared<LegacySystemClock>(this);
 }
 
-Env::Env(std::shared_ptr<FileSystem> fs)
-  : thread_status_updater_(nullptr),
-    file_system_(fs) {}
+Env::Env(const std::shared_ptr<FileSystem>& fs)
+    : thread_status_updater_(nullptr), file_system_(fs) {
+  system_clock_ = std::make_shared<LegacySystemClock>(this);
+}
+
+Env::Env(const std::shared_ptr<FileSystem>& fs,
+         const std::shared_ptr<SystemClock>& clock)
+    : thread_status_updater_(nullptr), file_system_(fs), system_clock_(clock) {}
 
 Env::~Env() {
 }
@@ -39,11 +583,18 @@ Status Env::NewLogger(const std::string& fname,
 }
 
 Status Env::LoadEnv(const std::string& value, Env** result) {
+  return CreateFromString(ConfigOptions(), value, result);
+}
+
+Status Env::CreateFromString(const ConfigOptions& config_options,
+                             const std::string& value, Env** result) {
   Env* env = *result;
   Status s;
 #ifndef ROCKSDB_LITE
+  (void)config_options;
   s = ObjectRegistry::NewInstance()->NewStaticObject<Env>(value, &env);
 #else
+  (void)config_options;
   s = Status::NotSupported("Cannot load environment in LITE mode", value);
 #endif
   if (s.ok()) {
@@ -54,18 +605,29 @@ Status Env::LoadEnv(const std::string& value, Env** result) {
 
 Status Env::LoadEnv(const std::string& value, Env** result,
                     std::shared_ptr<Env>* guard) {
+  return CreateFromString(ConfigOptions(), value, result, guard);
+}
+
+Status Env::CreateFromString(const ConfigOptions& config_options,
+                             const std::string& value, Env** result,
+                             std::shared_ptr<Env>* guard) {
   assert(result);
+  if (value.empty()) {
+    *result = Env::Default();
+    return Status::OK();
+  }
   Status s;
 #ifndef ROCKSDB_LITE
   Env* env = nullptr;
   std::unique_ptr<Env> uniq_guard;
   std::string err_msg;
   assert(guard != nullptr);
+  (void)config_options;
   env = ObjectRegistry::NewInstance()->NewObject<Env>(value, &uniq_guard,
                                                       &err_msg);
   if (!env) {
-    s = Status::NotFound(std::string("Cannot load ") + Env::Type() + ": " +
-                         value);
+    s = Status::NotSupported(std::string("Cannot load ") + Env::Type() + ": " +
+                             value);
     env = Env::Default();
   }
   if (s.ok() && uniq_guard) {
@@ -75,6 +637,7 @@ Status Env::LoadEnv(const std::string& value, Env** result,
     *result = env;
   }
 #else
+  (void)config_options;
   (void)result;
   (void)guard;
   s = Status::NotSupported("Cannot load environment in LITE mode", value);
@@ -82,6 +645,30 @@ Status Env::LoadEnv(const std::string& value, Env** result,
   return s;
 }
 
+Status Env::CreateFromUri(const ConfigOptions& config_options,
+                          const std::string& env_uri, const std::string& fs_uri,
+                          Env** result, std::shared_ptr<Env>* guard) {
+  *result = config_options.env;
+  if (env_uri.empty() && fs_uri.empty()) {
+    // Neither specified.  Use the default
+    guard->reset();
+    return Status::OK();
+  } else if (!env_uri.empty() && !fs_uri.empty()) {
+    // Both specified.  Cannot choose.  Return Invalid
+    return Status::InvalidArgument("cannot specify both fs_uri and env_uri");
+  } else if (fs_uri.empty()) {  // Only have an ENV URI.  Create an Env from it
+    return CreateFromString(config_options, env_uri, result, guard);
+  } else {
+    std::shared_ptr<FileSystem> fs;
+    Status s = FileSystem::CreateFromString(config_options, fs_uri, &fs);
+    if (s.ok()) {
+      guard->reset(new CompositeEnvWrapper(*result, fs));
+      *result = guard->get();
+    }
+    return s;
+  }
+}
+
 std::string Env::PriorityToString(Env::Priority priority) {
   switch (priority) {
     case Env::Priority::BOTTOM:
@@ -140,6 +727,16 @@ Status Env::GetChildrenFileAttributes(const std::string& dir,
   return Status::OK();
 }
 
+Status Env::GetHostNameString(std::string* result) {
+  std::array<char, kMaxHostNameLen> hostname_buf;
+  Status s = GetHostName(hostname_buf.data(), hostname_buf.size());
+  if (s.ok()) {
+    hostname_buf[hostname_buf.size() - 1] = '\0';
+    result->assign(hostname_buf.data());
+  }
+  return s;
+}
+
 SequentialFile::~SequentialFile() {
 }
 
@@ -377,13 +974,13 @@ void Log(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
 
 Status WriteStringToFile(Env* env, const Slice& data, const std::string& fname,
                          bool should_sync) {
-  LegacyFileSystemWrapper lfsw(env);
-  return WriteStringToFile(&lfsw, data, fname, should_sync);
+  const auto& fs = env->GetFileSystem();
+  return WriteStringToFile(fs.get(), data, fname, should_sync);
 }
 
 Status ReadFileToString(Env* env, const std::string& fname, std::string* data) {
-  LegacyFileSystemWrapper lfsw(env);
-  return ReadFileToString(&lfsw, fname, data);
+  const auto& fs = env->GetFileSystem();
+  return ReadFileToString(fs.get(), fname, data);
 }
 
 EnvWrapper::~EnvWrapper() {
@@ -449,6 +1046,12 @@ EnvOptions Env::OptimizeForCompactionTableRead(
   optimized_env_options.use_direct_reads = db_options.use_direct_reads;
   return optimized_env_options;
 }
+EnvOptions Env::OptimizeForBlobFileRead(
+    const EnvOptions& env_options, const ImmutableDBOptions& db_options) const {
+  EnvOptions optimized_env_options(env_options);
+  optimized_env_options.use_direct_reads = db_options.use_direct_reads;
+  return optimized_env_options;
+}
 
 EnvOptions::EnvOptions(const DBOptions& options) {
   AssignEnvOptions(this, options);
@@ -461,18 +1064,18 @@ EnvOptions::EnvOptions() {
 
 Status NewEnvLogger(const std::string& fname, Env* env,
                     std::shared_ptr<Logger>* result) {
-  EnvOptions options;
+  FileOptions options;
   // TODO: Tune the buffer size.
   options.writable_file_max_buffer_size = 1024 * 1024;
-  std::unique_ptr<WritableFile> writable_file;
-  const auto status = env->NewWritableFile(fname, &writable_file, options);
+  std::unique_ptr<FSWritableFile> writable_file;
+  const auto status = env->GetFileSystem()->NewWritableFile(
+      fname, options, &writable_file, nullptr);
   if (!status.ok()) {
     return status;
   }
 
-  *result = std::make_shared<EnvLogger>(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname, options,
-      env);
+  *result = std::make_shared<EnvLogger>(std::move(writable_file), fname,
+                                        options, env);
   return Status::OK();
 }
 
@@ -480,10 +1083,14 @@ const std::shared_ptr<FileSystem>& Env::GetFileSystem() const {
   return file_system_;
 }
 
-#ifdef OS_WIN
-std::unique_ptr<Env> NewCompositeEnv(std::shared_ptr<FileSystem> fs) {
-  return std::unique_ptr<Env>(new CompositeEnvWrapper(Env::Default(), fs));
+const std::shared_ptr<SystemClock>& Env::GetSystemClock() const {
+  return system_clock_;
+}
+
+std::unique_ptr<FSSequentialFile> NewLegacySequentialFileWrapper(
+    std::unique_ptr<SequentialFile>& file) {
+  return std::unique_ptr<FSSequentialFile>(
+      new LegacySequentialFileWrapper(std::move(file)));
 }
-#endif
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/env/env_basic_test.cc b/env/env_basic_test.cc
index cc91e10eb99..e8e3df5f65a 100644
--- a/env/env_basic_test.cc
+++ b/env/env_basic_test.cc
@@ -10,6 +10,7 @@
 #include <vector>
 
 #include "env/mock_env.h"
+#include "file/file_util.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/env.h"
 #include "rocksdb/env_encryption.h"
@@ -17,46 +18,6 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-// Normalizes trivial differences across Envs such that these test cases can
-// run on all Envs.
-class NormalizingEnvWrapper : public EnvWrapper {
- private:
-  std::unique_ptr<Env> base_;
-
- public:
-  explicit NormalizingEnvWrapper(std::unique_ptr<Env>&& base)
-      : EnvWrapper(base.get()), base_(std::move(base)) {}
-  explicit NormalizingEnvWrapper(Env* base) : EnvWrapper(base) {}
-
-  // Removes . and .. from directory listing
-  Status GetChildren(const std::string& dir,
-                     std::vector<std::string>* result) override {
-    Status status = EnvWrapper::GetChildren(dir, result);
-    if (status.ok()) {
-      result->erase(std::remove_if(result->begin(), result->end(),
-                                   [](const std::string& s) {
-                                     return s == "." || s == "..";
-                                   }),
-                    result->end());
-    }
-    return status;
-  }
-
-  // Removes . and .. from directory listing
-  Status GetChildrenFileAttributes(
-      const std::string& dir, std::vector<FileAttributes>* result) override {
-    Status status = EnvWrapper::GetChildrenFileAttributes(dir, result);
-    if (status.ok()) {
-      result->erase(std::remove_if(result->begin(), result->end(),
-                                   [](const FileAttributes& fa) {
-                                     return fa.name == "." || fa.name == "..";
-                                   }),
-                    result->end());
-    }
-    return status;
-  }
-};
-
 class EnvBasicTestWithParam : public testing::Test,
                               public ::testing::WithParamInterface<Env*> {
  public:
@@ -68,32 +29,17 @@ class EnvBasicTestWithParam : public testing::Test,
     test_dir_ = test::PerThreadDBPath(env_, "env_basic_test");
   }
 
-  void SetUp() override {
-    env_->CreateDirIfMissing(test_dir_).PermitUncheckedError();
-  }
+  void SetUp() override { ASSERT_OK(env_->CreateDirIfMissing(test_dir_)); }
 
-  void TearDown() override {
-    std::vector<std::string> files;
-    env_->GetChildren(test_dir_, &files).PermitUncheckedError();
-    for (const auto& file : files) {
-      // don't know whether it's file or directory, try both. The tests must
-      // only create files or empty directories, so one must succeed, else the
-      // directory's corrupted.
-      Status s = env_->DeleteFile(test_dir_ + "/" + file);
-      if (!s.ok()) {
-        ASSERT_OK(env_->DeleteDir(test_dir_ + "/" + file));
-      }
-    }
-  }
+  void TearDown() override { ASSERT_OK(DestroyDir(env_, test_dir_)); }
 };
 
 class EnvMoreTestWithParam : public EnvBasicTestWithParam {};
 
-static std::unique_ptr<Env> def_env(new NormalizingEnvWrapper(Env::Default()));
 INSTANTIATE_TEST_CASE_P(EnvDefault, EnvBasicTestWithParam,
-                        ::testing::Values(def_env.get()));
+                        ::testing::Values(Env::Default()));
 INSTANTIATE_TEST_CASE_P(EnvDefault, EnvMoreTestWithParam,
-                        ::testing::Values(def_env.get()));
+                        ::testing::Values(Env::Default()));
 
 static std::unique_ptr<Env> mock_env(new MockEnv(Env::Default()));
 INSTANTIATE_TEST_CASE_P(MockEnv, EnvBasicTestWithParam,
@@ -104,8 +50,7 @@ static Env* NewTestEncryptedEnv(Env* base, const std::string& provider_id) {
   std::shared_ptr<EncryptionProvider> provider;
   EXPECT_OK(EncryptionProvider::CreateFromString(ConfigOptions(), provider_id,
                                                  &provider));
-  std::unique_ptr<Env> encrypted(NewEncryptedEnv(base, provider));
-  return new NormalizingEnvWrapper(std::move(encrypted));
+  return NewEncryptedEnv(base, provider);
 }
 
 // next statements run env test against default encryption code.
@@ -130,19 +75,30 @@ namespace {
 // The purpose of returning an empty vector (instead of nullptr) is that gtest
 // ValuesIn() will skip running tests when given an empty collection.
 std::vector<Env*> GetCustomEnvs() {
-  static Env* custom_env;
   static bool init = false;
+  static std::vector<Env*> res;
   if (!init) {
     init = true;
     const char* uri = getenv("TEST_ENV_URI");
     if (uri != nullptr) {
-      Env::LoadEnv(uri, &custom_env);
+      static std::shared_ptr<Env> env_guard;
+      static Env* custom_env;
+      Status s =
+          Env::CreateFromUri(ConfigOptions(), uri, "", &custom_env, &env_guard);
+      if (s.ok()) {
+        res.emplace_back(custom_env);
+      }
+    }
+    uri = getenv("TEST_FS_URI");
+    if (uri != nullptr) {
+      static std::shared_ptr<Env> fs_env_guard;
+      static Env* fs_env;
+      Status s =
+          Env::CreateFromUri(ConfigOptions(), "", uri, &fs_env, &fs_env_guard);
+      if (s.ok()) {
+        res.emplace_back(fs_env);
+      }
     }
-  }
-
-  std::vector<Env*> res;
-  if (custom_env != nullptr) {
-    res.emplace_back(custom_env);
   }
   return res;
 }
@@ -228,8 +184,8 @@ TEST_P(EnvBasicTestWithParam, Basics) {
   ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/g"));
   ASSERT_OK(env_->GetChildren(test_dir_, &children));
   ASSERT_EQ(0U, children.size());
-  ASSERT_TRUE(
-      env_->GetChildren(test_dir_ + "/non_existent", &children).IsNotFound());
+  Status s = env_->GetChildren(test_dir_ + "/non_existent", &children);
+  ASSERT_TRUE(s.IsNotFound());
 }
 
 TEST_P(EnvBasicTestWithParam, ReadWrite) {
@@ -325,7 +281,7 @@ TEST_P(EnvMoreTestWithParam, MakeDir) {
   ASSERT_OK(env_->CreateDir(test_dir_ + "/j"));
   ASSERT_OK(env_->FileExists(test_dir_ + "/j"));
   std::vector<std::string> children;
-  env_->GetChildren(test_dir_, &children);
+  ASSERT_OK(env_->GetChildren(test_dir_, &children));
   ASSERT_EQ(1U, children.size());
   // fail because file already exists
   ASSERT_TRUE(!env_->CreateDir(test_dir_ + "/j").ok());
@@ -374,6 +330,32 @@ TEST_P(EnvMoreTestWithParam, GetChildren) {
   ASSERT_EQ(0U, children.size());
 }
 
+TEST_P(EnvMoreTestWithParam, GetChildrenIgnoresDotAndDotDot) {
+  auto* env = Env::Default();
+  ASSERT_OK(env->CreateDirIfMissing(test_dir_));
+
+  // Create a single file
+  std::string path = test_dir_;
+  const EnvOptions soptions;
+#ifdef OS_WIN
+  path.append("\\test_file");
+#else
+  path.append("/test_file");
+#endif
+  std::string data("test data");
+  std::unique_ptr<WritableFile> file;
+  ASSERT_OK(env->NewWritableFile(path, &file, soptions));
+  ASSERT_OK(file->Append("test data"));
+
+  // get the children
+  std::vector<std::string> result;
+  ASSERT_OK(env->GetChildren(test_dir_, &result));
+
+  // expect only one file named `test_data`, i.e. no `.` or `..` names
+  ASSERT_EQ(result.size(), 1);
+  ASSERT_EQ(result.at(0), "test_file");
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
diff --git a/env/env_chroot.cc b/env/env_chroot.cc
index 4bc2f9a2507..ff47049e0f0 100644
--- a/env/env_chroot.cc
+++ b/env/env_chroot.cc
@@ -7,23 +7,21 @@
 
 #include "env/env_chroot.h"
 
-#include <errno.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
+#include <errno.h>   // errno
+#include <stdlib.h>  // realpath, free
+#include <unistd.h>  // geteuid
 
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "rocksdb/status.h"
+#include "env/composite_env_wrapper.h"
+#include "env/fs_remap.h"
+#include "util/string_util.h"  // errnoStr
 
 namespace ROCKSDB_NAMESPACE {
-
-class ChrootEnv : public EnvWrapper {
+namespace {
+class ChrootFileSystem : public RemapFileSystem {
  public:
-  ChrootEnv(Env* base_env, const std::string& chroot_dir)
-      : EnvWrapper(base_env) {
+  ChrootFileSystem(const std::shared_ptr<FileSystem>& base,
+                   const std::string& chroot_dir)
+      : RemapFileSystem(base) {
 #if defined(OS_AIX)
     char resolvedName[PATH_MAX];
     char* real_chroot_dir = realpath(chroot_dir.c_str(), resolvedName);
@@ -38,217 +36,10 @@ class ChrootEnv : public EnvWrapper {
 #endif
   }
 
-  Status RegisterDbPaths(const std::vector<std::string>& paths) override {
-    std::vector<std::string> encoded_paths;
-    encoded_paths.reserve(paths.size());
-    for (auto& path : paths) {
-      auto status_and_enc_path = EncodePathWithNewBasename(path);
-      if (!status_and_enc_path.first.ok()) {
-        return status_and_enc_path.first;
-      }
-      encoded_paths.emplace_back(status_and_enc_path.second);
-    }
-    return EnvWrapper::Env::RegisterDbPaths(encoded_paths);
-  }
-
-  Status UnregisterDbPaths(const std::vector<std::string>& paths) override {
-    std::vector<std::string> encoded_paths;
-    encoded_paths.reserve(paths.size());
-    for (auto& path : paths) {
-      auto status_and_enc_path = EncodePathWithNewBasename(path);
-      if (!status_and_enc_path.first.ok()) {
-        return status_and_enc_path.first;
-      }
-      encoded_paths.emplace_back(status_and_enc_path.second);
-    }
-    return EnvWrapper::Env::UnregisterDbPaths(encoded_paths);
-  }
-
-  Status NewSequentialFile(const std::string& fname,
-                           std::unique_ptr<SequentialFile>* result,
-                           const EnvOptions& options) override {
-    auto status_and_enc_path = EncodePathWithNewBasename(fname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::NewSequentialFile(status_and_enc_path.second, result,
-                                         options);
-  }
-
-  Status NewRandomAccessFile(const std::string& fname,
-                             std::unique_ptr<RandomAccessFile>* result,
-                             const EnvOptions& options) override {
-    auto status_and_enc_path = EncodePathWithNewBasename(fname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::NewRandomAccessFile(status_and_enc_path.second, result,
-                                           options);
-  }
-
-  Status NewWritableFile(const std::string& fname,
-                         std::unique_ptr<WritableFile>* result,
-                         const EnvOptions& options) override {
-    auto status_and_enc_path = EncodePathWithNewBasename(fname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::NewWritableFile(status_and_enc_path.second, result,
-                                       options);
-  }
-
-  Status ReuseWritableFile(const std::string& fname,
-                           const std::string& old_fname,
-                           std::unique_ptr<WritableFile>* result,
-                           const EnvOptions& options) override {
-    auto status_and_enc_path = EncodePathWithNewBasename(fname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    auto status_and_old_enc_path = EncodePath(old_fname);
-    if (!status_and_old_enc_path.first.ok()) {
-      return status_and_old_enc_path.first;
-    }
-    return EnvWrapper::ReuseWritableFile(status_and_old_enc_path.second,
-                                         status_and_old_enc_path.second, result,
-                                         options);
-  }
-
-  Status NewRandomRWFile(const std::string& fname,
-                         std::unique_ptr<RandomRWFile>* result,
-                         const EnvOptions& options) override {
-    auto status_and_enc_path = EncodePathWithNewBasename(fname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::NewRandomRWFile(status_and_enc_path.second, result,
-                                       options);
-  }
-
-  Status NewDirectory(const std::string& dir,
-                      std::unique_ptr<Directory>* result) override {
-    auto status_and_enc_path = EncodePathWithNewBasename(dir);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::NewDirectory(status_and_enc_path.second, result);
-  }
-
-  Status FileExists(const std::string& fname) override {
-    auto status_and_enc_path = EncodePathWithNewBasename(fname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::FileExists(status_and_enc_path.second);
-  }
-
-  Status GetChildren(const std::string& dir,
-                     std::vector<std::string>* result) override {
-    auto status_and_enc_path = EncodePath(dir);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::GetChildren(status_and_enc_path.second, result);
-  }
-
-  Status GetChildrenFileAttributes(
-      const std::string& dir, std::vector<FileAttributes>* result) override {
-    auto status_and_enc_path = EncodePath(dir);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::GetChildrenFileAttributes(status_and_enc_path.second,
-                                                 result);
-  }
-
-  Status DeleteFile(const std::string& fname) override {
-    auto status_and_enc_path = EncodePath(fname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::DeleteFile(status_and_enc_path.second);
-  }
-
-  Status CreateDir(const std::string& dirname) override {
-    auto status_and_enc_path = EncodePathWithNewBasename(dirname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::CreateDir(status_and_enc_path.second);
-  }
-
-  Status CreateDirIfMissing(const std::string& dirname) override {
-    auto status_and_enc_path = EncodePathWithNewBasename(dirname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::CreateDirIfMissing(status_and_enc_path.second);
-  }
-
-  Status DeleteDir(const std::string& dirname) override {
-    auto status_and_enc_path = EncodePath(dirname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::DeleteDir(status_and_enc_path.second);
-  }
+  const char* Name() const override { return "ChrootFS"; }
 
-  Status GetFileSize(const std::string& fname, uint64_t* file_size) override {
-    auto status_and_enc_path = EncodePath(fname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::GetFileSize(status_and_enc_path.second, file_size);
-  }
-
-  Status GetFileModificationTime(const std::string& fname,
-                                 uint64_t* file_mtime) override {
-    auto status_and_enc_path = EncodePath(fname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::GetFileModificationTime(status_and_enc_path.second,
-                                               file_mtime);
-  }
-
-  Status RenameFile(const std::string& src, const std::string& dest) override {
-    auto status_and_src_enc_path = EncodePath(src);
-    if (!status_and_src_enc_path.first.ok()) {
-      return status_and_src_enc_path.first;
-    }
-    auto status_and_dest_enc_path = EncodePathWithNewBasename(dest);
-    if (!status_and_dest_enc_path.first.ok()) {
-      return status_and_dest_enc_path.first;
-    }
-    return EnvWrapper::RenameFile(status_and_src_enc_path.second,
-                                  status_and_dest_enc_path.second);
-  }
-
-  Status LinkFile(const std::string& src, const std::string& dest) override {
-    auto status_and_src_enc_path = EncodePath(src);
-    if (!status_and_src_enc_path.first.ok()) {
-      return status_and_src_enc_path.first;
-    }
-    auto status_and_dest_enc_path = EncodePathWithNewBasename(dest);
-    if (!status_and_dest_enc_path.first.ok()) {
-      return status_and_dest_enc_path.first;
-    }
-    return EnvWrapper::LinkFile(status_and_src_enc_path.second,
-                                status_and_dest_enc_path.second);
-  }
-
-  Status LockFile(const std::string& fname, FileLock** lock) override {
-    auto status_and_enc_path = EncodePathWithNewBasename(fname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    // FileLock subclasses may store path (e.g., PosixFileLock stores it). We
-    // can skip stripping the chroot directory from this path because callers
-    // shouldn't use it.
-    return EnvWrapper::LockFile(status_and_enc_path.second, lock);
-  }
-
-  Status GetTestDirectory(std::string* path) override {
+  IOStatus GetTestDirectory(const IOOptions& options, std::string* path,
+                            IODebugContext* dbg) override {
     // Adapted from PosixEnv's implementation since it doesn't provide a way to
     // create directory in the chroot.
     char buf[256];
@@ -256,36 +47,19 @@ class ChrootEnv : public EnvWrapper {
     *path = buf;
 
     // Directory may already exist, so ignore return
-    return CreateDirIfMissing(*path);
-  }
-
-  Status NewLogger(const std::string& fname,
-                   std::shared_ptr<Logger>* result) override {
-    auto status_and_enc_path = EncodePathWithNewBasename(fname);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::NewLogger(status_and_enc_path.second, result);
-  }
-
-  Status GetAbsolutePath(const std::string& db_path,
-                         std::string* output_path) override {
-    auto status_and_enc_path = EncodePath(db_path);
-    if (!status_and_enc_path.first.ok()) {
-      return status_and_enc_path.first;
-    }
-    return EnvWrapper::GetAbsolutePath(status_and_enc_path.second, output_path);
+    return CreateDirIfMissing(*path, options, dbg);
   }
 
- private:
+ protected:
   // Returns status and expanded absolute path including the chroot directory.
   // Checks whether the provided path breaks out of the chroot. If it returns
   // non-OK status, the returned path should not be used.
-  std::pair<Status, std::string> EncodePath(const std::string& path) {
+  std::pair<IOStatus, std::string> EncodePath(
+      const std::string& path) override {
     if (path.empty() || path[0] != '/') {
-      return {Status::InvalidArgument(path, "Not an absolute path"), ""};
+      return {IOStatus::InvalidArgument(path, "Not an absolute path"), ""};
     }
-    std::pair<Status, std::string> res;
+    std::pair<IOStatus, std::string> res;
     res.second = chroot_dir_ + path;
 #if defined(OS_AIX)
     char resolvedName[PATH_MAX];
@@ -294,14 +68,14 @@ class ChrootEnv : public EnvWrapper {
     char* normalized_path = realpath(res.second.c_str(), nullptr);
 #endif
     if (normalized_path == nullptr) {
-      res.first = Status::NotFound(res.second, strerror(errno));
+      res.first = IOStatus::NotFound(res.second, errnoStr(errno).c_str());
     } else if (strlen(normalized_path) < chroot_dir_.size() ||
                strncmp(normalized_path, chroot_dir_.c_str(),
                        chroot_dir_.size()) != 0) {
-      res.first = Status::IOError(res.second,
-                                  "Attempted to access path outside chroot");
+      res.first = IOStatus::IOError(res.second,
+                                    "Attempted to access path outside chroot");
     } else {
-      res.first = Status::OK();
+      res.first = IOStatus::OK();
     }
 #if !defined(OS_AIX)
     free(normalized_path);
@@ -311,10 +85,10 @@ class ChrootEnv : public EnvWrapper {
 
   // Similar to EncodePath() except assumes the basename in the path hasn't been
   // created yet.
-  std::pair<Status, std::string> EncodePathWithNewBasename(
-      const std::string& path) {
+  std::pair<IOStatus, std::string> EncodePathWithNewBasename(
+      const std::string& path) override {
     if (path.empty() || path[0] != '/') {
-      return {Status::InvalidArgument(path, "Not an absolute path"), ""};
+      return {IOStatus::InvalidArgument(path, "Not an absolute path"), ""};
     }
     // Basename may be followed by trailing slashes
     size_t final_idx = path.find_last_not_of('/');
@@ -331,14 +105,23 @@ class ChrootEnv : public EnvWrapper {
     return status_and_enc_path;
   }
 
+ private:
   std::string chroot_dir_;
 };
+}  // namespace
+
+std::shared_ptr<FileSystem> NewChrootFileSystem(
+    const std::shared_ptr<FileSystem>& base, const std::string& chroot_dir) {
+  return std::make_shared<ChrootFileSystem>(base, chroot_dir);
+}
 
 Env* NewChrootEnv(Env* base_env, const std::string& chroot_dir) {
   if (!base_env->FileExists(chroot_dir).ok()) {
     return nullptr;
   }
-  return new ChrootEnv(base_env, chroot_dir);
+  std::shared_ptr<FileSystem> chroot_fs =
+      NewChrootFileSystem(base_env->GetFileSystem(), chroot_dir);
+  return new CompositeEnvWrapper(base_env, chroot_fs);
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/env/env_chroot.h b/env/env_chroot.h
index cb5585b3b7d..fb5b70c4480 100644
--- a/env/env_chroot.h
+++ b/env/env_chroot.h
@@ -15,6 +15,9 @@ namespace ROCKSDB_NAMESPACE {
 
 // Returns an Env that translates paths such that the root directory appears to
 // be chroot_dir. chroot_dir should refer to an existing directory.
+//
+// This class has not been fully analyzed for providing strong security
+// guarantees.
 Env* NewChrootEnv(Env* base_env, const std::string& chroot_dir);
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/env/env_encryption.cc b/env/env_encryption.cc
index ca2542abbb1..a5670ad780d 100644
--- a/env/env_encryption.cc
+++ b/env/env_encryption.cc
@@ -12,9 +12,12 @@
 #include <cctype>
 #include <iostream>
 
+#include "env/composite_env_wrapper.h"
 #include "env/env_encryption_ctr.h"
 #include "monitoring/perf_context_imp.h"
 #include "rocksdb/convenience.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/system_clock.h"
 #include "util/aligned_buffer.h"
 #include "util/coding.h"
 #include "util/random.h"
@@ -84,19 +87,24 @@ std::shared_ptr<EncryptionProvider> EncryptionProvider::NewCTRProvider(
   // If an error was encountered, returns a non-OK status.
   //
   // REQUIRES: External synchronization
-Status EncryptedSequentialFile::Read(size_t n, Slice* result, char* scratch) {
+IOStatus EncryptedSequentialFile::Read(size_t n, const IOOptions& options,
+                                       Slice* result, char* scratch,
+                                       IODebugContext* dbg) {
   assert(scratch);
-  Status status = file_->Read(n, result, scratch);
-  if (!status.ok()) {
-    return status;
+  IOStatus io_s = file_->Read(n, options, result, scratch, dbg);
+  if (!io_s.ok()) {
+    return io_s;
   }
   {
     PERF_TIMER_GUARD(decrypt_data_nanos);
-    status = stream_->Decrypt(offset_, (char*)result->data(), result->size());
+    io_s = status_to_io_status(
+        stream_->Decrypt(offset_, (char*)result->data(), result->size()));
   }
-  offset_ += result->size();  // We've already ready data from disk, so update
-                              // offset_ even if decryption fails.
-  return status;
+  if (io_s.ok()) {
+    offset_ += result->size();  // We've already ready data from disk, so update
+                                // offset_ even if decryption fails.
+  }
+  return io_s;
 }
 
 // Skip "n" bytes from the file. This is guaranteed to be no
@@ -106,7 +114,7 @@ Status EncryptedSequentialFile::Read(size_t n, Slice* result, char* scratch) {
 // file, and Skip will return OK.
 //
 // REQUIRES: External synchronization
-Status EncryptedSequentialFile::Skip(uint64_t n) {
+IOStatus EncryptedSequentialFile::Skip(uint64_t n) {
   auto status = file_->Skip(n);
   if (!status.ok()) {
     return status;
@@ -130,26 +138,30 @@ size_t EncryptedSequentialFile::GetRequiredBufferAlignment() const {
   // Remove any kind of caching of data from the offset to offset+length
   // of this file. If the length is 0, then it refers to the end of file.
   // If the system is not caching the file contents, then this is a noop.
-Status EncryptedSequentialFile::InvalidateCache(size_t offset, size_t length) {
+IOStatus EncryptedSequentialFile::InvalidateCache(size_t offset,
+                                                  size_t length) {
   return file_->InvalidateCache(offset + prefixLength_, length);
 }
 
   // Positioned Read for direct I/O
   // If Direct I/O enabled, offset, n, and scratch should be properly aligned
-Status EncryptedSequentialFile::PositionedRead(uint64_t offset, size_t n,
-                                               Slice* result, char* scratch) {
+IOStatus EncryptedSequentialFile::PositionedRead(uint64_t offset, size_t n,
+                                                 const IOOptions& options,
+                                                 Slice* result, char* scratch,
+                                                 IODebugContext* dbg) {
   assert(scratch);
   offset += prefixLength_;  // Skip prefix
-  auto status = file_->PositionedRead(offset, n, result, scratch);
-  if (!status.ok()) {
-    return status;
+  auto io_s = file_->PositionedRead(offset, n, options, result, scratch, dbg);
+  if (!io_s.ok()) {
+    return io_s;
   }
   offset_ = offset + result->size();
   {
     PERF_TIMER_GUARD(decrypt_data_nanos);
-    status = stream_->Decrypt(offset, (char*)result->data(), result->size());
+    io_s = status_to_io_status(
+        stream_->Decrypt(offset, (char*)result->data(), result->size()));
   }
-  return status;
+  return io_s;
 }
 
   // Read up to "n" bytes from the file starting at "offset".
@@ -162,25 +174,30 @@ Status EncryptedSequentialFile::PositionedRead(uint64_t offset, size_t n,
   //
   // Safe for concurrent use by multiple threads.
   // If Direct I/O enabled, offset, n, and scratch should be aligned properly.
-Status EncryptedRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result,
-                                       char* scratch) const {
+IOStatus EncryptedRandomAccessFile::Read(uint64_t offset, size_t n,
+                                         const IOOptions& options,
+                                         Slice* result, char* scratch,
+                                         IODebugContext* dbg) const {
   assert(scratch);
   offset += prefixLength_;
-  auto status = file_->Read(offset, n, result, scratch);
-  if (!status.ok()) {
-    return status;
+  auto io_s = file_->Read(offset, n, options, result, scratch, dbg);
+  if (!io_s.ok()) {
+    return io_s;
   }
   {
     PERF_TIMER_GUARD(decrypt_data_nanos);
-    status = stream_->Decrypt(offset, (char*)result->data(), result->size());
+    io_s = status_to_io_status(
+        stream_->Decrypt(offset, (char*)result->data(), result->size()));
   }
-  return status;
+  return io_s;
 }
 
   // Readahead the file starting from offset by n bytes for caching.
-Status EncryptedRandomAccessFile::Prefetch(uint64_t offset, size_t n) {
+IOStatus EncryptedRandomAccessFile::Prefetch(uint64_t offset, size_t n,
+                                             const IOOptions& options,
+                                             IODebugContext* dbg) {
   // return Status::OK();
-  return file_->Prefetch(offset + prefixLength_, n);
+  return file_->Prefetch(offset + prefixLength_, n, options, dbg);
 }
 
   // Tries to get an unique ID for this file that will be the same each time
@@ -221,20 +238,21 @@ size_t EncryptedRandomAccessFile::GetRequiredBufferAlignment() const {
   // Remove any kind of caching of data from the offset to offset+length
   // of this file. If the length is 0, then it refers to the end of file.
   // If the system is not caching the file contents, then this is a noop.
-Status EncryptedRandomAccessFile::InvalidateCache(size_t offset,
-                                                  size_t length) {
+IOStatus EncryptedRandomAccessFile::InvalidateCache(size_t offset,
+                                                    size_t length) {
   return file_->InvalidateCache(offset + prefixLength_, length);
 }
 
 // A file abstraction for sequential writing.  The implementation
 // must provide buffering since callers may append small fragments
 // at a time to the file.
-Status EncryptedWritableFile::Append(const Slice& data) {
+IOStatus EncryptedWritableFile::Append(const Slice& data,
+                                       const IOOptions& options,
+                                       IODebugContext* dbg) {
   AlignedBuffer buf;
-  Status status;
   Slice dataToAppend(data);
   if (data.size() > 0) {
-    auto offset = file_->GetFileSize();  // size including prefix
+    auto offset = file_->GetFileSize(options, dbg);  // size including prefix
     // Encrypt in cloned buffer
     buf.Alignment(GetRequiredBufferAlignment());
     buf.AllocateNewBuffer(data.size());
@@ -242,26 +260,25 @@ Status EncryptedWritableFile::Append(const Slice& data) {
     // so that the next two lines can be replaced with buf.Append().
     memmove(buf.BufferStart(), data.data(), data.size());
     buf.Size(data.size());
+    IOStatus io_s;
     {
       PERF_TIMER_GUARD(encrypt_data_nanos);
-      status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize());
+      io_s = status_to_io_status(
+          stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()));
     }
-    if (!status.ok()) {
-      return status;
+    if (!io_s.ok()) {
+      return io_s;
     }
     dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize());
   }
-  status = file_->Append(dataToAppend);
-  if (!status.ok()) {
-    return status;
-  }
-  return status;
+  return file_->Append(dataToAppend, options, dbg);
 }
 
-Status EncryptedWritableFile::PositionedAppend(const Slice& data,
-                                               uint64_t offset) {
+IOStatus EncryptedWritableFile::PositionedAppend(const Slice& data,
+                                                 uint64_t offset,
+                                                 const IOOptions& options,
+                                                 IODebugContext* dbg) {
   AlignedBuffer buf;
-  Status status;
   Slice dataToAppend(data);
   offset += prefixLength_;
   if (data.size() > 0) {
@@ -270,20 +287,18 @@ Status EncryptedWritableFile::PositionedAppend(const Slice& data,
     buf.AllocateNewBuffer(data.size());
     memmove(buf.BufferStart(), data.data(), data.size());
     buf.Size(data.size());
+    IOStatus io_s;
     {
       PERF_TIMER_GUARD(encrypt_data_nanos);
-      status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize());
+      io_s = status_to_io_status(
+          stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()));
     }
-    if (!status.ok()) {
-      return status;
+    if (!io_s.ok()) {
+      return io_s;
     }
     dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize());
   }
-  status = file_->PositionedAppend(dataToAppend, offset);
-  if (!status.ok()) {
-    return status;
-  }
-  return status;
+  return file_->PositionedAppend(dataToAppend, offset, options, dbg);
 }
 
   // Indicates the upper layers if the current WritableFile implementation
@@ -301,48 +316,83 @@ size_t EncryptedWritableFile::GetRequiredBufferAlignment() const {
 /*
  * Get the size of valid data in the file.
  */
-uint64_t EncryptedWritableFile::GetFileSize() {
-  return file_->GetFileSize() - prefixLength_;
+uint64_t EncryptedWritableFile::GetFileSize(const IOOptions& options,
+                                            IODebugContext* dbg) {
+  return file_->GetFileSize(options, dbg) - prefixLength_;
 }
 
-  // Truncate is necessary to trim the file to the correct size
-  // before closing. It is not always possible to keep track of the file
-  // size due to whole pages writes. The behavior is undefined if called
-  // with other writes to follow.
-Status EncryptedWritableFile::Truncate(uint64_t size) {
-  return file_->Truncate(size + prefixLength_);
+// Truncate is necessary to trim the file to the correct size
+// before closing. It is not always possible to keep track of the file
+// size due to whole pages writes. The behavior is undefined if called
+// with other writes to follow.
+IOStatus EncryptedWritableFile::Truncate(uint64_t size,
+                                         const IOOptions& options,
+                                         IODebugContext* dbg) {
+  return file_->Truncate(size + prefixLength_, options, dbg);
 }
 
-    // Remove any kind of caching of data from the offset to offset+length
-  // of this file. If the length is 0, then it refers to the end of file.
-  // If the system is not caching the file contents, then this is a noop.
-  // This call has no effect on dirty pages in the cache.
-Status EncryptedWritableFile::InvalidateCache(size_t offset, size_t length) {
+// Remove any kind of caching of data from the offset to offset+length
+// of this file. If the length is 0, then it refers to the end of file.
+// If the system is not caching the file contents, then this is a noop.
+// This call has no effect on dirty pages in the cache.
+IOStatus EncryptedWritableFile::InvalidateCache(size_t offset, size_t length) {
   return file_->InvalidateCache(offset + prefixLength_, length);
 }
 
-  // Sync a file range with disk.
-  // offset is the starting byte of the file range to be synchronized.
-  // nbytes specifies the length of the range to be synchronized.
-  // This asks the OS to initiate flushing the cached data to disk,
-  // without waiting for completion.
-  // Default implementation does nothing.
-Status EncryptedWritableFile::RangeSync(uint64_t offset, uint64_t nbytes) {
-  return file_->RangeSync(offset + prefixLength_, nbytes);
+// Sync a file range with disk.
+// offset is the starting byte of the file range to be synchronized.
+// nbytes specifies the length of the range to be synchronized.
+// This asks the OS to initiate flushing the cached data to disk,
+// without waiting for completion.
+// Default implementation does nothing.
+IOStatus EncryptedWritableFile::RangeSync(uint64_t offset, uint64_t nbytes,
+                                          const IOOptions& options,
+                                          IODebugContext* dbg) {
+  return file_->RangeSync(offset + prefixLength_, nbytes, options, dbg);
+}
+
+// PrepareWrite performs any necessary preparation for a write
+// before the write actually occurs.  This allows for pre-allocation
+// of space on devices where it can result in less file
+// fragmentation and/or less waste from over-zealous filesystem
+// pre-allocation.
+void EncryptedWritableFile::PrepareWrite(size_t offset, size_t len,
+                                         const IOOptions& options,
+                                         IODebugContext* dbg) {
+  file_->PrepareWrite(offset + prefixLength_, len, options, dbg);
+}
+
+void EncryptedWritableFile::SetPreallocationBlockSize(size_t size) {
+  // the size here doesn't need to include prefixLength_, as it's a
+  // configuration will be use for `PrepareWrite()`.
+  file_->SetPreallocationBlockSize(size);
 }
 
-  // PrepareWrite performs any necessary preparation for a write
-  // before the write actually occurs.  This allows for pre-allocation
-  // of space on devices where it can result in less file
-  // fragmentation and/or less waste from over-zealous filesystem
-  // pre-allocation.
-void EncryptedWritableFile::PrepareWrite(size_t offset, size_t len) {
-  file_->PrepareWrite(offset + prefixLength_, len);
+void EncryptedWritableFile::GetPreallocationStatus(
+    size_t* block_size, size_t* last_allocated_block) {
+  file_->GetPreallocationStatus(block_size, last_allocated_block);
 }
 
-  // Pre-allocates space for a file.
-Status EncryptedWritableFile::Allocate(uint64_t offset, uint64_t len) {
-  return file_->Allocate(offset + prefixLength_, len);
+// Pre-allocates space for a file.
+IOStatus EncryptedWritableFile::Allocate(uint64_t offset, uint64_t len,
+                                         const IOOptions& options,
+                                         IODebugContext* dbg) {
+  return file_->Allocate(offset + prefixLength_, len, options, dbg);
+}
+
+IOStatus EncryptedWritableFile::Flush(const IOOptions& options,
+                                      IODebugContext* dbg) {
+  return file_->Flush(options, dbg);
+}
+
+IOStatus EncryptedWritableFile::Sync(const IOOptions& options,
+                                     IODebugContext* dbg) {
+  return file_->Sync(options, dbg);
+}
+
+IOStatus EncryptedWritableFile::Close(const IOOptions& options,
+                                      IODebugContext* dbg) {
+  return file_->Close(options, dbg);
 }
 
 // A file abstraction for random reading and writing.
@@ -361,9 +411,10 @@ size_t EncryptedRandomRWFile::GetRequiredBufferAlignment() const {
 
   // Write bytes in `data` at  offset `offset`, Returns Status::OK() on success.
   // Pass aligned buffer when use_direct_io() returns true.
-Status EncryptedRandomRWFile::Write(uint64_t offset, const Slice& data) {
+IOStatus EncryptedRandomRWFile::Write(uint64_t offset, const Slice& data,
+                                      const IOOptions& options,
+                                      IODebugContext* dbg) {
   AlignedBuffer buf;
-  Status status;
   Slice dataToWrite(data);
   offset += prefixLength_;
   if (data.size() > 0) {
@@ -372,71 +423,89 @@ Status EncryptedRandomRWFile::Write(uint64_t offset, const Slice& data) {
     buf.AllocateNewBuffer(data.size());
     memmove(buf.BufferStart(), data.data(), data.size());
     buf.Size(data.size());
+    IOStatus io_s;
     {
       PERF_TIMER_GUARD(encrypt_data_nanos);
-      status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize());
+      io_s = status_to_io_status(
+          stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()));
     }
-    if (!status.ok()) {
-      return status;
+    if (!io_s.ok()) {
+      return io_s;
     }
     dataToWrite = Slice(buf.BufferStart(), buf.CurrentSize());
   }
-  status = file_->Write(offset, dataToWrite);
-  return status;
+  return file_->Write(offset, dataToWrite, options, dbg);
 }
 
   // Read up to `n` bytes starting from offset `offset` and store them in
   // result, provided `scratch` size should be at least `n`.
   // Returns Status::OK() on success.
-Status EncryptedRandomRWFile::Read(uint64_t offset, size_t n, Slice* result,
-                                   char* scratch) const {
+IOStatus EncryptedRandomRWFile::Read(uint64_t offset, size_t n,
+                                     const IOOptions& options, Slice* result,
+                                     char* scratch, IODebugContext* dbg) const {
   assert(scratch);
   offset += prefixLength_;
-  auto status = file_->Read(offset, n, result, scratch);
+  auto status = file_->Read(offset, n, options, result, scratch, dbg);
   if (!status.ok()) {
     return status;
   }
   {
     PERF_TIMER_GUARD(decrypt_data_nanos);
-    status = stream_->Decrypt(offset, (char*)result->data(), result->size());
+    status = status_to_io_status(
+        stream_->Decrypt(offset, (char*)result->data(), result->size()));
   }
   return status;
 }
 
-Status EncryptedRandomRWFile::Flush() { return file_->Flush(); }
+IOStatus EncryptedRandomRWFile::Flush(const IOOptions& options,
+                                      IODebugContext* dbg) {
+  return file_->Flush(options, dbg);
+}
 
-Status EncryptedRandomRWFile::Sync() { return file_->Sync(); }
+IOStatus EncryptedRandomRWFile::Sync(const IOOptions& options,
+                                     IODebugContext* dbg) {
+  return file_->Sync(options, dbg);
+}
 
-Status EncryptedRandomRWFile::Fsync() { return file_->Fsync(); }
+IOStatus EncryptedRandomRWFile::Fsync(const IOOptions& options,
+                                      IODebugContext* dbg) {
+  return file_->Fsync(options, dbg);
+}
 
-Status EncryptedRandomRWFile::Close() { return file_->Close(); }
+IOStatus EncryptedRandomRWFile::Close(const IOOptions& options,
+                                      IODebugContext* dbg) {
+  return file_->Close(options, dbg);
+}
 
-// EncryptedEnv implements an Env wrapper that adds encryption to files stored
-// on disk.
-class EncryptedEnvImpl : public EnvWrapper {
+namespace {
+// EncryptedFileSystemImpl implements an FileSystemWrapper that adds encryption
+// to files stored on disk.
+class EncryptedFileSystemImpl : public EncryptedFileSystem {
+ public:
+  const char* Name() const override { return "EncryptedFS"; }
   // Returns the raw encryption provider that should be used to write the input
   // encrypted file.  If there is no such provider, NotFound is returned.
-  Status GetWritableProvider(const std::string& /*fname*/,
-                             EncryptionProvider** result) {
+  IOStatus GetWritableProvider(const std::string& /*fname*/,
+                               EncryptionProvider** result) {
     if (provider_) {
       *result = provider_.get();
-      return Status::OK();
+      return IOStatus::OK();
     } else {
       *result = nullptr;
-      return Status::NotFound("No WriteProvider specified");
+      return IOStatus::NotFound("No WriteProvider specified");
     }
   }
 
   // Returns the raw encryption provider that should be used to read the input
   // encrypted file.  If there is no such provider, NotFound is returned.
-  Status GetReadableProvider(const std::string& /*fname*/,
-                             EncryptionProvider** result) {
+  IOStatus GetReadableProvider(const std::string& /*fname*/,
+                               EncryptionProvider** result) {
     if (provider_) {
       *result = provider_.get();
-      return Status::OK();
+      return IOStatus::OK();
     } else {
       *result = nullptr;
-      return Status::NotFound("No Provider specified");
+      return IOStatus::NotFound("No Provider specified");
     }
   }
 
@@ -452,13 +521,13 @@ class EncryptedEnvImpl : public EnvWrapper {
   // should be encrypted
   // @return OK on success, non-OK on failure.
   template <class TypeFile>
-  Status CreateWritableCipherStream(
+  IOStatus CreateWritableCipherStream(
       const std::string& fname, const std::unique_ptr<TypeFile>& underlying,
-      const EnvOptions& options, size_t* prefix_length,
-      std::unique_ptr<BlockAccessCipherStream>* stream) {
+      const FileOptions& options, size_t* prefix_length,
+      std::unique_ptr<BlockAccessCipherStream>* stream, IODebugContext* dbg) {
     EncryptionProvider* provider = nullptr;
     *prefix_length = 0;
-    Status status = GetWritableProvider(fname, &provider);
+    IOStatus status = GetWritableProvider(fname, &provider);
     if (!status.ok()) {
       return status;
     } else if (provider != nullptr) {
@@ -470,34 +539,36 @@ class EncryptedEnvImpl : public EnvWrapper {
         // Initialize prefix
         buffer.Alignment(underlying->GetRequiredBufferAlignment());
         buffer.AllocateNewBuffer(*prefix_length);
-        status = provider->CreateNewPrefix(fname, buffer.BufferStart(),
-                                           *prefix_length);
+        status = status_to_io_status(provider->CreateNewPrefix(
+            fname, buffer.BufferStart(), *prefix_length));
         if (status.ok()) {
           buffer.Size(*prefix_length);
           prefix = Slice(buffer.BufferStart(), buffer.CurrentSize());
           // Write prefix
-          status = underlying->Append(prefix);
+          status = underlying->Append(prefix, options.io_options, dbg);
         }
         if (!status.ok()) {
           return status;
         }
       }
       // Create cipher stream
-      status = provider->CreateCipherStream(fname, options, prefix, stream);
+      status = status_to_io_status(
+          provider->CreateCipherStream(fname, options, prefix, stream));
     }
     return status;
   }
 
   template <class TypeFile>
-  Status CreateWritableEncryptedFile(const std::string& fname,
-                                     std::unique_ptr<TypeFile>& underlying,
-                                     const EnvOptions& options,
-                                     std::unique_ptr<TypeFile>* result) {
+  IOStatus CreateWritableEncryptedFile(const std::string& fname,
+                                       std::unique_ptr<TypeFile>& underlying,
+                                       const FileOptions& options,
+                                       std::unique_ptr<TypeFile>* result,
+                                       IODebugContext* dbg) {
     // Create cipher stream
     std::unique_ptr<BlockAccessCipherStream> stream;
     size_t prefix_length;
-    Status status = CreateWritableCipherStream(fname, underlying, options,
-                                               &prefix_length, &stream);
+    IOStatus status = CreateWritableCipherStream(fname, underlying, options,
+                                                 &prefix_length, &stream, dbg);
     if (status.ok()) {
       if (stream) {
         result->reset(new EncryptedWritableFile(
@@ -521,15 +592,15 @@ class EncryptedEnvImpl : public EnvWrapper {
   // should be encrypted
   // @return OK on success, non-OK on failure.
   template <class TypeFile>
-  Status CreateRandomWriteCipherStream(
+  IOStatus CreateRandomWriteCipherStream(
       const std::string& fname, const std::unique_ptr<TypeFile>& underlying,
-      const EnvOptions& options, size_t* prefix_length,
-      std::unique_ptr<BlockAccessCipherStream>* stream) {
+      const FileOptions& options, size_t* prefix_length,
+      std::unique_ptr<BlockAccessCipherStream>* stream, IODebugContext* dbg) {
     EncryptionProvider* provider = nullptr;
     *prefix_length = 0;
-    Status status = GetWritableProvider(fname, &provider);
-    if (!status.ok()) {
-      return status;
+    IOStatus io_s = GetWritableProvider(fname, &provider);
+    if (!io_s.ok()) {
+      return io_s;
     } else if (provider != nullptr) {
       // Initialize & write prefix (if needed)
       AlignedBuffer buffer;
@@ -539,22 +610,23 @@ class EncryptedEnvImpl : public EnvWrapper {
         // Initialize prefix
         buffer.Alignment(underlying->GetRequiredBufferAlignment());
         buffer.AllocateNewBuffer(*prefix_length);
-        status = provider->CreateNewPrefix(fname, buffer.BufferStart(),
-                                           *prefix_length);
-        if (status.ok()) {
+        io_s = status_to_io_status(provider->CreateNewPrefix(
+            fname, buffer.BufferStart(), *prefix_length));
+        if (io_s.ok()) {
           buffer.Size(*prefix_length);
           prefix = Slice(buffer.BufferStart(), buffer.CurrentSize());
           // Write prefix
-          status = underlying->Write(0, prefix);
+          io_s = underlying->Write(0, prefix, options.io_options, dbg);
         }
-        if (!status.ok()) {
-          return status;
+        if (!io_s.ok()) {
+          return io_s;
         }
       }
       // Create cipher stream
-      status = provider->CreateCipherStream(fname, options, prefix, stream);
+      io_s = status_to_io_status(
+          provider->CreateCipherStream(fname, options, prefix, stream));
     }
-    return status;
+    return io_s;
   }
 
   // Creates a CipherStream for the underlying file/name using the options
@@ -569,10 +641,10 @@ class EncryptedEnvImpl : public EnvWrapper {
   // is encrypted
   // @return OK on success, non-OK on failure.
   template <class TypeFile>
-  Status CreateSequentialCipherStream(
+  IOStatus CreateSequentialCipherStream(
       const std::string& fname, const std::unique_ptr<TypeFile>& underlying,
-      const EnvOptions& options, size_t* prefix_length,
-      std::unique_ptr<BlockAccessCipherStream>* stream) {
+      const FileOptions& options, size_t* prefix_length,
+      std::unique_ptr<BlockAccessCipherStream>* stream, IODebugContext* dbg) {
     // Read prefix (if needed)
     AlignedBuffer buffer;
     Slice prefix;
@@ -581,14 +653,15 @@ class EncryptedEnvImpl : public EnvWrapper {
       // Read prefix
       buffer.Alignment(underlying->GetRequiredBufferAlignment());
       buffer.AllocateNewBuffer(*prefix_length);
-      Status status =
-          underlying->Read(*prefix_length, &prefix, buffer.BufferStart());
+      IOStatus status = underlying->Read(*prefix_length, options.io_options,
+                                         &prefix, buffer.BufferStart(), dbg);
       if (!status.ok()) {
         return status;
       }
       buffer.Size(*prefix_length);
     }
-    return provider_->CreateCipherStream(fname, options, prefix, stream);
+    return status_to_io_status(
+        provider_->CreateCipherStream(fname, options, prefix, stream));
   }
 
   // Creates a CipherStream for the underlying file/name using the options
@@ -603,10 +676,10 @@ class EncryptedEnvImpl : public EnvWrapper {
   // is encrypted
   // @return OK on success, non-OK on failure.
   template <class TypeFile>
-  Status CreateRandomReadCipherStream(
+  IOStatus CreateRandomReadCipherStream(
       const std::string& fname, const std::unique_ptr<TypeFile>& underlying,
-      const EnvOptions& options, size_t* prefix_length,
-      std::unique_ptr<BlockAccessCipherStream>* stream) {
+      const FileOptions& options, size_t* prefix_length,
+      std::unique_ptr<BlockAccessCipherStream>* stream, IODebugContext* dbg) {
     // Read prefix (if needed)
     AlignedBuffer buffer;
     Slice prefix;
@@ -615,42 +688,60 @@ class EncryptedEnvImpl : public EnvWrapper {
       // Read prefix
       buffer.Alignment(underlying->GetRequiredBufferAlignment());
       buffer.AllocateNewBuffer(*prefix_length);
-      Status status =
-          underlying->Read(0, *prefix_length, &prefix, buffer.BufferStart());
+      IOStatus status = underlying->Read(0, *prefix_length, options.io_options,
+                                         &prefix, buffer.BufferStart(), dbg);
       if (!status.ok()) {
         return status;
       }
       buffer.Size(*prefix_length);
     }
-    return provider_->CreateCipherStream(fname, options, prefix, stream);
+    return status_to_io_status(
+        provider_->CreateCipherStream(fname, options, prefix, stream));
   }
 
  public:
-  EncryptedEnvImpl(Env* base_env,
-                   const std::shared_ptr<EncryptionProvider>& provider)
-      : EnvWrapper(base_env) {
+  EncryptedFileSystemImpl(const std::shared_ptr<FileSystem>& base,
+                          const std::shared_ptr<EncryptionProvider>& provider)
+      : EncryptedFileSystem(base) {
     provider_ = provider;
   }
 
+  Status AddCipher(const std::string& descriptor, const char* cipher,
+                   size_t len, bool for_write) override {
+    return provider_->AddCipher(descriptor, cipher, len, for_write);
+  }
+
   // NewSequentialFile opens a file for sequential reading.
-  virtual Status NewSequentialFile(const std::string& fname,
-                                   std::unique_ptr<SequentialFile>* result,
-                                   const EnvOptions& options) override {
+  IOStatus NewSequentialFile(const std::string& fname,
+                             const FileOptions& options,
+                             std::unique_ptr<FSSequentialFile>* result,
+                             IODebugContext* dbg) override {
     result->reset();
     if (options.use_mmap_reads) {
-      return Status::InvalidArgument();
+      return IOStatus::InvalidArgument();
     }
     // Open file using underlying Env implementation
-    std::unique_ptr<SequentialFile> underlying;
-    auto status = EnvWrapper::NewSequentialFile(fname, &underlying, options);
+    std::unique_ptr<FSSequentialFile> underlying;
+    auto status =
+        FileSystemWrapper::NewSequentialFile(fname, options, &underlying, dbg);
     if (!status.ok()) {
       return status;
     }
+    uint64_t file_size;
+    status = FileSystemWrapper::GetFileSize(fname, options.io_options,
+                                            &file_size, dbg);
+    if (!status.ok()) {
+      return status;
+    }
+    if (!file_size) {
+      *result = std::move(underlying);
+      return status;
+    }
     // Create cipher stream
     std::unique_ptr<BlockAccessCipherStream> stream;
     size_t prefix_length;
     status = CreateSequentialCipherStream(fname, underlying, options,
-                                          &prefix_length, &stream);
+                                          &prefix_length, &stream, dbg);
     if (status.ok()) {
       result->reset(new EncryptedSequentialFile(
           std::move(underlying), std::move(stream), prefix_length));
@@ -659,23 +750,25 @@ class EncryptedEnvImpl : public EnvWrapper {
   }
 
   // NewRandomAccessFile opens a file for random read access.
-  virtual Status NewRandomAccessFile(const std::string& fname,
-                                     std::unique_ptr<RandomAccessFile>* result,
-                                     const EnvOptions& options) override {
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& options,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override {
     result->reset();
     if (options.use_mmap_reads) {
-      return Status::InvalidArgument();
+      return IOStatus::InvalidArgument();
     }
     // Open file using underlying Env implementation
-    std::unique_ptr<RandomAccessFile> underlying;
-    auto status = EnvWrapper::NewRandomAccessFile(fname, &underlying, options);
+    std::unique_ptr<FSRandomAccessFile> underlying;
+    auto status = FileSystemWrapper::NewRandomAccessFile(fname, options,
+                                                         &underlying, dbg);
     if (!status.ok()) {
       return status;
     }
     std::unique_ptr<BlockAccessCipherStream> stream;
     size_t prefix_length;
     status = CreateRandomReadCipherStream(fname, underlying, options,
-                                          &prefix_length, &stream);
+                                          &prefix_length, &stream, dbg);
     if (status.ok()) {
       if (stream) {
         result->reset(new EncryptedRandomAccessFile(
@@ -688,20 +781,21 @@ class EncryptedEnvImpl : public EnvWrapper {
   }
 
   // NewWritableFile opens a file for sequential writing.
-  virtual Status NewWritableFile(const std::string& fname,
-                                 std::unique_ptr<WritableFile>* result,
-                                 const EnvOptions& options) override {
+  IOStatus NewWritableFile(const std::string& fname, const FileOptions& options,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override {
     result->reset();
     if (options.use_mmap_writes) {
-      return Status::InvalidArgument();
+      return IOStatus::InvalidArgument();
     }
     // Open file using underlying Env implementation
-    std::unique_ptr<WritableFile> underlying;
-    Status status = EnvWrapper::NewWritableFile(fname, &underlying, options);
+    std::unique_ptr<FSWritableFile> underlying;
+    IOStatus status =
+        FileSystemWrapper::NewWritableFile(fname, options, &underlying, dbg);
     if (!status.ok()) {
       return status;
     }
-    return CreateWritableEncryptedFile(fname, underlying, options, result);
+    return CreateWritableEncryptedFile(fname, underlying, options, result, dbg);
   }
 
   // Create an object that writes to a new file with the specified
@@ -711,39 +805,42 @@ class EncryptedEnvImpl : public EnvWrapper {
   // returns non-OK.
   //
   // The returned file will only be accessed by one thread at a time.
-  virtual Status ReopenWritableFile(const std::string& fname,
-                                    std::unique_ptr<WritableFile>* result,
-                                    const EnvOptions& options) override {
+  IOStatus ReopenWritableFile(const std::string& fname,
+                              const FileOptions& options,
+                              std::unique_ptr<FSWritableFile>* result,
+                              IODebugContext* dbg) override {
     result->reset();
     if (options.use_mmap_writes) {
-      return Status::InvalidArgument();
+      return IOStatus::InvalidArgument();
     }
     // Open file using underlying Env implementation
-    std::unique_ptr<WritableFile> underlying;
-    Status status = EnvWrapper::ReopenWritableFile(fname, &underlying, options);
+    std::unique_ptr<FSWritableFile> underlying;
+    IOStatus status =
+        FileSystemWrapper::ReopenWritableFile(fname, options, &underlying, dbg);
     if (!status.ok()) {
       return status;
     }
-    return CreateWritableEncryptedFile(fname, underlying, options, result);
+    return CreateWritableEncryptedFile(fname, underlying, options, result, dbg);
   }
 
   // Reuse an existing file by renaming it and opening it as writable.
-  virtual Status ReuseWritableFile(const std::string& fname,
-                                   const std::string& old_fname,
-                                   std::unique_ptr<WritableFile>* result,
-                                   const EnvOptions& options) override {
+  IOStatus ReuseWritableFile(const std::string& fname,
+                             const std::string& old_fname,
+                             const FileOptions& options,
+                             std::unique_ptr<FSWritableFile>* result,
+                             IODebugContext* dbg) override {
     result->reset();
     if (options.use_mmap_writes) {
-      return Status::InvalidArgument();
+      return IOStatus::InvalidArgument();
     }
     // Open file using underlying Env implementation
-    std::unique_ptr<WritableFile> underlying;
-    Status status =
-        EnvWrapper::ReuseWritableFile(fname, old_fname, &underlying, options);
+    std::unique_ptr<FSWritableFile> underlying;
+    auto status = FileSystemWrapper::ReuseWritableFile(
+        fname, old_fname, options, &underlying, dbg);
     if (!status.ok()) {
       return status;
     }
-    return CreateWritableEncryptedFile(fname, underlying, options, result);
+    return CreateWritableEncryptedFile(fname, underlying, options, result, dbg);
   }
 
   // Open `fname` for random read and write, if file doesn't exist the file
@@ -751,19 +848,20 @@ class EncryptedEnvImpl : public EnvWrapper {
   // *result and returns OK.  On failure returns non-OK.
   //
   // The returned file will only be accessed by one thread at a time.
-  virtual Status NewRandomRWFile(const std::string& fname,
-                                 std::unique_ptr<RandomRWFile>* result,
-                                 const EnvOptions& options) override {
+  IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* dbg) override {
     result->reset();
     if (options.use_mmap_reads || options.use_mmap_writes) {
-      return Status::InvalidArgument();
+      return IOStatus::InvalidArgument();
     }
     // Check file exists
-    bool isNewFile = !FileExists(fname).ok();
+    bool isNewFile = !FileExists(fname, options.io_options, dbg).ok();
 
     // Open file using underlying Env implementation
-    std::unique_ptr<RandomRWFile> underlying;
-    Status status = EnvWrapper::NewRandomRWFile(fname, &underlying, options);
+    std::unique_ptr<FSRandomRWFile> underlying;
+    auto status =
+        FileSystemWrapper::NewRandomRWFile(fname, options, &underlying, dbg);
     if (!status.ok()) {
       return status;
     }
@@ -773,10 +871,10 @@ class EncryptedEnvImpl : public EnvWrapper {
     if (!isNewFile) {
       // File already exists, read prefix
       status = CreateRandomReadCipherStream(fname, underlying, options,
-                                            &prefix_length, &stream);
+                                            &prefix_length, &stream, dbg);
     } else {
       status = CreateRandomWriteCipherStream(fname, underlying, options,
-                                             &prefix_length, &stream);
+                                             &prefix_length, &stream, dbg);
     }
     if (status.ok()) {
       if (stream) {
@@ -803,9 +901,12 @@ class EncryptedEnvImpl : public EnvWrapper {
   //         have
   //                  permission to access "dir", or if "dir" is invalid.
   //         IOError if an IO Error was encountered
-  virtual Status GetChildrenFileAttributes(
-      const std::string& dir, std::vector<FileAttributes>* result) override {
-    auto status = EnvWrapper::GetChildrenFileAttributes(dir, result);
+  IOStatus GetChildrenFileAttributes(const std::string& dir,
+                                     const IOOptions& options,
+                                     std::vector<FileAttributes>* result,
+                                     IODebugContext* dbg) override {
+    auto status =
+        FileSystemWrapper::GetChildrenFileAttributes(dir, options, result, dbg);
     if (!status.ok()) {
       return status;
     }
@@ -823,14 +924,15 @@ class EncryptedEnvImpl : public EnvWrapper {
         it->size_bytes -= provider->GetPrefixLength();
       }
     }
-    return Status::OK();
+    return IOStatus::OK();
   }
 
   // Store the size of fname in *file_size.
-  virtual Status GetFileSize(const std::string& fname,
-                             uint64_t* file_size) override {
-    auto status = EnvWrapper::GetFileSize(fname, file_size);
-    if (!status.ok()) {
+  IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
+                       uint64_t* file_size, IODebugContext* dbg) override {
+    auto status =
+        FileSystemWrapper::GetFileSize(fname, options, file_size, dbg);
+    if (!status.ok() || !(*file_size)) {
       return status;
     }
     EncryptionProvider* provider;
@@ -846,12 +948,19 @@ class EncryptedEnvImpl : public EnvWrapper {
  private:
   std::shared_ptr<EncryptionProvider> provider_;
 };
+}  // namespace
 
+std::shared_ptr<FileSystem> NewEncryptedFS(
+    const std::shared_ptr<FileSystem>& base,
+    const std::shared_ptr<EncryptionProvider>& provider) {
+  return std::make_shared<EncryptedFileSystemImpl>(base, provider);
+}
 // Returns an Env that encrypts data when stored on disk and decrypts data when
 // read from disk.
 Env* NewEncryptedEnv(Env* base_env,
                      const std::shared_ptr<EncryptionProvider>& provider) {
-  return new EncryptedEnvImpl(base_env, provider);
+  return new CompositeEnvWrapper(
+      base_env, NewEncryptedFS(base_env->GetFileSystem(), provider));
 }
 
 // Encrypt one or more (partial) blocks of data at the file offset.
@@ -1054,7 +1163,7 @@ Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/,
     return Status::InvalidArgument("Encryption Cipher is missing");
   }
   // Create & seed rnd.
-  Random rnd((uint32_t)Env::Default()->NowMicros());
+  Random rnd((uint32_t)SystemClock::Default()->NowMicros());
   // Fill entire prefix block with random values.
   for (size_t i = 0; i < prefixLength; i++) {
     prefix[i] = rnd.Uniform(256) & 0xFF;
diff --git a/env/env_hdfs.cc b/env/env_hdfs.cc
index 3323eeb8af3..e0443dd944d 100644
--- a/env/env_hdfs.cc
+++ b/env/env_hdfs.cc
@@ -37,10 +37,10 @@ namespace {
 // Log error message
 static Status IOError(const std::string& context, int err_number) {
   return (err_number == ENOSPC)
-             ? Status::NoSpace(context, strerror(err_number))
+             ? Status::NoSpace(context, errnoStr(err_number).c_str())
              : (err_number == ENOENT)
-                   ? Status::PathNotFound(context, strerror(err_number))
-                   : Status::IOError(context, strerror(err_number));
+                   ? Status::PathNotFound(context, errnoStr(err_number).c_str())
+                   : Status::IOError(context, errnoStr(err_number).c_str());
 }
 
 // assume that there is one global logger for now. It is not thread-safe,
@@ -213,6 +213,8 @@ class HdfsWritableFile: public WritableFile {
     }
   }
 
+  using WritableFile::Append;
+
   // If the file was successfully created, then this returns true.
   // Otherwise returns false.
   bool isValid() {
diff --git a/env/env_posix.cc b/env/env_posix.cc
index fd7cbec22d3..fdcb6f6a31a 100644
--- a/env/env_posix.cc
+++ b/env/env_posix.cc
@@ -16,9 +16,6 @@
 #include <errno.h>
 #include <fcntl.h>
 
-#if defined(OS_LINUX)
-#include <linux/fs.h>
-#endif
 #if defined(ROCKSDB_IOURING_PRESENT)
 #include <liburing.h>
 #endif
@@ -27,13 +24,10 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID)
 #include <sys/statfs.h>
-#include <sys/syscall.h>
-#include <sys/sysmacros.h>
 #endif
 #include <sys/statvfs.h>
 #include <sys/time.h>
@@ -58,13 +52,14 @@
 
 #include "env/composite_env_wrapper.h"
 #include "env/io_posix.h"
-#include "logging/logging.h"
 #include "logging/posix_logger.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/thread_status_updater.h"
 #include "port/port.h"
+#include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/compression_context_cache.h"
@@ -129,14 +124,86 @@ class PosixDynamicLibrary : public DynamicLibrary {
 };
 #endif  // !ROCKSDB_NO_DYNAMIC_EXTENSION
 
-class PosixEnv : public CompositeEnvWrapper {
+class PosixClock : public SystemClock {
  public:
-  // This constructor is for constructing non-default Envs, mainly by
-  // NewCompositeEnv(). It allows new instances to share the same
-  // threadpool and other resources as the default Env, while allowing
-  // a non-default FileSystem implementation
-  PosixEnv(const PosixEnv* default_env, std::shared_ptr<FileSystem> fs);
+  const char* Name() const override { return "PosixClock"; }
+  uint64_t NowMicros() override {
+    struct timeval tv;
+    gettimeofday(&tv, nullptr);
+    return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+  }
+
+  uint64_t NowNanos() override {
+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \
+    defined(OS_AIX)
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
+#elif defined(OS_SOLARIS)
+    return gethrtime();
+#elif defined(__MACH__)
+    clock_serv_t cclock;
+    mach_timespec_t ts;
+    host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
+    clock_get_time(cclock, &ts);
+    mach_port_deallocate(mach_task_self(), cclock);
+    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
+#else
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(
+               std::chrono::steady_clock::now().time_since_epoch())
+        .count();
+#endif
+  }
 
+  uint64_t CPUMicros() override {
+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \
+    defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12))
+    struct timespec ts;
+    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
+    return static_cast<uint64_t>(ts.tv_sec) * 1000000000;
+#endif
+    return 0;
+  }
+
+  uint64_t CPUNanos() override {
+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \
+    defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12))
+    struct timespec ts;
+    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
+    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
+#endif
+    return 0;
+  }
+
+  void SleepForMicroseconds(int micros) override { usleep(micros); }
+
+  Status GetCurrentTime(int64_t* unix_time) override {
+    time_t ret = time(nullptr);
+    if (ret == (time_t)-1) {
+      return IOError("GetCurrentTime", "", errno);
+    }
+    *unix_time = (int64_t)ret;
+    return Status::OK();
+  }
+
+  std::string TimeToString(uint64_t secondsSince1970) override {
+    const time_t seconds = (time_t)secondsSince1970;
+    struct tm t;
+    int maxsize = 64;
+    std::string dummy;
+    dummy.reserve(maxsize);
+    dummy.resize(maxsize);
+    char* p = &dummy[0];
+    localtime_r(&seconds, &t);
+    snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", t.tm_year + 1900,
+             t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec);
+    return dummy;
+  }
+};
+
+class PosixEnv : public CompositeEnv {
+ public:
+  PosixEnv(const PosixEnv* default_env, const std::shared_ptr<FileSystem>& fs);
   ~PosixEnv() override {
     if (this == Env::Default()) {
       for (const auto tid : threads_to_join_) {
@@ -244,50 +311,11 @@ class PosixEnv : public CompositeEnvWrapper {
 
   uint64_t GetThreadID() const override { return gettid(pthread_self()); }
 
-  uint64_t NowMicros() override {
-    struct timeval tv;
-    gettimeofday(&tv, nullptr);
-    return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
-  }
-
-  uint64_t NowNanos() override {
-#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \
-    defined(OS_AIX)
-    struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
-#elif defined(OS_SOLARIS)
-    return gethrtime();
-#elif defined(__MACH__)
-    clock_serv_t cclock;
-    mach_timespec_t ts;
-    host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
-    clock_get_time(cclock, &ts);
-    mach_port_deallocate(mach_task_self(), cclock);
-    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
-#else
-    return std::chrono::duration_cast<std::chrono::nanoseconds>(
-       std::chrono::steady_clock::now().time_since_epoch()).count();
-#endif
-  }
-
-  uint64_t NowCPUNanos() override {
-#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \
-    defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12))
-    struct timespec ts;
-    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
-    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
-#endif
-    return 0;
-  }
-
-  void SleepForMicroseconds(int micros) override { usleep(micros); }
-
   Status GetHostName(char* name, uint64_t len) override {
     int ret = gethostname(name, static_cast<size_t>(len));
     if (ret < 0) {
       if (errno == EFAULT || errno == EINVAL) {
-        return Status::InvalidArgument(strerror(errno));
+        return Status::InvalidArgument(errnoStr(errno).c_str());
       } else {
         return IOError("GetHostName", name, errno);
       }
@@ -295,15 +323,6 @@ class PosixEnv : public CompositeEnvWrapper {
     return Status::OK();
   }
 
-  Status GetCurrentTime(int64_t* unix_time) override {
-    time_t ret = time(nullptr);
-    if (ret == (time_t) -1) {
-      return IOError("GetCurrentTime", "", errno);
-    }
-    *unix_time = (int64_t) ret;
-    return Status::OK();
-  }
-
   ThreadStatusUpdater* GetThreadStatusUpdater() const override {
     return Env::GetThreadStatusUpdater();
   }
@@ -352,26 +371,6 @@ class PosixEnv : public CompositeEnvWrapper {
     return Status::OK();
   }
 
-  std::string TimeToString(uint64_t secondsSince1970) override {
-    const time_t seconds = (time_t)secondsSince1970;
-    struct tm t;
-    int maxsize = 64;
-    std::string dummy;
-    dummy.reserve(maxsize);
-    dummy.resize(maxsize);
-    char* p = &dummy[0];
-    localtime_r(&seconds, &t);
-    snprintf(p, maxsize,
-             "%04d/%02d/%02d-%02d:%02d:%02d ",
-             t.tm_year + 1900,
-             t.tm_mon + 1,
-             t.tm_mday,
-             t.tm_hour,
-             t.tm_min,
-             t.tm_sec);
-    return dummy;
-  }
-
  private:
   friend Env* Env::Default();
   // Constructs the default Env, a singleton
@@ -394,7 +393,7 @@ class PosixEnv : public CompositeEnvWrapper {
 };
 
 PosixEnv::PosixEnv()
-    : CompositeEnvWrapper(this, FileSystem::Default()),
+    : CompositeEnv(FileSystem::Default(), SystemClock::Default()),
       thread_pools_storage_(Priority::TOTAL),
       allow_non_owner_access_storage_(true),
       thread_pools_(thread_pools_storage_),
@@ -411,12 +410,13 @@ PosixEnv::PosixEnv()
   thread_status_updater_ = CreateThreadStatusUpdater();
 }
 
-PosixEnv::PosixEnv(const PosixEnv* default_env, std::shared_ptr<FileSystem> fs)
-  : CompositeEnvWrapper(this, fs),
-    thread_pools_(default_env->thread_pools_),
-    mu_(default_env->mu_),
-    threads_to_join_(default_env->threads_to_join_),
-    allow_non_owner_access_(default_env->allow_non_owner_access_) {
+PosixEnv::PosixEnv(const PosixEnv* default_env,
+                   const std::shared_ptr<FileSystem>& fs)
+    : CompositeEnv(fs, default_env->GetSystemClock()),
+      thread_pools_(default_env->thread_pools_),
+      mu_(default_env->mu_),
+      threads_to_join_(default_env->threads_to_join_),
+      allow_non_owner_access_(default_env->allow_non_owner_access_) {
   thread_status_updater_ = default_env->thread_status_updater_;
 }
 
@@ -470,11 +470,12 @@ void PosixEnv::WaitForJoin() {
 
 std::string Env::GenerateUniqueId() {
   std::string uuid_file = "/proc/sys/kernel/random/uuid";
+  std::shared_ptr<FileSystem> fs = FileSystem::Default();
 
-  Status s = FileExists(uuid_file);
+  Status s = fs->FileExists(uuid_file, IOOptions(), nullptr);
   if (s.ok()) {
     std::string uuid;
-    s = ReadFileToString(this, uuid_file, &uuid);
+    s = ReadFileToString(fs.get(), uuid_file, &uuid);
     if (s.ok()) {
       return uuid;
     }
@@ -514,11 +515,19 @@ Env* Env::Default() {
   return &default_env;
 }
 
-std::unique_ptr<Env> NewCompositeEnv(std::shared_ptr<FileSystem> fs) {
+std::unique_ptr<Env> NewCompositeEnv(const std::shared_ptr<FileSystem>& fs) {
   PosixEnv* default_env = static_cast<PosixEnv*>(Env::Default());
   return std::unique_ptr<Env>(new PosixEnv(default_env, fs));
 }
 
+//
+// Default Posix SystemClock
+//
+const std::shared_ptr<SystemClock>& SystemClock::Default() {
+  static std::shared_ptr<SystemClock> default_clock =
+      std::make_shared<PosixClock>();
+  return default_clock;
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 #endif
diff --git a/env/env_test.cc b/env/env_test.cc
index 660f210e46c..c7239047317 100644
--- a/env/env_test.cc
+++ b/env/env_test.cc
@@ -11,6 +11,11 @@
 #include <sys/ioctl.h>
 #endif
 
+#if defined(ROCKSDB_IOURING_PRESENT)
+#include <liburing.h>
+#include <sys/uio.h>
+#endif
+
 #include <sys/types.h>
 
 #include <iostream>
@@ -35,10 +40,12 @@
 #include "port/malloc.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/coding.h"
+#include "util/crc32c.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 #include "util/string_util.h"
@@ -89,6 +96,11 @@ class EnvPosixTest : public testing::Test {
   Env* env_;
   bool direct_io_;
   EnvPosixTest() : env_(Env::Default()), direct_io_(false) {}
+  ~EnvPosixTest() {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->LoadDependency({});
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
 };
 
 class EnvPosixTestWithParam
@@ -913,7 +925,7 @@ class IoctlFriendlyTmpdir {
       } else {
         // mkdtemp failed: diagnose it, but don't give up.
         fprintf(stderr, "mkdtemp(%s/...) failed: %s\n", d.c_str(),
-                strerror(errno));
+                errnoStr(errno).c_str());
       }
     }
 
@@ -1038,7 +1050,8 @@ TEST_P(EnvPosixTestWithParam, AllocateTest) {
     int err_number = 0;
     if (alloc_status != 0) {
       err_number = errno;
-      fprintf(stderr, "Warning: fallocate() fails, %s\n", strerror(err_number));
+      fprintf(stderr, "Warning: fallocate() fails, %s\n",
+              errnoStr(err_number).c_str());
     }
     close(fd);
     ASSERT_OK(env_->DeleteFile(fname_test_fallocate));
@@ -1265,7 +1278,7 @@ TEST_P(EnvPosixTestWithParam, MultiRead) {
 }
 
 TEST_F(EnvPosixTest, MultiReadNonAlignedLargeNum) {
-  // In this test we don't do aligned read, wo it doesn't work for
+  // In this test we don't do aligned read, so it doesn't work for
   // direct I/O case.
   EnvOptions soptions;
   soptions.use_direct_reads = soptions.use_direct_writes = false;
@@ -1356,6 +1369,121 @@ TEST_F(EnvPosixTest, MultiReadNonAlignedLargeNum) {
   }
 }
 
+#if defined(ROCKSDB_IOURING_PRESENT)
+void GenerateFilesAndRequest(Env* env, const std::string& fname,
+                             std::vector<ReadRequest>* ret_reqs,
+                             std::vector<std::string>* scratches) {
+  const size_t kTotalSize = 81920;
+  Random rnd(301);
+  std::string expected_data = rnd.RandomString(kTotalSize);
+
+  // Create file.
+  {
+    std::unique_ptr<WritableFile> wfile;
+    ASSERT_OK(env->NewWritableFile(fname, &wfile, EnvOptions()));
+    ASSERT_OK(wfile->Append(expected_data));
+    ASSERT_OK(wfile->Close());
+  }
+
+  // Right now kIoUringDepth is hard coded as 256, so we need very large
+  // number of keys to cover the case of multiple rounds of submissions.
+  // Right now the test latency is still acceptable. If it ends up with
+  // too long, we can modify the io uring depth with SyncPoint here.
+  const int num_reads = 3;
+  std::vector<size_t> offsets = {10000, 20000, 30000};
+  std::vector<size_t> lens = {3000, 200, 100};
+
+  // Create requests
+  scratches->reserve(num_reads);
+  std::vector<ReadRequest>& reqs = *ret_reqs;
+  reqs.resize(num_reads);
+  for (int i = 0; i < num_reads; ++i) {
+    reqs[i].offset = offsets[i];
+    reqs[i].len = lens[i];
+    scratches->emplace_back(reqs[i].len, ' ');
+    reqs[i].scratch = const_cast<char*>(scratches->back().data());
+  }
+}
+
+TEST_F(EnvPosixTest, MultiReadIOUringError) {
+  // In this test we don't do aligned read, so we can't do direct I/O.
+  EnvOptions soptions;
+  soptions.use_direct_reads = soptions.use_direct_writes = false;
+  std::string fname = test::PerThreadDBPath(env_, "testfile");
+
+  std::vector<std::string> scratches;
+  std::vector<ReadRequest> reqs;
+  GenerateFilesAndRequest(env_, fname, &reqs, &scratches);
+  // Query the data
+  std::unique_ptr<RandomAccessFile> file;
+  ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+
+  bool io_uring_wait_cqe_called = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return",
+      [&](void* arg) {
+        if (!io_uring_wait_cqe_called) {
+          io_uring_wait_cqe_called = true;
+          ssize_t& ret = *(static_cast<ssize_t*>(arg));
+          ret = 1;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = file->MultiRead(reqs.data(), reqs.size());
+  if (io_uring_wait_cqe_called) {
+    ASSERT_NOK(s);
+  } else {
+    s.PermitUncheckedError();
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(EnvPosixTest, MultiReadIOUringError2) {
+  // In this test we don't do aligned read, so we can't do direct I/O.
+  EnvOptions soptions;
+  soptions.use_direct_reads = soptions.use_direct_writes = false;
+  std::string fname = test::PerThreadDBPath(env_, "testfile");
+
+  std::vector<std::string> scratches;
+  std::vector<ReadRequest> reqs;
+  GenerateFilesAndRequest(env_, fname, &reqs, &scratches);
+  // Query the data
+  std::unique_ptr<RandomAccessFile> file;
+  ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
+
+  bool io_uring_submit_and_wait_called = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1",
+      [&](void* arg) {
+        io_uring_submit_and_wait_called = true;
+        ssize_t* ret = static_cast<ssize_t*>(arg);
+        (*ret)--;
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2",
+      [&](void* arg) {
+        struct io_uring* iu = static_cast<struct io_uring*>(arg);
+        struct io_uring_cqe* cqe;
+        assert(io_uring_wait_cqe(iu, &cqe) == 0);
+        io_uring_cqe_seen(iu, cqe);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = file->MultiRead(reqs.data(), reqs.size());
+  if (io_uring_submit_and_wait_called) {
+    ASSERT_NOK(s);
+  } else {
+    s.PermitUncheckedError();
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // ROCKSDB_IOURING_PRESENT
+
 // Only works in linux platforms
 #ifdef OS_WIN
 TEST_P(EnvPosixTestWithParam, DISABLED_InvalidateCache) {
@@ -1667,12 +1795,26 @@ TEST_P(EnvPosixTestWithParam, WritableFileWrapper) {
       return Status::OK();
     }
 
+    Status Append(
+        const Slice& /*data*/,
+        const DataVerificationInfo& /* verification_info */) override {
+      inc(1);
+      return Status::OK();
+    }
+
     Status PositionedAppend(const Slice& /*data*/,
                             uint64_t /*offset*/) override {
       inc(2);
       return Status::OK();
     }
 
+    Status PositionedAppend(
+        const Slice& /*data*/, uint64_t /*offset*/,
+        const DataVerificationInfo& /* verification_info */) override {
+      inc(2);
+      return Status::OK();
+    }
+
     Status Truncate(uint64_t /*size*/) override {
       inc(3);
       return Status::OK();
@@ -2051,6 +2193,26 @@ TEST_F(EnvTest, Close) {
   delete env;
 }
 
+class LogvWithInfoLogLevelLogger : public Logger {
+ public:
+  using Logger::Logv;
+  void Logv(const InfoLogLevel /* log_level */, const char* /* format */,
+            va_list /* ap */) override {}
+};
+
+TEST_F(EnvTest, LogvWithInfoLogLevel) {
+  // Verifies the log functions work on a `Logger` that only overrides the
+  // `Logv()` overload including `InfoLogLevel`.
+  const std::string kSampleMessage("sample log message");
+  LogvWithInfoLogLevelLogger logger;
+  ROCKS_LOG_HEADER(&logger, "%s", kSampleMessage.c_str());
+  ROCKS_LOG_DEBUG(&logger, "%s", kSampleMessage.c_str());
+  ROCKS_LOG_INFO(&logger, "%s", kSampleMessage.c_str());
+  ROCKS_LOG_WARN(&logger, "%s", kSampleMessage.c_str());
+  ROCKS_LOG_ERROR(&logger, "%s", kSampleMessage.c_str());
+  ROCKS_LOG_FATAL(&logger, "%s", kSampleMessage.c_str());
+}
+
 INSTANTIATE_TEST_CASE_P(DefaultEnvWithoutDirectIO, EnvPosixTestWithParam,
                         ::testing::Values(std::pair<Env*, bool>(Env::Default(),
                                                                 false)));
@@ -2146,7 +2308,7 @@ TEST_P(EnvFSTestWithParam, OptionsTest) {
 
     ASSERT_OK(db->Close());
     delete db;
-    DestroyDB(dbname, opts);
+    ASSERT_OK(DestroyDB(dbname, opts));
 
     dbname = dbname2_;
   }
@@ -2193,7 +2355,8 @@ TEST_F(EnvTest, IsDirectory) {
     ASSERT_OK(s);
     std::unique_ptr<WritableFileWriter> fwriter;
     fwriter.reset(new WritableFileWriter(std::move(wfile), test_file_path,
-                                         FileOptions(), Env::Default()));
+                                         FileOptions(),
+                                         SystemClock::Default().get()));
     constexpr char buf[] = "test";
     s = fwriter->Append(buf);
     ASSERT_OK(s);
@@ -2202,6 +2365,28 @@ TEST_F(EnvTest, IsDirectory) {
   ASSERT_FALSE(is_dir);
 }
 
+TEST_F(EnvTest, EnvWriteVerificationTest) {
+  Status s = Env::Default()->CreateDirIfMissing(test_directory_);
+  const std::string test_file_path = test_directory_ + "file1";
+  ASSERT_OK(s);
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  std::unique_ptr<WritableFile> file;
+  s = fault_fs_env->NewWritableFile(test_file_path, &file, EnvOptions());
+  ASSERT_OK(s);
+
+  DataVerificationInfo v_info;
+  std::string test_data = "test";
+  std::string checksum;
+  uint32_t v_crc32c = crc32c::Extend(0, test_data.c_str(), test_data.size());
+  PutFixed32(&checksum, v_crc32c);
+  v_info.checksum = Slice(checksum);
+  s = file->Append(Slice(test_data), v_info);
+  ASSERT_OK(s);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/env/file_system.cc b/env/file_system.cc
index d2fc06c6c95..a6a2f3388a6 100644
--- a/env/file_system.cc
+++ b/env/file_system.cc
@@ -3,9 +3,11 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
-#include "env/composite_env_wrapper.h"
 #include "rocksdb/file_system.h"
+
+#include "env/composite_env_wrapper.h"
 #include "options/db_options.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/utilities/object_registry.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -16,10 +18,18 @@ FileSystem::~FileSystem() {}
 
 Status FileSystem::Load(const std::string& value,
                         std::shared_ptr<FileSystem>* result) {
+  return CreateFromString(ConfigOptions(), value, result);
+}
+
+Status FileSystem::CreateFromString(const ConfigOptions& config_options,
+                                    const std::string& value,
+                                    std::shared_ptr<FileSystem>* result) {
   Status s;
 #ifndef ROCKSDB_LITE
+  (void)config_options;
   s = ObjectRegistry::NewInstance()->NewSharedObject<FileSystem>(value, result);
 #else
+  (void)config_options;
   (void)result;
   s = Status::NotSupported("Cannot load FileSystem in LITE mode", value);
 #endif
@@ -83,6 +93,14 @@ FileOptions FileSystem::OptimizeForCompactionTableRead(
   return optimized_file_options;
 }
 
+FileOptions FileSystem::OptimizeForBlobFileRead(
+    const FileOptions& file_options,
+    const ImmutableDBOptions& db_options) const {
+  FileOptions optimized_file_options(file_options);
+  optimized_file_options.use_direct_reads = db_options.use_direct_reads;
+  return optimized_file_options;
+}
+
 IOStatus WriteStringToFile(FileSystem* fs, const Slice& data,
                            const std::string& fname, bool should_sync) {
   std::unique_ptr<FSWritableFile> file;
@@ -129,13 +147,4 @@ IOStatus ReadFileToString(FileSystem* fs, const std::string& fname,
   return s;
 }
 
-#ifdef OS_WIN
-std::shared_ptr<FileSystem> FileSystem::Default() {
-  static LegacyFileSystemWrapper default_fs(Env::Default());
-  static std::shared_ptr<LegacyFileSystemWrapper> default_fs_ptr(
-      &default_fs, [](LegacyFileSystemWrapper*) {});
-  return default_fs_ptr;
-}
-#endif
-
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/env/file_system_tracer.cc b/env/file_system_tracer.cc
index ad15df822e7..9a85dd5e080 100644
--- a/env/file_system_tracer.cc
+++ b/env/file_system_tracer.cc
@@ -5,33 +5,108 @@
 
 #include "env/file_system_tracer.h"
 
-#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
+IOStatus FileSystemTracingWrapper::NewSequentialFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSSequentialFile>* result, IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->NewSequentialFile(fname, file_opts, result, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::NewRandomAccessFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->NewRandomAccessFile(fname, file_opts, result, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
 IOStatus FileSystemTracingWrapper::NewWritableFile(
     const std::string& fname, const FileOptions& file_opts,
     std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(clock_);
   timer.Start();
   IOStatus s = target()->NewWritableFile(fname, file_opts, result, dbg);
   uint64_t elapsed = timer.ElapsedNanos();
-  IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOFileName, __func__,
-                          elapsed, s.ToString(), fname);
-  io_tracer_->WriteIOOp(io_record);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::ReopenWritableFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->ReopenWritableFile(fname, file_opts, result, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::ReuseWritableFile(
+    const std::string& fname, const std::string& old_fname,
+    const FileOptions& file_opts, std::unique_ptr<FSWritableFile>* result,
+    IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s =
+      target()->ReuseWritableFile(fname, old_fname, file_opts, result, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::NewRandomRWFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSRandomRWFile>* result, IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->NewRandomRWFile(fname, file_opts, result, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
   return s;
 }
 
 IOStatus FileSystemTracingWrapper::NewDirectory(
     const std::string& name, const IOOptions& io_opts,
     std::unique_ptr<FSDirectory>* result, IODebugContext* dbg) {
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(clock_);
   timer.Start();
   IOStatus s = target()->NewDirectory(name, io_opts, result, dbg);
   uint64_t elapsed = timer.ElapsedNanos();
-  IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOFileName, __func__,
-                          elapsed, s.ToString(), name);
-  io_tracer_->WriteIOOp(io_record);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          name.substr(name.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
   return s;
 }
 
@@ -39,64 +114,69 @@ IOStatus FileSystemTracingWrapper::GetChildren(const std::string& dir,
                                                const IOOptions& io_opts,
                                                std::vector<std::string>* r,
                                                IODebugContext* dbg) {
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(clock_);
   timer.Start();
   IOStatus s = target()->GetChildren(dir, io_opts, r, dbg);
   uint64_t elapsed = timer.ElapsedNanos();
-  IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOFileName, __func__,
-                          elapsed, s.ToString(), dir);
-  io_tracer_->WriteIOOp(io_record);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          dir.substr(dir.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
   return s;
 }
 
 IOStatus FileSystemTracingWrapper::DeleteFile(const std::string& fname,
                                               const IOOptions& options,
                                               IODebugContext* dbg) {
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(clock_);
   timer.Start();
   IOStatus s = target()->DeleteFile(fname, options, dbg);
   uint64_t elapsed = timer.ElapsedNanos();
-  IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOFileName, __func__,
-                          elapsed, s.ToString(), fname);
-  io_tracer_->WriteIOOp(io_record);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
   return s;
 }
 
 IOStatus FileSystemTracingWrapper::CreateDir(const std::string& dirname,
                                              const IOOptions& options,
                                              IODebugContext* dbg) {
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(clock_);
   timer.Start();
   IOStatus s = target()->CreateDir(dirname, options, dbg);
   uint64_t elapsed = timer.ElapsedNanos();
-  IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOFileName, __func__,
-                          elapsed, s.ToString(), dirname);
-  io_tracer_->WriteIOOp(io_record);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          dirname.substr(dirname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
   return s;
 }
 
 IOStatus FileSystemTracingWrapper::CreateDirIfMissing(
     const std::string& dirname, const IOOptions& options, IODebugContext* dbg) {
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(clock_);
   timer.Start();
   IOStatus s = target()->CreateDirIfMissing(dirname, options, dbg);
   uint64_t elapsed = timer.ElapsedNanos();
-  IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOFileName, __func__,
-                          elapsed, s.ToString(), dirname);
-  io_tracer_->WriteIOOp(io_record);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          dirname.substr(dirname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
   return s;
 }
 
 IOStatus FileSystemTracingWrapper::DeleteDir(const std::string& dirname,
                                              const IOOptions& options,
                                              IODebugContext* dbg) {
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(clock_);
   timer.Start();
   IOStatus s = target()->DeleteDir(dirname, options, dbg);
   uint64_t elapsed = timer.ElapsedNanos();
-  IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOFileName, __func__,
-                          elapsed, s.ToString(), dirname);
-  io_tracer_->WriteIOOp(io_record);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          dirname.substr(dirname.find_last_of("/\\") + 1));
+  io_tracer_->WriteIOOp(io_record, dbg);
   return s;
 }
 
@@ -104,13 +184,33 @@ IOStatus FileSystemTracingWrapper::GetFileSize(const std::string& fname,
                                                const IOOptions& options,
                                                uint64_t* file_size,
                                                IODebugContext* dbg) {
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(clock_);
   timer.Start();
   IOStatus s = target()->GetFileSize(fname, options, file_size, dbg);
   uint64_t elapsed = timer.ElapsedNanos();
-  IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOFileNameAndFileSize,
-                          __func__, elapsed, s.ToString(), fname, *file_size);
-  io_tracer_->WriteIOOp(io_record);
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOFileSize);
+  IOTraceRecord io_record(
+      clock_->NowNanos(), TraceType::kIOTracer, io_op_data, __func__, elapsed,
+      s.ToString(), fname.substr(fname.find_last_of("/\\") + 1), *file_size);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FileSystemTracingWrapper::Truncate(const std::string& fname,
+                                            size_t size,
+                                            const IOOptions& options,
+                                            IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Truncate(fname, size, options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOFileSize);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(),
+                          fname.substr(fname.find_last_of("/\\") + 1), size);
+  io_tracer_->WriteIOOp(io_record, dbg);
   return s;
 }
 
@@ -118,40 +218,50 @@ IOStatus FSSequentialFileTracingWrapper::Read(size_t n,
                                               const IOOptions& options,
                                               Slice* result, char* scratch,
                                               IODebugContext* dbg) {
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(clock_);
   timer.Start();
   IOStatus s = target()->Read(n, options, result, scratch, dbg);
   uint64_t elapsed = timer.ElapsedNanos();
-  IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLen, __func__,
-                          elapsed, s.ToString(), result->size());
-  io_tracer_->WriteIOOp(io_record);
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_,
+                          result->size(), 0 /*Offset*/);
+  io_tracer_->WriteIOOp(io_record, dbg);
   return s;
 }
 
 IOStatus FSSequentialFileTracingWrapper::InvalidateCache(size_t offset,
                                                          size_t length) {
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(clock_);
   timer.Start();
   IOStatus s = target()->InvalidateCache(offset, length);
   uint64_t elapsed = timer.ElapsedNanos();
-  IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLenAndOffset,
-                          __func__, elapsed, s.ToString(), length, offset);
-  io_tracer_->WriteIOOp(io_record);
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_, length,
+                          offset);
+  io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/);
   return s;
 }
 
 IOStatus FSSequentialFileTracingWrapper::PositionedRead(
     uint64_t offset, size_t n, const IOOptions& options, Slice* result,
     char* scratch, IODebugContext* dbg) {
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(clock_);
   timer.Start();
   IOStatus s =
       target()->PositionedRead(offset, n, options, result, scratch, dbg);
   uint64_t elapsed = timer.ElapsedNanos();
-  IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLenAndOffset,
-                          __func__, elapsed, s.ToString(), result->size(),
-                          offset);
-  io_tracer_->WriteIOOp(io_record);
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_,
+                          result->size(), offset);
+  io_tracer_->WriteIOOp(io_record, dbg);
   return s;
 }
 
@@ -159,13 +269,17 @@ IOStatus FSRandomAccessFileTracingWrapper::Read(uint64_t offset, size_t n,
                                                 const IOOptions& options,
                                                 Slice* result, char* scratch,
                                                 IODebugContext* dbg) const {
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(clock_);
   timer.Start();
   IOStatus s = target()->Read(offset, n, options, result, scratch, dbg);
   uint64_t elapsed = timer.ElapsedNanos();
-  IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLenAndOffset,
-                          __func__, elapsed, s.ToString(), n, offset);
-  io_tracer_->WriteIOOp(io_record);
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_, n,
+                          offset);
+  io_tracer_->WriteIOOp(io_record, dbg);
   return s;
 }
 
@@ -173,16 +287,19 @@ IOStatus FSRandomAccessFileTracingWrapper::MultiRead(FSReadRequest* reqs,
                                                      size_t num_reqs,
                                                      const IOOptions& options,
                                                      IODebugContext* dbg) {
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(clock_);
   timer.Start();
   IOStatus s = target()->MultiRead(reqs, num_reqs, options, dbg);
   uint64_t elapsed = timer.ElapsedNanos();
   uint64_t latency = elapsed;
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
   for (size_t i = 0; i < num_reqs; i++) {
-    IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLenAndOffset,
-                            __func__, latency, reqs[i].status.ToString(),
-                            reqs[i].len, reqs[i].offset);
-    io_tracer_->WriteIOOp(io_record);
+    IOTraceRecord io_record(
+        clock_->NowNanos(), TraceType::kIOTracer, io_op_data, __func__, latency,
+        reqs[i].status.ToString(), file_name_, reqs[i].len, reqs[i].offset);
+    io_tracer_->WriteIOOp(io_record, dbg);
   }
   return s;
 }
@@ -190,116 +307,142 @@ IOStatus FSRandomAccessFileTracingWrapper::MultiRead(FSReadRequest* reqs,
 IOStatus FSRandomAccessFileTracingWrapper::Prefetch(uint64_t offset, size_t n,
                                                     const IOOptions& options,
                                                     IODebugContext* dbg) {
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(clock_);
   timer.Start();
   IOStatus s = target()->Prefetch(offset, n, options, dbg);
   uint64_t elapsed = timer.ElapsedNanos();
-  IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLenAndOffset,
-                          __func__, elapsed, s.ToString(), n, offset);
-  io_tracer_->WriteIOOp(io_record);
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_, n,
+                          offset);
+  io_tracer_->WriteIOOp(io_record, dbg);
   return s;
 }
 
 IOStatus FSRandomAccessFileTracingWrapper::InvalidateCache(size_t offset,
                                                            size_t length) {
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(clock_);
   timer.Start();
   IOStatus s = target()->InvalidateCache(offset, length);
   uint64_t elapsed = timer.ElapsedNanos();
-  IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLenAndOffset,
-                          __func__, elapsed, s.ToString(), length,
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_, length,
                           static_cast<uint64_t>(offset));
-  io_tracer_->WriteIOOp(io_record);
+  io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/);
   return s;
 }
 
 IOStatus FSWritableFileTracingWrapper::Append(const Slice& data,
                                               const IOOptions& options,
                                               IODebugContext* dbg) {
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(clock_);
   timer.Start();
   IOStatus s = target()->Append(data, options, dbg);
   uint64_t elapsed = timer.ElapsedNanos();
-  IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLen, __func__,
-                          elapsed, s.ToString(), data.size());
-  io_tracer_->WriteIOOp(io_record);
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_,
+                          data.size(), 0 /*Offset*/);
+  io_tracer_->WriteIOOp(io_record, dbg);
   return s;
 }
 
 IOStatus FSWritableFileTracingWrapper::PositionedAppend(
     const Slice& data, uint64_t offset, const IOOptions& options,
     IODebugContext* dbg) {
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(clock_);
   timer.Start();
   IOStatus s = target()->PositionedAppend(data, offset, options, dbg);
   uint64_t elapsed = timer.ElapsedNanos();
-  IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLenAndOffset,
-                          __func__, elapsed, s.ToString(), data.size(), offset);
-  io_tracer_->WriteIOOp(io_record);
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_,
+                          data.size(), offset);
+  io_tracer_->WriteIOOp(io_record, dbg);
   return s;
 }
 
 IOStatus FSWritableFileTracingWrapper::Truncate(uint64_t size,
                                                 const IOOptions& options,
                                                 IODebugContext* dbg) {
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(clock_);
   timer.Start();
   IOStatus s = target()->Truncate(size, options, dbg);
   uint64_t elapsed = timer.ElapsedNanos();
-  IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLen, __func__,
-                          elapsed, s.ToString(), size);
-  io_tracer_->WriteIOOp(io_record);
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_, size,
+                          0 /*Offset*/);
+  io_tracer_->WriteIOOp(io_record, dbg);
   return s;
 }
 
 IOStatus FSWritableFileTracingWrapper::Close(const IOOptions& options,
                                              IODebugContext* dbg) {
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(clock_);
   timer.Start();
   IOStatus s = target()->Close(options, dbg);
   uint64_t elapsed = timer.ElapsedNanos();
-  IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOGeneral, __func__,
-                          elapsed, s.ToString());
-  io_tracer_->WriteIOOp(io_record);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          file_name_);
+  io_tracer_->WriteIOOp(io_record, dbg);
   return s;
 }
 
 uint64_t FSWritableFileTracingWrapper::GetFileSize(const IOOptions& options,
                                                    IODebugContext* dbg) {
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(clock_);
   timer.Start();
   uint64_t file_size = target()->GetFileSize(options, dbg);
   uint64_t elapsed = timer.ElapsedNanos();
-  IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOFileNameAndFileSize,
-                          "GetFileSize", elapsed, "OK", "" /* file_name */,
-                          file_size);
-  io_tracer_->WriteIOOp(io_record);
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOFileSize);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, "OK", file_name_, file_size);
+  io_tracer_->WriteIOOp(io_record, dbg);
   return file_size;
 }
 
 IOStatus FSWritableFileTracingWrapper::InvalidateCache(size_t offset,
                                                        size_t length) {
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(clock_);
   timer.Start();
   IOStatus s = target()->InvalidateCache(offset, length);
   uint64_t elapsed = timer.ElapsedNanos();
-  IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLenAndOffset,
-                          __func__, elapsed, s.ToString(), length,
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_, length,
                           static_cast<uint64_t>(offset));
-  io_tracer_->WriteIOOp(io_record);
+  io_tracer_->WriteIOOp(io_record, nullptr /*dbg*/);
   return s;
 }
 
 IOStatus FSRandomRWFileTracingWrapper::Write(uint64_t offset, const Slice& data,
                                              const IOOptions& options,
                                              IODebugContext* dbg) {
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(clock_);
   timer.Start();
   IOStatus s = target()->Write(offset, data, options, dbg);
   uint64_t elapsed = timer.ElapsedNanos();
-  IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLenAndOffset,
-                          __func__, elapsed, s.ToString(), data.size(), offset);
-  io_tracer_->WriteIOOp(io_record);
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_,
+                          data.size(), offset);
+  io_tracer_->WriteIOOp(io_record, dbg);
   return s;
 }
 
@@ -307,13 +450,69 @@ IOStatus FSRandomRWFileTracingWrapper::Read(uint64_t offset, size_t n,
                                             const IOOptions& options,
                                             Slice* result, char* scratch,
                                             IODebugContext* dbg) const {
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(clock_);
   timer.Start();
   IOStatus s = target()->Read(offset, n, options, result, scratch, dbg);
   uint64_t elapsed = timer.ElapsedNanos();
-  IOTraceRecord io_record(env_->NowNanos(), TraceType::kIOLenAndOffset,
-                          __func__, elapsed, s.ToString(), n, offset);
-  io_tracer_->WriteIOOp(io_record);
+  uint64_t io_op_data = 0;
+  io_op_data |= (1 << IOTraceOp::kIOLen);
+  io_op_data |= (1 << IOTraceOp::kIOOffset);
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer, io_op_data,
+                          __func__, elapsed, s.ToString(), file_name_, n,
+                          offset);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSRandomRWFileTracingWrapper::Flush(const IOOptions& options,
+                                             IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Flush(options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          file_name_);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSRandomRWFileTracingWrapper::Close(const IOOptions& options,
+                                             IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Close(options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          file_name_);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSRandomRWFileTracingWrapper::Sync(const IOOptions& options,
+                                            IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Sync(options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          file_name_);
+  io_tracer_->WriteIOOp(io_record, dbg);
+  return s;
+}
+
+IOStatus FSRandomRWFileTracingWrapper::Fsync(const IOOptions& options,
+                                             IODebugContext* dbg) {
+  StopWatchNano timer(clock_);
+  timer.Start();
+  IOStatus s = target()->Fsync(options, dbg);
+  uint64_t elapsed = timer.ElapsedNanos();
+  IOTraceRecord io_record(clock_->NowNanos(), TraceType::kIOTracer,
+                          0 /*io_op_data*/, __func__, elapsed, s.ToString(),
+                          file_name_);
+  io_tracer_->WriteIOOp(io_record, dbg);
   return s;
 }
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/env/file_system_tracer.h b/env/file_system_tracer.h
index 4f37cbb0f2b..da87797d30b 100644
--- a/env/file_system_tracer.h
+++ b/env/file_system_tracer.h
@@ -6,6 +6,7 @@
 #pragma once
 
 #include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
 #include "trace_replay/io_tracer.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -18,17 +19,44 @@ namespace ROCKSDB_NAMESPACE {
 // overridden.
 class FileSystemTracingWrapper : public FileSystemWrapper {
  public:
-  FileSystemTracingWrapper(std::shared_ptr<FileSystem> t,
-                           std::shared_ptr<IOTracer> io_tracer)
-      : FileSystemWrapper(t), io_tracer_(io_tracer), env_(Env::Default()) {}
+  FileSystemTracingWrapper(const std::shared_ptr<FileSystem>& t,
+                           const std::shared_ptr<IOTracer>& io_tracer)
+      : FileSystemWrapper(t),
+        io_tracer_(io_tracer),
+        clock_(SystemClock::Default().get()) {}
 
   ~FileSystemTracingWrapper() override {}
 
+  IOStatus NewSequentialFile(const std::string& fname,
+                             const FileOptions& file_opts,
+                             std::unique_ptr<FSSequentialFile>* result,
+                             IODebugContext* dbg) override;
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& file_opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override;
+
   IOStatus NewWritableFile(const std::string& fname,
                            const FileOptions& file_opts,
                            std::unique_ptr<FSWritableFile>* result,
                            IODebugContext* dbg) override;
 
+  IOStatus ReopenWritableFile(const std::string& fname,
+                              const FileOptions& file_opts,
+                              std::unique_ptr<FSWritableFile>* result,
+                              IODebugContext* dbg) override;
+
+  IOStatus ReuseWritableFile(const std::string& fname,
+                             const std::string& old_fname,
+                             const FileOptions& file_opts,
+                             std::unique_ptr<FSWritableFile>* result,
+                             IODebugContext* dbg) override;
+
+  IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* dbg) override;
+
   IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts,
                         std::unique_ptr<FSDirectory>* result,
                         IODebugContext* dbg) override;
@@ -53,9 +81,12 @@ class FileSystemTracingWrapper : public FileSystemWrapper {
   IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
                        uint64_t* file_size, IODebugContext* dbg) override;
 
+  IOStatus Truncate(const std::string& fname, size_t size,
+                    const IOOptions& options, IODebugContext* dbg) override;
+
  private:
   std::shared_ptr<IOTracer> io_tracer_;
-  Env* env_;
+  SystemClock* clock_;
 };
 
 // The FileSystemPtr is a wrapper class that takes pointer to storage systems
@@ -103,10 +134,12 @@ class FileSystemPtr {
 class FSSequentialFileTracingWrapper : public FSSequentialFileWrapper {
  public:
   FSSequentialFileTracingWrapper(FSSequentialFile* t,
-                                 std::shared_ptr<IOTracer> io_tracer)
+                                 std::shared_ptr<IOTracer> io_tracer,
+                                 const std::string& file_name)
       : FSSequentialFileWrapper(t),
         io_tracer_(io_tracer),
-        env_(Env::Default()) {}
+        clock_(SystemClock::Default().get()),
+        file_name_(file_name) {}
 
   ~FSSequentialFileTracingWrapper() override {}
 
@@ -121,7 +154,8 @@ class FSSequentialFileTracingWrapper : public FSSequentialFileWrapper {
 
  private:
   std::shared_ptr<IOTracer> io_tracer_;
-  Env* env_;
+  SystemClock* clock_;
+  std::string file_name_;
 };
 
 // The FSSequentialFilePtr is a wrapper class that takes pointer to storage
@@ -133,10 +167,13 @@ class FSSequentialFilePtr {
  public:
   FSSequentialFilePtr() = delete;
   FSSequentialFilePtr(std::unique_ptr<FSSequentialFile>&& fs,
-                      const std::shared_ptr<IOTracer>& io_tracer)
+                      const std::shared_ptr<IOTracer>& io_tracer,
+                      const std::string& file_name)
       : fs_(std::move(fs)),
         io_tracer_(io_tracer),
-        fs_tracer_(fs_.get(), io_tracer_) {}
+        fs_tracer_(fs_.get(), io_tracer_,
+                   file_name.substr(file_name.find_last_of("/\\") +
+                                    1) /* pass file name */) {}
 
   FSSequentialFile* operator->() const {
     if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
@@ -169,10 +206,12 @@ class FSSequentialFilePtr {
 class FSRandomAccessFileTracingWrapper : public FSRandomAccessFileWrapper {
  public:
   FSRandomAccessFileTracingWrapper(FSRandomAccessFile* t,
-                                   std::shared_ptr<IOTracer> io_tracer)
+                                   std::shared_ptr<IOTracer> io_tracer,
+                                   const std::string& file_name)
       : FSRandomAccessFileWrapper(t),
         io_tracer_(io_tracer),
-        env_(Env::Default()) {}
+        clock_(SystemClock::Default().get()),
+        file_name_(file_name) {}
 
   ~FSRandomAccessFileTracingWrapper() override {}
 
@@ -190,7 +229,9 @@ class FSRandomAccessFileTracingWrapper : public FSRandomAccessFileWrapper {
 
  private:
   std::shared_ptr<IOTracer> io_tracer_;
-  Env* env_;
+  SystemClock* clock_;
+  // Stores file name instead of full path.
+  std::string file_name_;
 };
 
 // The FSRandomAccessFilePtr is a wrapper class that takes pointer to storage
@@ -201,10 +242,13 @@ class FSRandomAccessFileTracingWrapper : public FSRandomAccessFileWrapper {
 class FSRandomAccessFilePtr {
  public:
   FSRandomAccessFilePtr(std::unique_ptr<FSRandomAccessFile>&& fs,
-                        const std::shared_ptr<IOTracer>& io_tracer)
+                        const std::shared_ptr<IOTracer>& io_tracer,
+                        const std::string& file_name)
       : fs_(std::move(fs)),
         io_tracer_(io_tracer),
-        fs_tracer_(fs_.get(), io_tracer_) {}
+        fs_tracer_(fs_.get(), io_tracer_,
+                   file_name.substr(file_name.find_last_of("/\\") +
+                                    1) /* pass file name */) {}
 
   FSRandomAccessFile* operator->() const {
     if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
@@ -237,8 +281,12 @@ class FSRandomAccessFilePtr {
 class FSWritableFileTracingWrapper : public FSWritableFileWrapper {
  public:
   FSWritableFileTracingWrapper(FSWritableFile* t,
-                               std::shared_ptr<IOTracer> io_tracer)
-      : FSWritableFileWrapper(t), io_tracer_(io_tracer), env_(Env::Default()) {}
+                               std::shared_ptr<IOTracer> io_tracer,
+                               const std::string& file_name)
+      : FSWritableFileWrapper(t),
+        io_tracer_(io_tracer),
+        clock_(SystemClock::Default().get()),
+        file_name_(file_name) {}
 
   ~FSWritableFileTracingWrapper() override {}
 
@@ -271,7 +319,9 @@ class FSWritableFileTracingWrapper : public FSWritableFileWrapper {
 
  private:
   std::shared_ptr<IOTracer> io_tracer_;
-  Env* env_;
+  SystemClock* clock_;
+  // Stores file name instead of full path.
+  std::string file_name_;
 };
 
 // The FSWritableFilePtr is a wrapper class that takes pointer to storage
@@ -282,9 +332,13 @@ class FSWritableFileTracingWrapper : public FSWritableFileWrapper {
 class FSWritableFilePtr {
  public:
   FSWritableFilePtr(std::unique_ptr<FSWritableFile>&& fs,
-                    const std::shared_ptr<IOTracer>& io_tracer)
+                    const std::shared_ptr<IOTracer>& io_tracer,
+                    const std::string& file_name)
       : fs_(std::move(fs)), io_tracer_(io_tracer) {
-    fs_tracer_.reset(new FSWritableFileTracingWrapper(fs_.get(), io_tracer_));
+    fs_tracer_.reset(new FSWritableFileTracingWrapper(
+        fs_.get(), io_tracer_,
+        file_name.substr(file_name.find_last_of("/\\") +
+                         1) /* pass file name */));
   }
 
   FSWritableFile* operator->() const {
@@ -324,8 +378,12 @@ class FSWritableFilePtr {
 class FSRandomRWFileTracingWrapper : public FSRandomRWFileWrapper {
  public:
   FSRandomRWFileTracingWrapper(FSRandomRWFile* t,
-                               std::shared_ptr<IOTracer> io_tracer)
-      : FSRandomRWFileWrapper(t), io_tracer_(io_tracer), env_(Env::Default()) {}
+                               std::shared_ptr<IOTracer> io_tracer,
+                               const std::string& file_name)
+      : FSRandomRWFileWrapper(t),
+        io_tracer_(io_tracer),
+        clock_(SystemClock::Default().get()),
+        file_name_(file_name) {}
 
   ~FSRandomRWFileTracingWrapper() override {}
 
@@ -336,9 +394,19 @@ class FSRandomRWFileTracingWrapper : public FSRandomRWFileWrapper {
                 Slice* result, char* scratch,
                 IODebugContext* dbg) const override;
 
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
+
  private:
   std::shared_ptr<IOTracer> io_tracer_;
-  Env* env_;
+  SystemClock* clock_;
+  // Stores file name instead of full path.
+  std::string file_name_;
 };
 
 // The FSRandomRWFilePtr is a wrapper class that takes pointer to storage
@@ -349,10 +417,13 @@ class FSRandomRWFileTracingWrapper : public FSRandomRWFileWrapper {
 class FSRandomRWFilePtr {
  public:
   FSRandomRWFilePtr(std::unique_ptr<FSRandomRWFile>&& fs,
-                    std::shared_ptr<IOTracer> io_tracer)
+                    std::shared_ptr<IOTracer> io_tracer,
+                    const std::string& file_name)
       : fs_(std::move(fs)),
         io_tracer_(io_tracer),
-        fs_tracer_(fs_.get(), io_tracer_) {}
+        fs_tracer_(fs_.get(), io_tracer_,
+                   file_name.substr(file_name.find_last_of("/\\") +
+                                    1) /* pass file name */) {}
 
   FSRandomRWFile* operator->() const {
     if (io_tracer_ && io_tracer_->is_tracing_enabled()) {
diff --git a/env/fs_posix.cc b/env/fs_posix.cc
index 5fa3d8224f0..a3e360806b6 100644
--- a/env/fs_posix.cc
+++ b/env/fs_posix.cc
@@ -15,10 +15,6 @@
 #endif
 #include <errno.h>
 #include <fcntl.h>
-
-#if defined(OS_LINUX)
-#include <linux/fs.h>
-#endif
 #include <pthread.h>
 #include <signal.h>
 #include <stdio.h>
@@ -29,13 +25,13 @@
 #include <sys/stat.h>
 #if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID)
 #include <sys/statfs.h>
-#include <sys/syscall.h>
 #include <sys/sysmacros.h>
 #endif
 #include <sys/statvfs.h>
 #include <sys/time.h>
 #include <sys/types.h>
 #include <time.h>
+
 #include <algorithm>
 // Get nano time includes
 #if defined(OS_LINUX) || defined(OS_FREEBSD)
@@ -52,7 +48,6 @@
 
 #include "env/composite_env_wrapper.h"
 #include "env/io_posix.h"
-#include "logging/logging.h"
 #include "logging/posix_logger.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/thread_status_updater.h"
@@ -86,9 +81,7 @@ inline mode_t GetDBFileMode(bool allow_non_owner_access) {
   return allow_non_owner_access ? 0644 : 0600;
 }
 
-static uint64_t gettid() {
-  return Env::Default()->GetThreadID();
-}
+static uint64_t gettid() { return Env::Default()->GetThreadID(); }
 
 // list of pathnames that are locked
 // Only used for error message.
@@ -272,8 +265,7 @@ class PosixFileSystem : public FileSystem {
   }
 
   virtual IOStatus OpenWritableFile(const std::string& fname,
-                                    const FileOptions& options,
-                                    bool reopen,
+                                    const FileOptions& options, bool reopen,
                                     std::unique_ptr<FSWritableFile>* result,
                                     IODebugContext* /*dbg*/) {
     result->reset();
@@ -556,26 +548,37 @@ class PosixFileSystem : public FileSystem {
   }
 
   IOStatus NewLogger(const std::string& fname, const IOOptions& /*opts*/,
-                   std::shared_ptr<Logger>* result,
-                   IODebugContext* /*dbg*/) override {
-    FILE* f;
+                     std::shared_ptr<Logger>* result,
+                     IODebugContext* /*dbg*/) override {
+    FILE* f = nullptr;
+    int fd;
     {
       IOSTATS_TIMER_GUARD(open_nanos);
-      f = fopen(fname.c_str(),
-                "w"
+      fd = open(fname.c_str(),
+                cloexec_flags(O_WRONLY | O_CREAT | O_TRUNC, nullptr),
+                GetDBFileMode(allow_non_owner_access_));
+      if (fd != -1) {
+        f = fdopen(fd,
+                   "w"
 #ifdef __GLIBC_PREREQ
 #if __GLIBC_PREREQ(2, 7)
-                "e"  // glibc extension to enable O_CLOEXEC
+                   "e"  // glibc extension to enable O_CLOEXEC
 #endif
 #endif
-      );
+        );
+      }
+    }
+    if (fd == -1) {
+      result->reset();
+      return status_to_io_status(
+          IOError("when open a file for new logger", fname, errno));
     }
     if (f == nullptr) {
+      close(fd);
       result->reset();
       return status_to_io_status(
-              IOError("when fopen a file for new logger", fname, errno));
+          IOError("when fdopen a file for new logger", fname, errno));
     } else {
-      int fd = fileno(f);
 #ifdef ROCKSDB_FALLOCATE_PRESENT
       fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, 4 * 1024);
 #endif
@@ -612,6 +615,7 @@ class PosixFileSystem : public FileSystem {
                        std::vector<std::string>* result,
                        IODebugContext* /*dbg*/) override {
     result->clear();
+
     DIR* d = opendir(dir.c_str());
     if (d == nullptr) {
       switch (errno) {
@@ -623,11 +627,36 @@ class PosixFileSystem : public FileSystem {
           return IOError("While opendir", dir, errno);
       }
     }
+
+    // reset errno before calling readdir()
+    errno = 0;
     struct dirent* entry;
     while ((entry = readdir(d)) != nullptr) {
-      result->push_back(entry->d_name);
+      // filter out '.' and '..' directory entries
+      // which appear only on some platforms
+      const bool ignore =
+          entry->d_type == DT_DIR &&
+          (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0);
+      if (!ignore) {
+        result->push_back(entry->d_name);
+      }
+      errno = 0;  // reset errno if readdir() success
+    }
+
+    // always attempt to close the dir
+    const auto pre_close_errno = errno;  // errno may be modified by closedir
+    const int close_result = closedir(d);
+
+    if (pre_close_errno != 0) {
+      // error occurred during readdir
+      return IOError("While readdir", dir, pre_close_errno);
     }
-    closedir(d);
+
+    if (close_result != 0) {
+      // error occurred during closedir
+      return IOError("While closedir", dir, errno);
+    }
+
     return IOStatus::OK();
   }
 
@@ -755,7 +784,9 @@ class PosixFileSystem : public FileSystem {
     LockHoldingInfo lhi;
     int64_t current_time = 0;
     // Ignore status code as the time is only used for error message.
-    Env::Default()->GetCurrentTime(&current_time).PermitUncheckedError();
+    SystemClock::Default()
+        ->GetCurrentTime(&current_time)
+        .PermitUncheckedError();
     lhi.acquire_time = current_time;
     lhi.acquiring_thread = Env::Default()->GetThreadID();
 
@@ -771,9 +802,9 @@ class PosixFileSystem : public FileSystem {
     // closed, all locks the process holds for that *file* are released
     const auto it_success = locked_files.insert({fname, lhi});
     if (it_success.second == false) {
+      LockHoldingInfo prev_info = it_success.first->second;
       mutex_locked_files.Unlock();
       errno = ENOLCK;
-      LockHoldingInfo& prev_info = it_success.first->second;
       // Note that the thread ID printed is the same one as the one in
       // posix logger, but posix logger prints it hex format.
       return IOError("lock hold by current process, acquire time " +
@@ -841,7 +872,7 @@ class PosixFileSystem : public FileSystem {
     char the_path[256];
     char* ret = getcwd(the_path, 256);
     if (ret == nullptr) {
-      return IOStatus::IOError(strerror(errno));
+      return IOStatus::IOError(errnoStr(errno).c_str());
     }
 
     *output_path = ret;
@@ -875,7 +906,17 @@ class PosixFileSystem : public FileSystem {
       return IOError("While doing statvfs", fname, errno);
     }
 
-    *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bfree);
+    // sbuf.bfree is total free space available to root
+    // sbuf.bavail is total free space available to unprivileged user
+    //  sbuf.bavail <= sbuf.bfree ... pick correct based upon effective user id
+    if (geteuid()) {
+      // non-zero user is unprivileged, or -1 if error.  take more conservative
+      // size
+      *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bavail);
+    } else {
+      // root user can access all disk space
+      *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bfree);
+    }
     return IOStatus::OK();
   }
 
@@ -904,7 +945,7 @@ class PosixFileSystem : public FileSystem {
   }
 
   FileOptions OptimizeForLogWrite(const FileOptions& file_options,
-                                 const DBOptions& db_options) const override {
+                                  const DBOptions& db_options) const override {
     FileOptions optimized = file_options;
     optimized.use_mmap_writes = false;
     optimized.use_direct_writes = false;
diff --git a/env/fs_readonly.h b/env/fs_readonly.h
new file mode 100644
index 00000000000..89875106eec
--- /dev/null
+++ b/env/fs_readonly.h
@@ -0,0 +1,104 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/file_system.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A FileSystem wrapper that only allows read-only operation.
+//
+// This class has not been fully analyzed for providing strong security
+// guarantees.
+class ReadOnlyFileSystem : public FileSystemWrapper {
+  static inline IOStatus FailReadOnly() {
+    IOStatus s = IOStatus::IOError("Attempted write to ReadOnlyFileSystem");
+    assert(s.GetRetryable() == false);
+    return s;
+  }
+
+ public:
+  explicit ReadOnlyFileSystem(const std::shared_ptr<FileSystem>& base)
+      : FileSystemWrapper(base) {}
+
+  IOStatus NewWritableFile(const std::string& /*fname*/,
+                           const FileOptions& /*options*/,
+                           std::unique_ptr<FSWritableFile>* /*result*/,
+                           IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus ReuseWritableFile(const std::string& /*fname*/,
+                             const std::string& /*old_fname*/,
+                             const FileOptions& /*options*/,
+                             std::unique_ptr<FSWritableFile>* /*result*/,
+                             IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus NewRandomRWFile(const std::string& /*fname*/,
+                           const FileOptions& /*options*/,
+                           std::unique_ptr<FSRandomRWFile>* /*result*/,
+                           IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus NewDirectory(const std::string& /*dir*/,
+                        const IOOptions& /*options*/,
+                        std::unique_ptr<FSDirectory>* /*result*/,
+                        IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus DeleteFile(const std::string& /*fname*/,
+                      const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus CreateDir(const std::string& /*dirname*/,
+                     const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus CreateDirIfMissing(const std::string& dirname,
+                              const IOOptions& options,
+                              IODebugContext* dbg) override {
+    // Allow if dir already exists
+    bool is_dir = false;
+    IOStatus s = IsDirectory(dirname, options, &is_dir, dbg);
+    if (s.ok() && is_dir) {
+      return s;
+    } else {
+      return FailReadOnly();
+    }
+  }
+  IOStatus DeleteDir(const std::string& /*dirname*/,
+                     const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus RenameFile(const std::string& /*src*/, const std::string& /*dest*/,
+                      const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus LinkFile(const std::string& /*src*/, const std::string& /*dest*/,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus LockFile(const std::string& /*fname*/, const IOOptions& /*options*/,
+                    FileLock** /*lock*/, IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+  IOStatus NewLogger(const std::string& /*fname*/, const IOOptions& /*options*/,
+                     std::shared_ptr<Logger>* /*result*/,
+                     IODebugContext* /*dbg*/) override {
+    return FailReadOnly();
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/env/fs_remap.cc b/env/fs_remap.cc
new file mode 100644
index 00000000000..026f83cd12a
--- /dev/null
+++ b/env/fs_remap.cc
@@ -0,0 +1,306 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "env/fs_remap.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+RemapFileSystem::RemapFileSystem(const std::shared_ptr<FileSystem>& base)
+    : FileSystemWrapper(base) {}
+
+std::pair<IOStatus, std::string> RemapFileSystem::EncodePathWithNewBasename(
+    const std::string& path) {
+  // No difference by default
+  return EncodePath(path);
+}
+
+Status RemapFileSystem::RegisterDbPaths(const std::vector<std::string>& paths) {
+  std::vector<std::string> encoded_paths;
+  encoded_paths.reserve(paths.size());
+  for (auto& path : paths) {
+    auto status_and_enc_path = EncodePathWithNewBasename(path);
+    if (!status_and_enc_path.first.ok()) {
+      return status_and_enc_path.first;
+    }
+    encoded_paths.emplace_back(status_and_enc_path.second);
+  }
+  return FileSystemWrapper::RegisterDbPaths(encoded_paths);
+}
+
+Status RemapFileSystem::UnregisterDbPaths(
+    const std::vector<std::string>& paths) {
+  std::vector<std::string> encoded_paths;
+  encoded_paths.reserve(paths.size());
+  for (auto& path : paths) {
+    auto status_and_enc_path = EncodePathWithNewBasename(path);
+    if (!status_and_enc_path.first.ok()) {
+      return status_and_enc_path.first;
+    }
+    encoded_paths.emplace_back(status_and_enc_path.second);
+  }
+  return FileSystemWrapper::UnregisterDbPaths(encoded_paths);
+}
+
+IOStatus RemapFileSystem::NewSequentialFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSSequentialFile>* result, IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::NewSequentialFile(status_and_enc_path.second,
+                                              options, result, dbg);
+}
+
+IOStatus RemapFileSystem::NewRandomAccessFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::NewRandomAccessFile(status_and_enc_path.second,
+                                                options, result, dbg);
+}
+
+IOStatus RemapFileSystem::NewWritableFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::NewWritableFile(status_and_enc_path.second, options,
+                                            result, dbg);
+}
+
+IOStatus RemapFileSystem::ReuseWritableFile(
+    const std::string& fname, const std::string& old_fname,
+    const FileOptions& options, std::unique_ptr<FSWritableFile>* result,
+    IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  auto status_and_old_enc_path = EncodePath(old_fname);
+  if (!status_and_old_enc_path.first.ok()) {
+    return status_and_old_enc_path.first;
+  }
+  return FileSystemWrapper::ReuseWritableFile(status_and_old_enc_path.second,
+                                              status_and_old_enc_path.second,
+                                              options, result, dbg);
+}
+
+IOStatus RemapFileSystem::NewRandomRWFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSRandomRWFile>* result, IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::NewRandomRWFile(status_and_enc_path.second, options,
+                                            result, dbg);
+}
+
+IOStatus RemapFileSystem::NewDirectory(const std::string& dir,
+                                       const IOOptions& options,
+                                       std::unique_ptr<FSDirectory>* result,
+                                       IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(dir);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::NewDirectory(status_and_enc_path.second, options,
+                                         result, dbg);
+}
+
+IOStatus RemapFileSystem::FileExists(const std::string& fname,
+                                     const IOOptions& options,
+                                     IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::FileExists(status_and_enc_path.second, options,
+                                       dbg);
+}
+
+IOStatus RemapFileSystem::GetChildren(const std::string& dir,
+                                      const IOOptions& options,
+                                      std::vector<std::string>* result,
+                                      IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePath(dir);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::GetChildren(status_and_enc_path.second, options,
+                                        result, dbg);
+}
+
+IOStatus RemapFileSystem::GetChildrenFileAttributes(
+    const std::string& dir, const IOOptions& options,
+    std::vector<FileAttributes>* result, IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePath(dir);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::GetChildrenFileAttributes(
+      status_and_enc_path.second, options, result, dbg);
+}
+
+IOStatus RemapFileSystem::DeleteFile(const std::string& fname,
+                                     const IOOptions& options,
+                                     IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePath(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::DeleteFile(status_and_enc_path.second, options,
+                                       dbg);
+}
+
+IOStatus RemapFileSystem::CreateDir(const std::string& dirname,
+                                    const IOOptions& options,
+                                    IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(dirname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::CreateDir(status_and_enc_path.second, options, dbg);
+}
+
+IOStatus RemapFileSystem::CreateDirIfMissing(const std::string& dirname,
+                                             const IOOptions& options,
+                                             IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(dirname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::CreateDirIfMissing(status_and_enc_path.second,
+                                               options, dbg);
+}
+
+IOStatus RemapFileSystem::DeleteDir(const std::string& dirname,
+                                    const IOOptions& options,
+                                    IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePath(dirname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::DeleteDir(status_and_enc_path.second, options, dbg);
+}
+
+IOStatus RemapFileSystem::GetFileSize(const std::string& fname,
+                                      const IOOptions& options,
+                                      uint64_t* file_size,
+                                      IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePath(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::GetFileSize(status_and_enc_path.second, options,
+                                        file_size, dbg);
+}
+
+IOStatus RemapFileSystem::GetFileModificationTime(const std::string& fname,
+                                                  const IOOptions& options,
+                                                  uint64_t* file_mtime,
+                                                  IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePath(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::GetFileModificationTime(status_and_enc_path.second,
+                                                    options, file_mtime, dbg);
+}
+
+IOStatus RemapFileSystem::IsDirectory(const std::string& path,
+                                      const IOOptions& options, bool* is_dir,
+                                      IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePath(path);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::IsDirectory(status_and_enc_path.second, options,
+                                        is_dir, dbg);
+}
+
+IOStatus RemapFileSystem::RenameFile(const std::string& src,
+                                     const std::string& dest,
+                                     const IOOptions& options,
+                                     IODebugContext* dbg) {
+  auto status_and_src_enc_path = EncodePath(src);
+  if (!status_and_src_enc_path.first.ok()) {
+    return status_and_src_enc_path.first;
+  }
+  auto status_and_dest_enc_path = EncodePathWithNewBasename(dest);
+  if (!status_and_dest_enc_path.first.ok()) {
+    return status_and_dest_enc_path.first;
+  }
+  return FileSystemWrapper::RenameFile(status_and_src_enc_path.second,
+                                       status_and_dest_enc_path.second, options,
+                                       dbg);
+}
+
+IOStatus RemapFileSystem::LinkFile(const std::string& src,
+                                   const std::string& dest,
+                                   const IOOptions& options,
+                                   IODebugContext* dbg) {
+  auto status_and_src_enc_path = EncodePath(src);
+  if (!status_and_src_enc_path.first.ok()) {
+    return status_and_src_enc_path.first;
+  }
+  auto status_and_dest_enc_path = EncodePathWithNewBasename(dest);
+  if (!status_and_dest_enc_path.first.ok()) {
+    return status_and_dest_enc_path.first;
+  }
+  return FileSystemWrapper::LinkFile(status_and_src_enc_path.second,
+                                     status_and_dest_enc_path.second, options,
+                                     dbg);
+}
+
+IOStatus RemapFileSystem::LockFile(const std::string& fname,
+                                   const IOOptions& options, FileLock** lock,
+                                   IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  // FileLock subclasses may store path (e.g., PosixFileLock stores it). We
+  // can skip stripping the chroot directory from this path because callers
+  // shouldn't use it.
+  return FileSystemWrapper::LockFile(status_and_enc_path.second, options, lock,
+                                     dbg);
+}
+
+IOStatus RemapFileSystem::NewLogger(const std::string& fname,
+                                    const IOOptions& options,
+                                    std::shared_ptr<Logger>* result,
+                                    IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePathWithNewBasename(fname);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::NewLogger(status_and_enc_path.second, options,
+                                      result, dbg);
+}
+
+IOStatus RemapFileSystem::GetAbsolutePath(const std::string& db_path,
+                                          const IOOptions& options,
+                                          std::string* output_path,
+                                          IODebugContext* dbg) {
+  auto status_and_enc_path = EncodePath(db_path);
+  if (!status_and_enc_path.first.ok()) {
+    return status_and_enc_path.first;
+  }
+  return FileSystemWrapper::GetAbsolutePath(status_and_enc_path.second, options,
+                                            output_path, dbg);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/env/fs_remap.h b/env/fs_remap.h
new file mode 100644
index 00000000000..4975822f66d
--- /dev/null
+++ b/env/fs_remap.h
@@ -0,0 +1,131 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <utility>
+
+#include "rocksdb/file_system.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An abstract FileSystem wrapper that creates a view of an existing
+// FileSystem by remapping names in some way.
+//
+// This class has not been fully analyzed for providing strong security
+// guarantees.
+class RemapFileSystem : public FileSystemWrapper {
+ public:
+  explicit RemapFileSystem(const std::shared_ptr<FileSystem>& base);
+
+ protected:
+  // Returns status and mapped-to path in the wrapped filesystem.
+  // If it returns non-OK status, the returned path should not be used.
+  virtual std::pair<IOStatus, std::string> EncodePath(
+      const std::string& path) = 0;
+
+  // Similar to EncodePath() except used in cases in which it is OK for
+  // no file or directory on 'path' to already exist, such as if the
+  // operation would create one. However, the parent of 'path' is expected
+  // to exist for the operation to succeed.
+  // Default implementation: call EncodePath
+  virtual std::pair<IOStatus, std::string> EncodePathWithNewBasename(
+      const std::string& path);
+
+ public:
+  // Left abstract:
+  // const char* Name() const override { ... }
+
+  Status RegisterDbPaths(const std::vector<std::string>& paths) override;
+
+  Status UnregisterDbPaths(const std::vector<std::string>& paths) override;
+
+  IOStatus NewSequentialFile(const std::string& fname,
+                             const FileOptions& options,
+                             std::unique_ptr<FSSequentialFile>* result,
+                             IODebugContext* dbg) override;
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& options,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override;
+
+  IOStatus NewWritableFile(const std::string& fname, const FileOptions& options,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override;
+
+  IOStatus ReuseWritableFile(const std::string& fname,
+                             const std::string& old_fname,
+                             const FileOptions& options,
+                             std::unique_ptr<FSWritableFile>* result,
+                             IODebugContext* dbg) override;
+
+  IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* dbg) override;
+
+  IOStatus NewDirectory(const std::string& dir, const IOOptions& options,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* dbg) override;
+
+  IOStatus FileExists(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override;
+
+  IOStatus GetChildren(const std::string& dir, const IOOptions& options,
+                       std::vector<std::string>* result,
+                       IODebugContext* dbg) override;
+
+  IOStatus GetChildrenFileAttributes(const std::string& dir,
+                                     const IOOptions& options,
+                                     std::vector<FileAttributes>* result,
+                                     IODebugContext* dbg) override;
+
+  IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override;
+
+  IOStatus CreateDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+
+  IOStatus CreateDirIfMissing(const std::string& dirname,
+                              const IOOptions& options,
+                              IODebugContext* dbg) override;
+
+  IOStatus DeleteDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+
+  IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
+                       uint64_t* file_size, IODebugContext* dbg) override;
+
+  IOStatus GetFileModificationTime(const std::string& fname,
+                                   const IOOptions& options,
+                                   uint64_t* file_mtime,
+                                   IODebugContext* dbg) override;
+
+  IOStatus IsDirectory(const std::string& path, const IOOptions& options,
+                       bool* is_dir, IODebugContext* dbg) override;
+
+  IOStatus RenameFile(const std::string& src, const std::string& dest,
+                      const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus LinkFile(const std::string& src, const std::string& dest,
+                    const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus LockFile(const std::string& fname, const IOOptions& options,
+                    FileLock** lock, IODebugContext* dbg) override;
+
+  IOStatus NewLogger(const std::string& fname, const IOOptions& options,
+                     std::shared_ptr<Logger>* result,
+                     IODebugContext* dbg) override;
+
+  IOStatus GetAbsolutePath(const std::string& db_path, const IOOptions& options,
+                           std::string* output_path,
+                           IODebugContext* dbg) override;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/env/io_posix.cc b/env/io_posix.cc
index 689d898120b..a041b32aa6a 100644
--- a/env/io_posix.cc
+++ b/env/io_posix.cc
@@ -27,7 +27,6 @@
 #include <sys/types.h>
 #ifdef OS_LINUX
 #include <sys/statfs.h>
-#include <sys/syscall.h>
 #include <sys/sysmacros.h>
 #endif
 #include "monitoring/iostats_context_imp.h"
@@ -59,7 +58,7 @@ IOStatus IOError(const std::string& context, const std::string& file_name,
   switch (err_number) {
     case ENOSPC: {
       IOStatus s = IOStatus::NoSpace(IOErrorMsg(context, file_name),
-                                     strerror(err_number));
+                                     errnoStr(err_number).c_str());
       s.SetRetryable(true);
       return s;
     }
@@ -67,10 +66,10 @@ IOStatus IOError(const std::string& context, const std::string& file_name,
       return IOStatus::IOError(IOStatus::kStaleFile);
     case ENOENT:
       return IOStatus::PathNotFound(IOErrorMsg(context, file_name),
-                                    strerror(err_number));
+                                    errnoStr(err_number).c_str());
     default:
       return IOStatus::IOError(IOErrorMsg(context, file_name),
-                               strerror(err_number));
+                               errnoStr(err_number).c_str());
   }
 }
 
@@ -634,6 +633,8 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs,
     return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg);
   }
 
+  IOStatus ios = IOStatus::OK();
+
   struct WrappedReadRequest {
     FSReadRequest* req;
     struct iovec iov;
@@ -680,19 +681,47 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs,
 
     ssize_t ret =
         io_uring_submit_and_wait(iu, static_cast<unsigned int>(this_reqs));
+    TEST_SYNC_POINT_CALLBACK(
+        "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return1",
+        &ret);
+    TEST_SYNC_POINT_CALLBACK(
+        "PosixRandomAccessFile::MultiRead:io_uring_submit_and_wait:return2",
+        iu);
+
     if (static_cast<size_t>(ret) != this_reqs) {
       fprintf(stderr, "ret = %ld this_reqs: %ld\n", (long)ret, (long)this_reqs);
+      // If error happens and we submitted fewer than expected, it is an
+      // exception case and we don't retry here. We should still consume
+      // what is is submitted in the ring.
+      for (ssize_t i = 0; i < ret; i++) {
+        struct io_uring_cqe* cqe = nullptr;
+        io_uring_wait_cqe(iu, &cqe);
+        if (cqe != nullptr) {
+          io_uring_cqe_seen(iu, cqe);
+        }
+      }
+      return IOStatus::IOError("io_uring_submit_and_wait() requested " +
+                               ToString(this_reqs) + " but returned " +
+                               ToString(ret));
     }
-    assert(static_cast<size_t>(ret) == this_reqs);
 
     for (size_t i = 0; i < this_reqs; i++) {
-      struct io_uring_cqe* cqe;
+      struct io_uring_cqe* cqe = nullptr;
       WrappedReadRequest* req_wrap;
 
       // We could use the peek variant here, but this seems safer in terms
       // of our initial wait not reaping all completions
       ret = io_uring_wait_cqe(iu, &cqe);
-      assert(!ret);
+      TEST_SYNC_POINT_CALLBACK(
+          "PosixRandomAccessFile::MultiRead:io_uring_wait_cqe:return", &ret);
+      if (ret) {
+        ios = IOStatus::IOError("io_uring_wait_cqe() returns " + ToString(ret));
+
+        if (cqe != nullptr) {
+          io_uring_cqe_seen(iu, cqe);
+        }
+        continue;
+      }
 
       req_wrap = static_cast<WrappedReadRequest*>(io_uring_cqe_get_data(cqe));
       FSReadRequest* req = req_wrap->req;
@@ -741,7 +770,7 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs,
       io_uring_cqe_seen(iu, cqe);
     }
   }
-  return IOStatus::OK();
+  return ios;
 #else
   return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg);
 #endif
@@ -894,7 +923,7 @@ IOStatus PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
  * knows enough to skip zero suffixes.
  */
 IOStatus PosixMmapFile::UnmapCurrentRegion() {
-  TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds);
+  TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0");
   if (base_ != nullptr) {
     int munmap_status = munmap(base_, limit_ - base_);
     if (munmap_status != 0) {
@@ -917,7 +946,7 @@ IOStatus PosixMmapFile::UnmapCurrentRegion() {
 IOStatus PosixMmapFile::MapNewRegion() {
 #ifdef ROCKSDB_FALLOCATE_PRESENT
   assert(base_ == nullptr);
-  TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds);
+  TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0");
   // we can't fallocate with FALLOC_FL_KEEP_SIZE here
   if (allow_fallocate_) {
     IOSTATS_TIMER_GUARD(allocate_nanos);
@@ -928,17 +957,17 @@ IOStatus PosixMmapFile::MapNewRegion() {
     }
     if (alloc_status != 0) {
       return IOStatus::IOError("Error allocating space to file : " + filename_ +
-                               "Error : " + strerror(alloc_status));
+                               "Error : " + errnoStr(alloc_status).c_str());
     }
   }
 
-  TEST_KILL_RANDOM("PosixMmapFile::Append:1", rocksdb_kill_odds);
+  TEST_KILL_RANDOM("PosixMmapFile::Append:1");
   void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
                    file_offset_);
   if (ptr == MAP_FAILED) {
     return IOStatus::IOError("MMap failed on " + filename_);
   }
-  TEST_KILL_RANDOM("PosixMmapFile::Append:2", rocksdb_kill_odds);
+  TEST_KILL_RANDOM("PosixMmapFile::Append:2");
 
   base_ = reinterpret_cast<char*>(ptr);
   limit_ = base_ + map_size_;
@@ -959,7 +988,7 @@ IOStatus PosixMmapFile::Msync() {
   size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
   size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
   last_sync_ = dst_;
-  TEST_KILL_RANDOM("PosixMmapFile::Msync:0", rocksdb_kill_odds);
+  TEST_KILL_RANDOM("PosixMmapFile::Msync:0");
   if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
     return IOError("While msync", filename_, errno);
   }
@@ -1012,7 +1041,7 @@ IOStatus PosixMmapFile::Append(const Slice& data, const IOOptions& /*opts*/,
       if (!s.ok()) {
         return s;
       }
-      TEST_KILL_RANDOM("PosixMmapFile::Append:0", rocksdb_kill_odds);
+      TEST_KILL_RANDOM("PosixMmapFile::Append:0");
     }
 
     size_t n = (left <= avail) ? left : avail;
@@ -1110,7 +1139,7 @@ IOStatus PosixMmapFile::Allocate(uint64_t offset, uint64_t len,
                                  IODebugContext* /*dbg*/) {
   assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
   assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
-  TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds);
+  TEST_KILL_RANDOM("PosixMmapFile::Allocate:0");
   int alloc_status = 0;
   if (allow_fallocate_) {
     alloc_status =
@@ -1214,6 +1243,7 @@ IOStatus PosixWritableFile::Close(const IOOptions& /*opts*/,
   size_t block_size;
   size_t last_allocated_block;
   GetPreallocationStatus(&block_size, &last_allocated_block);
+  TEST_SYNC_POINT_CALLBACK("PosixWritableFile::Close", &last_allocated_block);
   if (last_allocated_block > 0) {
     // trim the extra space preallocated at the end of the file
     // NOTE(ljin): we probably don't want to surface failure as an IOError,
@@ -1333,7 +1363,7 @@ IOStatus PosixWritableFile::Allocate(uint64_t offset, uint64_t len,
                                      IODebugContext* /*dbg*/) {
   assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
   assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
-  TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds);
+  TEST_KILL_RANDOM("PosixWritableFile::Allocate:0");
   IOSTATS_TIMER_GUARD(allocate_nanos);
   int alloc_status = 0;
   if (allow_fallocate_) {
diff --git a/env/mock_env.cc b/env/mock_env.cc
index 3fdeac2b9ed..3733371fce1 100644
--- a/env/mock_env.cc
+++ b/env/mock_env.cc
@@ -8,12 +8,16 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "env/mock_env.h"
+
 #include <algorithm>
 #include <chrono>
+
 #include "file/filename.h"
 #include "port/sys_time.h"
+#include "rocksdb/file_system.h"
+#include "test_util/sync_point.h"
 #include "util/cast_util.h"
-#include "util/murmurhash.h"
+#include "util/hash.h"
 #include "util/random.h"
 #include "util/rate_limiter.h"
 
@@ -29,8 +33,7 @@ class MemFile {
         locked_(false),
         size_(0),
         modified_time_(Now()),
-        rnd_(static_cast<uint32_t>(
-            MurmurHash(fn.data(), static_cast<int>(fn.size()), 0))),
+        rnd_(Lower32of64(GetSliceNPHash64(fn))),
         fsynced_bytes_(0) {}
   // No copying allowed.
   MemFile(const MemFile&) = delete;
@@ -78,7 +81,8 @@ class MemFile {
 
   uint64_t Size() const { return size_; }
 
-  void Truncate(size_t size) {
+  void Truncate(size_t size, const IOOptions& /*options*/,
+                IODebugContext* /*dbg*/) {
     MutexLock lock(&mutex_);
     if (size < size_) {
       data_.resize(size);
@@ -100,7 +104,17 @@ class MemFile {
     }
   }
 
-  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*options*/,
+                Slice* result, char* scratch, IODebugContext* /*dbg*/) const {
+    {
+      IOStatus s;
+      TEST_SYNC_POINT_CALLBACK("MemFile::Read:IOStatus", &s);
+      if (!s.ok()) {
+        // with sync point only
+        *result = Slice();
+        return s;
+      }
+    }
     MutexLock lock(&mutex_);
     const uint64_t available = Size() - std::min(Size(), offset);
     size_t offset_ = static_cast<size_t>(offset);
@@ -109,7 +123,7 @@ class MemFile {
     }
     if (n == 0) {
       *result = Slice();
-      return Status::OK();
+      return IOStatus::OK();
     }
     if (scratch) {
       memcpy(scratch, &(data_[offset_]), n);
@@ -117,10 +131,11 @@ class MemFile {
     } else {
       *result = Slice(&(data_[offset_]), n);
     }
-    return Status::OK();
+    return IOStatus::OK();
   }
 
-  Status Write(uint64_t offset, const Slice& data) {
+  IOStatus Write(uint64_t offset, const Slice& data,
+                 const IOOptions& /*options*/, IODebugContext* /*dbg*/) {
     MutexLock lock(&mutex_);
     size_t offset_ = static_cast<size_t>(offset);
     if (offset + data.size() > data_.size()) {
@@ -129,20 +144,21 @@ class MemFile {
     data_.replace(offset_, data.size(), data.data(), data.size());
     size_ = data_.size();
     modified_time_ = Now();
-    return Status::OK();
+    return IOStatus::OK();
   }
 
-  Status Append(const Slice& data) {
+  IOStatus Append(const Slice& data, const IOOptions& /*options*/,
+                  IODebugContext* /*dbg*/) {
     MutexLock lock(&mutex_);
     data_.append(data.data(), data.size());
     size_ = data_.size();
     modified_time_ = Now();
-    return Status::OK();
+    return IOStatus::OK();
   }
 
-  Status Fsync() {
+  IOStatus Fsync(const IOOptions& /*options*/, IODebugContext* /*dbg*/) {
     fsynced_bytes_ = size_.load();
-    return Status::OK();
+    return IOStatus::OK();
   }
 
   uint64_t ModifiedTime() const { return modified_time_; }
@@ -177,111 +193,176 @@ class MemFile {
 
 namespace {
 
-class MockSequentialFile : public SequentialFile {
+class MockSequentialFile : public FSSequentialFile {
  public:
-  explicit MockSequentialFile(MemFile* file) : file_(file), pos_(0) {
+  explicit MockSequentialFile(MemFile* file, const FileOptions& opts)
+      : file_(file),
+        use_direct_io_(opts.use_direct_reads),
+        use_mmap_read_(opts.use_mmap_reads),
+        pos_(0) {
     file_->Ref();
   }
 
   ~MockSequentialFile() override { file_->Unref(); }
 
-  Status Read(size_t n, Slice* result, char* scratch) override {
-    Status s = file_->Read(pos_, n, result, scratch);
+  IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+                char* scratch, IODebugContext* dbg) override {
+    IOStatus s = file_->Read(pos_, n, options, result,
+                             (use_mmap_read_) ? nullptr : scratch, dbg);
     if (s.ok()) {
       pos_ += result->size();
     }
     return s;
   }
 
-  Status Skip(uint64_t n) override {
+  bool use_direct_io() const override { return use_direct_io_; }
+  IOStatus Skip(uint64_t n) override {
     if (pos_ > file_->Size()) {
-      return Status::IOError("pos_ > file_->Size()");
+      return IOStatus::IOError("pos_ > file_->Size()");
     }
     const uint64_t available = file_->Size() - pos_;
     if (n > available) {
       n = available;
     }
     pos_ += static_cast<size_t>(n);
-    return Status::OK();
+    return IOStatus::OK();
   }
 
  private:
   MemFile* file_;
+  bool use_direct_io_;
+  bool use_mmap_read_;
   size_t pos_;
 };
 
-class MockRandomAccessFile : public RandomAccessFile {
+class MockRandomAccessFile : public FSRandomAccessFile {
  public:
-  explicit MockRandomAccessFile(MemFile* file) : file_(file) { file_->Ref(); }
+  explicit MockRandomAccessFile(MemFile* file, const FileOptions& opts)
+      : file_(file),
+        use_direct_io_(opts.use_direct_reads),
+        use_mmap_read_(opts.use_mmap_reads) {
+    file_->Ref();
+  }
 
   ~MockRandomAccessFile() override { file_->Unref(); }
 
-  Status Read(uint64_t offset, size_t n, Slice* result,
-              char* scratch) const override {
-    return file_->Read(offset, n, result, scratch);
+  bool use_direct_io() const override { return use_direct_io_; }
+
+  IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override {
+    if (use_mmap_read_) {
+      return file_->Read(offset, n, options, result, nullptr, dbg);
+    } else {
+      return file_->Read(offset, n, options, result, scratch, dbg);
+    }
   }
 
  private:
   MemFile* file_;
+  bool use_direct_io_;
+  bool use_mmap_read_;
 };
 
-class MockRandomRWFile : public RandomRWFile {
+class MockRandomRWFile : public FSRandomRWFile {
  public:
   explicit MockRandomRWFile(MemFile* file) : file_(file) { file_->Ref(); }
 
   ~MockRandomRWFile() override { file_->Unref(); }
 
-  Status Write(uint64_t offset, const Slice& data) override {
-    return file_->Write(offset, data);
+  IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+                 IODebugContext* dbg) override {
+    return file_->Write(offset, data, options, dbg);
   }
 
-  Status Read(uint64_t offset, size_t n, Slice* result,
-              char* scratch) const override {
-    return file_->Read(offset, n, result, scratch);
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override {
+    return file_->Read(offset, n, options, result, scratch, dbg);
   }
 
-  Status Close() override { return file_->Fsync(); }
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+    return file_->Fsync(options, dbg);
+  }
 
-  Status Flush() override { return Status::OK(); }
+  IOStatus Flush(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
 
-  Status Sync() override { return file_->Fsync(); }
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+    return file_->Fsync(options, dbg);
+  }
 
  private:
   MemFile* file_;
 };
 
-class MockWritableFile : public WritableFile {
+class MockWritableFile : public FSWritableFile {
  public:
-  MockWritableFile(MemFile* file, RateLimiter* rate_limiter)
-      : file_(file), rate_limiter_(rate_limiter) {
+  MockWritableFile(MemFile* file, const FileOptions& opts)
+      : file_(file),
+        use_direct_io_(opts.use_direct_writes),
+        rate_limiter_(opts.rate_limiter) {
     file_->Ref();
   }
 
   ~MockWritableFile() override { file_->Unref(); }
 
-  Status Append(const Slice& data) override {
+  bool use_direct_io() const override { return false && use_direct_io_; }
+
+  using FSWritableFile::Append;
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  IODebugContext* dbg) override {
     size_t bytes_written = 0;
     while (bytes_written < data.size()) {
       auto bytes = RequestToken(data.size() - bytes_written);
-      Status s = file_->Append(Slice(data.data() + bytes_written, bytes));
+      IOStatus s = file_->Append(Slice(data.data() + bytes_written, bytes),
+                                 options, dbg);
       if (!s.ok()) {
         return s;
       }
       bytes_written += bytes;
     }
-    return Status::OK();
+    return IOStatus::OK();
   }
-  Status Truncate(uint64_t size) override {
-    file_->Truncate(static_cast<size_t>(size));
-    return Status::OK();
+
+  using FSWritableFile::PositionedAppend;
+  IOStatus PositionedAppend(const Slice& data, uint64_t /*offset*/,
+                            const IOOptions& options,
+                            IODebugContext* dbg) override {
+    assert(use_direct_io_);
+    return Append(data, options, dbg);
   }
-  Status Close() override { return file_->Fsync(); }
 
-  Status Flush() override { return Status::OK(); }
+  IOStatus Truncate(uint64_t size, const IOOptions& options,
+                    IODebugContext* dbg) override {
+    file_->Truncate(static_cast<size_t>(size), options, dbg);
+    return IOStatus::OK();
+  }
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+    return file_->Fsync(options, dbg);
+  }
 
-  Status Sync() override { return file_->Fsync(); }
+  IOStatus Flush(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
 
-  uint64_t GetFileSize() override { return file_->Size(); }
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+    return file_->Fsync(options, dbg);
+  }
+
+  uint64_t GetFileSize(const IOOptions& /*options*/,
+                       IODebugContext* /*dbg*/) override {
+    return file_->Size();
+  }
 
  private:
   inline size_t RequestToken(size_t bytes) {
@@ -294,12 +375,16 @@ class MockWritableFile : public WritableFile {
   }
 
   MemFile* file_;
+  bool use_direct_io_;
   RateLimiter* rate_limiter_;
 };
 
-class MockEnvDirectory : public Directory {
+class MockEnvDirectory : public FSDirectory {
  public:
-  Status Fsync() override { return Status::OK(); }
+  IOStatus Fsync(const IOOptions& /*options*/,
+                 IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
 };
 
 class MockEnvFileLock : public FileLock {
@@ -314,21 +399,26 @@ class MockEnvFileLock : public FileLock {
 
 class TestMemLogger : public Logger {
  private:
-  std::unique_ptr<WritableFile> file_;
+  std::unique_ptr<FSWritableFile> file_;
   std::atomic_size_t log_size_;
   static const uint64_t flush_every_seconds_ = 5;
   std::atomic_uint_fast64_t last_flush_micros_;
   Env* env_;
+  IOOptions options_;
+  IODebugContext* dbg_;
   std::atomic<bool> flush_pending_;
 
  public:
-  TestMemLogger(std::unique_ptr<WritableFile> f, Env* env,
+  TestMemLogger(std::unique_ptr<FSWritableFile> f, Env* env,
+                const IOOptions& options, IODebugContext* dbg,
                 const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL)
       : Logger(log_level),
         file_(std::move(f)),
         log_size_(0),
         last_flush_micros_(0),
         env_(env),
+        options_(options),
+        dbg_(dbg),
         flush_pending_(false) {}
   ~TestMemLogger() override {}
 
@@ -394,7 +484,7 @@ class TestMemLogger : public Logger {
       assert(p <= limit);
       const size_t write_size = p - base;
 
-      Status s = file_->Append(Slice(base, write_size));
+      Status s = file_->Append(Slice(base, write_size), options_, dbg_);
       if (s.ok()) {
         flush_pending_ = true;
         log_size_ += write_size;
@@ -414,151 +504,305 @@ class TestMemLogger : public Logger {
   size_t GetLogFileSize() const override { return log_size_; }
 };
 
-}  // Anonymous namespace
+class MockFileSystem : public FileSystem {
+ public:
+  explicit MockFileSystem(Env* env, bool supports_direct_io = true)
+      : env_(env), supports_direct_io_(supports_direct_io) {}
 
-MockEnv::MockEnv(Env* base_env) : EnvWrapper(base_env), fake_sleep_micros_(0) {}
+  ~MockFileSystem() override {
+    for (auto i = file_map_.begin(); i != file_map_.end(); ++i) {
+      i->second->Unref();
+    }
+  }
 
-MockEnv::~MockEnv() {
-  for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i) {
-    i->second->Unref();
+  const char* Name() const override { return "Memory"; }
+  IOStatus NewSequentialFile(const std::string& f, const FileOptions& file_opts,
+                             std::unique_ptr<FSSequentialFile>* r,
+                             IODebugContext* dbg) override;
+  IOStatus NewRandomAccessFile(const std::string& f,
+                               const FileOptions& file_opts,
+                               std::unique_ptr<FSRandomAccessFile>* r,
+                               IODebugContext* dbg) override;
+
+  IOStatus NewRandomRWFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* dbg) override;
+  IOStatus ReuseWritableFile(const std::string& fname,
+                             const std::string& old_fname,
+                             const FileOptions& file_opts,
+                             std::unique_ptr<FSWritableFile>* result,
+                             IODebugContext* dbg) override;
+  IOStatus NewWritableFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override;
+  IOStatus ReopenWritableFile(const std::string& fname,
+                              const FileOptions& options,
+                              std::unique_ptr<FSWritableFile>* result,
+                              IODebugContext* dbg) override;
+  IOStatus NewDirectory(const std::string& /*name*/, const IOOptions& io_opts,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* dbg) override;
+  IOStatus FileExists(const std::string& fname, const IOOptions& /*io_opts*/,
+                      IODebugContext* /*dbg*/) override;
+  IOStatus GetChildren(const std::string& dir, const IOOptions& options,
+                       std::vector<std::string>* result,
+                       IODebugContext* dbg) override;
+  IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override;
+  IOStatus Truncate(const std::string& fname, size_t size,
+                    const IOOptions& options, IODebugContext* dbg) override;
+  IOStatus CreateDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+  IOStatus CreateDirIfMissing(const std::string& dirname,
+                              const IOOptions& options,
+                              IODebugContext* dbg) override;
+  IOStatus DeleteDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+
+  IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
+                       uint64_t* file_size, IODebugContext* dbg) override;
+
+  IOStatus GetFileModificationTime(const std::string& fname,
+                                   const IOOptions& options,
+                                   uint64_t* file_mtime,
+                                   IODebugContext* dbg) override;
+  IOStatus RenameFile(const std::string& src, const std::string& target,
+                      const IOOptions& options, IODebugContext* dbg) override;
+  IOStatus LinkFile(const std::string& /*src*/, const std::string& /*target*/,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override;
+  IOStatus LockFile(const std::string& fname, const IOOptions& options,
+                    FileLock** lock, IODebugContext* dbg) override;
+  IOStatus UnlockFile(FileLock* lock, const IOOptions& options,
+                      IODebugContext* dbg) override;
+  IOStatus GetTestDirectory(const IOOptions& options, std::string* path,
+                            IODebugContext* dbg) override;
+  IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts,
+                     std::shared_ptr<Logger>* result,
+                     IODebugContext* dbg) override;
+  // Get full directory name for this db.
+  IOStatus GetAbsolutePath(const std::string& db_path,
+                           const IOOptions& /*options*/,
+                           std::string* output_path,
+                           IODebugContext* /*dbg*/) override {
+    *output_path = NormalizeMockPath(db_path);
+    if (output_path->at(0) != '/') {
+      return IOStatus::NotSupported("GetAbsolutePath");
+    } else {
+      return IOStatus::OK();
+    }
   }
-}
+  IOStatus IsDirectory(const std::string& /*path*/,
+                       const IOOptions& /*options*/, bool* /*is_dir*/,
+                       IODebugContext* /*dgb*/) override {
+    return IOStatus::NotSupported("IsDirectory");
+  }
+
+  Status CorruptBuffer(const std::string& fname);
 
+ private:
+  bool RenameFileInternal(const std::string& src, const std::string& dest);
+  void DeleteFileInternal(const std::string& fname);
+  bool GetChildrenInternal(const std::string& fname,
+                           std::vector<std::string>* results);
+
+  std::string NormalizeMockPath(const std::string& path) {
+    std::string p = NormalizePath(path);
+    if (p.back() == kFilePathSeparator && p.size() > 1) {
+      p.pop_back();
+    }
+    return p;
+  }
+
+ private:
+  // Map from filenames to MemFile objects, representing a simple file system.
+  port::Mutex mutex_;
+  std::map<std::string, MemFile*> file_map_;  // Protected by mutex_.
+  Env* env_;
+  bool supports_direct_io_;
+};
+
+}  // Anonymous namespace
 // Partial implementation of the Env interface.
-Status MockEnv::NewSequentialFile(const std::string& fname,
-                                  std::unique_ptr<SequentialFile>* result,
-                                  const EnvOptions& /*soptions*/) {
-  auto fn = NormalizePath(fname);
+IOStatus MockFileSystem::NewSequentialFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSSequentialFile>* result, IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
+
   MutexLock lock(&mutex_);
   if (file_map_.find(fn) == file_map_.end()) {
     *result = nullptr;
-    return Status::IOError(fn, "File not found");
+    return IOStatus::PathNotFound(fn);
   }
   auto* f = file_map_[fn];
   if (f->is_lock_file()) {
-    return Status::InvalidArgument(fn, "Cannot open a lock file.");
+    return IOStatus::InvalidArgument(fn, "Cannot open a lock file.");
+  } else if (file_opts.use_direct_reads && !supports_direct_io_) {
+    return IOStatus::NotSupported("Direct I/O Not Supported");
+  } else {
+    result->reset(new MockSequentialFile(f, file_opts));
+    return IOStatus::OK();
   }
-  result->reset(new MockSequentialFile(f));
-  return Status::OK();
 }
 
-Status MockEnv::NewRandomAccessFile(const std::string& fname,
-                                    std::unique_ptr<RandomAccessFile>* result,
-                                    const EnvOptions& /*soptions*/) {
-  auto fn = NormalizePath(fname);
+IOStatus MockFileSystem::NewRandomAccessFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
   MutexLock lock(&mutex_);
   if (file_map_.find(fn) == file_map_.end()) {
     *result = nullptr;
-    return Status::IOError(fn, "File not found");
+    return IOStatus::PathNotFound(fn);
   }
   auto* f = file_map_[fn];
   if (f->is_lock_file()) {
-    return Status::InvalidArgument(fn, "Cannot open a lock file.");
+    return IOStatus::InvalidArgument(fn, "Cannot open a lock file.");
+  } else if (file_opts.use_direct_reads && !supports_direct_io_) {
+    return IOStatus::NotSupported("Direct I/O Not Supported");
+  } else {
+    result->reset(new MockRandomAccessFile(f, file_opts));
+    return IOStatus::OK();
   }
-  result->reset(new MockRandomAccessFile(f));
-  return Status::OK();
 }
 
-Status MockEnv::NewRandomRWFile(const std::string& fname,
-                                std::unique_ptr<RandomRWFile>* result,
-                                const EnvOptions& /*soptions*/) {
-  auto fn = NormalizePath(fname);
+IOStatus MockFileSystem::NewRandomRWFile(
+    const std::string& fname, const FileOptions& /*file_opts*/,
+    std::unique_ptr<FSRandomRWFile>* result, IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
   MutexLock lock(&mutex_);
   if (file_map_.find(fn) == file_map_.end()) {
     *result = nullptr;
-    return Status::IOError(fn, "File not found");
+    return IOStatus::PathNotFound(fn);
   }
   auto* f = file_map_[fn];
   if (f->is_lock_file()) {
-    return Status::InvalidArgument(fn, "Cannot open a lock file.");
+    return IOStatus::InvalidArgument(fn, "Cannot open a lock file.");
   }
   result->reset(new MockRandomRWFile(f));
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status MockEnv::ReuseWritableFile(const std::string& fname,
-                                  const std::string& old_fname,
-                                  std::unique_ptr<WritableFile>* result,
-                                  const EnvOptions& options) {
-  auto s = RenameFile(old_fname, fname);
+IOStatus MockFileSystem::ReuseWritableFile(
+    const std::string& fname, const std::string& old_fname,
+    const FileOptions& options, std::unique_ptr<FSWritableFile>* result,
+    IODebugContext* dbg) {
+  auto s = RenameFile(old_fname, fname, IOOptions(), dbg);
   if (!s.ok()) {
     return s;
+  } else {
+    result->reset();
+    return NewWritableFile(fname, options, result, dbg);
   }
-  result->reset();
-  return NewWritableFile(fname, result, options);
 }
 
-Status MockEnv::NewWritableFile(const std::string& fname,
-                                std::unique_ptr<WritableFile>* result,
-                                const EnvOptions& env_options) {
-  auto fn = NormalizePath(fname);
+IOStatus MockFileSystem::NewWritableFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
   MutexLock lock(&mutex_);
   if (file_map_.find(fn) != file_map_.end()) {
     DeleteFileInternal(fn);
   }
-  MemFile* file = new MemFile(this, fn, false);
+  MemFile* file = new MemFile(env_, fn, false);
   file->Ref();
   file_map_[fn] = file;
+  if (file_opts.use_direct_writes && !supports_direct_io_) {
+    return IOStatus::NotSupported("Direct I/O Not Supported");
+  } else {
+    result->reset(new MockWritableFile(file, file_opts));
+    return IOStatus::OK();
+  }
+}
 
-  result->reset(new MockWritableFile(file, env_options.rate_limiter));
-  return Status::OK();
+IOStatus MockFileSystem::ReopenWritableFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
+  MutexLock lock(&mutex_);
+  MemFile* file = nullptr;
+  if (file_map_.find(fn) == file_map_.end()) {
+    file = new MemFile(env_, fn, false);
+    // Only take a reference when we create the file objectt
+    file->Ref();
+    file_map_[fn] = file;
+  } else {
+    file = file_map_[fn];
+  }
+  if (file_opts.use_direct_writes && !supports_direct_io_) {
+    return IOStatus::NotSupported("Direct I/O Not Supported");
+  } else {
+    result->reset(new MockWritableFile(file, file_opts));
+    return IOStatus::OK();
+  }
 }
 
-Status MockEnv::NewDirectory(const std::string& /*name*/,
-                             std::unique_ptr<Directory>* result) {
+IOStatus MockFileSystem::NewDirectory(const std::string& /*name*/,
+                                      const IOOptions& /*io_opts*/,
+                                      std::unique_ptr<FSDirectory>* result,
+                                      IODebugContext* /*dbg*/) {
   result->reset(new MockEnvDirectory());
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status MockEnv::FileExists(const std::string& fname) {
-  auto fn = NormalizePath(fname);
+IOStatus MockFileSystem::FileExists(const std::string& fname,
+                                    const IOOptions& /*io_opts*/,
+                                    IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
   MutexLock lock(&mutex_);
   if (file_map_.find(fn) != file_map_.end()) {
     // File exists
-    return Status::OK();
+    return IOStatus::OK();
   }
   // Now also check if fn exists as a dir
   for (const auto& iter : file_map_) {
     const std::string& filename = iter.first;
     if (filename.size() >= fn.size() + 1 && filename[fn.size()] == '/' &&
         Slice(filename).starts_with(Slice(fn))) {
-      return Status::OK();
+      return IOStatus::OK();
     }
   }
-  return Status::NotFound();
+  return IOStatus::NotFound();
 }
 
-Status MockEnv::GetChildren(const std::string& dir,
-                            std::vector<std::string>* result) {
-  auto d = NormalizePath(dir);
+bool MockFileSystem::GetChildrenInternal(const std::string& dir,
+                                         std::vector<std::string>* result) {
+  auto d = NormalizeMockPath(dir);
   bool found_dir = false;
-  {
-    MutexLock lock(&mutex_);
-    result->clear();
-    for (const auto& iter : file_map_) {
-      const std::string& filename = iter.first;
-
-      if (filename == d) {
-        found_dir = true;
-      } else if (filename.size() >= d.size() + 1 && filename[d.size()] == '/' &&
-                 Slice(filename).starts_with(Slice(d))) {
-        found_dir = true;
-        size_t next_slash = filename.find('/', d.size() + 1);
-        if (next_slash != std::string::npos) {
-          result->push_back(
-              filename.substr(d.size() + 1, next_slash - d.size() - 1));
-        } else {
-          result->push_back(filename.substr(d.size() + 1));
-        }
+  result->clear();
+  for (const auto& iter : file_map_) {
+    const std::string& filename = iter.first;
+
+    if (filename == d) {
+      found_dir = true;
+    } else if (filename.size() >= d.size() + 1 && filename[d.size()] == '/' &&
+               Slice(filename).starts_with(Slice(d))) {
+      found_dir = true;
+      size_t next_slash = filename.find('/', d.size() + 1);
+      if (next_slash != std::string::npos) {
+        result->push_back(
+            filename.substr(d.size() + 1, next_slash - d.size() - 1));
+      } else {
+        result->push_back(filename.substr(d.size() + 1));
       }
     }
   }
   result->erase(std::unique(result->begin(), result->end()), result->end());
-  return found_dir ? Status::OK() : Status::NotFound();
+  return found_dir;
+}
+
+IOStatus MockFileSystem::GetChildren(const std::string& dir,
+                                     const IOOptions& /*options*/,
+                                     std::vector<std::string>* result,
+                                     IODebugContext* /*dbg*/) {
+  MutexLock lock(&mutex_);
+  bool found_dir = GetChildrenInternal(dir, result);
+  return found_dir ? IOStatus::OK() : IOStatus::NotFound(dir);
 }
 
-void MockEnv::DeleteFileInternal(const std::string& fname) {
-  assert(fname == NormalizePath(fname));
+void MockFileSystem::DeleteFileInternal(const std::string& fname) {
+  assert(fname == NormalizeMockPath(fname));
   const auto& pair = file_map_.find(fname);
   if (pair != file_map_.end()) {
     pair->second->Unref();
@@ -566,180 +810,222 @@ void MockEnv::DeleteFileInternal(const std::string& fname) {
   }
 }
 
-Status MockEnv::DeleteFile(const std::string& fname) {
-  auto fn = NormalizePath(fname);
+IOStatus MockFileSystem::DeleteFile(const std::string& fname,
+                                    const IOOptions& /*options*/,
+                                    IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
   MutexLock lock(&mutex_);
   if (file_map_.find(fn) == file_map_.end()) {
-    return Status::IOError(fn, "File not found");
+    return IOStatus::PathNotFound(fn);
   }
 
   DeleteFileInternal(fn);
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status MockEnv::Truncate(const std::string& fname, size_t size) {
-  auto fn = NormalizePath(fname);
+IOStatus MockFileSystem::Truncate(const std::string& fname, size_t size,
+                                  const IOOptions& options,
+                                  IODebugContext* dbg) {
+  auto fn = NormalizeMockPath(fname);
   MutexLock lock(&mutex_);
   auto iter = file_map_.find(fn);
   if (iter == file_map_.end()) {
-    return Status::IOError(fn, "File not found");
+    return IOStatus::PathNotFound(fn);
   }
-  iter->second->Truncate(size);
-  return Status::OK();
+  iter->second->Truncate(size, options, dbg);
+  return IOStatus::OK();
 }
 
-Status MockEnv::CreateDir(const std::string& dirname) {
-  auto dn = NormalizePath(dirname);
+IOStatus MockFileSystem::CreateDir(const std::string& dirname,
+                                   const IOOptions& /*options*/,
+                                   IODebugContext* /*dbg*/) {
+  auto dn = NormalizeMockPath(dirname);
   MutexLock lock(&mutex_);
   if (file_map_.find(dn) == file_map_.end()) {
-    MemFile* file = new MemFile(this, dn, false);
+    MemFile* file = new MemFile(env_, dn, false);
     file->Ref();
     file_map_[dn] = file;
   } else {
-    return Status::IOError();
+    return IOStatus::IOError();
   }
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status MockEnv::CreateDirIfMissing(const std::string& dirname) {
-  CreateDir(dirname).PermitUncheckedError();
-  return Status::OK();
+IOStatus MockFileSystem::CreateDirIfMissing(const std::string& dirname,
+                                            const IOOptions& options,
+                                            IODebugContext* dbg) {
+  CreateDir(dirname, options, dbg).PermitUncheckedError();
+  return IOStatus::OK();
 }
 
-Status MockEnv::DeleteDir(const std::string& dirname) {
-  return DeleteFile(dirname);
+IOStatus MockFileSystem::DeleteDir(const std::string& dirname,
+                                   const IOOptions& /*options*/,
+                                   IODebugContext* /*dbg*/) {
+  auto dir = NormalizeMockPath(dirname);
+  MutexLock lock(&mutex_);
+  if (file_map_.find(dir) == file_map_.end()) {
+    return IOStatus::PathNotFound(dir);
+  } else {
+    std::vector<std::string> children;
+    if (GetChildrenInternal(dir, &children)) {
+      for (const auto& child : children) {
+        DeleteFileInternal(child);
+      }
+    }
+    DeleteFileInternal(dir);
+    return IOStatus::OK();
+  }
 }
 
-Status MockEnv::GetFileSize(const std::string& fname, uint64_t* file_size) {
-  auto fn = NormalizePath(fname);
+IOStatus MockFileSystem::GetFileSize(const std::string& fname,
+                                     const IOOptions& /*options*/,
+                                     uint64_t* file_size,
+                                     IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
   MutexLock lock(&mutex_);
   auto iter = file_map_.find(fn);
   if (iter == file_map_.end()) {
-    return Status::IOError(fn, "File not found");
+    return IOStatus::PathNotFound(fn);
   }
 
   *file_size = iter->second->Size();
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status MockEnv::GetFileModificationTime(const std::string& fname,
-                                        uint64_t* time) {
-  auto fn = NormalizePath(fname);
+IOStatus MockFileSystem::GetFileModificationTime(const std::string& fname,
+                                                 const IOOptions& /*options*/,
+                                                 uint64_t* time,
+                                                 IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
   MutexLock lock(&mutex_);
   auto iter = file_map_.find(fn);
   if (iter == file_map_.end()) {
-    return Status::IOError(fn, "File not found");
+    return IOStatus::PathNotFound(fn);
   }
   *time = iter->second->ModifiedTime();
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status MockEnv::RenameFile(const std::string& src, const std::string& dest) {
-  auto s = NormalizePath(src);
-  auto t = NormalizePath(dest);
-  MutexLock lock(&mutex_);
-  if (file_map_.find(s) == file_map_.end()) {
-    return Status::IOError(s, "File not found");
+bool MockFileSystem::RenameFileInternal(const std::string& src,
+                                        const std::string& dest) {
+  if (file_map_.find(src) == file_map_.end()) {
+    return false;
+  } else {
+    std::vector<std::string> children;
+    if (GetChildrenInternal(src, &children)) {
+      for (const auto& child : children) {
+        RenameFileInternal(src + "/" + child, dest + "/" + child);
+      }
+    }
+    DeleteFileInternal(dest);
+    file_map_[dest] = file_map_[src];
+    file_map_.erase(src);
+    return true;
   }
+}
 
-  DeleteFileInternal(t);
-  file_map_[t] = file_map_[s];
-  file_map_.erase(s);
-  return Status::OK();
+IOStatus MockFileSystem::RenameFile(const std::string& src,
+                                    const std::string& dest,
+                                    const IOOptions& /*options*/,
+                                    IODebugContext* /*dbg*/) {
+  auto s = NormalizeMockPath(src);
+  auto t = NormalizeMockPath(dest);
+  MutexLock lock(&mutex_);
+  bool found = RenameFileInternal(s, t);
+  if (!found) {
+    return IOStatus::PathNotFound(s);
+  } else {
+    return IOStatus::OK();
+  }
 }
 
-Status MockEnv::LinkFile(const std::string& src, const std::string& dest) {
-  auto s = NormalizePath(src);
-  auto t = NormalizePath(dest);
+IOStatus MockFileSystem::LinkFile(const std::string& src,
+                                  const std::string& dest,
+                                  const IOOptions& /*options*/,
+                                  IODebugContext* /*dbg*/) {
+  auto s = NormalizeMockPath(src);
+  auto t = NormalizeMockPath(dest);
   MutexLock lock(&mutex_);
   if (file_map_.find(s) == file_map_.end()) {
-    return Status::IOError(s, "File not found");
+    return IOStatus::PathNotFound(s);
   }
 
   DeleteFileInternal(t);
   file_map_[t] = file_map_[s];
   file_map_[t]->Ref();  // Otherwise it might get deleted when noone uses s
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status MockEnv::NewLogger(const std::string& fname,
-                          std::shared_ptr<Logger>* result) {
-  auto fn = NormalizePath(fname);
+IOStatus MockFileSystem::NewLogger(const std::string& fname,
+                                   const IOOptions& io_opts,
+                                   std::shared_ptr<Logger>* result,
+                                   IODebugContext* dbg) {
+  auto fn = NormalizeMockPath(fname);
   MutexLock lock(&mutex_);
   auto iter = file_map_.find(fn);
   MemFile* file = nullptr;
   if (iter == file_map_.end()) {
-    file = new MemFile(this, fn, false);
+    file = new MemFile(env_, fn, false);
     file->Ref();
     file_map_[fn] = file;
   } else {
     file = iter->second;
   }
-  std::unique_ptr<WritableFile> f(new MockWritableFile(file, nullptr));
-  result->reset(new TestMemLogger(std::move(f), this));
-  return Status::OK();
+  std::unique_ptr<FSWritableFile> f(new MockWritableFile(file, FileOptions()));
+  result->reset(new TestMemLogger(std::move(f), env_, io_opts, dbg));
+  return IOStatus::OK();
 }
 
-Status MockEnv::LockFile(const std::string& fname, FileLock** flock) {
-  auto fn = NormalizePath(fname);
+IOStatus MockFileSystem::LockFile(const std::string& fname,
+                                  const IOOptions& /*options*/,
+                                  FileLock** flock, IODebugContext* /*dbg*/) {
+  auto fn = NormalizeMockPath(fname);
   {
     MutexLock lock(&mutex_);
     if (file_map_.find(fn) != file_map_.end()) {
       if (!file_map_[fn]->is_lock_file()) {
-        return Status::InvalidArgument(fname, "Not a lock file.");
+        return IOStatus::InvalidArgument(fname, "Not a lock file.");
       }
       if (!file_map_[fn]->Lock()) {
-        return Status::IOError(fn, "Lock is already held.");
+        return IOStatus::IOError(fn, "lock is already held.");
       }
     } else {
-      auto* file = new MemFile(this, fn, true);
+      auto* file = new MemFile(env_, fn, true);
       file->Ref();
       file->Lock();
       file_map_[fn] = file;
     }
   }
   *flock = new MockEnvFileLock(fn);
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status MockEnv::UnlockFile(FileLock* flock) {
+IOStatus MockFileSystem::UnlockFile(FileLock* flock,
+                                    const IOOptions& /*options*/,
+                                    IODebugContext* /*dbg*/) {
   std::string fn = static_cast_with_check<MockEnvFileLock>(flock)->FileName();
   {
     MutexLock lock(&mutex_);
     if (file_map_.find(fn) != file_map_.end()) {
       if (!file_map_[fn]->is_lock_file()) {
-        return Status::InvalidArgument(fn, "Not a lock file.");
+        return IOStatus::InvalidArgument(fn, "Not a lock file.");
       }
       file_map_[fn]->Unlock();
     }
   }
   delete flock;
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status MockEnv::GetTestDirectory(std::string* path) {
+IOStatus MockFileSystem::GetTestDirectory(const IOOptions& /*options*/,
+                                          std::string* path,
+                                          IODebugContext* /*dbg*/) {
   *path = "/test";
-  return Status::OK();
-}
-
-Status MockEnv::GetCurrentTime(int64_t* unix_time) {
-  auto s = EnvWrapper::GetCurrentTime(unix_time);
-  if (s.ok()) {
-    *unix_time += fake_sleep_micros_.load() / (1000 * 1000);
-  }
-  return s;
+  return IOStatus::OK();
 }
 
-uint64_t MockEnv::NowMicros() {
-  return EnvWrapper::NowMicros() + fake_sleep_micros_.load();
-}
-
-uint64_t MockEnv::NowNanos() {
-  return EnvWrapper::NowNanos() + fake_sleep_micros_.load() * 1000;
-}
-
-Status MockEnv::CorruptBuffer(const std::string& fname) {
-  auto fn = NormalizePath(fname);
+Status MockFileSystem::CorruptBuffer(const std::string& fname) {
+  auto fn = NormalizeMockPath(fname);
   MutexLock lock(&mutex_);
   auto iter = file_map_.find(fn);
   if (iter == file_map_.end()) {
@@ -748,9 +1034,52 @@ Status MockEnv::CorruptBuffer(const std::string& fname) {
   iter->second->CorruptBuffer();
   return Status::OK();
 }
+namespace {
+class MockSystemClock : public SystemClockWrapper {
+ public:
+  explicit MockSystemClock(const std::shared_ptr<SystemClock>& c)
+      : SystemClockWrapper(c), fake_sleep_micros_(0) {}
+
+  void FakeSleepForMicroseconds(int64_t micros) {
+    fake_sleep_micros_.fetch_add(micros);
+  }
+
+  const char* Name() const override { return "MockSystemClock"; }
+
+  Status GetCurrentTime(int64_t* unix_time) override {
+    auto s = SystemClockWrapper::GetCurrentTime(unix_time);
+    if (s.ok()) {
+      auto fake_time = fake_sleep_micros_.load() / (1000 * 1000);
+      *unix_time += fake_time;
+    }
+    return s;
+  }
+
+  uint64_t NowMicros() override {
+    return SystemClockWrapper::NowMicros() + fake_sleep_micros_.load();
+  }
+
+  uint64_t NowNanos() override {
+    return SystemClockWrapper::NowNanos() + fake_sleep_micros_.load() * 1000;
+  }
+
+ private:
+  std::atomic<int64_t> fake_sleep_micros_;
+};
+}  // namespace
+MockEnv::MockEnv(Env* base_env)
+    : CompositeEnvWrapper(
+          base_env, std::make_shared<MockFileSystem>(this),
+          std::make_shared<MockSystemClock>(base_env->GetSystemClock())) {}
+
+Status MockEnv::CorruptBuffer(const std::string& fname) {
+  auto mock = static_cast_with_check<MockFileSystem>(GetFileSystem().get());
+  return mock->CorruptBuffer(fname);
+}
 
 void MockEnv::FakeSleepForMicroseconds(int64_t micros) {
-  fake_sleep_micros_.fetch_add(micros);
+  auto mock = static_cast_with_check<MockSystemClock>(GetSystemClock().get());
+  mock->FakeSleepForMicroseconds(micros);
 }
 
 #ifndef ROCKSDB_LITE
diff --git a/env/mock_env.h b/env/mock_env.h
index 1ed5c0b1f73..5e7faf55b85 100644
--- a/env/mock_env.h
+++ b/env/mock_env.h
@@ -12,93 +12,17 @@
 #include <map>
 #include <string>
 #include <vector>
+
+#include "env/composite_env_wrapper.h"
 #include "rocksdb/env.h"
 #include "rocksdb/status.h"
-#include "port/port.h"
-#include "util/mutexlock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-class MemFile;
-class MockEnv : public EnvWrapper {
+class MockEnv : public CompositeEnvWrapper {
  public:
   explicit MockEnv(Env* base_env);
 
-  ~MockEnv() override;
-
-  // Partial implementation of the Env interface.
-  Status RegisterDbPaths(const std::vector<std::string>& /*paths*/) override {
-    return Status::OK();
-  }
-
-  Status UnregisterDbPaths(const std::vector<std::string>& /*paths*/) override {
-    return Status::OK();
-  }
-
-  Status NewSequentialFile(const std::string& fname,
-                           std::unique_ptr<SequentialFile>* result,
-                           const EnvOptions& soptions) override;
-
-  Status NewRandomAccessFile(const std::string& fname,
-                             std::unique_ptr<RandomAccessFile>* result,
-                             const EnvOptions& soptions) override;
-
-  Status NewRandomRWFile(const std::string& fname,
-                         std::unique_ptr<RandomRWFile>* result,
-                         const EnvOptions& options) override;
-
-  Status ReuseWritableFile(const std::string& fname,
-                           const std::string& old_fname,
-                           std::unique_ptr<WritableFile>* result,
-                           const EnvOptions& options) override;
-
-  Status NewWritableFile(const std::string& fname,
-                         std::unique_ptr<WritableFile>* result,
-                         const EnvOptions& env_options) override;
-
-  Status NewDirectory(const std::string& name,
-                      std::unique_ptr<Directory>* result) override;
-
-  Status FileExists(const std::string& fname) override;
-
-  Status GetChildren(const std::string& dir,
-                     std::vector<std::string>* result) override;
-
-  void DeleteFileInternal(const std::string& fname);
-
-  Status DeleteFile(const std::string& fname) override;
-
-  Status Truncate(const std::string& fname, size_t size) override;
-
-  Status CreateDir(const std::string& dirname) override;
-
-  Status CreateDirIfMissing(const std::string& dirname) override;
-
-  Status DeleteDir(const std::string& dirname) override;
-
-  Status GetFileSize(const std::string& fname, uint64_t* file_size) override;
-
-  Status GetFileModificationTime(const std::string& fname,
-                                 uint64_t* time) override;
-
-  Status RenameFile(const std::string& src, const std::string& target) override;
-
-  Status LinkFile(const std::string& src, const std::string& target) override;
-
-  Status NewLogger(const std::string& fname,
-                   std::shared_ptr<Logger>* result) override;
-
-  Status LockFile(const std::string& fname, FileLock** flock) override;
-
-  Status UnlockFile(FileLock* flock) override;
-
-  Status GetTestDirectory(std::string* path) override;
-
-  // Results of these can be affected by FakeSleepForMicroseconds()
-  Status GetCurrentTime(int64_t* unix_time) override;
-  uint64_t NowMicros() override;
-  uint64_t NowNanos() override;
-
   Status CorruptBuffer(const std::string& fname);
 
   // Doesn't really sleep, just affects output of GetCurrentTime(), NowMicros()
@@ -106,12 +30,6 @@ class MockEnv : public EnvWrapper {
   void FakeSleepForMicroseconds(int64_t micros);
 
  private:
-  // Map from filenames to MemFile objects, representing a simple file system.
-  typedef std::map<std::string, MemFile*> FileSystem;
-  port::Mutex mutex_;
-  FileSystem file_map_;  // Protected by mutex_.
-
-  std::atomic<int64_t> fake_sleep_micros_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/examples/Makefile b/examples/Makefile
index 27a6f0f421a..faee6f06bfd 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -12,6 +12,8 @@ ifneq ($(USE_RTTI), 1)
 	CXXFLAGS += -fno-rtti
 endif
 
+CFLAGS += -Wstrict-prototypes
+
 .PHONY: clean librocksdb
 
 all: simple_example column_families_example compact_files_example c_simple_example optimistic_transaction_example transaction_example compaction_filter_example options_file_example
diff --git a/file/delete_scheduler.cc b/file/delete_scheduler.cc
index 5a032837e88..5f9994074a6 100644
--- a/file/delete_scheduler.cc
+++ b/file/delete_scheduler.cc
@@ -15,17 +15,19 @@
 #include "logging/logging.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/sync_point.h"
 #include "util/mutexlock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-DeleteScheduler::DeleteScheduler(Env* env, FileSystem* fs,
+DeleteScheduler::DeleteScheduler(SystemClock* clock, FileSystem* fs,
                                  int64_t rate_bytes_per_sec, Logger* info_log,
                                  SstFileManagerImpl* sst_file_manager,
                                  double max_trash_db_ratio,
                                  uint64_t bytes_max_delete_chunk)
-    : env_(env),
+    : clock_(clock),
       fs_(fs),
       total_trash_size_(0),
       rate_bytes_per_sec_(rate_bytes_per_sec),
@@ -51,19 +53,21 @@ DeleteScheduler::~DeleteScheduler() {
   if (bg_thread_) {
     bg_thread_->join();
   }
+  for (const auto& it : bg_errors_) {
+    it.second.PermitUncheckedError();
+  }
 }
 
 Status DeleteScheduler::DeleteFile(const std::string& file_path,
                                    const std::string& dir_to_sync,
                                    const bool force_bg) {
-  Status s;
   if (rate_bytes_per_sec_.load() <= 0 || (!force_bg &&
       total_trash_size_.load() >
           sst_file_manager_->GetTotalSize() * max_trash_db_ratio_.load())) {
     // Rate limiting is disabled or trash size makes up more than
     // max_trash_db_ratio_ (default 25%) of the total DB size
     TEST_SYNC_POINT("DeleteScheduler::DeleteFile");
-    s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
+    Status s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
     if (s.ok()) {
       s = sst_file_manager_->OnDeleteFile(file_path);
       ROCKS_LOG_INFO(info_log_,
@@ -79,7 +83,7 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path,
 
   // Move file to trash
   std::string trash_file;
-  s = MarkAsTrash(file_path, &trash_file);
+  Status s = MarkAsTrash(file_path, &trash_file);
   ROCKS_LOG_INFO(info_log_, "Mark file: %s as trash -- %s", trash_file.c_str(),
                  s.ToString().c_str());
 
@@ -99,8 +103,13 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path,
 
   // Update the total trash size
   uint64_t trash_file_size = 0;
-  fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr);
-  total_trash_size_.fetch_add(trash_file_size);
+  IOStatus io_s =
+      fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr);
+  if (io_s.ok()) {
+    total_trash_size_.fetch_add(trash_file_size);
+  }
+  //**TODO: What should we do if we failed to
+  // get the file size?
 
   // Add file to delete queue
   {
@@ -169,17 +178,17 @@ Status DeleteScheduler::MarkAsTrash(const std::string& file_path,
     return Status::InvalidArgument("file_path is corrupted");
   }
 
-  Status s;
   if (DeleteScheduler::IsTrashFile(file_path)) {
     // This is already a trash file
     *trash_file = file_path;
-    return s;
+    return Status::OK();
   }
 
   *trash_file = file_path + kTrashExtension;
   // TODO(tec) : Implement Env::RenameFileIfNotExist and remove
   //             file_move_mu mutex.
   int cnt = 0;
+  Status s;
   InstrumentedMutexLock l(&file_move_mu_);
   while (true) {
     s = fs_->FileExists(*trash_file, IOOptions(), nullptr);
@@ -197,7 +206,7 @@ Status DeleteScheduler::MarkAsTrash(const std::string& file_path,
     cnt++;
   }
   if (s.ok()) {
-    sst_file_manager_->OnMoveFile(file_path, *trash_file);
+    s = sst_file_manager_->OnMoveFile(file_path, *trash_file);
   }
   return s;
 }
@@ -216,14 +225,14 @@ void DeleteScheduler::BackgroundEmptyTrash() {
     }
 
     // Delete all files in queue_
-    uint64_t start_time = env_->NowMicros();
+    uint64_t start_time = clock_->NowMicros();
     uint64_t total_deleted_bytes = 0;
     int64_t current_delete_rate = rate_bytes_per_sec_.load();
     while (!queue_.empty() && !closing_) {
       if (current_delete_rate != rate_bytes_per_sec_.load()) {
         // User changed the delete rate
         current_delete_rate = rate_bytes_per_sec_.load();
-        start_time = env_->NowMicros();
+        start_time = clock_->NowMicros();
         total_deleted_bytes = 0;
         ROCKS_LOG_INFO(info_log_, "rate_bytes_per_sec is changed to %" PRIi64,
                        current_delete_rate);
diff --git a/file/delete_scheduler.h b/file/delete_scheduler.h
index b2d17a73e12..6d3f6b4a4f4 100644
--- a/file/delete_scheduler.h
+++ b/file/delete_scheduler.h
@@ -15,26 +15,28 @@
 #include "monitoring/instrumented_mutex.h"
 #include "port/port.h"
 
-#include "rocksdb/file_system.h"
 #include "rocksdb/status.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class Env;
+class FileSystem;
 class Logger;
 class SstFileManagerImpl;
+class SystemClock;
 
 // DeleteScheduler allows the DB to enforce a rate limit on file deletion,
 // Instead of deleteing files immediately, files are marked as trash
-// and deleted in a background thread that apply sleep penlty between deletes
+// and deleted in a background thread that apply sleep penalty between deletes
 // if they are happening in a rate faster than rate_bytes_per_sec,
 //
 // Rate limiting can be turned off by setting rate_bytes_per_sec = 0, In this
 // case DeleteScheduler will delete files immediately.
 class DeleteScheduler {
  public:
-  DeleteScheduler(Env* env, FileSystem* fs, int64_t rate_bytes_per_sec,
-                  Logger* info_log, SstFileManagerImpl* sst_file_manager,
+  DeleteScheduler(SystemClock* clock, FileSystem* fs,
+                  int64_t rate_bytes_per_sec, Logger* info_log,
+                  SstFileManagerImpl* sst_file_manager,
                   double max_trash_db_ratio, uint64_t bytes_max_delete_chunk);
 
   ~DeleteScheduler();
@@ -48,7 +50,7 @@ class DeleteScheduler {
     MaybeCreateBackgroundThread();
   }
 
-  // Mark file as trash directory and schedule it's deletion. If force_bg is
+  // Mark file as trash directory and schedule its deletion. If force_bg is
   // set, it forces the file to always be deleted in the background thread,
   // except when rate limiting is disabled
   Status DeleteFile(const std::string& fname, const std::string& dir_to_sync,
@@ -78,7 +80,7 @@ class DeleteScheduler {
   static const std::string kTrashExtension;
   static bool IsTrashFile(const std::string& file_path);
 
-  // Check if there are any .trash filse in path, and schedule their deletion
+  // Check if there are any .trash files in path, and schedule their deletion
   // Or delete immediately if sst_file_manager is nullptr
   static Status CleanupDirectory(Env* env, SstFileManagerImpl* sfm,
                                  const std::string& path);
@@ -99,7 +101,7 @@ class DeleteScheduler {
 
   void MaybeCreateBackgroundThread();
 
-  Env* env_;
+  SystemClock* clock_;
   FileSystem* fs_;
 
   // total size of trash files
diff --git a/file/delete_scheduler_test.cc b/file/delete_scheduler_test.cc
index 67eaa50e613..e6f590a526b 100644
--- a/file/delete_scheduler_test.cc
+++ b/file/delete_scheduler_test.cc
@@ -10,7 +10,6 @@
 #include <thread>
 #include <vector>
 
-#include "env/composite_env_wrapper.h"
 #include "file/file_util.h"
 #include "file/sst_file_manager_impl.h"
 #include "rocksdb/env.h"
@@ -58,7 +57,7 @@ class DeleteSchedulerTest : public testing::Test {
 
     int normal_cnt = 0;
     for (auto& f : files_in_dir) {
-      if (!DeleteScheduler::IsTrashFile(f) && f != "." && f != "..") {
+      if (!DeleteScheduler::IsTrashFile(f)) {
         normal_cnt++;
       }
     }
@@ -88,7 +87,7 @@ class DeleteSchedulerTest : public testing::Test {
     std::string data(size, 'A');
     EXPECT_OK(f->Append(data));
     EXPECT_OK(f->Close());
-    sst_file_mgr_->OnAddFile(file_path, false);
+    sst_file_mgr_->OnAddFile(file_path);
     return file_path;
   }
 
@@ -96,10 +95,9 @@ class DeleteSchedulerTest : public testing::Test {
     // Tests in this file are for DeleteScheduler component and don't create any
     // DBs, so we need to set max_trash_db_ratio to 100% (instead of default
     // 25%)
-    std::shared_ptr<FileSystem>
-                fs(std::make_shared<LegacyFileSystemWrapper>(env_));
     sst_file_mgr_.reset(
-        new SstFileManagerImpl(env_, fs, nullptr, rate_bytes_per_sec_,
+        new SstFileManagerImpl(env_->GetSystemClock(), env_->GetFileSystem(),
+                               nullptr, rate_bytes_per_sec_,
                                /* max_trash_db_ratio= */ 1.1, 128 * 1024));
     delete_scheduler_ = sst_file_mgr_->delete_scheduler();
     sst_file_mgr_->SetStatisticsPtr(stats_);
@@ -426,7 +424,9 @@ TEST_F(DeleteSchedulerTest, BackgroundError) {
   delete_scheduler_->WaitForEmptyTrash();
   auto bg_errors = delete_scheduler_->GetBackgroundErrors();
   ASSERT_EQ(bg_errors.size(), 10);
-
+  for (const auto& it : bg_errors) {
+    ASSERT_TRUE(it.second.IsPathNotFound());
+  }
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
 
@@ -670,7 +670,7 @@ TEST_F(DeleteSchedulerTest, ImmediateDeleteOn25PercDBSize) {
   }
 
   for (std::string& file_name : generated_files) {
-    delete_scheduler_->DeleteFile(file_name, "");
+    ASSERT_OK(delete_scheduler_->DeleteFile(file_name, ""));
   }
 
   // When we end up with 26 files in trash we will start
diff --git a/file/file_prefetch_buffer.cc b/file/file_prefetch_buffer.cc
index 8d9798d09eb..1fe5a367ee0 100644
--- a/file/file_prefetch_buffer.cc
+++ b/file/file_prefetch_buffer.cc
@@ -91,23 +91,26 @@ Status FilePrefetchBuffer::Prefetch(const IOOptions& opts,
   size_t read_len = static_cast<size_t>(roundup_len - chunk_len);
   s = reader->Read(opts, rounddown_offset + chunk_len, read_len, &result,
                    buffer_.BufferStart() + chunk_len, nullptr, for_compaction);
+  if (!s.ok()) {
+    return s;
+  }
+
 #ifndef NDEBUG
-  if (!s.ok() || result.size() < read_len) {
+  if (result.size() < read_len) {
     // Fake an IO error to force db_stress fault injection to ignore
     // truncated read errors
     IGNORE_STATUS_IF_ERROR(Status::IOError());
   }
 #endif
-  if (s.ok()) {
-    buffer_offset_ = rounddown_offset;
-    buffer_.Size(static_cast<size_t>(chunk_len) + result.size());
-  }
+  buffer_offset_ = rounddown_offset;
+  buffer_.Size(static_cast<size_t>(chunk_len) + result.size());
   return s;
 }
 
 bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts,
                                           uint64_t offset, size_t n,
-                                          Slice* result, bool for_compaction) {
+                                          Slice* result, Status* status,
+                                          bool for_compaction) {
   if (track_min_offset_ && offset < min_offset_read_) {
     min_offset_read_ = static_cast<size_t>(offset);
   }
@@ -116,7 +119,7 @@ bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts,
   }
 
   // If the buffer contains only a few of the requested bytes:
-  //    If readahead is enabled: prefetch the remaining bytes + readadhead bytes
+  //    If readahead is enabled: prefetch the remaining bytes + readahead bytes
   //        and satisfy the request.
   //    If readahead is not enabled: return false.
   if (offset + n > buffer_offset_ + buffer_.CurrentSize()) {
@@ -128,10 +131,34 @@ bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts,
         s = Prefetch(opts, file_reader_, offset, std::max(n, readahead_size_),
                      for_compaction);
       } else {
+        if (implicit_auto_readahead_) {
+          // Prefetch only if this read is sequential otherwise reset
+          // readahead_size_ to initial value.
+          if (!IsBlockSequential(offset)) {
+            UpdateReadPattern(offset, n);
+            ResetValues();
+            // Ignore status as Prefetch is not called.
+            s.PermitUncheckedError();
+            return false;
+          }
+          num_file_reads_++;
+          if (num_file_reads_ <= kMinNumFileReadsToStartAutoReadahead) {
+            UpdateReadPattern(offset, n);
+            // Ignore status as Prefetch is not called.
+            s.PermitUncheckedError();
+            return false;
+          }
+        }
         s = Prefetch(opts, file_reader_, offset, n + readahead_size_,
                      for_compaction);
       }
       if (!s.ok()) {
+        if (status) {
+          *status = s;
+        }
+#ifndef NDEBUG
+        IGNORE_STATUS_IF_ERROR(s);
+#endif
         return false;
       }
       readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
@@ -139,7 +166,7 @@ bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts,
       return false;
     }
   }
-
+  UpdateReadPattern(offset, n);
   uint64_t offset_in_buffer = offset - buffer_offset_;
   *result = Slice(buffer_.BufferStart() + offset_in_buffer, n);
   return true;
diff --git a/file/file_prefetch_buffer.h b/file/file_prefetch_buffer.h
index d8e9c0ff6ea..980496d285e 100644
--- a/file/file_prefetch_buffer.h
+++ b/file/file_prefetch_buffer.h
@@ -23,6 +23,7 @@ namespace ROCKSDB_NAMESPACE {
 // FilePrefetchBuffer is a smart buffer to store and read data from a file.
 class FilePrefetchBuffer {
  public:
+  static const int kMinNumFileReadsToStartAutoReadahead = 2;
   // Constructor.
   //
   // All arguments are optional.
@@ -38,23 +39,31 @@ class FilePrefetchBuffer {
   //   for the minimum offset if track_min_offset = true.
   // track_min_offset : Track the minimum offset ever read and collect stats on
   //   it. Used for adaptable readahead of the file footer/metadata.
+  // implicit_auto_readahead : Readahead is enabled implicitly by rocksdb after
+  //   doing sequential scans for two times.
   //
   // Automatic readhead is enabled for a file if file_reader, readahead_size,
   // and max_readahead_size are passed in.
-  // If file_reader is a nullptr, setting readadhead_size and max_readahead_size
+  // If file_reader is a nullptr, setting readahead_size and max_readahead_size
   // does not make any sense. So it does nothing.
   // A user can construct a FilePrefetchBuffer without any arguments, but use
   // `Prefetch` to load data into the buffer.
   FilePrefetchBuffer(RandomAccessFileReader* file_reader = nullptr,
-                     size_t readadhead_size = 0, size_t max_readahead_size = 0,
-                     bool enable = true, bool track_min_offset = false)
+                     size_t readahead_size = 0, size_t max_readahead_size = 0,
+                     bool enable = true, bool track_min_offset = false,
+                     bool implicit_auto_readahead = false)
       : buffer_offset_(0),
         file_reader_(file_reader),
-        readahead_size_(readadhead_size),
+        readahead_size_(readahead_size),
         max_readahead_size_(max_readahead_size),
+        initial_readahead_size_(readahead_size),
         min_offset_read_(port::kMaxSizet),
         enable_(enable),
-        track_min_offset_(track_min_offset) {}
+        track_min_offset_(track_min_offset),
+        implicit_auto_readahead_(implicit_auto_readahead),
+        prev_offset_(0),
+        prev_len_(0),
+        num_file_reads_(kMinNumFileReadsToStartAutoReadahead + 1) {}
 
   // Load data into the buffer from a file.
   // reader : the file reader.
@@ -67,7 +76,7 @@ class FilePrefetchBuffer {
   // Tries returning the data for a file raed from this buffer, if that data is
   // in the buffer.
   // It handles tracking the minimum read offset if track_min_offset = true.
-  // It also does the exponential readahead when readadhead_size is set as part
+  // It also does the exponential readahead when readahead_size is set as part
   // of the constructor.
   //
   // offset : the file offset.
@@ -75,18 +84,33 @@ class FilePrefetchBuffer {
   // result : output buffer to put the data into.
   // for_compaction : if cache read is done for compaction read.
   bool TryReadFromCache(const IOOptions& opts, uint64_t offset, size_t n,
-                        Slice* result, bool for_compaction = false);
+                        Slice* result, Status* s, bool for_compaction = false);
 
   // The minimum `offset` ever passed to TryReadFromCache(). This will nly be
   // tracked if track_min_offset = true.
   size_t min_offset_read() const { return min_offset_read_; }
 
+  void UpdateReadPattern(const size_t& offset, const size_t& len) {
+    prev_offset_ = offset;
+    prev_len_ = len;
+  }
+
+  bool IsBlockSequential(const size_t& offset) {
+    return (prev_len_ == 0 || (prev_offset_ + prev_len_ == offset));
+  }
+
+  void ResetValues() {
+    num_file_reads_ = 1;
+    readahead_size_ = initial_readahead_size_;
+  }
+
  private:
   AlignedBuffer buffer_;
   uint64_t buffer_offset_;
   RandomAccessFileReader* file_reader_;
   size_t readahead_size_;
   size_t max_readahead_size_;
+  size_t initial_readahead_size_;
   // The minimum `offset` ever passed to TryReadFromCache().
   size_t min_offset_read_;
   // if false, TryReadFromCache() always return false, and we only take stats
@@ -95,5 +119,12 @@ class FilePrefetchBuffer {
   // If true, track minimum `offset` ever passed to TryReadFromCache(), which
   // can be fetched from min_offset_read().
   bool track_min_offset_;
+
+  // implicit_auto_readahead is enabled by rocksdb internally after 2 sequential
+  // IOs.
+  bool implicit_auto_readahead_;
+  size_t prev_offset_;
+  size_t prev_len_;
+  int num_file_reads_;
 };
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/file/file_util.cc b/file/file_util.cc
index 69f357a1ad4..70191a7f377 100644
--- a/file/file_util.cc
+++ b/file/file_util.cc
@@ -124,15 +124,17 @@ bool IsWalDirSameAsDBPath(const ImmutableDBOptions* db_options) {
 }
 
 // requested_checksum_func_name brings the function name of the checksum
-// generator in checksum_factory. Checksum factories may use or ignore
-// requested_checksum_func_name.
+// generator in checksum_factory. Empty string is permitted, in which case the
+// name of the generator created by the factory is unchecked. When
+// `requested_checksum_func_name` is non-empty, however, the created generator's
+// name must match it, otherwise an `InvalidArgument` error is returned.
 IOStatus GenerateOneFileChecksum(
     FileSystem* fs, const std::string& file_path,
     FileChecksumGenFactory* checksum_factory,
     const std::string& requested_checksum_func_name, std::string* file_checksum,
     std::string* file_checksum_func_name,
     size_t verify_checksums_readahead_size, bool allow_mmap_reads,
-    std::shared_ptr<IOTracer>& io_tracer) {
+    std::shared_ptr<IOTracer>& io_tracer, RateLimiter* rate_limiter) {
   if (checksum_factory == nullptr) {
     return IOStatus::InvalidArgument("Checksum factory is invalid");
   }
@@ -151,14 +153,22 @@ IOStatus GenerateOneFileChecksum(
         requested_checksum_func_name +
         " from checksum factory: " + checksum_factory->Name();
     return IOStatus::InvalidArgument(msg);
+  } else {
+    // For backward compatibility and use in file ingestion clients where there
+    // is no stored checksum function name, `requested_checksum_func_name` can
+    // be empty. If we give the requested checksum function name, we expect it
+    // is the same name of the checksum generator.
+    if (!requested_checksum_func_name.empty() &&
+        checksum_generator->Name() != requested_checksum_func_name) {
+      std::string msg = "Expected file checksum generator named '" +
+                        requested_checksum_func_name +
+                        "', while the factory created one "
+                        "named '" +
+                        checksum_generator->Name() + "'";
+      return IOStatus::InvalidArgument(msg);
+    }
   }
 
-  // For backward compatable, requested_checksum_func_name can be empty.
-  // If we give the requested checksum function name, we expect it is the
-  // same name of the checksum generator.
-  assert(!checksum_generator || requested_checksum_func_name.empty() ||
-         requested_checksum_func_name == checksum_generator->Name());
-
   uint64_t size;
   IOStatus io_s;
   std::unique_ptr<RandomAccessFileReader> reader;
@@ -173,7 +183,8 @@ IOStatus GenerateOneFileChecksum(
       return io_s;
     }
     reader.reset(new RandomAccessFileReader(std::move(r_file), file_path,
-                                            nullptr /*Env*/, io_tracer));
+                                            nullptr /*Env*/, io_tracer, nullptr,
+                                            0, nullptr, rate_limiter));
   }
 
   // Found that 256 KB readahead size provides the best performance, based on
@@ -184,7 +195,7 @@ IOStatus GenerateOneFileChecksum(
                               : default_max_read_ahead_size;
 
   FilePrefetchBuffer prefetch_buffer(
-      reader.get(), readahead_size /* readadhead_size */,
+      reader.get(), readahead_size /* readahead_size */,
       readahead_size /* max_readahead_size */, !allow_mmap_reads /* enable */);
 
   Slice slice;
@@ -194,7 +205,7 @@ IOStatus GenerateOneFileChecksum(
     size_t bytes_to_read =
         static_cast<size_t>(std::min(uint64_t{readahead_size}, size));
     if (!prefetch_buffer.TryReadFromCache(opts, offset, bytes_to_read, &slice,
-                                          false)) {
+                                          nullptr, false)) {
       return IOStatus::Corruption("file read failed");
     }
     if (slice.size() == 0) {
@@ -219,9 +230,6 @@ Status DestroyDir(Env* env, const std::string& dir) {
   s = env->GetChildren(dir, &files_in_dir);
   if (s.ok()) {
     for (auto& file_in_dir : files_in_dir) {
-      if (file_in_dir == "." || file_in_dir == "..") {
-        continue;
-      }
       std::string path = dir + "/" + file_in_dir;
       bool is_dir = false;
       s = env->IsDirectory(path, &is_dir);
@@ -231,6 +239,8 @@ Status DestroyDir(Env* env, const std::string& dir) {
         } else {
           s = env->DeleteFile(path);
         }
+      } else if (s.IsNotSupported()) {
+        s = Status::OK();
       }
       if (!s.ok()) {
         // IsDirectory, etc. might not report NotFound
diff --git a/file/file_util.h b/file/file_util.h
index a9b0a95095d..48878833f41 100644
--- a/file/file_util.h
+++ b/file/file_util.h
@@ -12,6 +12,7 @@
 #include "rocksdb/file_system.h"
 #include "rocksdb/sst_file_writer.h"
 #include "rocksdb/status.h"
+#include "rocksdb/system_clock.h"
 #include "rocksdb/types.h"
 #include "trace_replay/io_tracer.h"
 
@@ -22,10 +23,23 @@ extern IOStatus CopyFile(FileSystem* fs, const std::string& source,
                          const std::string& destination, uint64_t size,
                          bool use_fsync,
                          const std::shared_ptr<IOTracer>& io_tracer = nullptr);
+inline IOStatus CopyFile(const std::shared_ptr<FileSystem>& fs,
+                         const std::string& source,
+                         const std::string& destination, uint64_t size,
+                         bool use_fsync,
+                         const std::shared_ptr<IOTracer>& io_tracer = nullptr) {
+  return CopyFile(fs.get(), source, destination, size, use_fsync, io_tracer);
+}
 
 extern IOStatus CreateFile(FileSystem* fs, const std::string& destination,
                            const std::string& contents, bool use_fsync);
 
+inline IOStatus CreateFile(const std::shared_ptr<FileSystem>& fs,
+                           const std::string& destination,
+                           const std::string& contents, bool use_fsync) {
+  return CreateFile(fs.get(), destination, contents, use_fsync);
+}
+
 extern Status DeleteDBFile(const ImmutableDBOptions* db_options,
                            const std::string& fname,
                            const std::string& path_to_sync, const bool force_bg,
@@ -39,16 +53,26 @@ extern IOStatus GenerateOneFileChecksum(
     const std::string& requested_checksum_func_name, std::string* file_checksum,
     std::string* file_checksum_func_name,
     size_t verify_checksums_readahead_size, bool allow_mmap_reads,
-    std::shared_ptr<IOTracer>& io_tracer);
+    std::shared_ptr<IOTracer>& io_tracer, RateLimiter* rate_limiter = nullptr);
 
-inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro, Env* env,
-                                         IOOptions& opts) {
-  if (!env) {
-    env = Env::Default();
-  }
+inline IOStatus GenerateOneFileChecksum(
+    const std::shared_ptr<FileSystem>& fs, const std::string& file_path,
+    FileChecksumGenFactory* checksum_factory,
+    const std::string& requested_checksum_func_name, std::string* file_checksum,
+    std::string* file_checksum_func_name,
+    size_t verify_checksums_readahead_size, bool allow_mmap_reads,
+    std::shared_ptr<IOTracer>& io_tracer) {
+  return GenerateOneFileChecksum(
+      fs.get(), file_path, checksum_factory, requested_checksum_func_name,
+      file_checksum, file_checksum_func_name, verify_checksums_readahead_size,
+      allow_mmap_reads, io_tracer);
+}
 
+inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro,
+                                         SystemClock* clock, IOOptions& opts) {
   if (ro.deadline.count()) {
-    std::chrono::microseconds now = std::chrono::microseconds(env->NowMicros());
+    std::chrono::microseconds now =
+        std::chrono::microseconds(clock->NowMicros());
     // Ensure there is atleast 1us available. We don't want to pass a value of
     // 0 as that means no timeout
     if (now >= ro.deadline) {
diff --git a/file/filename.cc b/file/filename.cc
index a7c22d2e773..87bf060d1c2 100644
--- a/file/filename.cc
+++ b/file/filename.cc
@@ -13,7 +13,6 @@
 #include <stdio.h>
 #include <vector>
 #include "file/writable_file_writer.h"
-#include "logging/logging.h"
 #include "rocksdb/env.h"
 #include "test_util/sync_point.h"
 #include "util/stop_watch.h"
@@ -184,7 +183,8 @@ InfoLogPrefix::InfoLogPrefix(bool has_log_dir,
     snprintf(buf, sizeof(buf), kInfoLogPrefix);
     prefix = Slice(buf, sizeof(kInfoLogPrefix) - 1);
   } else {
-    size_t len = GetInfoLogPrefix(db_absolute_path, buf, sizeof(buf));
+    size_t len =
+        GetInfoLogPrefix(NormalizePath(db_absolute_path), buf, sizeof(buf));
     prefix = Slice(buf, len);
   }
 }
@@ -352,7 +352,7 @@ bool ParseFileName(const std::string& fname, uint64_t* number,
 
     Slice suffix = rest;
     if (suffix == Slice("log")) {
-      *type = kLogFile;
+      *type = kWalFile;
       if (log_type && !archive_dir_found) {
         *log_type = kAliveLogFile;
       }
@@ -383,10 +383,12 @@ IOStatus SetCurrentFile(FileSystem* fs, const std::string& dbname,
   contents.remove_prefix(dbname.size() + 1);
   std::string tmp = TempFileName(dbname, descriptor_number);
   IOStatus s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true);
+  TEST_SYNC_POINT_CALLBACK("SetCurrentFile:BeforeRename", &s);
   if (s.ok()) {
-    TEST_KILL_RANDOM("SetCurrentFile:0", rocksdb_kill_odds * REDUCE_ODDS2);
+    TEST_KILL_RANDOM_WITH_WEIGHT("SetCurrentFile:0", REDUCE_ODDS2);
     s = fs->RenameFile(tmp, CurrentFileName(dbname), IOOptions(), nullptr);
-    TEST_KILL_RANDOM("SetCurrentFile:1", rocksdb_kill_odds * REDUCE_ODDS2);
+    TEST_KILL_RANDOM_WITH_WEIGHT("SetCurrentFile:1", REDUCE_ODDS2);
+    TEST_SYNC_POINT_CALLBACK("SetCurrentFile:AfterRename", &s);
   }
   if (s.ok()) {
     if (directory_to_fsync != nullptr) {
@@ -419,20 +421,21 @@ Status SetIdentityFile(Env* env, const std::string& dbname,
   return s;
 }
 
-IOStatus SyncManifest(Env* env, const ImmutableDBOptions* db_options,
+IOStatus SyncManifest(const ImmutableDBOptions* db_options,
                       WritableFileWriter* file) {
-  TEST_KILL_RANDOM("SyncManifest:0", rocksdb_kill_odds * REDUCE_ODDS2);
-  StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS);
+  TEST_KILL_RANDOM_WITH_WEIGHT("SyncManifest:0", REDUCE_ODDS2);
+  StopWatch sw(db_options->clock, db_options->stats, MANIFEST_FILE_SYNC_MICROS);
   return file->Sync(db_options->use_fsync);
 }
 
-Status GetInfoLogFiles(Env* env, const std::string& db_log_dir,
-                       const std::string& dbname, std::string* parent_dir,
+Status GetInfoLogFiles(const std::shared_ptr<FileSystem>& fs,
+                       const std::string& db_log_dir, const std::string& dbname,
+                       std::string* parent_dir,
                        std::vector<std::string>* info_log_list) {
   assert(parent_dir != nullptr);
   assert(info_log_list != nullptr);
   uint64_t number = 0;
-  FileType type = kLogFile;
+  FileType type = kWalFile;
 
   if (!db_log_dir.empty()) {
     *parent_dir = db_log_dir;
@@ -443,7 +446,7 @@ Status GetInfoLogFiles(Env* env, const std::string& db_log_dir,
   InfoLogPrefix info_log_prefix(!db_log_dir.empty(), dbname);
 
   std::vector<std::string> file_names;
-  Status s = env->GetChildren(*parent_dir, &file_names);
+  Status s = fs->GetChildren(*parent_dir, IOOptions(), &file_names, nullptr);
 
   if (!s.ok()) {
     return s;
diff --git a/file/filename.h b/file/filename.h
index f23723244c7..7f34ade281b 100644
--- a/file/filename.h
+++ b/file/filename.h
@@ -27,6 +27,7 @@ namespace ROCKSDB_NAMESPACE {
 
 class Env;
 class Directory;
+class SystemClock;
 class WritableFileWriter;
 
 #ifdef OS_WIN
@@ -35,20 +36,6 @@ const char kFilePathSeparator = '\\';
 const char kFilePathSeparator = '/';
 #endif
 
-enum FileType {
-  kLogFile,
-  kDBLockFile,
-  kTableFile,
-  kDescriptorFile,
-  kCurrentFile,
-  kTempFile,
-  kInfoLogFile,  // Either the current one, or an old one
-  kMetaDatabase,
-  kIdentityFile,
-  kOptionsFile,
-  kBlobFile
-};
-
 // Return the name of the log file with the specified number
 // in the db named by "dbname".  The result will be prefixed with
 // "dbname".
@@ -180,14 +167,15 @@ extern Status SetIdentityFile(Env* env, const std::string& dbname,
                               const std::string& db_id = {});
 
 // Sync manifest file `file`.
-extern IOStatus SyncManifest(Env* env, const ImmutableDBOptions* db_options,
+extern IOStatus SyncManifest(const ImmutableDBOptions* db_options,
                              WritableFileWriter* file);
 
 // Return list of file names of info logs in `file_names`.
 // The list only contains file name. The parent directory name is stored
 // in `parent_dir`.
 // `db_log_dir` should be the one as in options.db_log_dir
-extern Status GetInfoLogFiles(Env* env, const std::string& db_log_dir,
+extern Status GetInfoLogFiles(const std::shared_ptr<FileSystem>& fs,
+                              const std::string& db_log_dir,
                               const std::string& dbname,
                               std::string* parent_dir,
                               std::vector<std::string>* file_names);
diff --git a/file/line_file_reader.cc b/file/line_file_reader.cc
new file mode 100644
index 00000000000..8a56a09b2e4
--- /dev/null
+++ b/file/line_file_reader.cc
@@ -0,0 +1,65 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "file/line_file_reader.h"
+
+#include <cstring>
+
+namespace ROCKSDB_NAMESPACE {
+
+Status LineFileReader::Create(const std::shared_ptr<FileSystem>& fs,
+                              const std::string& fname,
+                              const FileOptions& file_opts,
+                              std::unique_ptr<LineFileReader>* reader,
+                              IODebugContext* dbg) {
+  std::unique_ptr<FSSequentialFile> file;
+  Status s = fs->NewSequentialFile(fname, file_opts, &file, dbg);
+  if (s.ok()) {
+    reader->reset(new LineFileReader(std::move(file), fname));
+  }
+  return s;
+}
+
+bool LineFileReader::ReadLine(std::string* out) {
+  assert(out);
+  if (!status_.ok()) {
+    // Status should be checked (or permit unchecked) any time we return false.
+    status_.MustCheck();
+    return false;
+  }
+  out->clear();
+  for (;;) {
+    // Look for line delimiter
+    const char* found = static_cast<const char*>(
+        std::memchr(buf_begin_, '\n', buf_end_ - buf_begin_));
+    if (found) {
+      size_t len = found - buf_begin_;
+      out->append(buf_begin_, len);
+      buf_begin_ += len + /*delim*/ 1;
+      ++line_number_;
+      return true;
+    }
+    if (at_eof_) {
+      status_.MustCheck();
+      return false;
+    }
+    // else flush and reload buffer
+    out->append(buf_begin_, buf_end_ - buf_begin_);
+    Slice result;
+    status_ = sfr_.Read(buf_.size(), &result, buf_.data());
+    if (!status_.ok()) {
+      status_.MustCheck();
+      return false;
+    }
+    if (result.size() != buf_.size()) {
+      // The obscure way of indicating EOF
+      at_eof_ = true;
+    }
+    buf_begin_ = result.data();
+    buf_end_ = result.data() + result.size();
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/file/line_file_reader.h b/file/line_file_reader.h
new file mode 100644
index 00000000000..48d79f327c0
--- /dev/null
+++ b/file/line_file_reader.h
@@ -0,0 +1,59 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <array>
+
+#include "file/sequence_file_reader.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A wrapper on top of Env::SequentialFile for reading text lines from a file.
+// Lines are delimited by '\n'. The last line may or may not include a
+// trailing newline. Uses SequentialFileReader internally.
+class LineFileReader {
+ private:
+  std::array<char, 8192> buf_;
+  SequentialFileReader sfr_;
+  Status status_;
+  const char* buf_begin_ = buf_.data();
+  const char* buf_end_ = buf_.data();
+  size_t line_number_ = 0;
+  bool at_eof_ = false;
+
+ public:
+  // See SequentialFileReader constructors
+  template <typename... Args>
+  explicit LineFileReader(Args&&... args)
+      : sfr_(std::forward<Args&&>(args)...) {}
+
+  static Status Create(const std::shared_ptr<FileSystem>& fs,
+                       const std::string& fname, const FileOptions& file_opts,
+                       std::unique_ptr<LineFileReader>* reader,
+                       IODebugContext* dbg);
+
+  LineFileReader(const LineFileReader&) = delete;
+  LineFileReader& operator=(const LineFileReader&) = delete;
+
+  // Reads another line from the file, returning true on success and saving
+  // the line to `out`, without delimiter, or returning false on failure. You
+  // must check GetStatus() to determine whether the failure was just
+  // end-of-file (OK status) or an I/O error (another status).
+  bool ReadLine(std::string* out);
+
+  // Returns the number of the line most recently returned from ReadLine.
+  // Return value is unspecified if ReadLine has returned false due to
+  // I/O error. After ReadLine returns false due to end-of-file, return
+  // value is the last returned line number, or equivalently the total
+  // number of lines returned.
+  size_t GetLineNumber() const { return line_number_; }
+
+  // Returns any error encountered during read. The error is considered
+  // permanent and no retry or recovery is attempted with the same
+  // LineFileReader.
+  const Status& GetStatus() const { return status_; }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/file/prefetch_test.cc b/file/prefetch_test.cc
index ffe0367a4b2..79b56b944fe 100644
--- a/file/prefetch_test.cc
+++ b/file/prefetch_test.cc
@@ -25,7 +25,7 @@ class MockRandomAccessFile : public FSRandomAccessFileWrapper {
       prefetch_count_.fetch_add(1);
       return target()->Prefetch(offset, n, options, dbg);
     } else {
-      return IOStatus::NotSupported();
+      return IOStatus::NotSupported("Prefetch not supported");
     }
   }
 
@@ -37,9 +37,9 @@ class MockRandomAccessFile : public FSRandomAccessFileWrapper {
 
 class MockFS : public FileSystemWrapper {
  public:
-  explicit MockFS(bool support_prefetch)
-      : FileSystemWrapper(FileSystem::Default()),
-        support_prefetch_(support_prefetch) {}
+  explicit MockFS(const std::shared_ptr<FileSystem>& wrapped,
+                  bool support_prefetch)
+      : FileSystemWrapper(wrapped), support_prefetch_(support_prefetch) {}
 
   IOStatus NewRandomAccessFile(const std::string& fname,
                                const FileOptions& opts,
@@ -57,6 +57,10 @@ class MockFS : public FileSystemWrapper {
 
   bool IsPrefetchCalled() { return prefetch_count_ > 0; }
 
+  int GetPrefetchCount() {
+    return prefetch_count_.load(std::memory_order_relaxed);
+  }
+
  private:
   const bool support_prefetch_;
   std::atomic_int prefetch_count_{0};
@@ -69,19 +73,25 @@ class PrefetchTest
   PrefetchTest() : DBTestBase("/prefetch_test", true) {}
 };
 
+INSTANTIATE_TEST_CASE_P(PrefetchTest, PrefetchTest,
+                        ::testing::Combine(::testing::Bool(),
+                                           ::testing::Bool()));
+
 std::string BuildKey(int num, std::string postfix = "") {
   return "my_key_" + std::to_string(num) + postfix;
 }
 
 TEST_P(PrefetchTest, Basic) {
   // First param is if the mockFS support_prefetch or not
-  bool support_prefetch = std::get<0>(GetParam());
+  bool support_prefetch =
+      std::get<0>(GetParam()) &&
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
 
   // Second param is if directIO is enabled or not
   bool use_direct_io = std::get<1>(GetParam());
-
   const int kNumKeys = 1100;
-  std::shared_ptr<MockFS> fs = std::make_shared<MockFS>(support_prefetch);
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
   std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
   Options options = CurrentOptions();
   options.write_buffer_size = 1024;
@@ -109,21 +119,21 @@ TEST_P(PrefetchTest, Basic) {
   // create first key range
   WriteBatch batch;
   for (int i = 0; i < kNumKeys; i++) {
-    batch.Put(BuildKey(i), "value for range 1 key");
+    ASSERT_OK(batch.Put(BuildKey(i), "value for range 1 key"));
   }
   ASSERT_OK(db_->Write(WriteOptions(), &batch));
 
   // create second key range
   batch.Clear();
   for (int i = 0; i < kNumKeys; i++) {
-    batch.Put(BuildKey(i, "key2"), "value for range 2 key");
+    ASSERT_OK(batch.Put(BuildKey(i, "key2"), "value for range 2 key"));
   }
   ASSERT_OK(db_->Write(WriteOptions(), &batch));
 
   // delete second key range
   batch.Clear();
   for (int i = 0; i < kNumKeys; i++) {
-    batch.Delete(BuildKey(i, "key2"));
+    ASSERT_OK(batch.Delete(BuildKey(i, "key2")));
   }
   ASSERT_OK(db_->Write(WriteOptions(), &batch));
 
@@ -134,7 +144,7 @@ TEST_P(PrefetchTest, Basic) {
   Slice greatest(end_key.data(), end_key.size());
 
   // commenting out the line below causes the example to work correctly
-  db_->CompactRange(CompactRangeOptions(), &least, &greatest);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
 
   if (support_prefetch && !use_direct_io) {
     // If underline file system supports prefetch, and directIO is not enabled
@@ -173,9 +183,491 @@ TEST_P(PrefetchTest, Basic) {
   Close();
 }
 
-INSTANTIATE_TEST_CASE_P(PrefetchTest, PrefetchTest,
-                        ::testing::Combine(::testing::Bool(),
-                                           ::testing::Bool()));
+#ifndef ROCKSDB_LITE
+TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) {
+  // First param is if the mockFS support_prefetch or not
+  bool support_prefetch =
+      std::get<0>(GetParam()) &&
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
+
+  // Second param is if directIO is enabled or not
+  bool use_direct_io = std::get<1>(GetParam());
+
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  options.disable_auto_compactions = true;
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  table_options.max_auto_readahead_size = 0;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  int buff_prefetch_count = 0;
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+
+  // DB open will create table readers unless we reduce the table cache
+  // capacity. SanitizeOptions will set max_open_files to minimum of 20. Table
+  // cache is allocated with max_open_files - 10 as capacity. So override
+  // max_open_files to 10 so table cache capacity will become 0. This will
+  // prevent file open during DB open and force the file to be opened during
+  // Iteration.
+  SyncPoint::GetInstance()->SetCallBack(
+      "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+        int* max_open_files = (int*)arg;
+        *max_open_files = 11;
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  Random rnd(309);
+  int key_count = 0;
+  const int num_keys_per_level = 100;
+  // Level 0 : Keys in range [0, 99], Level 1:[100, 199], Level 2:[200, 299].
+  for (int level = 2; level >= 0; level--) {
+    key_count = level * num_keys_per_level;
+    for (int i = 0; i < num_keys_per_level; ++i) {
+      ASSERT_OK(Put(Key(key_count++), rnd.RandomString(500)));
+    }
+    ASSERT_OK(Flush());
+    MoveFilesToLevel(level);
+  }
+  Close();
+  std::vector<int> buff_prefectch_level_count = {0, 0, 0};
+  TryReopen(options);
+  {
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    fs->ClearPrefetchCount();
+    buff_prefetch_count = 0;
+
+    for (int level = 2; level >= 0; level--) {
+      key_count = level * num_keys_per_level;
+      switch (level) {
+        case 0:
+          // max_auto_readahead_size is set 0 so data and index blocks are not
+          // prefetched.
+          ASSERT_OK(db_->SetOptions(
+              {{"block_based_table_factory", "{max_auto_readahead_size=0;}"}}));
+          break;
+        case 1:
+          // max_auto_readahead_size is set less than
+          // BlockBasedTable::kInitAutoReadaheadSize. So readahead_size remains
+          // equal to max_auto_readahead_size.
+          ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
+                                      "{max_auto_readahead_size=4096;}"}}));
+          break;
+        case 2:
+          ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
+                                      "{max_auto_readahead_size=65536;}"}}));
+          break;
+        default:
+          assert(false);
+      }
+
+      for (int i = 0; i < num_keys_per_level; ++i) {
+        iter->Seek(Key(key_count++));
+        iter->Next();
+      }
+
+      buff_prefectch_level_count[level] = buff_prefetch_count;
+      if (support_prefetch && !use_direct_io) {
+        if (level == 0) {
+          ASSERT_FALSE(fs->IsPrefetchCalled());
+        } else {
+          ASSERT_TRUE(fs->IsPrefetchCalled());
+        }
+        fs->ClearPrefetchCount();
+      } else {
+        ASSERT_FALSE(fs->IsPrefetchCalled());
+        if (level == 0) {
+          ASSERT_EQ(buff_prefetch_count, 0);
+        } else {
+          ASSERT_GT(buff_prefetch_count, 0);
+        }
+        buff_prefetch_count = 0;
+      }
+    }
+  }
+
+  if (!support_prefetch) {
+    ASSERT_GT(buff_prefectch_level_count[1], buff_prefectch_level_count[2]);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_P(PrefetchTest, PrefetchWhenReseek) {
+  // First param is if the mockFS support_prefetch or not
+  bool support_prefetch =
+      std::get<0>(GetParam()) &&
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
+
+  const int kNumKeys = 2000;
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  // Second param is if directIO is enabled or not
+  bool use_direct_io = std::get<1>(GetParam());
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+
+  int buff_prefetch_count = 0;
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  WriteBatch batch;
+  Random rnd(309);
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  std::string start_key = BuildKey(0);
+  std::string end_key = BuildKey(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  fs->ClearPrefetchCount();
+  buff_prefetch_count = 0;
+
+  {
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    /*
+     * Reseek keys from sequential Data Blocks within same partitioned
+     * index. After 2 sequential reads it will prefetch the data block.
+     * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data more
+     * initially (2 more data blocks).
+     */
+    iter->Seek(BuildKey(0));
+    iter->Seek(BuildKey(1000));
+    iter->Seek(BuildKey(1004));  // Prefetch Data
+    iter->Seek(BuildKey(1008));
+    iter->Seek(BuildKey(1011));
+    iter->Seek(BuildKey(1015));  // Prefetch Data
+    iter->Seek(BuildKey(1019));
+    // Missed 2 blocks but they are already in buffer so no reset.
+    iter->Seek(BuildKey(103));   // Already in buffer.
+    iter->Seek(BuildKey(1033));  // Prefetch Data
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 3);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 3);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /*
+     * Reseek keys from  non sequential data blocks within same partitioned
+     * index. buff_prefetch_count will be 0 in that case.
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    iter->Seek(BuildKey(0));
+    iter->Seek(BuildKey(1008));
+    iter->Seek(BuildKey(1019));
+    iter->Seek(BuildKey(1033));
+    iter->Seek(BuildKey(1048));
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 0);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 0);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /*
+     * Reesek keys from Single Data Block.
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    iter->Seek(BuildKey(0));
+    iter->Seek(BuildKey(1));
+    iter->Seek(BuildKey(10));
+    iter->Seek(BuildKey(100));
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 0);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 0);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /*
+     * Reseek keys from  sequential data blocks to set implicit auto readahead
+     * and prefetch data but after that iterate over different (non sequential)
+     * data blocks which won't prefetch any data further. So buff_prefetch_count
+     * will be 1 for the first one.
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    iter->Seek(BuildKey(0));
+    iter->Seek(BuildKey(1000));
+    iter->Seek(BuildKey(1004));  // This iteration will prefetch buffer
+    iter->Seek(BuildKey(1008));
+    iter->Seek(
+        BuildKey(996));  // Reseek won't prefetch any data and
+                         // readahead_size will be initiallized to 8*1024.
+    iter->Seek(BuildKey(992));
+    iter->Seek(BuildKey(989));
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 1);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 1);
+      buff_prefetch_count = 0;
+    }
+
+    // Read sequentially to confirm readahead_size is reset to initial value (2
+    // more data blocks)
+    iter->Seek(BuildKey(1011));
+    iter->Seek(BuildKey(1015));
+    iter->Seek(BuildKey(1019));  // Prefetch Data
+    iter->Seek(BuildKey(1022));
+    iter->Seek(BuildKey(1026));
+    iter->Seek(BuildKey(103));  // Prefetch Data
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 2);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 2);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /* Reseek keys from sequential partitioned index block. Since partitioned
+     * index fetch are sequential, buff_prefetch_count will be 1.
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    iter->Seek(BuildKey(0));
+    iter->Seek(BuildKey(1167));
+    iter->Seek(BuildKey(1334));  // This iteration will prefetch buffer
+    iter->Seek(BuildKey(1499));
+    iter->Seek(BuildKey(1667));
+    iter->Seek(BuildKey(1847));
+    iter->Seek(BuildKey(1999));
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 1);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 1);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /*
+     * Reseek over different keys from different blocks. buff_prefetch_count is
+     * set 0.
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    int i = 0;
+    int j = 1000;
+    do {
+      iter->Seek(BuildKey(i));
+      if (!iter->Valid()) {
+        break;
+      }
+      i = i + 100;
+      iter->Seek(BuildKey(j));
+      j = j + 100;
+    } while (i < 1000 && j < kNumKeys && iter->Valid());
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 0);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 0);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    /* Iterates sequentially over all keys. It will prefetch the buffer.*/
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    }
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 13);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 13);
+      buff_prefetch_count = 0;
+    }
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+}
+
+TEST_P(PrefetchTest, PrefetchWhenReseekwithCache) {
+  // First param is if the mockFS support_prefetch or not
+  bool support_prefetch =
+      std::get<0>(GetParam()) &&
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
+
+  const int kNumKeys = 2000;
+  std::shared_ptr<MockFS> fs =
+      std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+
+  // Second param is if directIO is enabled or not
+  bool use_direct_io = std::get<1>(GetParam());
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1024;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+
+  BlockBasedTableOptions table_options;
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);  // 8MB
+  table_options.block_cache = cache;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.metadata_block_size = 1024;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  if (use_direct_io) {
+    options.use_direct_reads = true;
+    options.use_direct_io_for_flush_and_compaction = true;
+  }
+
+  int buff_prefetch_count = 0;
+  SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
+                                        [&](void*) { buff_prefetch_count++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+  if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
+    // If direct IO is not supported, skip the test
+    return;
+  } else {
+    ASSERT_OK(s);
+  }
+
+  WriteBatch batch;
+  Random rnd(309);
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
+  }
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  std::string start_key = BuildKey(0);
+  std::string end_key = BuildKey(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  fs->ClearPrefetchCount();
+  buff_prefetch_count = 0;
+
+  {
+    /*
+     * Reseek keys from sequential Data Blocks within same partitioned
+     * index. After 2 sequential reads it will prefetch the data block.
+     * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data more
+     * initially (2 more data blocks).
+     */
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    // Warm up the cache
+    iter->Seek(BuildKey(1011));
+    iter->Seek(BuildKey(1015));
+    iter->Seek(BuildKey(1019));
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 1);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 1);
+      buff_prefetch_count = 0;
+    }
+  }
+  {
+    // After caching, blocks will be read from cache (Sequential blocks)
+    auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
+    iter->Seek(BuildKey(0));
+    iter->Seek(BuildKey(1000));
+    iter->Seek(BuildKey(1004));  // Prefetch data (not in cache).
+    // Missed one sequential block but next is in already in buffer so readahead
+    // will not be reset.
+    iter->Seek(BuildKey(1011));
+    // Prefetch data but blocks are in cache so no prefetch and reset.
+    iter->Seek(BuildKey(1015));
+    iter->Seek(BuildKey(1019));
+    iter->Seek(BuildKey(1022));
+    // Prefetch data with readahead_size = 4 blocks.
+    iter->Seek(BuildKey(1026));
+    iter->Seek(BuildKey(103));
+    iter->Seek(BuildKey(1033));
+    iter->Seek(BuildKey(1037));
+
+    if (support_prefetch && !use_direct_io) {
+      ASSERT_EQ(fs->GetPrefetchCount(), 3);
+      fs->ClearPrefetchCount();
+    } else {
+      ASSERT_EQ(buff_prefetch_count, 2);
+      buff_prefetch_count = 0;
+    }
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+}
 
 }  // namespace ROCKSDB_NAMESPACE
 
diff --git a/file/random_access_file_reader.cc b/file/random_access_file_reader.cc
index 4d3c1a7f4bf..e15b6b0338d 100644
--- a/file/random_access_file_reader.cc
+++ b/file/random_access_file_reader.cc
@@ -12,6 +12,7 @@
 #include <algorithm>
 #include <mutex>
 
+#include "file/file_util.h"
 #include "monitoring/histogram.h"
 #include "monitoring/iostats_context_imp.h"
 #include "port/port.h"
@@ -21,18 +22,29 @@
 #include "util/rate_limiter.h"
 
 namespace ROCKSDB_NAMESPACE {
+IOStatus RandomAccessFileReader::Create(
+    const std::shared_ptr<FileSystem>& fs, const std::string& fname,
+    const FileOptions& file_opts,
+    std::unique_ptr<RandomAccessFileReader>* reader, IODebugContext* dbg) {
+  std::unique_ptr<FSRandomAccessFile> file;
+  IOStatus io_s = fs->NewRandomAccessFile(fname, file_opts, &file, dbg);
+  if (io_s.ok()) {
+    reader->reset(new RandomAccessFileReader(std::move(file), fname));
+  }
+  return io_s;
+}
 
-Status RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset,
-                                    size_t n, Slice* result, char* scratch,
-                                    AlignedBuf* aligned_buf,
-                                    bool for_compaction) const {
+IOStatus RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset,
+                                      size_t n, Slice* result, char* scratch,
+                                      AlignedBuf* aligned_buf,
+                                      bool for_compaction) const {
   (void)aligned_buf;
 
   TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::Read", nullptr);
-  Status s;
+  IOStatus io_s;
   uint64_t elapsed = 0;
   {
-    StopWatch sw(env_, stats_, hist_type_,
+    StopWatch sw(clock_, stats_, hist_type_,
                  (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
                  true /*delay_enabled*/);
     auto prev_perf_level = GetPerfLevel();
@@ -68,28 +80,28 @@ Status RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset,
         }
 
         {
-          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
+          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_);
           // Only user reads are expected to specify a timeout. And user reads
           // are not subjected to rate_limiter and should go through only
           // one iteration of this loop, so we don't need to check and adjust
           // the opts.timeout before calling file_->Read
           assert(!opts.timeout.count() || allowed == read_size);
-          s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, opts,
-                          &tmp, buf.Destination(), nullptr);
+          io_s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, opts,
+                             &tmp, buf.Destination(), nullptr);
         }
         if (ShouldNotifyListeners()) {
           auto finish_ts = FileOperationInfo::FinishNow();
           NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts,
-                                 s);
+                                 io_s);
         }
 
         buf.Size(buf.CurrentSize() + tmp.size());
-        if (!s.ok() || tmp.size() < allowed) {
+        if (!io_s.ok() || tmp.size() < allowed) {
           break;
         }
       }
       size_t res_len = 0;
-      if (s.ok() && offset_advance < buf.CurrentSize()) {
+      if (io_s.ok() && offset_advance < buf.CurrentSize()) {
         res_len = std::min(buf.CurrentSize() - offset_advance, n);
         if (aligned_buf == nullptr) {
           buf.Read(scratch, offset_advance, res_len);
@@ -128,20 +140,20 @@ Status RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset,
 #endif
 
         {
-          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
+          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_);
           // Only user reads are expected to specify a timeout. And user reads
           // are not subjected to rate_limiter and should go through only
           // one iteration of this loop, so we don't need to check and adjust
           // the opts.timeout before calling file_->Read
           assert(!opts.timeout.count() || allowed == n);
-          s = file_->Read(offset + pos, allowed, opts, &tmp_result,
-                          scratch + pos, nullptr);
+          io_s = file_->Read(offset + pos, allowed, opts, &tmp_result,
+                             scratch + pos, nullptr);
         }
 #ifndef ROCKSDB_LITE
         if (ShouldNotifyListeners()) {
           auto finish_ts = FileOperationInfo::FinishNow();
           NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts,
-                                 finish_ts, s);
+                                 finish_ts, io_s);
         }
 #endif
 
@@ -154,11 +166,11 @@ Status RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset,
           assert(tmp_result.data() == res_scratch + pos);
         }
         pos += tmp_result.size();
-        if (!s.ok() || tmp_result.size() < allowed) {
+        if (!io_s.ok() || tmp_result.size() < allowed) {
           break;
         }
       }
-      *result = Slice(res_scratch, s.ok() ? pos : 0);
+      *result = Slice(res_scratch, io_s.ok() ? pos : 0);
     }
     IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size());
     SetPerfLevel(prev_perf_level);
@@ -167,7 +179,7 @@ Status RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset,
     file_read_hist_->Add(elapsed);
   }
 
-  return s;
+  return io_s;
 }
 
 size_t End(const FSReadRequest& r) {
@@ -196,16 +208,16 @@ bool TryMerge(FSReadRequest* dest, const FSReadRequest& src) {
   return true;
 }
 
-Status RandomAccessFileReader::MultiRead(const IOOptions& opts,
-                                         FSReadRequest* read_reqs,
-                                         size_t num_reqs,
-                                         AlignedBuf* aligned_buf) const {
+IOStatus RandomAccessFileReader::MultiRead(const IOOptions& opts,
+                                           FSReadRequest* read_reqs,
+                                           size_t num_reqs,
+                                           AlignedBuf* aligned_buf) const {
   (void)aligned_buf;  // suppress warning of unused variable in LITE mode
   assert(num_reqs > 0);
-  Status s;
+  IOStatus io_s;
   uint64_t elapsed = 0;
   {
-    StopWatch sw(env_, stats_, hist_type_,
+    StopWatch sw(clock_, stats_, hist_type_,
                  (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
                  true /*delay_enabled*/);
     auto prev_perf_level = GetPerfLevel();
@@ -221,11 +233,19 @@ Status RandomAccessFileReader::MultiRead(const IOOptions& opts,
       aligned_reqs.reserve(num_reqs);
       // Align and merge the read requests.
       size_t alignment = file_->GetRequiredBufferAlignment();
-      aligned_reqs.push_back(Align(read_reqs[0], alignment));
-      for (size_t i = 1; i < num_reqs; i++) {
+      for (size_t i = 0; i < num_reqs; i++) {
         const auto& r = Align(read_reqs[i], alignment);
-        if (!TryMerge(&aligned_reqs.back(), r)) {
+        if (i == 0) {
+          // head
+          aligned_reqs.push_back(r);
+
+        } else if (!TryMerge(&aligned_reqs.back(), r)) {
+          // head + n
           aligned_reqs.push_back(r);
+
+        } else {
+          // unused
+          r.status.PermitUncheckedError();
         }
       }
       TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::MultiRead:AlignedReqs",
@@ -259,8 +279,8 @@ Status RandomAccessFileReader::MultiRead(const IOOptions& opts,
 #endif  // ROCKSDB_LITE
 
     {
-      IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
-      s = file_->MultiRead(fs_reqs, num_fs_reqs, opts, nullptr);
+      IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_);
+      io_s = file_->MultiRead(fs_reqs, num_fs_reqs, opts, nullptr);
     }
 
 #ifndef ROCKSDB_LITE
@@ -301,7 +321,15 @@ Status RandomAccessFileReader::MultiRead(const IOOptions& opts,
     file_read_hist_->Add(elapsed);
   }
 
-  return s;
+  return io_s;
 }
 
+IOStatus RandomAccessFileReader::PrepareIOOptions(const ReadOptions& ro,
+                                                  IOOptions& opts) {
+  if (clock_ != nullptr) {
+    return PrepareIOFromReadOptions(ro, clock_, opts);
+  } else {
+    return PrepareIOFromReadOptions(ro, SystemClock::Default().get(), opts);
+  }
+}
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/file/random_access_file_reader.h b/file/random_access_file_reader.h
index a0f7a191736..181f4dd0295 100644
--- a/file/random_access_file_reader.h
+++ b/file/random_access_file_reader.h
@@ -14,7 +14,6 @@
 
 #include "env/file_system_tracer.h"
 #include "port/port.h"
-#include "rocksdb/env.h"
 #include "rocksdb/file_system.h"
 #include "rocksdb/listener.h"
 #include "rocksdb/options.h"
@@ -24,6 +23,7 @@
 namespace ROCKSDB_NAMESPACE {
 class Statistics;
 class HistogramImpl;
+class SystemClock;
 
 using AlignedBuf = std::unique_ptr<char[]>;
 
@@ -38,7 +38,7 @@ FSReadRequest Align(const FSReadRequest& r, size_t alignment);
 // Otherwise, do nothing and return false.
 bool TryMerge(FSReadRequest* dest, const FSReadRequest& src);
 
-// RandomAccessFileReader is a wrapper on top of Env::RnadomAccessFile. It is
+// RandomAccessFileReader is a wrapper on top of Env::RandomAccessFile. It is
 // responsible for:
 // - Handling Buffered and Direct reads appropriately.
 // - Rate limiting compaction reads.
@@ -67,7 +67,7 @@ class RandomAccessFileReader {
 
   FSRandomAccessFilePtr file_;
   std::string file_name_;
-  Env* env_;
+  SystemClock* clock_;
   Statistics* stats_;
   uint32_t hist_type_;
   HistogramImpl* file_read_hist_;
@@ -77,14 +77,15 @@ class RandomAccessFileReader {
  public:
   explicit RandomAccessFileReader(
       std::unique_ptr<FSRandomAccessFile>&& raf, const std::string& _file_name,
-      Env* _env = nullptr, const std::shared_ptr<IOTracer>& io_tracer = nullptr,
+      SystemClock* clock = nullptr,
+      const std::shared_ptr<IOTracer>& io_tracer = nullptr,
       Statistics* stats = nullptr, uint32_t hist_type = 0,
       HistogramImpl* file_read_hist = nullptr,
       RateLimiter* rate_limiter = nullptr,
       const std::vector<std::shared_ptr<EventListener>>& listeners = {})
-      : file_(std::move(raf), io_tracer),
+      : file_(std::move(raf), io_tracer, _file_name),
         file_name_(std::move(_file_name)),
-        env_(_env),
+        clock_(clock),
         stats_(stats),
         hist_type_(hist_type),
         file_read_hist_(file_read_hist),
@@ -102,6 +103,10 @@ class RandomAccessFileReader {
 #endif
   }
 
+  static IOStatus Create(const std::shared_ptr<FileSystem>& fs,
+                         const std::string& fname, const FileOptions& file_opts,
+                         std::unique_ptr<RandomAccessFileReader>* reader,
+                         IODebugContext* dbg);
   RandomAccessFileReader(const RandomAccessFileReader&) = delete;
   RandomAccessFileReader& operator=(const RandomAccessFileReader&) = delete;
 
@@ -115,19 +120,19 @@ class RandomAccessFileReader {
   // 2. Otherwise, scratch is not used and can be null, the aligned_buf owns
   // the internally allocated buffer on return, and the result refers to a
   // region in aligned_buf.
-  Status Read(const IOOptions& opts, uint64_t offset, size_t n, Slice* result,
-              char* scratch, AlignedBuf* aligned_buf,
-              bool for_compaction = false) const;
+  IOStatus Read(const IOOptions& opts, uint64_t offset, size_t n, Slice* result,
+                char* scratch, AlignedBuf* aligned_buf,
+                bool for_compaction = false) const;
 
   // REQUIRES:
   // num_reqs > 0, reqs do not overlap, and offsets in reqs are increasing.
   // In non-direct IO mode, aligned_buf should be null;
   // In direct IO mode, aligned_buf stores the aligned buffer allocated inside
   // MultiRead, the result Slices in reqs refer to aligned_buf.
-  Status MultiRead(const IOOptions& opts, FSReadRequest* reqs, size_t num_reqs,
-                   AlignedBuf* aligned_buf) const;
+  IOStatus MultiRead(const IOOptions& opts, FSReadRequest* reqs,
+                     size_t num_reqs, AlignedBuf* aligned_buf) const;
 
-  Status Prefetch(uint64_t offset, size_t n) const {
+  IOStatus Prefetch(uint64_t offset, size_t n) const {
     return file_->Prefetch(offset, n, IOOptions(), nullptr);
   }
 
@@ -137,6 +142,6 @@ class RandomAccessFileReader {
 
   bool use_direct_io() const { return file_->use_direct_io(); }
 
-  Env* env() const { return env_; }
+  IOStatus PrepareIOOptions(const ReadOptions& ro, IOOptions& opts);
 };
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/file/random_access_file_reader_test.cc b/file/random_access_file_reader_test.cc
index 1358fd4b0f8..77a0e84a0be 100644
--- a/file/random_access_file_reader_test.cc
+++ b/file/random_access_file_reader_test.cc
@@ -38,11 +38,12 @@ class RandomAccessFileReaderTest : public testing::Test {
   }
 
   void Read(const std::string& fname, const FileOptions& opts,
-                std::unique_ptr<RandomAccessFileReader>* reader) {
+            std::unique_ptr<RandomAccessFileReader>* reader) {
     std::string fpath = Path(fname);
     std::unique_ptr<FSRandomAccessFile> f;
     ASSERT_OK(fs_->NewRandomAccessFile(fpath, opts, &f, nullptr));
-    (*reader).reset(new RandomAccessFileReader(std::move(f), fpath, env_));
+    reader->reset(new RandomAccessFileReader(std::move(f), fpath,
+                                             env_->GetSystemClock().get()));
   }
 
   void AssertResult(const std::string& content,
@@ -145,6 +146,7 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) {
     // Reads the first page internally.
     ASSERT_EQ(aligned_reqs.size(), 1);
     const FSReadRequest& aligned_r = aligned_reqs[0];
+    ASSERT_OK(aligned_r.status);
     ASSERT_EQ(aligned_r.offset, 0);
     ASSERT_EQ(aligned_r.len, page_size);
   }
@@ -189,6 +191,7 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) {
     // Reads the first two pages in one request internally.
     ASSERT_EQ(aligned_reqs.size(), 1);
     const FSReadRequest& aligned_r = aligned_reqs[0];
+    ASSERT_OK(aligned_r.status);
     ASSERT_EQ(aligned_r.offset, 0);
     ASSERT_EQ(aligned_r.len, 2 * page_size);
   }
@@ -233,6 +236,7 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) {
     // Reads the first 3 pages in one request internally.
     ASSERT_EQ(aligned_reqs.size(), 1);
     const FSReadRequest& aligned_r = aligned_reqs[0];
+    ASSERT_OK(aligned_r.status);
     ASSERT_EQ(aligned_r.offset, 0);
     ASSERT_EQ(aligned_r.len, 3 * page_size);
   }
@@ -270,8 +274,10 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) {
     ASSERT_EQ(aligned_reqs.size(), 2);
     const FSReadRequest& aligned_r0 = aligned_reqs[0];
     const FSReadRequest& aligned_r1 = aligned_reqs[1];
+    ASSERT_OK(aligned_r0.status);
     ASSERT_EQ(aligned_r0.offset, 0);
     ASSERT_EQ(aligned_r0.len, page_size);
+    ASSERT_OK(aligned_r1.status);
     ASSERT_EQ(aligned_r1.offset, 2 * page_size);
     ASSERT_EQ(aligned_r1.len, page_size);
   }
@@ -287,8 +293,11 @@ TEST(FSReadRequest, Align) {
   r.offset = 2000;
   r.len = 2000;
   r.scratch = nullptr;
+  ASSERT_OK(r.status);
 
   FSReadRequest aligned_r = Align(r, 1024);
+  ASSERT_OK(r.status);
+  ASSERT_OK(aligned_r.status);
   ASSERT_EQ(aligned_r.offset, 1024);
   ASSERT_EQ(aligned_r.len, 3072);
 }
@@ -303,14 +312,20 @@ TEST(FSReadRequest, TryMerge) {
       dest.offset = 0;
       dest.len = 10;
       dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
 
       FSReadRequest src;
       src.offset = 15;
       src.len = 10;
       src.scratch = nullptr;
+      ASSERT_OK(src.status);
 
-      if (reverse) std::swap(dest, src);
+      if (reverse) {
+        std::swap(dest, src);
+      }
       ASSERT_FALSE(TryMerge(&dest, src));
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
     }
 
     {
@@ -320,16 +335,22 @@ TEST(FSReadRequest, TryMerge) {
       dest.offset = 0;
       dest.len = 10;
       dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
 
       FSReadRequest src;
       src.offset = 10;
       src.len = 10;
       src.scratch = nullptr;
+      ASSERT_OK(src.status);
 
-      if (reverse) std::swap(dest, src);
+      if (reverse) {
+        std::swap(dest, src);
+      }
       ASSERT_TRUE(TryMerge(&dest, src));
       ASSERT_EQ(dest.offset, 0);
       ASSERT_EQ(dest.len, 20);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
     }
 
     {
@@ -339,16 +360,22 @@ TEST(FSReadRequest, TryMerge) {
       dest.offset = 0;
       dest.len = 10;
       dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
 
       FSReadRequest src;
       src.offset = 5;
       src.len = 10;
       src.scratch = nullptr;
+      ASSERT_OK(src.status);
 
-      if (reverse) std::swap(dest, src);
+      if (reverse) {
+        std::swap(dest, src);
+      }
       ASSERT_TRUE(TryMerge(&dest, src));
       ASSERT_EQ(dest.offset, 0);
       ASSERT_EQ(dest.len, 15);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
     }
 
     {
@@ -358,16 +385,22 @@ TEST(FSReadRequest, TryMerge) {
       dest.offset = 0;
       dest.len = 10;
       dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
 
       FSReadRequest src;
       src.offset = 5;
       src.len = 5;
       src.scratch = nullptr;
+      ASSERT_OK(src.status);
 
-      if (reverse) std::swap(dest, src);
+      if (reverse) {
+        std::swap(dest, src);
+      }
       ASSERT_TRUE(TryMerge(&dest, src));
       ASSERT_EQ(dest.offset, 0);
       ASSERT_EQ(dest.len, 10);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
     }
 
     {
@@ -377,16 +410,20 @@ TEST(FSReadRequest, TryMerge) {
       dest.offset = 0;
       dest.len = 10;
       dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
 
       FSReadRequest src;
       src.offset = 5;
       src.len = 1;
       src.scratch = nullptr;
+      ASSERT_OK(src.status);
 
       if (reverse) std::swap(dest, src);
       ASSERT_TRUE(TryMerge(&dest, src));
       ASSERT_EQ(dest.offset, 0);
       ASSERT_EQ(dest.len, 10);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
     }
 
     {
@@ -396,16 +433,20 @@ TEST(FSReadRequest, TryMerge) {
       dest.offset = 0;
       dest.len = 10;
       dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
 
       FSReadRequest src;
       src.offset = 0;
       src.len = 10;
       src.scratch = nullptr;
+      ASSERT_OK(src.status);
 
       if (reverse) std::swap(dest, src);
       ASSERT_TRUE(TryMerge(&dest, src));
       ASSERT_EQ(dest.offset, 0);
       ASSERT_EQ(dest.len, 10);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
     }
 
     {
@@ -415,16 +456,20 @@ TEST(FSReadRequest, TryMerge) {
       dest.offset = 0;
       dest.len = 10;
       dest.scratch = nullptr;
+      ASSERT_OK(dest.status);
 
       FSReadRequest src;
       src.offset = 0;
       src.len = 5;
       src.scratch = nullptr;
+      ASSERT_OK(src.status);
 
       if (reverse) std::swap(dest, src);
       ASSERT_TRUE(TryMerge(&dest, src));
       ASSERT_EQ(dest.offset, 0);
       ASSERT_EQ(dest.len, 10);
+      ASSERT_OK(dest.status);
+      ASSERT_OK(src.status);
     }
   }
 }
diff --git a/file/read_write_util.cc b/file/read_write_util.cc
index b4854e110fa..9df6c5a39d8 100644
--- a/file/read_write_util.cc
+++ b/file/read_write_util.cc
@@ -18,47 +18,10 @@ IOStatus NewWritableFile(FileSystem* fs, const std::string& fname,
                          std::unique_ptr<FSWritableFile>* result,
                          const FileOptions& options) {
   IOStatus s = fs->NewWritableFile(fname, options, result, nullptr);
-  TEST_KILL_RANDOM("NewWritableFile:0", rocksdb_kill_odds * REDUCE_ODDS2);
+  TEST_KILL_RANDOM_WITH_WEIGHT("NewWritableFile:0", REDUCE_ODDS2);
   return s;
 }
 
-bool ReadOneLine(std::istringstream* iss, SequentialFileReader* seq_file_reader,
-                 std::string* output, bool* has_data, Status* result) {
-  const int kBufferSize = 8192;
-  char buffer[kBufferSize + 1];
-  Slice input_slice;
-
-  std::string line;
-  bool has_complete_line = false;
-  while (!has_complete_line) {
-    if (std::getline(*iss, line)) {
-      has_complete_line = !iss->eof();
-    } else {
-      has_complete_line = false;
-    }
-    if (!has_complete_line) {
-      // if we're not sure whether we have a complete line,
-      // further read from the file.
-      if (*has_data) {
-        *result = seq_file_reader->Read(kBufferSize, &input_slice, buffer);
-      }
-      if (input_slice.size() == 0) {
-        // meaning we have read all the data
-        *has_data = false;
-        break;
-      } else {
-        iss->str(line + input_slice.ToString());
-        // reset the internal state of iss so that we can keep reading it.
-        iss->clear();
-        *has_data = (input_slice.size() == kBufferSize);
-        continue;
-      }
-    }
-  }
-  *output = line;
-  return *has_data || has_complete_line;
-}
-
 #ifndef NDEBUG
 bool IsFileSectorAligned(const size_t off, size_t sector_size) {
   return off % sector_size == 0;
diff --git a/file/read_write_util.h b/file/read_write_util.h
index 22f4076b344..718135c9885 100644
--- a/file/read_write_util.h
+++ b/file/read_write_util.h
@@ -24,10 +24,6 @@ extern IOStatus NewWritableFile(FileSystem* fs, const std::string& fname,
                                 std::unique_ptr<FSWritableFile>* result,
                                 const FileOptions& options);
 
-// Read a single line from a file.
-bool ReadOneLine(std::istringstream* iss, SequentialFileReader* seq_file_reader,
-                 std::string* output, bool* has_data, Status* result);
-
 #ifndef NDEBUG
 bool IsFileSectorAligned(const size_t off, size_t sector_size);
 #endif  // NDEBUG
diff --git a/file/readahead_raf.cc b/file/readahead_raf.cc
index 493f9d9e895..6d346432e22 100644
--- a/file/readahead_raf.cc
+++ b/file/readahead_raf.cc
@@ -11,15 +11,17 @@
 
 #include <algorithm>
 #include <mutex>
+
 #include "file/read_write_util.h"
+#include "rocksdb/file_system.h"
 #include "util/aligned_buffer.h"
 #include "util/rate_limiter.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace {
-class ReadaheadRandomAccessFile : public RandomAccessFile {
+class ReadaheadRandomAccessFile : public FSRandomAccessFile {
  public:
-  ReadaheadRandomAccessFile(std::unique_ptr<RandomAccessFile>&& file,
+  ReadaheadRandomAccessFile(std::unique_ptr<FSRandomAccessFile>&& file,
                             size_t readahead_size)
       : file_(std::move(file)),
         alignment_(file_->GetRequiredBufferAlignment()),
@@ -35,11 +37,12 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
   ReadaheadRandomAccessFile& operator=(const ReadaheadRandomAccessFile&) =
       delete;
 
-  Status Read(uint64_t offset, size_t n, Slice* result,
-              char* scratch) const override {
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override {
     // Read-ahead only make sense if we have some slack left after reading
     if (n + alignment_ >= readahead_size_) {
-      return file_->Read(offset, n, result, scratch);
+      return file_->Read(offset, n, options, result, scratch, dbg);
     }
 
     std::unique_lock<std::mutex> lk(lock_);
@@ -53,14 +56,14 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
         (cached_len == n || buffer_.CurrentSize() < readahead_size_)) {
       // We read exactly what we needed, or we hit end of file - return.
       *result = Slice(scratch, cached_len);
-      return Status::OK();
+      return IOStatus::OK();
     }
     size_t advanced_offset = static_cast<size_t>(offset + cached_len);
     // In the case of cache hit advanced_offset is already aligned, means that
     // chunk_offset equals to advanced_offset
     size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset);
 
-    Status s = ReadIntoBuffer(chunk_offset, readahead_size_);
+    IOStatus s = ReadIntoBuffer(chunk_offset, readahead_size_, options, dbg);
     if (s.ok()) {
       // The data we need is now in cache, so we can safely read it
       size_t remaining_len;
@@ -71,11 +74,12 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
     return s;
   }
 
-  Status Prefetch(uint64_t offset, size_t n) override {
+  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+                    IODebugContext* dbg) override {
     if (n < readahead_size_) {
       // Don't allow smaller prefetches than the configured `readahead_size_`.
       // `Read()` assumes a smaller prefetch buffer indicates EOF was reached.
-      return Status::OK();
+      return IOStatus::OK();
     }
 
     std::unique_lock<std::mutex> lk(lock_);
@@ -83,10 +87,11 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
     size_t offset_ = static_cast<size_t>(offset);
     size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset_);
     if (prefetch_offset == buffer_offset_) {
-      return Status::OK();
+      return IOStatus::OK();
     }
     return ReadIntoBuffer(prefetch_offset,
-                          Roundup(offset_ + n, alignment_) - prefetch_offset);
+                          Roundup(offset_ + n, alignment_) - prefetch_offset,
+                          options, dbg);
   }
 
   size_t GetUniqueId(char* id, size_t max_size) const override {
@@ -95,7 +100,7 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
 
   void Hint(AccessPattern pattern) override { file_->Hint(pattern); }
 
-  Status InvalidateCache(size_t offset, size_t length) override {
+  IOStatus InvalidateCache(size_t offset, size_t length) override {
     std::unique_lock<std::mutex> lk(lock_);
     buffer_.Clear();
     return file_->InvalidateCache(offset, length);
@@ -125,14 +130,16 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
   // Reads into buffer_ the next n bytes from file_ starting at offset.
   // Can actually read less if EOF was reached.
   // Returns the status of the read operastion on the file.
-  Status ReadIntoBuffer(uint64_t offset, size_t n) const {
+  IOStatus ReadIntoBuffer(uint64_t offset, size_t n, const IOOptions& options,
+                          IODebugContext* dbg) const {
     if (n > buffer_.Capacity()) {
       n = buffer_.Capacity();
     }
     assert(IsFileSectorAligned(offset, alignment_));
     assert(IsFileSectorAligned(n, alignment_));
     Slice result;
-    Status s = file_->Read(offset, n, &result, buffer_.BufferStart());
+    IOStatus s =
+        file_->Read(offset, n, options, &result, buffer_.BufferStart(), dbg);
     if (s.ok()) {
       buffer_offset_ = offset;
       buffer_.Size(result.size());
@@ -141,7 +148,7 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
     return s;
   }
 
-  const std::unique_ptr<RandomAccessFile> file_;
+  const std::unique_ptr<FSRandomAccessFile> file_;
   const size_t alignment_;
   const size_t readahead_size_;
 
@@ -153,9 +160,9 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
 };
 }  // namespace
 
-std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
-    std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size) {
-  std::unique_ptr<RandomAccessFile> result(
+std::unique_ptr<FSRandomAccessFile> NewReadaheadRandomAccessFile(
+    std::unique_ptr<FSRandomAccessFile>&& file, size_t readahead_size) {
+  std::unique_ptr<FSRandomAccessFile> result(
       new ReadaheadRandomAccessFile(std::move(file), readahead_size));
   return result;
 }
diff --git a/file/readahead_raf.h b/file/readahead_raf.h
index cbdcb124fd0..dfaf2b4fa94 100644
--- a/file/readahead_raf.h
+++ b/file/readahead_raf.h
@@ -8,10 +8,12 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
-#include <atomic>
-#include "rocksdb/env.h"
+#include <memory>
+
+#include "rocksdb/rocksdb_namespace.h"
 
 namespace ROCKSDB_NAMESPACE {
+class FSRandomAccessFile;
 // This file provides the following main abstractions:
 // SequentialFileReader : wrapper over Env::SequentialFile
 // RandomAccessFileReader : wrapper over Env::RandomAccessFile
@@ -22,6 +24,6 @@ namespace ROCKSDB_NAMESPACE {
 // NewReadaheadRandomAccessFile provides a wrapper over RandomAccessFile to
 // always prefetch additional data with every read. This is mainly used in
 // Compaction Table Readers.
-std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
-    std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size);
+std::unique_ptr<FSRandomAccessFile> NewReadaheadRandomAccessFile(
+    std::unique_ptr<FSRandomAccessFile>&& file, size_t readahead_size);
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/file/sequence_file_reader.cc b/file/sequence_file_reader.cc
index 81c5e5d1d46..3a87b6d102f 100644
--- a/file/sequence_file_reader.cc
+++ b/file/sequence_file_reader.cc
@@ -22,6 +22,18 @@
 #include "util/rate_limiter.h"
 
 namespace ROCKSDB_NAMESPACE {
+Status SequentialFileReader::Create(
+    const std::shared_ptr<FileSystem>& fs, const std::string& fname,
+    const FileOptions& file_opts, std::unique_ptr<SequentialFileReader>* reader,
+    IODebugContext* dbg) {
+  std::unique_ptr<FSSequentialFile> file;
+  Status s = fs->NewSequentialFile(fname, file_opts, &file, dbg);
+  if (s.ok()) {
+    reader->reset(new SequentialFileReader(std::move(file), fname));
+  }
+  return s;
+}
+
 Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) {
   Status s;
   if (use_direct_io()) {
diff --git a/file/sequence_file_reader.h b/file/sequence_file_reader.h
index 139e5255fbc..ea315f853e2 100644
--- a/file/sequence_file_reader.h
+++ b/file/sequence_file_reader.h
@@ -31,7 +31,8 @@ class SequentialFileReader {
   explicit SequentialFileReader(
       std::unique_ptr<FSSequentialFile>&& _file, const std::string& _file_name,
       const std::shared_ptr<IOTracer>& io_tracer = nullptr)
-      : file_name_(_file_name), file_(std::move(_file), io_tracer) {}
+      : file_name_(_file_name),
+        file_(std::move(_file), io_tracer, _file_name) {}
 
   explicit SequentialFileReader(
       std::unique_ptr<FSSequentialFile>&& _file, const std::string& _file_name,
@@ -39,7 +40,11 @@ class SequentialFileReader {
       const std::shared_ptr<IOTracer>& io_tracer = nullptr)
       : file_name_(_file_name),
         file_(NewReadaheadSequentialFile(std::move(_file), _readahead_size),
-              io_tracer) {}
+              io_tracer, _file_name) {}
+  static Status Create(const std::shared_ptr<FileSystem>& fs,
+                       const std::string& fname, const FileOptions& file_opts,
+                       std::unique_ptr<SequentialFileReader>* reader,
+                       IODebugContext* dbg);
 
   SequentialFileReader(const SequentialFileReader&) = delete;
   SequentialFileReader& operator=(const SequentialFileReader&) = delete;
diff --git a/file/sst_file_manager_impl.cc b/file/sst_file_manager_impl.cc
index baf58d6b8e0..cc03e54441d 100644
--- a/file/sst_file_manager_impl.cc
+++ b/file/sst_file_manager_impl.cc
@@ -9,7 +9,6 @@
 #include <vector>
 
 #include "db/db_impl/db_impl.h"
-#include "env/composite_env_wrapper.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "rocksdb/sst_file_manager.h"
@@ -19,21 +18,21 @@
 namespace ROCKSDB_NAMESPACE {
 
 #ifndef ROCKSDB_LITE
-SstFileManagerImpl::SstFileManagerImpl(Env* env, std::shared_ptr<FileSystem> fs,
-                                       std::shared_ptr<Logger> logger,
-                                       int64_t rate_bytes_per_sec,
-                                       double max_trash_db_ratio,
-                                       uint64_t bytes_max_delete_chunk)
-    : env_(env),
+SstFileManagerImpl::SstFileManagerImpl(
+    const std::shared_ptr<SystemClock>& clock,
+    const std::shared_ptr<FileSystem>& fs,
+    const std::shared_ptr<Logger>& logger, int64_t rate_bytes_per_sec,
+    double max_trash_db_ratio, uint64_t bytes_max_delete_chunk)
+    : clock_(clock),
       fs_(fs),
       logger_(logger),
       total_files_size_(0),
-      in_progress_files_size_(0),
       compaction_buffer_size_(0),
       cur_compactions_reserved_size_(0),
       max_allowed_space_(0),
-      delete_scheduler_(env, fs_.get(), rate_bytes_per_sec, logger.get(), this,
-                        max_trash_db_ratio, bytes_max_delete_chunk),
+      delete_scheduler_(clock_.get(), fs_.get(), rate_bytes_per_sec,
+                        logger.get(), this, max_trash_db_ratio,
+                        bytes_max_delete_chunk),
       cv_(&mu_),
       closing_(false),
       bg_thread_(nullptr),
@@ -60,23 +59,24 @@ void SstFileManagerImpl::Close() {
   }
 }
 
-Status SstFileManagerImpl::OnAddFile(const std::string& file_path,
-                                     bool compaction) {
+Status SstFileManagerImpl::OnAddFile(const std::string& file_path) {
   uint64_t file_size;
   Status s = fs_->GetFileSize(file_path, IOOptions(), &file_size, nullptr);
   if (s.ok()) {
     MutexLock l(&mu_);
-    OnAddFileImpl(file_path, file_size, compaction);
+    OnAddFileImpl(file_path, file_size);
   }
-  TEST_SYNC_POINT("SstFileManagerImpl::OnAddFile");
+  TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnAddFile",
+                           const_cast<std::string*>(&file_path));
   return s;
 }
 
 Status SstFileManagerImpl::OnAddFile(const std::string& file_path,
-                                     uint64_t file_size, bool compaction) {
+                                     uint64_t file_size) {
   MutexLock l(&mu_);
-  OnAddFileImpl(file_path, file_size, compaction);
-  TEST_SYNC_POINT("SstFileManagerImpl::OnAddFile");
+  OnAddFileImpl(file_path, file_size);
+  TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnAddFile",
+                           const_cast<std::string*>(&file_path));
   return Status::OK();
 }
 
@@ -85,7 +85,8 @@ Status SstFileManagerImpl::OnDeleteFile(const std::string& file_path) {
     MutexLock l(&mu_);
     OnDeleteFileImpl(file_path);
   }
-  TEST_SYNC_POINT("SstFileManagerImpl::OnDeleteFile");
+  TEST_SYNC_POINT_CALLBACK("SstFileManagerImpl::OnDeleteFile",
+                           const_cast<std::string*>(&file_path));
   return Status::OK();
 }
 
@@ -99,19 +100,6 @@ void SstFileManagerImpl::OnCompactionCompletion(Compaction* c) {
     }
   }
   cur_compactions_reserved_size_ -= size_added_by_compaction;
-
-  auto new_files = c->edit()->GetNewFiles();
-  for (auto& new_file : new_files) {
-    auto fn = TableFileName(c->immutable_cf_options()->cf_paths,
-                            new_file.second.fd.GetNumber(),
-                            new_file.second.fd.GetPathId());
-    if (in_progress_files_.find(fn) != in_progress_files_.end()) {
-      auto tracked_file = tracked_files_.find(fn);
-      assert(tracked_file != tracked_files_.end());
-      in_progress_files_size_ -= tracked_file->second;
-      in_progress_files_.erase(fn);
-    }
-  }
 }
 
 Status SstFileManagerImpl::OnMoveFile(const std::string& old_path,
@@ -122,7 +110,7 @@ Status SstFileManagerImpl::OnMoveFile(const std::string& old_path,
     if (file_size != nullptr) {
       *file_size = tracked_files_[old_path];
     }
-    OnAddFileImpl(new_path, tracked_files_[old_path], false);
+    OnAddFileImpl(new_path, tracked_files_[old_path]);
     OnDeleteFileImpl(old_path);
   }
   TEST_SYNC_POINT("SstFileManagerImpl::OnMoveFile");
@@ -159,7 +147,7 @@ bool SstFileManagerImpl::IsMaxAllowedSpaceReachedIncludingCompactions() {
 
 bool SstFileManagerImpl::EnoughRoomForCompaction(
     ColumnFamilyData* cfd, const std::vector<CompactionInputFiles>& inputs,
-    Status bg_error) {
+    const Status& bg_error) {
   MutexLock l(&mu_);
   uint64_t size_added_by_compaction = 0;
   // First check if we even have the space to do the compaction
@@ -184,7 +172,7 @@ bool SstFileManagerImpl::EnoughRoomForCompaction(
   // seen a NoSpace() error. This is tin order to contain a single potentially
   // misbehaving DB instance and prevent it from slowing down compactions of
   // other DB instances
-  if (bg_error == Status::NoSpace() && CheckFreeSpace()) {
+  if (bg_error.IsNoSpace() && CheckFreeSpace()) {
     auto fn =
         TableFileName(cfd->ioptions()->cf_paths, inputs[0][0]->fd.GetNumber(),
                       inputs[0][0]->fd.GetPathId());
@@ -199,7 +187,6 @@ bool SstFileManagerImpl::EnoughRoomForCompaction(
     if (compaction_buffer_size_ == 0) {
       needed_headroom += reserved_disk_buffer_;
     }
-    needed_headroom -= in_progress_files_size_;
     if (free_space < needed_headroom + size_added_by_compaction) {
       // We hit the condition of not enough disk space
       ROCKS_LOG_ERROR(logger_,
@@ -348,7 +335,7 @@ void SstFileManagerImpl::ClearError() {
     if (!error_handler_list_.empty()) {
       // If there are more instances to be recovered, reschedule after 5
       // seconds
-      int64_t wait_until = env_->NowMicros() + 5000000;
+      int64_t wait_until = clock_->NowMicros() + 5000000;
       cv_.TimedWait(wait_until);
     }
 
@@ -440,24 +427,15 @@ void SstFileManagerImpl::WaitForEmptyTrash() {
 }
 
 void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path,
-                                       uint64_t file_size, bool compaction) {
+                                       uint64_t file_size) {
   auto tracked_file = tracked_files_.find(file_path);
   if (tracked_file != tracked_files_.end()) {
     // File was added before, we will just update the size
-    assert(!compaction);
     total_files_size_ -= tracked_file->second;
     total_files_size_ += file_size;
     cur_compactions_reserved_size_ -= file_size;
   } else {
     total_files_size_ += file_size;
-    if (compaction) {
-      // Keep track of the size of files created by in-progress compactions.
-      // When calculating whether there's enough headroom for new compactions,
-      // this will be subtracted from cur_compactions_reserved_size_.
-      // Otherwise, compactions will be double counted.
-      in_progress_files_size_ += file_size;
-      in_progress_files_.insert(file_path);
-    }
   }
   tracked_files_[file_path] = file_size;
 }
@@ -466,16 +444,10 @@ void SstFileManagerImpl::OnDeleteFileImpl(const std::string& file_path) {
   auto tracked_file = tracked_files_.find(file_path);
   if (tracked_file == tracked_files_.end()) {
     // File is not tracked
-    assert(in_progress_files_.find(file_path) == in_progress_files_.end());
     return;
   }
 
   total_files_size_ -= tracked_file->second;
-  // Check if it belonged to an in-progress compaction
-  if (in_progress_files_.find(file_path) != in_progress_files_.end()) {
-    in_progress_files_size_ -= tracked_file->second;
-    in_progress_files_.erase(file_path);
-  }
   tracked_files_.erase(tracked_file);
 }
 
@@ -485,14 +457,7 @@ SstFileManager* NewSstFileManager(Env* env, std::shared_ptr<Logger> info_log,
                                   bool delete_existing_trash, Status* status,
                                   double max_trash_db_ratio,
                                   uint64_t bytes_max_delete_chunk) {
-  std::shared_ptr<FileSystem> fs;
-
-  if (env == Env::Default()) {
-    fs = FileSystem::Default();
-  } else {
-    fs.reset(new LegacyFileSystemWrapper(env));
-  }
-
+  const auto& fs = env->GetFileSystem();
   return NewSstFileManager(env, fs, info_log, trash_dir, rate_bytes_per_sec,
                            delete_existing_trash, status, max_trash_db_ratio,
                            bytes_max_delete_chunk);
@@ -505,8 +470,9 @@ SstFileManager* NewSstFileManager(Env* env, std::shared_ptr<FileSystem> fs,
                                   bool delete_existing_trash, Status* status,
                                   double max_trash_db_ratio,
                                   uint64_t bytes_max_delete_chunk) {
+  const auto& clock = env->GetSystemClock();
   SstFileManagerImpl* res =
-      new SstFileManagerImpl(env, fs, info_log, rate_bytes_per_sec,
+      new SstFileManagerImpl(clock, fs, info_log, rate_bytes_per_sec,
                              max_trash_db_ratio, bytes_max_delete_chunk);
 
   // trash_dir is deprecated and not needed anymore, but if user passed it
@@ -517,10 +483,6 @@ SstFileManager* NewSstFileManager(Env* env, std::shared_ptr<FileSystem> fs,
     s = fs->GetChildren(trash_dir, IOOptions(), &files_in_trash, nullptr);
     if (s.ok()) {
       for (const std::string& trash_file : files_in_trash) {
-        if (trash_file == "." || trash_file == "..") {
-          continue;
-        }
-
         std::string path_in_trash = trash_dir + "/" + trash_file;
         res->OnAddFile(path_in_trash);
         Status file_delete =
diff --git a/file/sst_file_manager_impl.h b/file/sst_file_manager_impl.h
index 9f4d99049f0..796a8df82fb 100644
--- a/file/sst_file_manager_impl.h
+++ b/file/sst_file_manager_impl.h
@@ -12,48 +12,45 @@
 #include "port/port.h"
 
 #include "db/compaction/compaction.h"
-#include "db/error_handler.h"
 #include "file/delete_scheduler.h"
-#include "rocksdb/file_system.h"
 #include "rocksdb/sst_file_manager.h"
 
 namespace ROCKSDB_NAMESPACE {
-
-class Env;
+class ErrorHandler;
+class FileSystem;
+class SystemClock;
 class Logger;
 
-// SstFileManager is used to track SST files in the DB and control there
-// deletion rate.
-// All SstFileManager public functions are thread-safe.
+// SstFileManager is used to track SST and blob files in the DB and control
+// their deletion rate. All SstFileManager public functions are thread-safe.
 class SstFileManagerImpl : public SstFileManager {
  public:
-  explicit SstFileManagerImpl(Env* env, std::shared_ptr<FileSystem> fs,
-                              std::shared_ptr<Logger> logger,
+  explicit SstFileManagerImpl(const std::shared_ptr<SystemClock>& clock,
+                              const std::shared_ptr<FileSystem>& fs,
+                              const std::shared_ptr<Logger>& logger,
                               int64_t rate_bytes_per_sec,
                               double max_trash_db_ratio,
                               uint64_t bytes_max_delete_chunk);
 
   ~SstFileManagerImpl();
 
-  // DB will call OnAddFile whenever a new sst file is added.
-  virtual Status OnAddFile(const std::string& file_path,
-                           bool compaction = false);
+  // DB will call OnAddFile whenever a new sst/blob file is added.
+  virtual Status OnAddFile(const std::string& file_path);
 
   // Overload where size of the file is provided by the caller rather than
   // queried from the filesystem. This is an optimization.
-  Status OnAddFile(const std::string& file_path, uint64_t file_size,
-                   bool compaction);
+  Status OnAddFile(const std::string& file_path, uint64_t file_size);
 
-  // DB will call OnDeleteFile whenever an sst file is deleted.
+  // DB will call OnDeleteFile whenever a sst/blob file is deleted.
   Status OnDeleteFile(const std::string& file_path);
 
-  // DB will call OnMoveFile whenever an sst file is move to a new path.
+  // DB will call OnMoveFile whenever a sst/blob file is move to a new path.
   Status OnMoveFile(const std::string& old_path, const std::string& new_path,
                     uint64_t* file_size = nullptr);
 
   // Update the maximum allowed space that should be used by RocksDB, if
-  // the total size of the SST files exceeds max_allowed_space, writes to
-  // RocksDB will fail.
+  // the total size of the SST and blob files exceeds max_allowed_space, writes
+  // to RocksDB will fail.
   //
   // Setting max_allowed_space to 0 will disable this feature, maximum allowed
   // space will be infinite (Default value).
@@ -63,8 +60,8 @@ class SstFileManagerImpl : public SstFileManager {
 
   void SetCompactionBufferSize(uint64_t compaction_buffer_size) override;
 
-  // Return true if the total size of SST files exceeded the maximum allowed
-  // space usage.
+  // Return true if the total size of SST and blob files exceeded the maximum
+  // allowed space usage.
   //
   // thread-safe.
   bool IsMaxAllowedSpaceReached() override;
@@ -78,7 +75,7 @@ class SstFileManagerImpl : public SstFileManager {
   // the full compaction size).
   bool EnoughRoomForCompaction(ColumnFamilyData* cfd,
                                const std::vector<CompactionInputFiles>& inputs,
-                               Status bg_error);
+                               const Status& bg_error);
 
   // Bookkeeping so total_file_sizes_ goes back to normal after compaction
   // finishes
@@ -143,8 +140,7 @@ class SstFileManagerImpl : public SstFileManager {
 
  private:
   // REQUIRES: mutex locked
-  void OnAddFileImpl(const std::string& file_path, uint64_t file_size,
-                     bool compaction);
+  void OnAddFileImpl(const std::string& file_path, uint64_t file_size);
   // REQUIRES: mutex locked
   void OnDeleteFileImpl(const std::string& file_path);
 
@@ -153,15 +149,13 @@ class SstFileManagerImpl : public SstFileManager {
     return bg_err_.severity() == Status::Severity::kSoftError;
   }
 
-  Env* env_;
+  std::shared_ptr<SystemClock> clock_;
   std::shared_ptr<FileSystem> fs_;
   std::shared_ptr<Logger> logger_;
   // Mutex to protect tracked_files_, total_files_size_
   port::Mutex mu_;
   // The summation of the sizes of all files in tracked_files_ map
   uint64_t total_files_size_;
-  // The summation of all output files of in-progress compactions
-  uint64_t in_progress_files_size_;
   // Compactions should only execute if they can leave at least
   // this amount of buffer space for logs and flushes
   uint64_t compaction_buffer_size_;
@@ -170,9 +164,7 @@ class SstFileManagerImpl : public SstFileManager {
   // A map containing all tracked files and there sizes
   //  file_path => file_size
   std::unordered_map<std::string, uint64_t> tracked_files_;
-  // A set of files belonging to in-progress compactions
-  std::unordered_set<std::string> in_progress_files_;
-  // The maximum allowed space (in bytes) for sst files.
+  // The maximum allowed space (in bytes) for sst and blob files.
   uint64_t max_allowed_space_;
   // DeleteScheduler used to throttle file deletition.
   DeleteScheduler delete_scheduler_;
@@ -192,7 +184,7 @@ class SstFileManagerImpl : public SstFileManager {
   // compactions to run full throttle. If disk space is below this trigger,
   // compactions will be gated by free disk space > input size
   uint64_t free_space_trigger_;
-  // List of database error handler instances tracked by this sst file manager
+  // List of database error handler instances tracked by this SstFileManager.
   std::list<ErrorHandler*> error_handler_list_;
   // Pointer to ErrorHandler instance that is currently processing recovery
   ErrorHandler* cur_instance_;
diff --git a/file/writable_file_writer.cc b/file/writable_file_writer.cc
index eafd8b66acc..d009542c727 100644
--- a/file/writable_file_writer.cc
+++ b/file/writable_file_writer.cc
@@ -16,19 +16,33 @@
 #include "monitoring/histogram.h"
 #include "monitoring/iostats_context_imp.h"
 #include "port/port.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/sync_point.h"
+#include "util/crc32c.h"
 #include "util/random.h"
 #include "util/rate_limiter.h"
 
 namespace ROCKSDB_NAMESPACE {
+Status WritableFileWriter::Create(const std::shared_ptr<FileSystem>& fs,
+                                  const std::string& fname,
+                                  const FileOptions& file_opts,
+                                  std::unique_ptr<WritableFileWriter>* writer,
+                                  IODebugContext* dbg) {
+  std::unique_ptr<FSWritableFile> file;
+  Status s = fs->NewWritableFile(fname, file_opts, &file, dbg);
+  if (s.ok()) {
+    writer->reset(new WritableFileWriter(std::move(file), fname, file_opts));
+  }
+  return s;
+}
+
 IOStatus WritableFileWriter::Append(const Slice& data) {
   const char* src = data.data();
   size_t left = data.size();
   IOStatus s;
   pending_sync_ = true;
 
-  TEST_KILL_RANDOM("WritableFileWriter::Append:0",
-                   rocksdb_kill_odds * REDUCE_ODDS2);
+  TEST_KILL_RANDOM_WITH_WEIGHT("WritableFileWriter::Append:0", REDUCE_ODDS2);
 
   // Calculate the checksum of appended data
   UpdateFileChecksum(data);
@@ -89,7 +103,7 @@ IOStatus WritableFileWriter::Append(const Slice& data) {
     s = WriteBuffered(src, left);
   }
 
-  TEST_KILL_RANDOM("WritableFileWriter::Append:1", rocksdb_kill_odds);
+  TEST_KILL_RANDOM("WritableFileWriter::Append:1");
   if (s.ok()) {
     filesize_ += data.size();
   }
@@ -177,7 +191,7 @@ IOStatus WritableFileWriter::Close() {
     }
   }
 
-  TEST_KILL_RANDOM("WritableFileWriter::Close:0", rocksdb_kill_odds);
+  TEST_KILL_RANDOM("WritableFileWriter::Close:0");
   {
 #ifndef ROCKSDB_LITE
     FileOperationInfo::StartTimePoint start_ts;
@@ -198,7 +212,7 @@ IOStatus WritableFileWriter::Close() {
   }
 
   writable_file_.reset();
-  TEST_KILL_RANDOM("WritableFileWriter::Close:1", rocksdb_kill_odds);
+  TEST_KILL_RANDOM("WritableFileWriter::Close:1");
 
   if (s.ok() && checksum_generator_ != nullptr && !checksum_finalized_) {
     checksum_generator_->Finalize();
@@ -212,8 +226,7 @@ IOStatus WritableFileWriter::Close() {
 // enabled
 IOStatus WritableFileWriter::Flush() {
   IOStatus s;
-  TEST_KILL_RANDOM("WritableFileWriter::Flush:0",
-                   rocksdb_kill_odds * REDUCE_ODDS2);
+  TEST_KILL_RANDOM_WITH_WEIGHT("WritableFileWriter::Flush:0", REDUCE_ODDS2);
 
   if (buf_.CurrentSize() > 0) {
     if (use_direct_io()) {
@@ -302,14 +315,14 @@ IOStatus WritableFileWriter::Sync(bool use_fsync) {
   if (!s.ok()) {
     return s;
   }
-  TEST_KILL_RANDOM("WritableFileWriter::Sync:0", rocksdb_kill_odds);
+  TEST_KILL_RANDOM("WritableFileWriter::Sync:0");
   if (!use_direct_io() && pending_sync_) {
     s = SyncInternal(use_fsync);
     if (!s.ok()) {
       return s;
     }
   }
-  TEST_KILL_RANDOM("WritableFileWriter::Sync:1", rocksdb_kill_odds);
+  TEST_KILL_RANDOM("WritableFileWriter::Sync:1");
   pending_sync_ = false;
   return IOStatus::OK();
 }
@@ -331,7 +344,7 @@ IOStatus WritableFileWriter::SyncInternal(bool use_fsync) {
   IOSTATS_TIMER_GUARD(fsync_nanos);
   TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0");
   auto prev_perf_level = GetPerfLevel();
-  IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
+  IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_);
 #ifndef ROCKSDB_LITE
   FileOperationInfo::StartTimePoint start_ts;
   if (ShouldNotifyListeners()) {
@@ -381,6 +394,8 @@ IOStatus WritableFileWriter::WriteBuffered(const char* data, size_t size) {
   assert(!use_direct_io());
   const char* src = data;
   size_t left = size;
+  DataVerificationInfo v_info;
+  char checksum_buf[sizeof(uint32_t)];
 
   while (left > 0) {
     size_t allowed;
@@ -406,8 +421,16 @@ IOStatus WritableFileWriter::WriteBuffered(const char* data, size_t size) {
 #endif
       {
         auto prev_perf_level = GetPerfLevel();
-        IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
-        s = writable_file_->Append(Slice(src, allowed), IOOptions(), nullptr);
+
+        IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_);
+        if (perform_data_verification_) {
+          Crc32cHandoffChecksumCalculation(src, allowed, checksum_buf);
+          v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
+          s = writable_file_->Append(Slice(src, allowed), IOOptions(), v_info,
+                                     nullptr);
+        } else {
+          s = writable_file_->Append(Slice(src, allowed), IOOptions(), nullptr);
+        }
         SetPerfLevel(prev_perf_level);
       }
 #ifndef ROCKSDB_LITE
@@ -422,7 +445,7 @@ IOStatus WritableFileWriter::WriteBuffered(const char* data, size_t size) {
     }
 
     IOSTATS_ADD(bytes_written, allowed);
-    TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0", rocksdb_kill_odds);
+    TEST_KILL_RANDOM("WritableFileWriter::WriteBuffered:0");
 
     left -= allowed;
     src += allowed;
@@ -437,6 +460,19 @@ void WritableFileWriter::UpdateFileChecksum(const Slice& data) {
   }
 }
 
+// Currently, crc32c checksum is used to calculate the checksum value of the
+// content in the input buffer for handoff. In the future, the checksum might be
+// calculated from the existing crc32c checksums of the in WAl and Manifest
+// records, or even SST file blocks.
+// TODO: effectively use the existing checksum of the data being writing to
+// generate the crc32c checksum instead of a raw calculation.
+void WritableFileWriter::Crc32cHandoffChecksumCalculation(const char* data,
+                                                          size_t size,
+                                                          char* buf) {
+  uint32_t v_crc32c = crc32c::Extend(0, data, size);
+  EncodeFixed32(buf, v_crc32c);
+}
+
 // This flushes the accumulated data in the buffer. We pad data with zeros if
 // necessary to the whole page.
 // However, during automatic flushes padding would not be necessary.
@@ -467,6 +503,8 @@ IOStatus WritableFileWriter::WriteDirect() {
   const char* src = buf_.BufferStart();
   uint64_t write_offset = next_write_offset_;
   size_t left = buf_.CurrentSize();
+  DataVerificationInfo v_info;
+  char checksum_buf[sizeof(uint32_t)];
 
   while (left > 0) {
     // Check how much is allowed
@@ -487,8 +525,16 @@ IOStatus WritableFileWriter::WriteDirect() {
         start_ts = FileOperationInfo::StartNow();
       }
       // direct writes must be positional
-      s = writable_file_->PositionedAppend(Slice(src, size), write_offset,
-                                           IOOptions(), nullptr);
+      if (perform_data_verification_) {
+        Crc32cHandoffChecksumCalculation(src, size, checksum_buf);
+        v_info.checksum = Slice(checksum_buf, sizeof(uint32_t));
+        s = writable_file_->PositionedAppend(Slice(src, size), write_offset,
+                                             IOOptions(), v_info, nullptr);
+      } else {
+        s = writable_file_->PositionedAppend(Slice(src, size), write_offset,
+                                             IOOptions(), nullptr);
+      }
+
       if (ShouldNotifyListeners()) {
         auto finish_ts = std::chrono::steady_clock::now();
         NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s);
diff --git a/file/writable_file_writer.h b/file/writable_file_writer.h
index 51fbcc04b6f..e6894281cc2 100644
--- a/file/writable_file_writer.h
+++ b/file/writable_file_writer.h
@@ -14,7 +14,6 @@
 #include "db/version_edit.h"
 #include "env/file_system_tracer.h"
 #include "port/port.h"
-#include "rocksdb/env.h"
 #include "rocksdb/file_checksum.h"
 #include "rocksdb/file_system.h"
 #include "rocksdb/io_status.h"
@@ -25,6 +24,7 @@
 
 namespace ROCKSDB_NAMESPACE {
 class Statistics;
+class SystemClock;
 
 // WritableFileWriter is a wrapper on top of Env::WritableFile. It provides
 // facilities to:
@@ -118,10 +118,12 @@ class WritableFileWriter {
 
   bool ShouldNotifyListeners() const { return !listeners_.empty(); }
   void UpdateFileChecksum(const Slice& data);
+  void Crc32cHandoffChecksumCalculation(const char* data, size_t size,
+                                        char* buf);
 
   std::string file_name_;
   FSWritableFilePtr writable_file_;
-  Env* env_;
+  SystemClock* clock_;
   AlignedBuffer buf_;
   size_t max_buffer_size_;
   // Actually written data size can be used for truncate
@@ -141,18 +143,20 @@ class WritableFileWriter {
   std::vector<std::shared_ptr<EventListener>> listeners_;
   std::unique_ptr<FileChecksumGenerator> checksum_generator_;
   bool checksum_finalized_;
+  bool perform_data_verification_;
 
  public:
   WritableFileWriter(
       std::unique_ptr<FSWritableFile>&& file, const std::string& _file_name,
-      const FileOptions& options, Env* env = nullptr,
+      const FileOptions& options, SystemClock* clock = nullptr,
       const std::shared_ptr<IOTracer>& io_tracer = nullptr,
       Statistics* stats = nullptr,
       const std::vector<std::shared_ptr<EventListener>>& listeners = {},
-      FileChecksumGenFactory* file_checksum_gen_factory = nullptr)
+      FileChecksumGenFactory* file_checksum_gen_factory = nullptr,
+      bool perform_data_verification = false)
       : file_name_(_file_name),
-        writable_file_(std::move(file), io_tracer),
-        env_(env),
+        writable_file_(std::move(file), io_tracer, _file_name),
+        clock_(clock),
         buf_(),
         max_buffer_size_(options.writable_file_max_buffer_size),
         filesize_(0),
@@ -166,7 +170,8 @@ class WritableFileWriter {
         stats_(stats),
         listeners_(),
         checksum_generator_(nullptr),
-        checksum_finalized_(false) {
+        checksum_finalized_(false),
+        perform_data_verification_(perform_data_verification) {
     TEST_SYNC_POINT_CALLBACK("WritableFileWriter::WritableFileWriter:0",
                              reinterpret_cast<void*>(max_buffer_size_));
     buf_.Alignment(writable_file_->GetRequiredBufferAlignment());
@@ -190,6 +195,10 @@ class WritableFileWriter {
     }
   }
 
+  static Status Create(const std::shared_ptr<FileSystem>& fs,
+                       const std::string& fname, const FileOptions& file_opts,
+                       std::unique_ptr<WritableFileWriter>* writer,
+                       IODebugContext* dbg);
   WritableFileWriter(const WritableFileWriter&) = delete;
 
   WritableFileWriter& operator=(const WritableFileWriter&) = delete;
diff --git a/fuzz/Makefile b/fuzz/Makefile
new file mode 100644
index 00000000000..fa45b9e7831
--- /dev/null
+++ b/fuzz/Makefile
@@ -0,0 +1,61 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+# This source code is licensed under both the GPLv2 (found in the
+# COPYING file in the root directory) and Apache 2.0 License
+# (found in the LICENSE.Apache file in the root directory).
+
+ROOT_DIR = $(abspath $(shell pwd)/../)
+
+include $(ROOT_DIR)/make_config.mk
+
+PROTOBUF_CFLAGS = `pkg-config --cflags protobuf`
+PROTOBUF_LDFLAGS = `pkg-config --libs protobuf`
+
+PROTOBUF_MUTATOR_CFLAGS = `pkg-config --cflags libprotobuf-mutator`
+PROTOBUF_MUTATOR_LDFLAGS = `pkg-config --libs libprotobuf-mutator`
+
+ROCKSDB_INCLUDE_DIR = $(ROOT_DIR)/include
+ROCKSDB_LIB_DIR = $(ROOT_DIR)
+
+PROTO_IN = $(ROOT_DIR)/fuzz/proto
+PROTO_OUT = $(ROOT_DIR)/fuzz/proto/gen
+
+ifneq ($(FUZZ_ENV), ossfuzz)
+CC = clang++
+CCFLAGS += -Wall -fsanitize=address,fuzzer
+CFLAGS += $(PLATFORM_CXXFLAGS) $(PROTOBUF_CFLAGS) $(PROTOBUF_MUTATOR_CFLAGS) -I$(PROTO_OUT) -I$(ROCKSDB_INCLUDE_DIR) -I$(ROCKSDB_LIB_DIR)
+LDFLAGS += $(PLATFORM_LDFLAGS) $(PROTOBUF_LDFLAGS) $(PROTOBUF_MUTATOR_LDFLAGS) -L$(ROCKSDB_LIB_DIR) -lrocksdb
+else
+# OSS-Fuzz sets various environment flags that are used for compilation.
+# These environment flags depend on which type of sanitizer build is being
+# used, however, an ASan build would set the environment flags as follows:
+# CFLAGS="-O1 -fno-omit-frame-pointer -gline-tables-only \
+         -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION -fsanitize=address \
+         -fsanitize-address-use-after-scope -fsanitize=fuzzer-no-link"
+# CXXFLAGS="-O1 -fno-omit-frame-pointer -gline-tables-only \
+           -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION -fsanitize=address \
+           -fsanitize-address-use-after-scope -fsanitize=fuzzer-no-link \
+           -stdlib=libc++"
+# LIB_FUZZING_ENGINE="-fsanitize=fuzzer"
+CC = $(CXX)
+CCFLAGS = $(CXXFLAGS)
+CFLAGS += $(PROTOBUF_CFLAGS) $(PROTOBUF_MUTATOR_CFLAGS) -I$(PROTO_OUT) -I$(ROCKSDB_INCLUDE_DIR) -I$(ROCKSDB_LIB_DIR)
+LDFLAGS += $(PLATFORM_LDFLAGS) $(LIB_FUZZING_ENGINE) $(PROTOBUF_MUTATOR_LDFLAGS) $(PROTOBUF_LDFLAGS) -L$(ROCKSDB_LIB_DIR) -lrocksdb
+endif
+
+.PHONY: gen_proto
+
+gen_proto:
+	mkdir -p $(PROTO_OUT)
+	protoc \
+		--proto_path=$(PROTO_IN) \
+		--cpp_out=$(PROTO_OUT) \
+		$(PROTO_IN)/*.proto
+
+db_fuzzer: db_fuzzer.cc
+	$(CC) $(CCFLAGS) -o db_fuzzer db_fuzzer.cc $(CFLAGS) $(LDFLAGS)
+
+db_map_fuzzer: gen_proto db_map_fuzzer.cc proto/gen/db_operation.pb.cc
+	$(CC) $(CCFLAGS) -o db_map_fuzzer db_map_fuzzer.cc proto/gen/db_operation.pb.cc $(CFLAGS) $(LDFLAGS)
+
+sst_file_writer_fuzzer: gen_proto sst_file_writer_fuzzer.cc proto/gen/db_operation.pb.cc
+	$(CC) $(CCFLAGS) -o sst_file_writer_fuzzer sst_file_writer_fuzzer.cc proto/gen/db_operation.pb.cc $(CFLAGS) $(LDFLAGS)
diff --git a/fuzz/README.md b/fuzz/README.md
new file mode 100644
index 00000000000..79b89bbc330
--- /dev/null
+++ b/fuzz/README.md
@@ -0,0 +1,160 @@
+# Fuzzing RocksDB
+
+## Overview
+
+This directory contains [fuzz tests](https://en.wikipedia.org/wiki/Fuzzing) for RocksDB.
+RocksDB testing infrastructure currently includes unit tests and [stress tests](https://github.com/facebook/rocksdb/wiki/Stress-test),
+we hope fuzz testing can catch more bugs.
+
+## Prerequisite
+
+We use [LLVM libFuzzer](http://llvm.org/docs/LibFuzzer.html) as the fuzzying engine,
+so make sure you have [clang](https://clang.llvm.org/get_started.html) as your compiler.
+
+Some tests rely on [structure aware fuzzing](https://github.com/google/fuzzing/blob/master/docs/structure-aware-fuzzing.md).
+We use [protobuf](https://developers.google.com/protocol-buffers) to define structured input to the fuzzer,
+and use [libprotobuf-mutator](https://github.com/google/libprotobuf-mutator) as the custom libFuzzer mutator.
+So make sure you have protobuf and libprotobuf-mutator installed, and make sure `pkg-config` can find them.
+
+## Example
+
+This example shows you how to do structure aware fuzzing to `rocksdb::SstFileWriter`.
+
+After walking through the steps to create the fuzzer, we'll introduce a bug into `rocksdb::SstFileWriter::Put`,
+then show that the fuzzer can catch the bug.
+
+### Design the test
+
+We want the fuzzing engine to automatically generate a list of database operations,
+then we apply these operations to `SstFileWriter` in sequence,
+finally, after the SST file is generated, we use `SstFileReader` to check the file's checksum.
+
+### Define input
+
+We define the database operations in protobuf, each operation has a type of operation and a key value pair,
+see [proto/db_operation.proto](proto/db_operation.proto) for details.
+
+### Define tests with the input
+
+In [sst_file_writer_fuzzer.cc](sst_file_writer_fuzzer.cc),
+we define the tests to be run on the generated input:
+
+```
+DEFINE_PROTO_FUZZER(DBOperations& input) {
+  // apply the operations to SstFileWriter and use SstFileReader to verify checksum.
+  // ...
+}
+```
+
+`SstFileWriter` requires the keys of the operations to be unique and be in ascending order,
+but the fuzzing engine generates the input randomly, so we need to process the generated input before
+passing it to `DEFINE_PROTO_FUZZER`, this is accomplished by registering a post processor:
+
+```
+protobuf_mutator::libfuzzer::PostProcessorRegistration<DBOperations>
+```
+
+### Compile and link the fuzzer
+
+In the rocksdb root directory, compile rocksdb library by `make static_lib`.
+
+Go to the `fuzz` directory,
+run `make sst_file_writer_fuzzer` to generate the fuzzer,
+it will compile rocksdb static library, generate protobuf, then compile and link `sst_file_writer_fuzzer`.
+
+### Introduce a bug
+
+Manually introduce a bug to `SstFileWriter::Put`:
+
+```
+diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc
+index ab1ee7c4e..c7da9ffa0 100644
+--- a/table/sst_file_writer.cc
++++ b/table/sst_file_writer.cc
+@@ -277,6 +277,11 @@ Status SstFileWriter::Add(const Slice& user_key, const Slice& value) {
+ }
+
+ Status SstFileWriter::Put(const Slice& user_key, const Slice& value) {
++  if (user_key.starts_with("!")) {
++    if (value.ends_with("!")) {
++      return Status::Corruption("bomb");
++    }
++  }
+   return rep_->Add(user_key, value, ValueType::kTypeValue);
+ }
+```
+
+The bug is that for `Put`, if `user_key` starts with `!` and `value` ends with `!`, then corrupt.
+
+### Run fuzz testing to catch the bug
+
+Run the fuzzer by `time ./sst_file_writer_fuzzer`.
+
+Here is the output on my machine:
+
+```
+Corruption: bomb
+==59680== ERROR: libFuzzer: deadly signal
+    #0 0x109487315 in __sanitizer_print_stack_trace+0x35 (libclang_rt.asan_osx_dynamic.dylib:x86_64+0x4d315)
+    #1 0x108d63f18 in fuzzer::PrintStackTrace() FuzzerUtil.cpp:205
+    #2 0x108d47613 in fuzzer::Fuzzer::CrashCallback() FuzzerLoop.cpp:232
+    #3 0x7fff6af535fc in _sigtramp+0x1c (libsystem_platform.dylib:x86_64+0x35fc)
+    #4 0x7ffee720f3ef  (<unknown module>)
+    #5 0x7fff6ae29807 in abort+0x77 (libsystem_c.dylib:x86_64+0x7f807)
+    #6 0x108cf1c4c in TestOneProtoInput(DBOperations&)+0x113c (sst_file_writer_fuzzer:x86_64+0x100302c4c)
+    #7 0x108cf09be in LLVMFuzzerTestOneInput+0x16e (sst_file_writer_fuzzer:x86_64+0x1003019be)
+    #8 0x108d48ce0 in fuzzer::Fuzzer::ExecuteCallback(unsigned char const*, unsigned long) FuzzerLoop.cpp:556
+    #9 0x108d48425 in fuzzer::Fuzzer::RunOne(unsigned char const*, unsigned long, bool, fuzzer::InputInfo*, bool*) FuzzerLoop.cpp:470
+    #10 0x108d4a626 in fuzzer::Fuzzer::MutateAndTestOne() FuzzerLoop.cpp:698
+    #11 0x108d4b325 in fuzzer::Fuzzer::Loop(std::__1::vector<fuzzer::SizedFile, fuzzer::fuzzer_allocator<fuzzer::SizedFile> >&) FuzzerLoop.cpp:830
+    #12 0x108d37fcd in fuzzer::FuzzerDriver(int*, char***, int (*)(unsigned char const*, unsigned long)) FuzzerDriver.cpp:829
+    #13 0x108d652b2 in main FuzzerMain.cpp:19
+    #14 0x7fff6ad5acc8 in start+0x0 (libdyld.dylib:x86_64+0x1acc8)
+
+NOTE: libFuzzer has rudimentary signal handlers.
+      Combine libFuzzer with AddressSanitizer or similar for better crash reports.
+SUMMARY: libFuzzer: deadly signal
+MS: 7 Custom-CustomCrossOver-InsertByte-Custom-ChangeBit-Custom-CustomCrossOver-; base unit: 90863b4d83c3f994bba0a417d0c2ee3b68f9e795
+0x6f,0x70,0x65,0x72,0x61,0x74,0x69,0x6f,0x6e,0x73,0x20,0x7b,0xa,0x20,0x20,0x6b,0x65,0x79,0x3a,0x20,0x22,0x21,0x22,0xa,0x20,0x20,0x76,0x61,0x6c,0x75,0x65,0x3a,0x20,0x22,0x21,0x22,0xa,0x20,0x20,0x74,0x79,0x70,0x65,0x3a,0x20,0x50,0x55,0x54,0xa,0x7d,0xa,0x6f,0x70,0x65,0x72,0x61,0x74,0x69,0x6f,0x6e,0x73,0x20,0x7b,0xa,0x20,0x20,0x6b,0x65,0x79,0x3a,0x20,0x22,0x2b,0x22,0xa,0x20,0x20,0x74,0x79,0x70,0x65,0x3a,0x20,0x50,0x55,0x54,0xa,0x7d,0xa,0x6f,0x70,0x65,0x72,0x61,0x74,0x69,0x6f,0x6e,0x73,0x20,0x7b,0xa,0x20,0x20,0x6b,0x65,0x79,0x3a,0x20,0x22,0x2e,0x22,0xa,0x20,0x20,0x74,0x79,0x70,0x65,0x3a,0x20,0x50,0x55,0x54,0xa,0x7d,0xa,0x6f,0x70,0x65,0x72,0x61,0x74,0x69,0x6f,0x6e,0x73,0x20,0x7b,0xa,0x20,0x20,0x6b,0x65,0x79,0x3a,0x20,0x22,0x5c,0x32,0x35,0x33,0x22,0xa,0x20,0x20,0x74,0x79,0x70,0x65,0x3a,0x20,0x50,0x55,0x54,0xa,0x7d,0xa,
+operations {\x0a  key: \"!\"\x0a  value: \"!\"\x0a  type: PUT\x0a}\x0aoperations {\x0a  key: \"+\"\x0a  type: PUT\x0a}\x0aoperations {\x0a  key: \".\"\x0a  type: PUT\x0a}\x0aoperations {\x0a  key: \"\\253\"\x0a  type: PUT\x0a}\x0a
+artifact_prefix='./'; Test unit written to ./crash-a1460be302d09b548e61787178d9edaa40aea467
+Base64: b3BlcmF0aW9ucyB7CiAga2V5OiAiISIKICB2YWx1ZTogIiEiCiAgdHlwZTogUFVUCn0Kb3BlcmF0aW9ucyB7CiAga2V5OiAiKyIKICB0eXBlOiBQVVQKfQpvcGVyYXRpb25zIHsKICBrZXk6ICIuIgogIHR5cGU6IFBVVAp9Cm9wZXJhdGlvbnMgewogIGtleTogIlwyNTMiCiAgdHlwZTogUFVUCn0K
+./sst_file_writer_fuzzer  5.97s user 4.40s system 64% cpu 16.195 total
+```
+
+Within 6 seconds, it catches the bug.
+
+The input that triggers the bug is persisted in `./crash-a1460be302d09b548e61787178d9edaa40aea467`:
+
+```
+$ cat ./crash-a1460be302d09b548e61787178d9edaa40aea467
+operations {
+  key: "!"
+  value: "!"
+  type: PUT
+}
+operations {
+  key: "+"
+  type: PUT
+}
+operations {
+  key: "."
+  type: PUT
+}
+operations {
+  key: "\253"
+  type: PUT
+}
+```
+
+### Reproduce the crash to debug
+
+The above crash can be reproduced by `./sst_file_writer_fuzzer ./crash-a1460be302d09b548e61787178d9edaa40aea467`,
+so you can debug the crash.
+
+## Future Work
+
+According to [OSS-Fuzz](https://github.com/google/oss-fuzz),
+`as of June 2020, OSS-Fuzz has found over 20,000 bugs in 300 open source projects.`
+
+RocksDB can join OSS-Fuzz together with other open source projects such as sqlite.
diff --git a/fuzz/db_fuzzer.cc b/fuzz/db_fuzzer.cc
new file mode 100644
index 00000000000..10b4fb8df06
--- /dev/null
+++ b/fuzz/db_fuzzer.cc
@@ -0,0 +1,159 @@
+#include <fuzzer/FuzzedDataProvider.h>
+
+#include "rocksdb/db.h"
+
+enum OperationType {
+  kPut,
+  kGet,
+  kDelete,
+  kGetProperty,
+  kIterator,
+  kSnapshot,
+  kOpenClose,
+  kColumn,
+  kCompactRange,
+  kSeekForPrev,
+  OP_COUNT
+};
+
+constexpr char db_path[] = "/tmp/testdb";
+
+// Fuzzes DB operations by doing interpretations on the data. Both the
+// sequence of API calls to be called on the DB as well as the arguments
+// to each of these APIs are interpreted by way of the data buffer.
+// The operations that the fuzzer supports are given by the OperationType
+// enum. The goal is to capture sanitizer bugs, so the code should be
+// compiled with a given sanitizer (ASan, UBSan, MSan).
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+  rocksdb::DB* db;
+  rocksdb::Options options;
+  options.create_if_missing = true;
+  rocksdb::Status status = rocksdb::DB::Open(options, db_path, &db);
+  if (!status.ok()) {
+    return 0;
+  }
+  FuzzedDataProvider fuzzed_data(data, size);
+
+  // perform a sequence of calls on our db instance
+  int max_iter = static_cast<int>(data[0]);
+  for (int i = 0; i < max_iter && i < size; i++) {
+    OperationType op = static_cast<OperationType>(data[i] % OP_COUNT);
+
+    switch (op) {
+      case kPut: {
+        std::string key = fuzzed_data.ConsumeRandomLengthString();
+        std::string val = fuzzed_data.ConsumeRandomLengthString();
+        db->Put(rocksdb::WriteOptions(), key, val);
+        break;
+      }
+      case kGet: {
+        std::string key = fuzzed_data.ConsumeRandomLengthString();
+        std::string value;
+        db->Get(rocksdb::ReadOptions(), key, &value);
+        break;
+      }
+      case kDelete: {
+        std::string key = fuzzed_data.ConsumeRandomLengthString();
+        db->Delete(rocksdb::WriteOptions(), key);
+        break;
+      }
+      case kGetProperty: {
+        std::string prop;
+        std::string property_name = fuzzed_data.ConsumeRandomLengthString();
+        db->GetProperty(property_name, &prop);
+        break;
+      }
+      case kIterator: {
+        rocksdb::Iterator* it = db->NewIterator(rocksdb::ReadOptions());
+        for (it->SeekToFirst(); it->Valid(); it->Next()) {
+        }
+        delete it;
+        break;
+      }
+      case kSnapshot: {
+        rocksdb::ReadOptions snapshot_options;
+        snapshot_options.snapshot = db->GetSnapshot();
+        rocksdb::Iterator* it = db->NewIterator(snapshot_options);
+        db->ReleaseSnapshot(snapshot_options.snapshot);
+        delete it;
+        break;
+      }
+      case kOpenClose: {
+        db->Close();
+        delete db;
+        status = rocksdb::DB::Open(options, db_path, &db);
+        if (!status.ok()) {
+          rocksdb::DestroyDB(db_path, options);
+          return 0;
+        }
+
+        break;
+      }
+      case kColumn: {
+        rocksdb::ColumnFamilyHandle* cf;
+        rocksdb::Status s;
+        s = db->CreateColumnFamily(rocksdb::ColumnFamilyOptions(), "new_cf",
+                                   &cf);
+        s = db->DestroyColumnFamilyHandle(cf);
+        db->Close();
+        delete db;
+
+        // open DB with two column families
+        std::vector<rocksdb::ColumnFamilyDescriptor> column_families;
+        // have to open default column family
+        column_families.push_back(rocksdb::ColumnFamilyDescriptor(
+            rocksdb::kDefaultColumnFamilyName, rocksdb::ColumnFamilyOptions()));
+        // open the new one, too
+        column_families.push_back(rocksdb::ColumnFamilyDescriptor(
+            "new_cf", rocksdb::ColumnFamilyOptions()));
+        std::vector<rocksdb::ColumnFamilyHandle*> handles;
+        s = rocksdb::DB::Open(rocksdb::DBOptions(), db_path, column_families,
+                              &handles, &db);
+
+        if (s.ok()) {
+          std::string key1 = fuzzed_data.ConsumeRandomLengthString();
+          std::string val1 = fuzzed_data.ConsumeRandomLengthString();
+          std::string key2 = fuzzed_data.ConsumeRandomLengthString();
+          s = db->Put(rocksdb::WriteOptions(), handles[1], key1, val1);
+          std::string value;
+          s = db->Get(rocksdb::ReadOptions(), handles[1], key2, &value);
+          s = db->DropColumnFamily(handles[1]);
+          for (auto handle : handles) {
+            s = db->DestroyColumnFamilyHandle(handle);
+          }
+        } else {
+          status = rocksdb::DB::Open(options, db_path, &db);
+          if (!status.ok()) {
+            // At this point there is no saving to do. So we exit
+            rocksdb::DestroyDB(db_path, rocksdb::Options());
+            return 0;
+          }
+        }
+        break;
+      }
+      case kCompactRange: {
+        std::string slice_start = fuzzed_data.ConsumeRandomLengthString();
+        std::string slice_end = fuzzed_data.ConsumeRandomLengthString();
+
+        rocksdb::Slice begin(slice_start);
+        rocksdb::Slice end(slice_end);
+        rocksdb::CompactRangeOptions options;
+        rocksdb::Status s = db->CompactRange(options, &begin, &end);
+        break;
+      }
+      case kSeekForPrev: {
+        std::string key = fuzzed_data.ConsumeRandomLengthString();
+        auto iter = db->NewIterator(rocksdb::ReadOptions());
+        iter->SeekForPrev(key);
+        delete iter;
+        break;
+      }
+    }
+  }
+
+  // Cleanup DB
+  db->Close();
+  delete db;
+  rocksdb::DestroyDB(db_path, options);
+  return 0;
+}
diff --git a/fuzz/db_map_fuzzer.cc b/fuzz/db_map_fuzzer.cc
new file mode 100644
index 00000000000..4d9637ad906
--- /dev/null
+++ b/fuzz/db_map_fuzzer.cc
@@ -0,0 +1,103 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <algorithm>
+#include <iostream>
+#include <map>
+#include <string>
+
+#include "proto/gen/db_operation.pb.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "src/libfuzzer/libfuzzer_macro.h"
+#include "util.h"
+
+protobuf_mutator::libfuzzer::PostProcessorRegistration<DBOperations> reg = {
+    [](DBOperations* input, unsigned int /* seed */) {
+      const rocksdb::Comparator* comparator = rocksdb::BytewiseComparator();
+      auto ops = input->mutable_operations();
+      // Make sure begin <= end for DELETE_RANGE.
+      for (DBOperation& op : *ops) {
+        if (op.type() == OpType::DELETE_RANGE) {
+          auto begin = op.key();
+          auto end = op.value();
+          if (comparator->Compare(begin, end) > 0) {
+            std::swap(begin, end);
+            op.set_key(begin);
+            op.set_value(end);
+          }
+        }
+      }
+    }};
+
+// Execute randomly generated operations on both a DB and a std::map,
+// then reopen the DB and make sure that iterating the DB produces the
+// same key-value pairs as iterating through the std::map.
+DEFINE_PROTO_FUZZER(DBOperations& input) {
+  if (input.operations().empty()) {
+    return;
+  }
+
+  const std::string kDbPath = "/tmp/db_map_fuzzer_test";
+  auto fs = rocksdb::FileSystem::Default();
+  if (fs->FileExists(kDbPath, rocksdb::IOOptions(), /*dbg=*/nullptr).ok()) {
+    std::cerr << "db path " << kDbPath << " already exists" << std::endl;
+    abort();
+  }
+
+  std::map<std::string, std::string> kv;
+  rocksdb::DB* db = nullptr;
+  rocksdb::Options options;
+  options.create_if_missing = true;
+  CHECK_OK(rocksdb::DB::Open(options, kDbPath, &db));
+
+  for (const DBOperation& op : input.operations()) {
+    switch (op.type()) {
+      case OpType::PUT: {
+        CHECK_OK(db->Put(rocksdb::WriteOptions(), op.key(), op.value()));
+        kv[op.key()] = op.value();
+        break;
+      }
+      case OpType::MERGE: {
+        break;
+      }
+      case OpType::DELETE: {
+        CHECK_OK(db->Delete(rocksdb::WriteOptions(), op.key()));
+        kv.erase(op.key());
+        break;
+      }
+      case OpType::DELETE_RANGE: {
+        // [op.key(), op.value()) corresponds to [begin, end).
+        CHECK_OK(db->DeleteRange(rocksdb::WriteOptions(),
+                                 db->DefaultColumnFamily(), op.key(),
+                                 op.value()));
+        kv.erase(kv.lower_bound(op.key()), kv.lower_bound(op.value()));
+        break;
+      }
+      default: {
+        std::cerr << "Unsupported operation" << static_cast<int>(op.type());
+        return;
+      }
+    }
+  }
+  CHECK_OK(db->Close());
+  delete db;
+  db = nullptr;
+
+  CHECK_OK(rocksdb::DB::Open(options, kDbPath, &db));
+  auto kv_it = kv.begin();
+  rocksdb::Iterator* it = db->NewIterator(rocksdb::ReadOptions());
+  for (it->SeekToFirst(); it->Valid(); it->Next(), kv_it++) {
+    CHECK_TRUE(kv_it != kv.end());
+    CHECK_EQ(it->key().ToString(), kv_it->first);
+    CHECK_EQ(it->value().ToString(), kv_it->second);
+  }
+  CHECK_TRUE(kv_it == kv.end());
+  delete it;
+
+  CHECK_OK(db->Close());
+  delete db;
+  CHECK_OK(rocksdb::DestroyDB(kDbPath, options));
+}
diff --git a/fuzz/proto/db_operation.proto b/fuzz/proto/db_operation.proto
new file mode 100644
index 00000000000..20a55eaa565
--- /dev/null
+++ b/fuzz/proto/db_operation.proto
@@ -0,0 +1,28 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+// Defines database operations.
+// Each operation is a key-value pair and an operation type.
+
+syntax = "proto2";
+
+enum OpType {
+  PUT = 0;
+  MERGE = 1;
+  DELETE = 2;
+  DELETE_RANGE = 3;
+}
+
+message DBOperation {
+  required string key = 1;
+  // value is ignored for DELETE.
+  // [key, value] is the range for DELETE_RANGE.
+  optional string value = 2;
+  required OpType type = 3;
+}
+
+message DBOperations {
+  repeated DBOperation operations = 1;
+}
diff --git a/fuzz/sst_file_writer_fuzzer.cc b/fuzz/sst_file_writer_fuzzer.cc
new file mode 100644
index 00000000000..a21544943ec
--- /dev/null
+++ b/fuzz/sst_file_writer_fuzzer.cc
@@ -0,0 +1,187 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include "proto/gen/db_operation.pb.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/sst_file_writer.h"
+#include "src/libfuzzer/libfuzzer_macro.h"
+#include "table/table_reader.h"
+#include "util.h"
+
+using namespace ROCKSDB_NAMESPACE;
+
+// Keys in SST file writer operations must be unique and in ascending order.
+// For each DBOperation generated by the fuzzer, this function is called on
+// it to deduplicate and sort the keys in the DBOperations.
+protobuf_mutator::libfuzzer::PostProcessorRegistration<DBOperations> reg = {
+    [](DBOperations* input, unsigned int /* seed */) {
+      const Comparator* comparator = BytewiseComparator();
+      auto ops = input->mutable_operations();
+
+      // Make sure begin <= end for DELETE_RANGE.
+      for (DBOperation& op : *ops) {
+        if (op.type() == OpType::DELETE_RANGE) {
+          auto begin = op.key();
+          auto end = op.value();
+          if (comparator->Compare(begin, end) > 0) {
+            std::swap(begin, end);
+            op.set_key(begin);
+            op.set_value(end);
+          }
+        }
+      }
+
+      std::sort(ops->begin(), ops->end(),
+                [&comparator](const DBOperation& a, const DBOperation& b) {
+                  return comparator->Compare(a.key(), b.key()) < 0;
+                });
+
+      auto last = std::unique(
+          ops->begin(), ops->end(),
+          [&comparator](const DBOperation& a, const DBOperation& b) {
+            return comparator->Compare(a.key(), b.key()) == 0;
+          });
+      ops->erase(last, ops->end());
+    }};
+
+TableReader* NewTableReader(const std::string& sst_file_path,
+                            const Options& options,
+                            const EnvOptions& env_options,
+                            const ImmutableCFOptions& cf_ioptions) {
+  // This code block is similar to SstFileReader::Open.
+
+  uint64_t file_size = 0;
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  std::unique_ptr<TableReader> table_reader;
+  const auto& fs = options.env->GetFileSystem();
+  FileOptions fopts(env_options);
+  Status s = options.env->GetFileSize(sst_file_path, fopts.io_options,
+                                      &file_size, nullptr);
+  if (s.ok()) {
+    s = RandomAccessFileReader::Create(fs, sst_file_path, fopts, &file_reader,
+                                       nullptr);
+  }
+  if (s.ok()) {
+    TableReaderOptions t_opt(cf_ioptions, /*prefix_extractor=*/nullptr,
+                             env_options, cf_ioptions.internal_comparator);
+    t_opt.largest_seqno = kMaxSequenceNumber;
+    s = options.table_factory->NewTableReader(t_opt, std::move(file_reader),
+                                              file_size, &table_reader,
+                                              /*prefetch=*/false);
+  }
+  if (!s.ok()) {
+    std::cerr << "Failed to create TableReader for " << sst_file_path << ": "
+              << s.ToString() << std::endl;
+    abort();
+  }
+  return table_reader.release();
+}
+
+ValueType ToValueType(OpType op_type) {
+  switch (op_type) {
+    case OpType::PUT:
+      return ValueType::kTypeValue;
+    case OpType::MERGE:
+      return ValueType::kTypeMerge;
+    case OpType::DELETE:
+      return ValueType::kTypeDeletion;
+    case OpType::DELETE_RANGE:
+      return ValueType::kTypeRangeDeletion;
+    default:
+      std::cerr << "Unknown operation type " << static_cast<int>(op_type)
+                << std::endl;
+      abort();
+  }
+}
+
+// Fuzzes DB operations as input, let SstFileWriter generate a SST file
+// according to the operations, then let TableReader read and check all the
+// key-value pairs from the generated SST file.
+DEFINE_PROTO_FUZZER(DBOperations& input) {
+  if (input.operations().empty()) {
+    return;
+  }
+
+  std::string sstfile;
+  {
+    auto fs = FileSystem::Default();
+    std::string dir;
+    IOOptions opt;
+    CHECK_OK(fs->GetTestDirectory(opt, &dir, nullptr));
+    sstfile = dir + "/SstFileWriterFuzzer.sst";
+  }
+
+  Options options;
+  EnvOptions env_options(options);
+  ImmutableCFOptions cf_ioptions(options);
+
+  // Generate sst file.
+  SstFileWriter writer(env_options, options);
+  CHECK_OK(writer.Open(sstfile));
+  for (const DBOperation& op : input.operations()) {
+    switch (op.type()) {
+      case OpType::PUT: {
+        CHECK_OK(writer.Put(op.key(), op.value()));
+        break;
+      }
+      case OpType::MERGE: {
+        CHECK_OK(writer.Merge(op.key(), op.value()));
+        break;
+      }
+      case OpType::DELETE: {
+        CHECK_OK(writer.Delete(op.key()));
+        break;
+      }
+      case OpType::DELETE_RANGE: {
+        CHECK_OK(writer.DeleteRange(op.key(), op.value()));
+        break;
+      }
+      default: {
+        std::cerr << "Unsupported operation" << static_cast<int>(op.type())
+                  << std::endl;
+        abort();
+      }
+    }
+  }
+  ExternalSstFileInfo info;
+  CHECK_OK(writer.Finish(&info));
+
+  // Iterate and verify key-value pairs.
+  std::unique_ptr<TableReader> table_reader(
+      NewTableReader(sstfile, options, env_options, cf_ioptions));
+  ReadOptions roptions;
+  CHECK_OK(table_reader->VerifyChecksum(roptions,
+                                        TableReaderCaller::kUncategorized));
+  std::unique_ptr<InternalIterator> it(
+      table_reader->NewIterator(roptions, /*prefix_extractor=*/nullptr,
+                                /*arena=*/nullptr, /*skip_filters=*/true,
+                                TableReaderCaller::kUncategorized));
+  it->SeekToFirst();
+  for (const DBOperation& op : input.operations()) {
+    if (op.type() == OpType::DELETE_RANGE) {
+      // InternalIterator cannot iterate over DELETE_RANGE entries.
+      continue;
+    }
+    CHECK_TRUE(it->Valid());
+    ParsedInternalKey ikey;
+    CHECK_OK(ParseInternalKey(it->key(), &ikey, /*log_err_key=*/true));
+    CHECK_EQ(ikey.user_key.ToString(), op.key());
+    CHECK_EQ(ikey.sequence, 0);
+    CHECK_EQ(ikey.type, ToValueType(op.type()));
+    if (op.type() != OpType::DELETE) {
+      CHECK_EQ(op.value(), it->value().ToString());
+    }
+    it->Next();
+  }
+  CHECK_TRUE(!it->Valid());
+
+  // Delete sst file.
+  remove(sstfile.c_str());
+}
diff --git a/fuzz/util.h b/fuzz/util.h
new file mode 100644
index 00000000000..44ffaf536e5
--- /dev/null
+++ b/fuzz/util.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#define CHECK_OK(expression)                       \
+  do {                                             \
+    auto status = (expression);                    \
+    if (!status.ok()) {                            \
+      std::cerr << status.ToString() << std::endl; \
+      abort();                                     \
+    }                                              \
+  } while (0)
+
+#define CHECK_EQ(a, b)                                                      \
+  if (a != b) {                                                             \
+    std::cerr << "(" << #a << "=" << a << ") != (" << #b << "=" << b << ")" \
+              << std::endl;                                                 \
+    abort();                                                                \
+  }
+
+#define CHECK_TRUE(cond)                                      \
+  if (!(cond)) {                                              \
+    std::cerr << "\"" << #cond << "\" is false" << std::endl; \
+    abort();                                                  \
+  }
diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h
index c76c60416db..7804ec46b6c 100644
--- a/include/rocksdb/advanced_options.h
+++ b/include/rocksdb/advanced_options.h
@@ -143,6 +143,27 @@ struct CompressionOptions {
   // Default: false.
   bool enabled;
 
+  // Limit on data buffering when gathering samples to build a dictionary. Zero
+  // means no limit. When dictionary is disabled (`max_dict_bytes == 0`),
+  // enabling this limit (`max_dict_buffer_bytes != 0`) has no effect.
+  //
+  // In compaction, the buffering is limited to the target file size (see
+  // `target_file_size_base` and `target_file_size_multiplier`) even if this
+  // setting permits more buffering. Since we cannot determine where the file
+  // should be cut until data blocks are compressed with dictionary, buffering
+  // more than the target file size could lead to selecting samples that belong
+  // to a later output SST.
+  //
+  // Limiting too strictly may harm dictionary effectiveness since it forces
+  // RocksDB to pick samples from the initial portion of the output SST, which
+  // may not be representative of the whole file. Configuring this limit below
+  // `zstd_max_train_bytes` (when enabled) can restrict how many samples we can
+  // pass to the dictionary trainer. Configuring it below `max_dict_bytes` can
+  // restrict the size of the final dictionary.
+  //
+  // Default: 0 (unlimited)
+  uint64_t max_dict_buffer_bytes;
+
   CompressionOptions()
       : window_bits(-14),
         level(kDefaultCompressionLevel),
@@ -150,17 +171,30 @@ struct CompressionOptions {
         max_dict_bytes(0),
         zstd_max_train_bytes(0),
         parallel_threads(1),
-        enabled(false) {}
+        enabled(false),
+        max_dict_buffer_bytes(0) {}
   CompressionOptions(int wbits, int _lev, int _strategy, int _max_dict_bytes,
                      int _zstd_max_train_bytes, int _parallel_threads,
-                     bool _enabled)
+                     bool _enabled, uint64_t _max_dict_buffer_bytes)
       : window_bits(wbits),
         level(_lev),
         strategy(_strategy),
         max_dict_bytes(_max_dict_bytes),
         zstd_max_train_bytes(_zstd_max_train_bytes),
         parallel_threads(_parallel_threads),
-        enabled(_enabled) {}
+        enabled(_enabled),
+        max_dict_buffer_bytes(_max_dict_buffer_bytes) {}
+};
+
+// Temperature of a file. Used to pass to FileSystem for a different
+// placement and/or coding.
+// Reserve some numbers in the middle, in case we need to insert new tier
+// there.
+enum class Temperature : uint8_t {
+  kUnknown = 0,
+  kHot = 0x04,
+  kWarm = 0x08,
+  kCold = 0x0C,
 };
 
 enum UpdateStatus {    // Return status For inplace update callback
@@ -237,6 +271,7 @@ struct AdvancedColumnFamilyOptions {
   // achieve point-in-time consistency using snapshot or iterator (assuming
   // concurrent updates). Hence iterator and multi-get will return results
   // which are not consistent as of any point-in-time.
+  // Backward iteration on memtables will not work either.
   // If inplace_callback function is not set,
   //   Put(key, new_value) will update inplace the existing_value iff
   //   * key exists in current memtable
@@ -356,7 +391,8 @@ struct AdvancedColumnFamilyOptions {
 
   // size of one block in arena memory allocation.
   // If <= 0, a proper value is automatically calculated (usually 1/8 of
-  // writer_buffer_size, rounded up to a multiple of 4KB).
+  // writer_buffer_size, rounded up to a multiple of 4KB, or 1MB which ever is
+  // smaller).
   //
   // There are two additional restriction of the specified size:
   // (1) size should be in the range of [4096, 2 << 30] and
@@ -674,10 +710,14 @@ struct AdvancedColumnFamilyOptions {
   // Dynamically changeable through SetOptions() API
   bool report_bg_io_stats = false;
 
-  // Files older than TTL will go through the compaction process.
+  // Files containing updates older than TTL will go through the compaction
+  // process. This usually happens in a cascading way so that those entries
+  // will be compacted to bottommost level/file.
+  // The feature is used to remove stale entries that have been deleted or
+  // updated from the file system.
   // Pre-req: This needs max_open_files to be set to -1.
   // In Level: Non-bottom-level files older than TTL will go through the
-  //           compation process.
+  //           compaction process.
   // In FIFO: Files older than TTL will be deleted.
   // unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60
   // In FIFO, this option will have the same meaning as
@@ -694,6 +734,9 @@ struct AdvancedColumnFamilyOptions {
 
   // Files older than this value will be picked up for compaction, and
   // re-written to the same level as they were before.
+  // One main use of the feature is to make sure a file goes through compaction
+  // filters periodically. Users can also use the feature to clear up SST
+  // files using old format.
   //
   // A file's age is computed by looking at file_creation_time or creation_time
   // table properties in order, if they have valid non-zero values; if not, the
@@ -727,19 +770,25 @@ struct AdvancedColumnFamilyOptions {
   // data is left uncompressed (unless compression is also requested).
   uint64_t sample_for_compression = 0;
 
-  // UNDER CONSTRUCTION -- DO NOT USE
+  // EXPERIMENTAL
+  // The feature is still in development and is incomplete.
+  // If this option is set, when creating bottommost files, pass this
+  // temperature to FileSystem used. Should be no-op for default FileSystem
+  // and users need to plug in their own FileSystem to take advantage of it.
+  Temperature bottommost_temperature = Temperature::kUnknown;
+
   // When set, large values (blobs) are written to separate blob files, and
   // only pointers to them are stored in SST files. This can reduce write
   // amplification for large-value use cases at the cost of introducing a level
   // of indirection for reads. See also the options min_blob_size,
-  // blob_file_size, and blob_compression_type below.
+  // blob_file_size, blob_compression_type, enable_blob_garbage_collection,
+  // and blob_garbage_collection_age_cutoff below.
   //
   // Default: false
   //
   // Dynamically changeable through the SetOptions() API
   bool enable_blob_files = false;
 
-  // UNDER CONSTRUCTION -- DO NOT USE
   // The size of the smallest value to be stored separately in a blob file.
   // Values which have an uncompressed size smaller than this threshold are
   // stored alongside the keys in SST files in the usual fashion. A value of
@@ -752,7 +801,6 @@ struct AdvancedColumnFamilyOptions {
   // Dynamically changeable through the SetOptions() API
   uint64_t min_blob_size = 0;
 
-  // UNDER CONSTRUCTION -- DO NOT USE
   // The size limit for blob files. When writing blob files, a new file is
   // opened once this limit is reached. Note that enable_blob_files has to be
   // set in order for this option to have any effect.
@@ -762,7 +810,6 @@ struct AdvancedColumnFamilyOptions {
   // Dynamically changeable through the SetOptions() API
   uint64_t blob_file_size = 1ULL << 28;
 
-  // UNDER CONSTRUCTION -- DO NOT USE
   // The compression algorithm to use for large values stored in blob files.
   // Note that enable_blob_files has to be set in order for this option to have
   // any effect.
@@ -772,6 +819,28 @@ struct AdvancedColumnFamilyOptions {
   // Dynamically changeable through the SetOptions() API
   CompressionType blob_compression_type = kNoCompression;
 
+  // Enables garbage collection of blobs. Blob GC is performed as part of
+  // compaction. Valid blobs residing in blob files older than a cutoff get
+  // relocated to new files as they are encountered during compaction, which
+  // makes it possible to clean up blob files once they contain nothing but
+  // obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff below.
+  //
+  // Default: false
+  //
+  // Dynamically changeable through the SetOptions() API
+  bool enable_blob_garbage_collection = false;
+
+  // The cutoff in terms of blob file age for garbage collection. Blobs in
+  // the oldest N blob files will be relocated when encountered during
+  // compaction, where N = garbage_collection_cutoff * number_of_blob_files.
+  // Note that enable_blob_garbage_collection has to be set in order for this
+  // option to have any effect.
+  //
+  // Default: 0.25
+  //
+  // Dynamically changeable through the SetOptions() API
+  double blob_garbage_collection_age_cutoff = 0.25;
+
   // Create ColumnFamilyOptions with default values for all fields
   AdvancedColumnFamilyOptions();
   // Create ColumnFamilyOptions from Options
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index b8c72ae75d4..8280b25a20e 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -71,8 +71,11 @@ extern "C" {
 typedef struct rocksdb_t                 rocksdb_t;
 typedef struct rocksdb_backup_engine_t   rocksdb_backup_engine_t;
 typedef struct rocksdb_backup_engine_info_t   rocksdb_backup_engine_info_t;
+typedef struct rocksdb_backupable_db_options_t rocksdb_backupable_db_options_t;
 typedef struct rocksdb_restore_options_t rocksdb_restore_options_t;
-typedef struct rocksdb_cache_t           rocksdb_cache_t;
+typedef struct rocksdb_memory_allocator_t rocksdb_memory_allocator_t;
+typedef struct rocksdb_lru_cache_options_t rocksdb_lru_cache_options_t;
+typedef struct rocksdb_cache_t rocksdb_cache_t;
 typedef struct rocksdb_compactionfilter_t rocksdb_compactionfilter_t;
 typedef struct rocksdb_compactionfiltercontext_t
     rocksdb_compactionfiltercontext_t;
@@ -145,6 +148,10 @@ extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_as_secondary(
 extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t* rocksdb_backup_engine_open(
     const rocksdb_options_t* options, const char* path, char** errptr);
 
+extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t*
+rocksdb_backup_engine_open_opts(const rocksdb_backupable_db_options_t* options,
+                                rocksdb_env_t* env, char** errptr);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup(
     rocksdb_backup_engine_t* be, rocksdb_t* db, char** errptr);
 
@@ -156,7 +163,7 @@ extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_purge_old_backups(
     rocksdb_backup_engine_t* be, uint32_t num_backups_to_keep, char** errptr);
 
 extern ROCKSDB_LIBRARY_API rocksdb_restore_options_t*
-rocksdb_restore_options_create();
+rocksdb_restore_options_create(void);
 extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_destroy(
     rocksdb_restore_options_t* opt);
 extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_set_keep_log_files(
@@ -203,6 +210,100 @@ extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_info_destroy(
 extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_close(
     rocksdb_backup_engine_t* be);
 
+/* BackupableDBOptions */
+
+extern ROCKSDB_LIBRARY_API rocksdb_backupable_db_options_t*
+rocksdb_backupable_db_options_create(const char* backup_dir);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backupable_db_options_set_backup_dir(
+    rocksdb_backupable_db_options_t* options, const char* backup_dir);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backupable_db_options_set_env(
+    rocksdb_backupable_db_options_t* options, rocksdb_env_t* env);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backupable_db_options_set_share_table_files(
+    rocksdb_backupable_db_options_t* options, unsigned char val);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_backupable_db_options_get_share_table_files(
+    rocksdb_backupable_db_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backupable_db_options_set_sync(
+    rocksdb_backupable_db_options_t* options, unsigned char val);
+
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_backupable_db_options_get_sync(
+    rocksdb_backupable_db_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backupable_db_options_set_destroy_old_data(
+    rocksdb_backupable_db_options_t* options, unsigned char val);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_backupable_db_options_get_destroy_old_data(
+    rocksdb_backupable_db_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backupable_db_options_set_backup_log_files(
+    rocksdb_backupable_db_options_t* options, unsigned char val);
+
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_backupable_db_options_get_backup_log_files(
+    rocksdb_backupable_db_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backupable_db_options_set_backup_rate_limit(
+    rocksdb_backupable_db_options_t* options, uint64_t limit);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_backupable_db_options_get_backup_rate_limit(
+    rocksdb_backupable_db_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backupable_db_options_set_restore_rate_limit(
+    rocksdb_backupable_db_options_t* options, uint64_t limit);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_backupable_db_options_get_restore_rate_limit(
+    rocksdb_backupable_db_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backupable_db_options_set_max_background_operations(
+    rocksdb_backupable_db_options_t* options, int val);
+
+extern ROCKSDB_LIBRARY_API int
+rocksdb_backupable_db_options_get_max_background_operations(
+    rocksdb_backupable_db_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backupable_db_options_set_callback_trigger_interval_size(
+    rocksdb_backupable_db_options_t* options, uint64_t size);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_backupable_db_options_get_callback_trigger_interval_size(
+    rocksdb_backupable_db_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backupable_db_options_set_max_valid_backups_to_open(
+    rocksdb_backupable_db_options_t* options, int val);
+
+extern ROCKSDB_LIBRARY_API int
+rocksdb_backupable_db_options_get_max_valid_backups_to_open(
+    rocksdb_backupable_db_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backupable_db_options_set_share_files_with_checksum_naming(
+    rocksdb_backupable_db_options_t* options, int val);
+
+extern ROCKSDB_LIBRARY_API int
+rocksdb_backupable_db_options_get_share_files_with_checksum_naming(
+    rocksdb_backupable_db_options_t* options);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_backupable_db_options_destroy(
+    rocksdb_backupable_db_options_t*);
+
+/* Checkpoint */
+
 extern ROCKSDB_LIBRARY_API rocksdb_checkpoint_t*
 rocksdb_checkpoint_object_create(rocksdb_t* db, char** errptr);
 
@@ -397,13 +498,13 @@ extern ROCKSDB_LIBRARY_API char* rocksdb_property_value_cf(
 extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes(
     rocksdb_t* db, int num_ranges, const char* const* range_start_key,
     const size_t* range_start_key_len, const char* const* range_limit_key,
-    const size_t* range_limit_key_len, uint64_t* sizes);
+    const size_t* range_limit_key_len, uint64_t* sizes, char** errptr);
 
 extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes_cf(
     rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
     int num_ranges, const char* const* range_start_key,
     const size_t* range_start_key_len, const char* const* range_limit_key,
-    const size_t* range_limit_key_len, uint64_t* sizes);
+    const size_t* range_limit_key_len, uint64_t* sizes, char** errptr);
 
 extern ROCKSDB_LIBRARY_API void rocksdb_compact_range(rocksdb_t* db,
                                                       const char* start_key,
@@ -438,6 +539,10 @@ extern ROCKSDB_LIBRARY_API void rocksdb_flush_cf(
     rocksdb_t* db, const rocksdb_flushoptions_t* options,
     rocksdb_column_family_handle_t* column_family, char** errptr);
 
+extern ROCKSDB_LIBRARY_API void rocksdb_flush_wal(rocksdb_t* db,
+                                                  unsigned char sync,
+                                                  char** errptr);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_disable_file_deletions(rocksdb_t* db,
                                                                char** errptr);
 
@@ -483,7 +588,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_destroy (const rocksdb_wal_iter
 
 /* Write batch */
 
-extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create();
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create(
+    void);
 extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create_from(
     const char* rep, size_t size);
 extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_destroy(
@@ -712,7 +818,7 @@ extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_writebatch_wi_create_iter
 /* Block based table options */
 
 extern ROCKSDB_LIBRARY_API rocksdb_block_based_table_options_t*
-rocksdb_block_based_options_create();
+rocksdb_block_based_options_create(void);
 extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_destroy(
     rocksdb_block_based_table_options_t* options);
 extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_size(
@@ -787,7 +893,7 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_block_based_table_factory(
 /* Cuckoo table options */
 
 extern ROCKSDB_LIBRARY_API rocksdb_cuckoo_table_options_t*
-rocksdb_cuckoo_options_create();
+rocksdb_cuckoo_options_create(void);
 extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_destroy(
     rocksdb_cuckoo_table_options_t* options);
 extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_hash_ratio(
@@ -811,7 +917,7 @@ extern ROCKSDB_LIBRARY_API void rocksdb_set_options(
 extern ROCKSDB_LIBRARY_API void rocksdb_set_options_cf(
     rocksdb_t* db, rocksdb_column_family_handle_t* handle, int count, const char* const keys[], const char* const values[], char** errptr);
 
-extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create();
+extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create(void);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_destroy(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create_copy(
     rocksdb_options_t*);
@@ -898,12 +1004,30 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_options(
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_compression_options_zstd_max_train_bytes(rocksdb_options_t*,
                                                              int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_compression_options_zstd_max_train_bytes(
+    rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_compression_options_parallel_threads(rocksdb_options_t*,
+                                                         int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_options_get_compression_options_parallel_threads(
+    rocksdb_options_t* opt);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t* opt);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_bottommost_compression_options(rocksdb_options_t*, int, int,
                                                    int, int, unsigned char);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_bottommost_compression_options_zstd_max_train_bytes(
     rocksdb_options_t*, int, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_bottommost_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t*, uint64_t, unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_prefix_extractor(
     rocksdb_options_t*, rocksdb_slicetransform_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_num_levels(
@@ -962,6 +1086,37 @@ extern ROCKSDB_LIBRARY_API unsigned char
 rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(
     rocksdb_options_t* opt);
 
+/* Blob Options Settings */
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_blob_files(
+    rocksdb_options_t* opt, unsigned char val);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_files(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_min_blob_size(
+    rocksdb_options_t* opt, uint64_t val);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_min_blob_size(rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_file_size(
+    rocksdb_options_t* opt, uint64_t val);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_options_get_blob_file_size(rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_compression_type(
+    rocksdb_options_t* opt, int val);
+extern ROCKSDB_LIBRARY_API int rocksdb_options_get_blob_compression_type(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_blob_gc(
+    rocksdb_options_t* opt, unsigned char val);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_gc(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_gc_age_cutoff(
+    rocksdb_options_t* opt, double val);
+extern ROCKSDB_LIBRARY_API double rocksdb_options_get_blob_gc_age_cutoff(
+    rocksdb_options_t* opt);
+
 /* returns a pointer to a malloc()-ed, null terminated string */
 extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string(
     rocksdb_options_t* opt);
@@ -1275,6 +1430,14 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_row_cache(
     rocksdb_options_t* opt, rocksdb_cache_t* cache
 );
 
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_add_compact_on_deletion_collector_factory(
+    rocksdb_options_t*, size_t window_size, size_t num_dels_trigger);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manual_wal_flush(
+    rocksdb_options_t* opt, unsigned char);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_manual_wal_flush(
+    rocksdb_options_t* opt);
+
 /* RateLimiter */
 extern ROCKSDB_LIBRARY_API rocksdb_ratelimiter_t* rocksdb_ratelimiter_create(
     int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness);
@@ -1363,7 +1526,8 @@ enum {
 };
 
 extern ROCKSDB_LIBRARY_API void rocksdb_set_perf_level(int);
-extern ROCKSDB_LIBRARY_API rocksdb_perfcontext_t* rocksdb_perfcontext_create();
+extern ROCKSDB_LIBRARY_API rocksdb_perfcontext_t* rocksdb_perfcontext_create(
+    void);
 extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_reset(
     rocksdb_perfcontext_t* context);
 extern ROCKSDB_LIBRARY_API char* rocksdb_perfcontext_report(
@@ -1461,7 +1625,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_mergeoperator_destroy(
 
 /* Read options */
 
-extern ROCKSDB_LIBRARY_API rocksdb_readoptions_t* rocksdb_readoptions_create();
+extern ROCKSDB_LIBRARY_API rocksdb_readoptions_t* rocksdb_readoptions_create(
+    void);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_destroy(
     rocksdb_readoptions_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_verify_checksums(
@@ -1518,11 +1683,19 @@ extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_ignore_range_deletions(
     rocksdb_readoptions_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API unsigned char
 rocksdb_readoptions_get_ignore_range_deletions(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_deadline(
+    rocksdb_readoptions_t*, uint64_t microseconds);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_readoptions_get_deadline(rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_io_timeout(
+    rocksdb_readoptions_t*, uint64_t microseconds);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t*);
 
 /* Write options */
 
-extern ROCKSDB_LIBRARY_API rocksdb_writeoptions_t*
-rocksdb_writeoptions_create();
+extern ROCKSDB_LIBRARY_API rocksdb_writeoptions_t* rocksdb_writeoptions_create(
+    void);
 extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_destroy(
     rocksdb_writeoptions_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_sync(
@@ -1556,7 +1729,7 @@ rocksdb_writeoptions_get_memtable_insert_hint_per_batch(
 /* Compact range options */
 
 extern ROCKSDB_LIBRARY_API rocksdb_compactoptions_t*
-rocksdb_compactoptions_create();
+rocksdb_compactoptions_create(void);
 extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_destroy(
     rocksdb_compactoptions_t*);
 extern ROCKSDB_LIBRARY_API void
@@ -1582,8 +1755,8 @@ extern ROCKSDB_LIBRARY_API int rocksdb_compactoptions_get_target_level(
 
 /* Flush options */
 
-extern ROCKSDB_LIBRARY_API rocksdb_flushoptions_t*
-rocksdb_flushoptions_create();
+extern ROCKSDB_LIBRARY_API rocksdb_flushoptions_t* rocksdb_flushoptions_create(
+    void);
 extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_destroy(
     rocksdb_flushoptions_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_set_wait(
@@ -1591,11 +1764,31 @@ extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_set_wait(
 extern ROCKSDB_LIBRARY_API unsigned char rocksdb_flushoptions_get_wait(
     rocksdb_flushoptions_t*);
 
+/* Memory allocator */
+
+extern ROCKSDB_LIBRARY_API rocksdb_memory_allocator_t*
+rocksdb_jemalloc_nodump_allocator_create(char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_memory_allocator_destroy(
+    rocksdb_memory_allocator_t*);
+
 /* Cache */
 
+extern ROCKSDB_LIBRARY_API rocksdb_lru_cache_options_t*
+rocksdb_lru_cache_options_create(void);
+extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_destroy(
+    rocksdb_lru_cache_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_capacity(
+    rocksdb_lru_cache_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_lru_cache_options_set_memory_allocator(
+    rocksdb_lru_cache_options_t*, rocksdb_memory_allocator_t*);
+
 extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru(
     size_t capacity);
+extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru_opts(
+    rocksdb_lru_cache_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_cache_destroy(rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API void rocksdb_cache_disown_data(
+    rocksdb_cache_t* cache);
 extern ROCKSDB_LIBRARY_API void rocksdb_cache_set_capacity(
     rocksdb_cache_t* cache, size_t capacity);
 extern ROCKSDB_LIBRARY_API size_t
@@ -1612,16 +1805,24 @@ extern ROCKSDB_LIBRARY_API void rocksdb_dbpath_destroy(rocksdb_dbpath_t*);
 
 /* Env */
 
-extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_default_env();
-extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_mem_env();
+extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_default_env(void);
+extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_mem_env(void);
 extern ROCKSDB_LIBRARY_API void rocksdb_env_set_background_threads(
     rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API int rocksdb_env_get_background_threads(
+    rocksdb_env_t* env);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API int rocksdb_env_get_high_priority_background_threads(
+    rocksdb_env_t* env);
 extern ROCKSDB_LIBRARY_API void rocksdb_env_set_low_priority_background_threads(
     rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API int rocksdb_env_get_low_priority_background_threads(
+    rocksdb_env_t* env);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_env_set_bottom_priority_background_threads(rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_env_get_bottom_priority_background_threads(rocksdb_env_t* env);
 extern ROCKSDB_LIBRARY_API void rocksdb_env_join_all_threads(
     rocksdb_env_t* env);
 extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env);
@@ -1631,7 +1832,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_env_lower_high_priority_thread_pool_cpu_
 
 extern ROCKSDB_LIBRARY_API void rocksdb_env_destroy(rocksdb_env_t*);
 
-extern ROCKSDB_LIBRARY_API rocksdb_envoptions_t* rocksdb_envoptions_create();
+extern ROCKSDB_LIBRARY_API rocksdb_envoptions_t* rocksdb_envoptions_create(
+    void);
 extern ROCKSDB_LIBRARY_API void rocksdb_envoptions_destroy(
     rocksdb_envoptions_t* opt);
 
@@ -1666,7 +1868,7 @@ extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_destroy(
     rocksdb_sstfilewriter_t* writer);
 
 extern ROCKSDB_LIBRARY_API rocksdb_ingestexternalfileoptions_t*
-rocksdb_ingestexternalfileoptions_create();
+rocksdb_ingestexternalfileoptions_create(void);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_ingestexternalfileoptions_set_move_files(
     rocksdb_ingestexternalfileoptions_t* opt, unsigned char move_files);
@@ -1712,7 +1914,7 @@ rocksdb_slicetransform_create(
 extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
     rocksdb_slicetransform_create_fixed_prefix(size_t);
 extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
-rocksdb_slicetransform_create_noop();
+rocksdb_slicetransform_create_noop(void);
 extern ROCKSDB_LIBRARY_API void rocksdb_slicetransform_destroy(
     rocksdb_slicetransform_t*);
 
@@ -1724,33 +1926,54 @@ enum {
 };
 
 extern ROCKSDB_LIBRARY_API rocksdb_universal_compaction_options_t*
-rocksdb_universal_compaction_options_create();
+rocksdb_universal_compaction_options_create(void);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_universal_compaction_options_set_size_ratio(
     rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_size_ratio(
+    rocksdb_universal_compaction_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_universal_compaction_options_set_min_merge_width(
     rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_min_merge_width(
+    rocksdb_universal_compaction_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_universal_compaction_options_set_max_merge_width(
     rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_max_merge_width(
+    rocksdb_universal_compaction_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_universal_compaction_options_set_max_size_amplification_percent(
     rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_max_size_amplification_percent(
+    rocksdb_universal_compaction_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_universal_compaction_options_set_compression_size_percent(
     rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_compression_size_percent(
+    rocksdb_universal_compaction_options_t*);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_universal_compaction_options_set_stop_style(
     rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API int
+rocksdb_universal_compaction_options_get_stop_style(
+    rocksdb_universal_compaction_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_destroy(
     rocksdb_universal_compaction_options_t*);
 
 extern ROCKSDB_LIBRARY_API rocksdb_fifo_compaction_options_t*
-rocksdb_fifo_compaction_options_create();
+rocksdb_fifo_compaction_options_create(void);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_fifo_compaction_options_set_max_table_files_size(
     rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_fifo_compaction_options_get_max_table_files_size(
+    rocksdb_fifo_compaction_options_t* fifo_opts);
 extern ROCKSDB_LIBRARY_API void rocksdb_fifo_compaction_options_destroy(
     rocksdb_fifo_compaction_options_t* fifo_opts);
 
@@ -1801,7 +2024,7 @@ extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_t* rocksdb_transactiondb_open(
     const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
     char** errptr);
 
-rocksdb_transactiondb_t* rocksdb_transactiondb_open_column_families(
+extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_t* rocksdb_transactiondb_open_column_families(
     const rocksdb_options_t* options,
     const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
     int num_column_families, const char* const* column_family_names,
@@ -1853,7 +2076,7 @@ extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_for_update(
     const char* key, size_t klen, size_t* vlen, unsigned char exclusive,
     char** errptr);
 
-char* rocksdb_transaction_get_for_update_cf(
+extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_for_update_cf(
     rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
     rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
     size_t* vlen, unsigned char exclusive, char** errptr);
@@ -1977,7 +2200,7 @@ extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_close(
 /* Transaction Options */
 
 extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_options_t*
-rocksdb_transactiondb_options_create();
+rocksdb_transactiondb_options_create(void);
 
 extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_options_destroy(
     rocksdb_transactiondb_options_t* opt);
@@ -1997,7 +2220,7 @@ rocksdb_transactiondb_options_set_default_lock_timeout(
     rocksdb_transactiondb_options_t* opt, int64_t default_lock_timeout);
 
 extern ROCKSDB_LIBRARY_API rocksdb_transaction_options_t*
-rocksdb_transaction_options_create();
+rocksdb_transaction_options_create(void);
 
 extern ROCKSDB_LIBRARY_API void rocksdb_transaction_options_destroy(
     rocksdb_transaction_options_t* opt);
@@ -2023,7 +2246,7 @@ rocksdb_transaction_options_set_max_write_batch_size(
     rocksdb_transaction_options_t* opt, size_t size);
 
 extern ROCKSDB_LIBRARY_API rocksdb_optimistictransaction_options_t*
-rocksdb_optimistictransaction_options_create();
+rocksdb_optimistictransaction_options_create(void);
 
 extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransaction_options_destroy(
     rocksdb_optimistictransaction_options_t* opt);
@@ -2049,7 +2272,7 @@ extern ROCKSDB_LIBRARY_API const char* rocksdb_pinnableslice_value(
     const rocksdb_pinnableslice_t* t, size_t* vlen);
 
 extern ROCKSDB_LIBRARY_API rocksdb_memory_consumers_t*
-    rocksdb_memory_consumers_create();
+rocksdb_memory_consumers_create(void);
 extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_db(
     rocksdb_memory_consumers_t* consumers, rocksdb_t* db);
 extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_cache(
diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h
index e4c404333df..c1ce88dbd82 100644
--- a/include/rocksdb/cache.h
+++ b/include/rocksdb/cache.h
@@ -22,9 +22,11 @@
 
 #pragma once
 
-#include <stdint.h>
+#include <cstdint>
+#include <functional>
 #include <memory>
 #include <string>
+
 #include "rocksdb/memory_allocator.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/statistics.h"
@@ -34,6 +36,7 @@ namespace ROCKSDB_NAMESPACE {
 
 class Cache;
 struct ConfigOptions;
+class SecondaryCache;
 
 extern const bool kDefaultToAdaptiveMutex;
 
@@ -59,10 +62,10 @@ struct LRUCacheOptions {
 
   // Percentage of cache reserved for high priority entries.
   // If greater than zero, the LRU list will be split into a high-pri
-  // list and a low-pri list. High-pri entries will be insert to the
+  // list and a low-pri list. High-pri entries will be inserted to the
   // tail of high-pri list, while low-pri entries will be first inserted to
-  // the low-pri list (the midpoint). This is refered to as
-  // midpoint insertion strategy to make entries never get hit in cache
+  // the low-pri list (the midpoint). This is referred to as
+  // midpoint insertion strategy to make entries that never get hit in cache
   // age out faster.
   //
   // See also
@@ -87,6 +90,9 @@ struct LRUCacheOptions {
   CacheMetadataChargePolicy metadata_charge_policy =
       kDefaultCacheMetadataChargePolicy;
 
+  // A SecondaryCache instance to use a the non-volatile tier
+  std::shared_ptr<SecondaryCache> secondary_cache;
+
   LRUCacheOptions() {}
   LRUCacheOptions(size_t _capacity, int _num_shard_bits,
                   bool _strict_capacity_limit, double _high_pri_pool_ratio,
@@ -126,17 +132,82 @@ extern std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts);
 // more detail.
 //
 // Return nullptr if it is not supported.
+//
+// BROKEN: ClockCache is known to have bugs that could lead to crash or
+// corruption, so should not be used until fixed. Use NewLRUCache instead.
 extern std::shared_ptr<Cache> NewClockCache(
     size_t capacity, int num_shard_bits = -1,
     bool strict_capacity_limit = false,
     CacheMetadataChargePolicy metadata_charge_policy =
         kDefaultCacheMetadataChargePolicy);
+
 class Cache {
  public:
   // Depending on implementation, cache entries with high priority could be less
   // likely to get evicted than low priority entries.
   enum class Priority { HIGH, LOW };
 
+  // A set of callbacks to allow objects in the primary block cache to be
+  // be persisted in a secondary cache. The purpose of the secondary cache
+  // is to support other ways of caching the object, such as persistent or
+  // compressed data, that may require the object to be parsed and transformed
+  // in some way. Since the primary cache holds C++ objects and the secondary
+  // cache may only hold flat data that doesn't need relocation, these
+  // callbacks need to be provided by the user of the block
+  // cache to do the conversion.
+  // The CacheItemHelper is passed to Insert() and Lookup(). It has pointers
+  // to callback functions for size, saving and deletion of the
+  // object. The callbacks are defined in C-style in order to make them
+  // stateless and not add to the cache metadata size.
+  // Saving multiple std::function objects will take up 32 bytes per
+  // function, even if its not bound to an object and does no capture.
+  //
+  // All the callbacks are C-style function pointers in order to simplify
+  // lifecycle management. Objects in the cache can outlive the parent DB,
+  // so anything required for these operations should be contained in the
+  // object itself.
+  //
+  // The SizeCallback takes a void* pointer to the object and returns the size
+  // of the persistable data. It can be used by the secondary cache to allocate
+  // memory if needed.
+  using SizeCallback = size_t (*)(void* obj);
+
+  // The SaveToCallback takes a void* object pointer and saves the persistable
+  // data into a buffer. The secondary cache may decide to not store it in a
+  // contiguous buffer, in which case this callback will be called multiple
+  // times with increasing offset
+  using SaveToCallback = Status (*)(void* from_obj, size_t from_offset,
+                                    size_t length, void* out);
+
+  // A function pointer type for custom destruction of an entry's
+  // value. The Cache is responsible for copying and reclaiming space
+  // for the key, but values are managed by the caller.
+  using DeleterFn = void (*)(const Slice& key, void* value);
+
+  // A struct with pointers to helper functions for spilling items from the
+  // cache into the secondary cache. May be extended in the future. An
+  // instance of this struct is expected to outlive the cache.
+  struct CacheItemHelper {
+    SizeCallback size_cb;
+    SaveToCallback saveto_cb;
+    DeleterFn del_cb;
+
+    CacheItemHelper() : size_cb(nullptr), saveto_cb(nullptr), del_cb(nullptr) {}
+    CacheItemHelper(SizeCallback _size_cb, SaveToCallback _saveto_cb,
+                    DeleterFn _del_cb)
+        : size_cb(_size_cb), saveto_cb(_saveto_cb), del_cb(_del_cb) {}
+  };
+
+  // The CreateCallback is passed by the block cache user to Lookup(). It
+  // takes in a buffer from the NVM cache and constructs an object using
+  // it. The callback doesn't have ownership of the buffer and should
+  // copy the contents into its own buffer.
+  // typedef std::function<Status(void* buf, size_t size, void** out_obj,
+  //                             size_t* charge)>
+  //    CreateCallback;
+  using CreateCallback = std::function<Status(void* buf, size_t size,
+                                              void** out_obj, size_t* charge)>;
+
   Cache(std::shared_ptr<MemoryAllocator> allocator = nullptr)
       : memory_allocator_(std::move(allocator)) {}
   // No copying allowed
@@ -151,7 +222,7 @@ class Cache {
   //   - Name-value option pairs -- "capacity=1M; num_shard_bits=4;
   //     For the LRUCache, the values are defined in LRUCacheOptions.
   // @param result The new Cache object
-  // @return OK if the cache was sucessfully created
+  // @return OK if the cache was successfully created
   // @return NotFound if an invalid name was specified in the value
   // @return InvalidArgument if either the options were not valid
   static Status CreateFromString(const ConfigOptions& config_options,
@@ -170,8 +241,8 @@ class Cache {
   // The type of the Cache
   virtual const char* Name() const = 0;
 
-  // Insert a mapping from key->value into the cache and assign it
-  // the specified charge against the total cache capacity.
+  // Insert a mapping from key->value into the volatile cache only
+  // and assign it // the specified charge against the total cache capacity.
   // If strict_capacity_limit is true and cache reaches its full capacity,
   // return Status::Incomplete.
   //
@@ -184,10 +255,11 @@ class Cache {
   // insert. In case of error value will be cleanup.
   //
   // When the inserted entry is no longer needed, the key and
-  // value will be passed to "deleter".
+  // value will be passed to "deleter" which must delete the value.
+  // (The Cache is responsible for copying and reclaiming space for
+  // the key.)
   virtual Status Insert(const Slice& key, void* value, size_t charge,
-                        void (*deleter)(const Slice& key, void* value),
-                        Handle** handle = nullptr,
+                        DeleterFn deleter, Handle** handle = nullptr,
                         Priority priority = Priority::LOW) = 0;
 
   // If the cache has no mapping for "key", returns nullptr.
@@ -264,6 +336,12 @@ class Cache {
   // returns the charge for the specific entry in the cache.
   virtual size_t GetCharge(Handle* handle) const = 0;
 
+  // Returns the deleter for the specified entry. This might seem useless
+  // as the Cache itself is responsible for calling the deleter, but
+  // the deleter can essentially verify that a cache entry is of an
+  // expected type from an expected code source.
+  virtual DeleterFn GetDeleter(Handle* handle) const = 0;
+
   // Call this on shutdown if you want to speed it up. Cache will disown
   // any underlying data and will not free it on delete. This call will leak
   // memory - call this only if you're shutting down the process.
@@ -273,11 +351,33 @@ class Cache {
       // default implementation is noop
   }
 
-  // Apply callback to all entries in the cache
-  // If thread_safe is true, it will also lock the accesses. Otherwise, it will
-  // access the cache without the lock held
-  virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
-                                      bool thread_safe) = 0;
+  struct ApplyToAllEntriesOptions {
+    // If the Cache uses locks, setting `average_entries_per_lock` to
+    // a higher value suggests iterating over more entries each time a lock
+    // is acquired, likely reducing the time for ApplyToAllEntries but
+    // increasing latency for concurrent users of the Cache. Setting
+    // `average_entries_per_lock` to a smaller value could be helpful if
+    // callback is relatively expensive, such as using large data structures.
+    size_t average_entries_per_lock = 256;
+  };
+
+  // Apply a callback to all entries in the cache. The Cache must ensure
+  // thread safety but does not guarantee that a consistent snapshot of all
+  // entries is iterated over if other threads are operating on the Cache
+  // also.
+  virtual void ApplyToAllEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      const ApplyToAllEntriesOptions& opts) = 0;
+
+  // DEPRECATED version of above. (Default implementation uses above.)
+  virtual void ApplyToAllCacheEntries(void (*callback)(void* value,
+                                                       size_t charge),
+                                      bool /*thread_safe*/) {
+    ApplyToAllEntries([callback](const Slice&, void* value, size_t charge,
+                                 DeleterFn) { callback(value, charge); },
+                      {});
+  }
 
   // Remove all entries.
   // Prerequisite: no entry is referenced.
@@ -287,6 +387,108 @@ class Cache {
 
   MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); }
 
+  // EXPERIMENTAL
+  // The following APIs are experimental and might change in the future.
+  // The Insert and Lookup APIs below are intended to allow cached objects
+  // to be demoted/promoted between the primary block cache and a secondary
+  // cache. The secondary cache could be a non-volatile cache, and will
+  // likely store the object in a different representation more suitable
+  // for on disk storage. They rely on a per object CacheItemHelper to do
+  // the conversions.
+  // The secondary cache may persist across process and system restarts,
+  // and may even be moved between hosts. Therefore, the cache key must
+  // be repeatable across restarts/reboots, and globally unique if
+  // multiple DBs share the same cache and the set of DBs can change
+  // over time.
+
+  // Insert a mapping from key->value into the cache and assign it
+  // the specified charge against the total cache capacity.
+  // If strict_capacity_limit is true and cache reaches its full capacity,
+  // return Status::Incomplete.
+  //
+  // The helper argument is saved by the cache and will be used when the
+  // inserted object is evicted or promoted to the secondary cache. It,
+  // therefore, must outlive the cache.
+  //
+  // If handle is not nullptr, returns a handle that corresponds to the
+  // mapping. The caller must call this->Release(handle) when the returned
+  // mapping is no longer needed. In case of error caller is responsible to
+  // cleanup the value (i.e. calling "deleter").
+  //
+  // If handle is nullptr, it is as if Release is called immediately after
+  // insert. In case of error value will be cleanup.
+  //
+  // Regardless of whether the item was inserted into the cache,
+  // it will attempt to insert it into the secondary cache if one is
+  // configured, and the helper supports it.
+  // The cache implementation must support a secondary cache, otherwise
+  // the item is only inserted into the primary cache. It may
+  // defer the insertion to the secondary cache as it sees fit.
+  //
+  // When the inserted entry is no longer needed, the key and
+  // value will be passed to "deleter".
+  virtual Status Insert(const Slice& key, void* value,
+                        const CacheItemHelper* helper, size_t charge,
+                        Handle** handle = nullptr,
+                        Priority priority = Priority::LOW) {
+    if (!helper) {
+      return Status::InvalidArgument();
+    }
+    return Insert(key, value, charge, helper->del_cb, handle, priority);
+  }
+
+  // Lookup the key in the primary and secondary caches (if one is configured).
+  // The create_cb callback function object will be used to contruct the
+  // cached object.
+  // If none of the caches have the mapping for the key, returns nullptr.
+  // Else, returns a handle that corresponds to the mapping.
+  //
+  // This call may promote the object from the secondary cache (if one is
+  // configured, and has the given key) to the primary cache.
+  //
+  // The helper argument should be provided if the caller wants the lookup
+  // to include the secondary cache (if one is configured) and the object,
+  // if it exists, to be promoted to the primary cache. The helper may be
+  // saved and used later when the object is evicted. Therefore, it must
+  // outlive the cache.
+  //
+  // The handle returned may not be ready. The caller should call IsReady()
+  // to check if the item value is ready, and call Wait() or WaitAll() if
+  // its not ready. The caller should then call Value() to check if the
+  // item was successfully retrieved. If unsuccessful (perhaps due to an
+  // IO error), Value() will return nullptr.
+  virtual Handle* Lookup(const Slice& key, const CacheItemHelper* /*helper_cb*/,
+                         const CreateCallback& /*create_cb*/,
+                         Priority /*priority*/, bool /*wait*/,
+                         Statistics* stats = nullptr) {
+    return Lookup(key, stats);
+  }
+
+  // Release a mapping returned by a previous Lookup(). The "useful"
+  // parameter specifies whether the data was actually used or not,
+  // which may be used by the cache implementation to decide whether
+  // to consider it as a hit for retention purposes.
+  virtual bool Release(Handle* handle, bool /*useful*/, bool force_erase) {
+    return Release(handle, force_erase);
+  }
+
+  // Determines if the handle returned by Lookup() has a valid value yet. The
+  // call is not thread safe and should be called only by someone holding a
+  // reference to the handle.
+  virtual bool IsReady(Handle* /*handle*/) { return true; }
+
+  // If the handle returned by Lookup() is not ready yet, wait till it
+  // becomes ready.
+  // Note: A ready handle doesn't necessarily mean it has a valid value. The
+  // user should call Value() and check for nullptr.
+  virtual void Wait(Handle* /*handle*/) {}
+
+  // Wait for a vector of handles to become ready. As with Wait(), the user
+  // should check the Value() of each handle for nullptr. This call is not
+  // thread safe and should only be called by the caller holding a reference
+  // to each of the handles.
+  virtual void WaitAll(std::vector<Handle*>& /*handles*/) {}
+
  private:
   std::shared_ptr<MemoryAllocator> memory_allocator_;
 };
diff --git a/include/rocksdb/cache_bench_tool.h b/include/rocksdb/cache_bench_tool.h
new file mode 100644
index 00000000000..413ce159378
--- /dev/null
+++ b/include/rocksdb/cache_bench_tool.h
@@ -0,0 +1,14 @@
+// Copyright (c) 2013-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+int cache_bench_tool(int argc, char** argv);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/cloud/cloud_log_controller.h b/include/rocksdb/cloud/cloud_log_controller.h
index de151e5d4e7..701c89981b8 100644
--- a/include/rocksdb/cloud/cloud_log_controller.h
+++ b/include/rocksdb/cloud/cloud_log_controller.h
@@ -34,6 +34,7 @@ class CloudLogWritableFile : public WritableFile {
   virtual Status status() { return status_; }
 
   // Appends data to the file. If the file doesn't exist, it'll get created.
+  using WritableFile::Append;
   virtual Status Append(const Slice& data) = 0;
 
   // Closes a file by writing an EOF marker to the Cloud stream.
diff --git a/include/rocksdb/compaction_filter.h b/include/rocksdb/compaction_filter.h
index ed17889318a..14515976ec5 100644
--- a/include/rocksdb/compaction_filter.h
+++ b/include/rocksdb/compaction_filter.h
@@ -14,23 +14,15 @@
 #include <vector>
 
 #include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/types.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class Slice;
 class SliceTransform;
 
-// Context information of a compaction run
-struct CompactionFilterContext {
-  // Does this compaction run include all data files
-  bool is_full_compaction;
-  // Is this compaction requested by the client (true),
-  // or is it occurring as an automatic compaction process
-  bool is_manual_compaction;
-};
-
-// CompactionFilter allows an application to modify/delete a key-value at
-// the time of compaction.
+// CompactionFilter allows an application to modify/delete a key-value during
+// table file creation.
 
 class CompactionFilter {
  public:
@@ -47,35 +39,38 @@ class CompactionFilter {
     kRemoveAndSkipUntil,
     kChangeBlobIndex,  // used internally by BlobDB.
     kIOError,          // used internally by BlobDB.
+    kUndetermined,
   };
 
   enum class BlobDecision { kKeep, kChangeValue, kCorruption, kIOError };
 
-  // Context information of a compaction run
+  // Context information for a table file creation.
   struct Context {
-    // Does this compaction run include all data files
+    // Whether this table file is created as part of a compaction including all
+    // table files.
     bool is_full_compaction;
-    // Is this compaction requested by the client (true),
-    // or is it occurring as an automatic compaction process
+    // Whether this table file is created as part of a compaction requested by
+    // the client.
     bool is_manual_compaction;
-    // Which column family this compaction is for.
+    // The column family that will contain the created table file.
     uint32_t column_family_id;
+    // Reason this table file is being created.
+    TableFileCreationReason reason;
   };
 
   virtual ~CompactionFilter() {}
 
-  // The compaction process invokes this
-  // method for kv that is being compacted. A return value
-  // of false indicates that the kv should be preserved in the
-  // output of this compaction run and a return value of true
-  // indicates that this key-value should be removed from the
-  // output of the compaction.  The application can inspect
-  // the existing value of the key and make decision based on it.
+  // The table file creation process invokes this method before adding a kv to
+  // the table file. A return value of false indicates that the kv should be
+  // preserved in the new table file and a return value of true indicates
+  // that this key-value should be removed from the new table file. The
+  // application can inspect the existing value of the key and make decision
+  // based on it.
   //
-  // Key-Values that are results of merge operation during compaction are not
-  // passed into this function. Currently, when you have a mix of Put()s and
-  // Merge()s on a same key, we only guarantee to process the merge operands
-  // through the compaction filters. Put()s might be processed, or might not.
+  // Key-Values that are results of merge operation during table file creation
+  // are not passed into this function. Currently, when you have a mix of Put()s
+  // and Merge()s on a same key, we only guarantee to process the merge operands
+  // through the `CompactionFilter`s. Put()s might be processed, or might not.
   //
   // When the value is to be preserved, the application has the option
   // to modify the existing_value and pass it back through new_value.
@@ -83,9 +78,10 @@ class CompactionFilter {
   //
   // Note that RocksDB snapshots (i.e. call GetSnapshot() API on a
   // DB* object) will not guarantee to preserve the state of the DB with
-  // CompactionFilter. Data seen from a snapshot might disppear after a
-  // compaction finishes. If you use snapshots, think twice about whether you
-  // want to use compaction filter and whether you are using it in a safe way.
+  // CompactionFilter. Data seen from a snapshot might disappear after a
+  // table file created with a `CompactionFilter` is installed. If you use
+  // snapshots, think twice about whether you want to use `CompactionFilter` and
+  // whether you are using it in a safe way.
   //
   // If multithreaded compaction is being used *and* a single CompactionFilter
   // instance was supplied via Options::compaction_filter, this method may be
@@ -93,7 +89,7 @@ class CompactionFilter {
   // that the call is thread-safe.
   //
   // If the CompactionFilter was created by a factory, then it will only ever
-  // be used by a single thread that is doing the compaction run, and this
+  // be used by a single thread that is doing the table file creation, and this
   // call does not need to be thread-safe.  However, multiple filters may be
   // in existence and operating concurrently.
   virtual bool Filter(int /*level*/, const Slice& /*key*/,
@@ -103,9 +99,9 @@ class CompactionFilter {
     return false;
   }
 
-  // The compaction process invokes this method on every merge operand. If this
-  // method returns true, the merge operand will be ignored and not written out
-  // in the compaction output
+  // The table file creation process invokes this method on every merge operand.
+  // If this method returns true, the merge operand will be ignored and not
+  // written out in the new table file.
   //
   // Note: If you are using a TransactionDB, it is not recommended to implement
   // FilterMergeOperand().  If a Merge operation is filtered out, TransactionDB
@@ -142,14 +138,16 @@ class CompactionFilter {
   //         snapshot - beware if you're using TransactionDB or
   //         DB::GetSnapshot().
   //       - If value for a key was overwritten or merged into (multiple Put()s
-  //         or Merge()s), and compaction filter skips this key with
+  //         or Merge()s), and `CompactionFilter` skips this key with
   //         kRemoveAndSkipUntil, it's possible that it will remove only
   //         the new value, exposing the old value that was supposed to be
   //         overwritten.
   //       - Doesn't work with PlainTableFactory in prefix mode.
-  //       - If you use kRemoveAndSkipUntil, consider also reducing
-  //         compaction_readahead_size option.
+  //       - If you use kRemoveAndSkipUntil for table files created by
+  //         compaction, consider also reducing compaction_readahead_size
+  //         option.
   //
+  // Should never return kUndetermined.
   // Note: If you are using a TransactionDB, it is not recommended to filter
   // out or modify merge operands (ValueType::kMergeOperand).
   // If a merge operation is filtered out, TransactionDB may not realize there
@@ -187,27 +185,53 @@ class CompactionFilter {
   }
 
   // This function is deprecated. Snapshots will always be ignored for
-  // compaction filters, because we realized that not ignoring snapshots doesn't
-  // provide the gurantee we initially thought it would provide. Repeatable
-  // reads will not be guaranteed anyway. If you override the function and
-  // returns false, we will fail the compaction.
+  // `CompactionFilter`s, because we realized that not ignoring snapshots
+  // doesn't provide the guarantee we initially thought it would provide.
+  // Repeatable reads will not be guaranteed anyway. If you override the
+  // function and returns false, we will fail the table file creation.
   virtual bool IgnoreSnapshots() const { return true; }
 
-  // Returns a name that identifies this compaction filter.
+  // Returns a name that identifies this `CompactionFilter`.
   // The name will be printed to LOG file on start up for diagnosis.
   virtual const char* Name() const = 0;
+
+  // Internal (BlobDB) use only. Do not override in application code.
+  virtual bool IsStackedBlobDbInternalCompactionFilter() const { return false; }
+
+  // In the case of BlobDB, it may be possible to reach a decision with only
+  // the key without reading the actual value. Keys whose value_type is
+  // kBlobIndex will be checked by this method.
+  // Returning kUndetermined will cause FilterV2() to be called to make a
+  // decision as usual.
+  virtual Decision FilterBlobByKey(int /*level*/, const Slice& /*key*/,
+                                   std::string* /*new_value*/,
+                                   std::string* /*skip_until*/) const {
+    return Decision::kUndetermined;
+  }
 };
 
-// Each compaction will create a new CompactionFilter allowing the
-// application to know about different compactions
+// Each thread of work involving creating table files will create a new
+// `CompactionFilter` according to `ShouldFilterTableFileCreation()`. This
+// allows the application to know about the different ongoing threads of work
+// and makes it unnecessary for `CompactionFilter` to provide thread-safety.
 class CompactionFilterFactory {
  public:
   virtual ~CompactionFilterFactory() {}
 
+  // Returns whether a thread creating table files for the specified `reason`
+  // should invoke `CreateCompactionFilter()` and pass KVs through the returned
+  // filter.
+  virtual bool ShouldFilterTableFileCreation(
+      TableFileCreationReason reason) const {
+    // For backward compatibility, default implementation only applies
+    // `CompactionFilter` to files generated by compaction.
+    return reason == TableFileCreationReason::kCompaction;
+  }
+
   virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
       const CompactionFilter::Context& context) = 0;
 
-  // Returns a name that identifies this compaction filter factory.
+  // Returns a name that identifies this `CompactionFilter` factory.
   virtual const char* Name() const = 0;
 };
 
diff --git a/include/rocksdb/compaction_job_stats.h b/include/rocksdb/compaction_job_stats.h
index 0f9c8fcbb1f..626f3202ff9 100644
--- a/include/rocksdb/compaction_job_stats.h
+++ b/include/rocksdb/compaction_job_stats.h
@@ -25,25 +25,33 @@ struct CompactionJobStats {
 
   // the number of compaction input records.
   uint64_t num_input_records;
-  // the number of compaction input files.
+  // the number of blobs read from blob files
+  uint64_t num_blobs_read;
+  // the number of compaction input files (table files)
   size_t num_input_files;
-  // the number of compaction input files at the output level.
+  // the number of compaction input files at the output level (table files)
   size_t num_input_files_at_output_level;
 
   // the number of compaction output records.
   uint64_t num_output_records;
-  // the number of compaction output files.
+  // the number of compaction output files (table files)
   size_t num_output_files;
+  // the number of compaction output files (blob files)
+  size_t num_output_files_blob;
 
   // true if the compaction is a full compaction (all live SST files input)
   bool is_full_compaction;
   // true if the compaction is a manual compaction
   bool is_manual_compaction;
 
-  // the size of the compaction input in bytes.
+  // the total size of table files in the compaction input
   uint64_t total_input_bytes;
-  // the size of the compaction output in bytes.
+  // the total size of blobs read from blob files
+  uint64_t total_blob_bytes_read;
+  // the total size of table files in the compaction output
   uint64_t total_output_bytes;
+  // the total size of blob files in the compaction output
+  uint64_t total_output_bytes_blob;
 
   // number of records being replaced by newer record associated with same key.
   // this could be a new value or a deletion entry for that key so this field
diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h
index 53a46ad3359..37c2925bc33 100644
--- a/include/rocksdb/comparator.h
+++ b/include/rocksdb/comparator.h
@@ -110,7 +110,9 @@ class Comparator {
   // == 0 iff t1 == t2
   // > 0  iff t1 > t2
   // Note that an all-zero byte array will be the smallest (oldest) timestamp
-  // of the same length.
+  // of the same length, and a byte array with all bits 1 will be the largest.
+  // In the future, we can extend Comparator so that subclasses can specify
+  // both largest and smallest timestamps.
   virtual int CompareTimestamp(const Slice& /*ts1*/,
                                const Slice& /*ts2*/) const {
     return 0;
@@ -121,6 +123,11 @@ class Comparator {
     return Compare(a, b);
   }
 
+  virtual bool EqualWithoutTimestamp(const Slice& a, const Slice& b) const {
+    return 0 ==
+           CompareWithoutTimestamp(a, /*a_has_ts=*/true, b, /*b_has_ts=*/true);
+  }
+
  private:
   size_t timestamp_size_;
 };
diff --git a/include/rocksdb/concurrent_task_limiter.h b/include/rocksdb/concurrent_task_limiter.h
index 4fc6b794063..f8e7ed8ab44 100644
--- a/include/rocksdb/concurrent_task_limiter.h
+++ b/include/rocksdb/concurrent_task_limiter.h
@@ -33,7 +33,7 @@ class ConcurrentTaskLimiter {
   virtual int32_t GetOutstandingTask() const = 0;
 };
 
-// Create a ConcurrentTaskLimiter that can be shared with mulitple CFs
+// Create a ConcurrentTaskLimiter that can be shared with multiple CFs
 // across RocksDB instances to control concurrent tasks.
 //
 // @param name: Name of the limiter.
diff --git a/include/rocksdb/configurable.h b/include/rocksdb/configurable.h
index f4bfbf53285..b56072dbeae 100644
--- a/include/rocksdb/configurable.h
+++ b/include/rocksdb/configurable.h
@@ -28,7 +28,7 @@ struct DBOptions;
 // standard way of configuring objects.  A Configurable object can:
 //   -> Populate itself given:
 //        - One or more "name/value" pair strings
-//        - A string repesenting the set of name=value properties
+//        - A string representing the set of name=value properties
 //        - A map of name/value properties.
 //   -> Convert itself into its string representation
 //   -> Dump itself to a Logger
@@ -166,7 +166,7 @@ class Configurable {
   // This is the inverse of ConfigureFromString.
   // @param config_options Controls how serialization happens.
   // @param result The string representation of this object.
-  // @return OK If the options for this object wer successfully serialized.
+  // @return OK If the options for this object were successfully serialized.
   // @return InvalidArgument If one or more of the options could not be
   // serialized.
   Status GetOptionString(const ConfigOptions& config_options,
@@ -240,7 +240,7 @@ class Configurable {
   // @param config_options Controls how the object is prepared.  Also contains
   //      a Logger and Env that can be used to initialize this object.
   // @return OK If the object was successfully initialized.
-  // @return InvalidArgument If this object could not be successfull
+  // @return InvalidArgument If this object could not be successfully
   // initialized.
   virtual Status PrepareOptions(const ConfigOptions& config_options);
 
@@ -270,18 +270,13 @@ class Configurable {
   // True once the object is prepared.  Once the object is prepared, only
   // mutable options can be configured.
   bool prepared_;
-  // If this class is a wrapper (has-a), this method should be
-  // over-written to return the inner configurable (like an EnvWrapper).
-  // This method should NOT recurse, but should instead return the
-  // direct Inner object.
-  virtual Configurable* Inner() const { return nullptr; }
 
   // Returns the raw pointer for the associated named option.
   // The name is typically the name of an option registered via the
   // Classes may override this method to provide further specialization (such as
   // returning a sub-option)
   //
-  // The default implemntation looks at the registered options.  If the
+  // The default implementation looks at the registered options.  If the
   // input name matches that of a registered option, the pointer registered
   // with that name is returned.
   // e.g,, RegisterOptions("X", &my_ptr, ...); GetOptionsPtr("X") returns
@@ -355,6 +350,35 @@ class Configurable {
   //  Given a name (e.g. rocksdb.my.type.opt), returns the short name (opt)
   virtual std::string GetOptionName(const std::string& long_name) const;
 
+  // Registers the input name with the options and associated map.
+  // When classes register their options in this manner, most of the
+  // functionality (excluding unknown options and validate/prepare) is
+  // implemented by the base class.
+  //
+  // This method should be called in the class constructor to register the
+  // option set for this object.  For example, to register the options
+  // associated with the BlockBasedTableFactory, the constructor calls this
+  // method passing in:
+  // - the name of the options ("BlockBasedTableOptions");
+  // - the options object (the BlockBasedTableOptions object for this object;
+  // - the options type map for the BlockBasedTableOptions.
+  // This registration allows the Configurable class to process the option
+  // values associated with the BlockBasedTableOptions without further code in
+  // the derived class.
+  //
+  // @param name    The name of this set of options (@see GetOptionsPtr)
+  // @param opt_ptr Pointer to the options to associate with this name
+  // @param opt_map Options map that controls how this option is configured.
+  template <typename T>
+  void RegisterOptions(
+      T* opt_ptr,
+      const std::unordered_map<std::string, OptionTypeInfo>* opt_map) {
+    RegisterOptions(T::kName(), opt_ptr, opt_map);
+  }
+  void RegisterOptions(
+      const std::string& name, void* opt_ptr,
+      const std::unordered_map<std::string, OptionTypeInfo>* opt_map);
+
  private:
   // Contains the collection of options (name, opt_ptr, opt_map) associated with
   // this object. This collection is typically set in the constructor of the
diff --git a/include/rocksdb/convenience.h b/include/rocksdb/convenience.h
index f861b2fcf3d..dfcd7e22929 100644
--- a/include/rocksdb/convenience.h
+++ b/include/rocksdb/convenience.h
@@ -16,6 +16,9 @@
 
 namespace ROCKSDB_NAMESPACE {
 class Env;
+class Logger;
+class ObjectRegistry;
+
 struct ColumnFamilyOptions;
 struct DBOptions;
 struct Options;
@@ -27,6 +30,15 @@ struct Options;
 // of the serialization (e.g. delimiter), and how to compare
 // options (sanity_level).
 struct ConfigOptions {
+  // Constructs a new ConfigOptions with a new object registry.
+  // This method should only be used when a DBOptions is not available,
+  // else registry settings may be lost
+  ConfigOptions();
+
+  // Constructs a new ConfigOptions using the settings from
+  // the input DBOptions.  Currently constructs a new object registry.
+  explicit ConfigOptions(const DBOptions&);
+
   // This enum defines the RocksDB options sanity level.
   enum SanityLevel : unsigned char {
     kSanityLevelNone = 0x01,  // Performs no sanity check at all.
@@ -56,6 +68,13 @@ struct ConfigOptions {
   // Whether or not to invoke PrepareOptions after configure is called.
   bool invoke_prepare_options = true;
 
+  // Options can be marked as Mutable (OptionTypeInfo::IsMutable()) or not.
+  // When "mutable_options_only=false", all options are evaluated.
+  // When "mutable_options_only="true", any option not marked as Mutable is
+  // either ignored (in the case of string/equals methods) or results in an
+  // error (in the case of Configure).
+  bool mutable_options_only = false;
+
   // The separator between options when converting to a string
   std::string delimiter = ";";
 
@@ -71,6 +90,11 @@ struct ConfigOptions {
   // The environment to use for this option
   Env* env = Env::Default();
 
+#ifndef ROCKSDB_LITE
+  // The object registry to use for this options
+  std::shared_ptr<ObjectRegistry> registry;
+#endif
+
   bool IsShallow() const { return depth == Depth::kDepthShallow; }
   bool IsDetailed() const { return depth == Depth::kDepthDetailed; }
 
@@ -86,7 +110,7 @@ struct ConfigOptions {
 #ifndef ROCKSDB_LITE
 
 // The following set of functions provide a way to construct RocksDB Options
-// from a string or a string-to-string map.  Here're the general rule of
+// from a string or a string-to-string map.  Here is the general rule of
 // setting option values from strings by type.  Some RocksDB types are also
 // supported in these APIs.  Please refer to the comment of the function itself
 // to find more information about how to config those RocksDB types.
@@ -142,7 +166,7 @@ struct ConfigOptions {
 // ColumnFamilyOptions "new_options".
 //
 // Below are the instructions of how to config some non-primitive-typed
-// options in ColumnFOptions:
+// options in ColumnFamilyOptions:
 //
 // * table_factory:
 //   table_factory can be configured using our custom nested-option syntax.
@@ -184,7 +208,7 @@ struct ConfigOptions {
 //     * {"memtable", "skip_list:5"} is equivalent to setting
 //       memtable to SkipListFactory(5).
 //   - PrefixHash:
-//     Pass "prfix_hash:<hash_bucket_count>" to config memtable
+//     Pass "prefix_hash:<hash_bucket_count>" to config memtable
 //     to use PrefixHash, or simply "prefix_hash" to use the default
 //     PrefixHash.
 //     [Example]:
@@ -493,7 +517,6 @@ Status VerifySstFileChecksum(const Options& options,
                              const EnvOptions& env_options,
                              const ReadOptions& read_options,
                              const std::string& file_path);
-
 #endif  // ROCKSDB_LITE
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/customizable.h b/include/rocksdb/customizable.h
new file mode 100644
index 00000000000..24ddfa56c0c
--- /dev/null
+++ b/include/rocksdb/customizable.h
@@ -0,0 +1,156 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/configurable.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+/**
+ * Customizable a base class used by the rocksdb that describes a
+ * standard way of configuring and creating objects.  Customizable objects
+ * are configurable objects that can be created from an ObjectRegistry.
+ *
+ * Customizable classes are used when there are multiple potential
+ * implementations of a class for use by RocksDB (e.g. Table, Cache,
+ * MergeOperator, etc).  The abstract base class is expected to define a method
+ * declaring its type and a factory method for creating one of these, such as:
+ * static const char *Type() { return "Table"; }
+ * static Status CreateFromString(const ConfigOptions& options,
+ *                                const std::string& id,
+ *                                std::shared_ptr<TableFactory>* result);
+ * The "Type" string is expected to be unique (no two base classes are the same
+ * type). This factory is expected, based on the options and id, create and
+ * return the appropriate derived type of the customizable class (e.g.
+ * BlockBasedTableFactory, PlainTableFactory, etc). For extension developers,
+ * helper classes and methods are provided for writing this factory.
+ *
+ * Instances of a Customizable class need to define:
+ * - A "static const char *kClassName()" method.  This method defines the name
+ * of the class instance (e.g. BlockBasedTable, LRUCache) and is used by the
+ * CheckedCast method.
+ * - The Name() of the object.  This name is used when creating and saving
+ * instances of this class.  Typically this name will be the same as
+ * kClassName().
+ *
+ * Additionally, Customizable classes should register any options used to
+ * configure themselves with the Configurable subsystem.
+ *
+ * When a Customizable is being created, the "name" property specifies
+ * the name of the instance being created.
+ * For custom objects, their configuration and name can be specified by:
+ * [prop]={name=X;option 1 = value1[; option2=value2...]}
+ *
+ * [prop].name=X
+ * [prop].option1 = value1
+ *
+ * [prop].name=X
+ * X.option1 =value1
+ */
+class Customizable : public Configurable {
+ public:
+  virtual ~Customizable() {}
+
+  // Returns the name of this class of Customizable
+  virtual const char* Name() const = 0;
+
+  // Returns an identifier for this Customizable.
+  // This could be its name or something more complex (like its URL/pattern).
+  // Used for pretty printing.
+  virtual std::string GetId() const {
+    std::string id = Name();
+    return id;
+  }
+
+  // This is typically determined by if the input name matches the
+  // name of this object.
+  // This method is typically used in conjunction with CheckedCast to find the
+  // derived class instance from its base.  For example, if you have an Env
+  // and want the "Default" env, you would IsInstanceOf("Default") to get
+  // the default implementation.  This method should be used when you need a
+  // specific derivative or implementation of a class.
+  //
+  // Intermediary caches (such as SharedCache) may wish to override this method
+  // to check for the intermediary name (SharedCache).  Classes with multiple
+  // potential names (e.g. "PosixEnv", "DefaultEnv") may also wish to override
+  // this method.
+  //
+  // Note that IsInstanceOf only uses the "is-a" relationship and not "has-a".
+  // Wrapped classes that have an Inner "has-a" should not be returned.
+  //
+  // @param name The name of the instance to find.
+  // Returns true if the class is an instance of the input name.
+  virtual bool IsInstanceOf(const std::string& name) const {
+    return name == Name();
+  }
+
+  // Returns the named instance of the Customizable as a T*, or nullptr if not
+  // found. This method uses IsInstanceOf/Inner to find the appropriate class
+  // instance and then casts it to the expected return type.
+  template <typename T>
+  const T* CheckedCast() const {
+    if (IsInstanceOf(T::kClassName())) {
+      return static_cast<const T*>(this);
+    } else {
+      const auto inner = Inner();
+      if (inner != nullptr) {
+        return inner->CheckedCast<T>();
+      } else {
+        return nullptr;
+      }
+    }
+  }
+
+  template <typename T>
+  T* CheckedCast() {
+    if (IsInstanceOf(T::kClassName())) {
+      return static_cast<T*>(this);
+    } else {
+      auto inner = const_cast<Customizable*>(Inner());
+      if (inner != nullptr) {
+        return inner->CheckedCast<T>();
+      } else {
+        return nullptr;
+      }
+    }
+  }
+
+  // Checks to see if this Customizable is equivalent to other.
+  // This method assumes that the two objects are of the same class.
+  // @param config_options Controls how the options are compared.
+  // @param other The other object to compare to.
+  // @param mismatch If the objects do not match, this parameter contains
+  //      the name of the option that triggered the match failure.
+  // @param True if the objects match, false otherwise.
+  // @see Configurable::AreEquivalent for more details
+  bool AreEquivalent(const ConfigOptions& config_options,
+                     const Configurable* other,
+                     std::string* mismatch) const override;
+#ifndef ROCKSDB_LITE
+  // Gets the value of the option associated with the input name
+  // @see Configurable::GetOption for more details
+  Status GetOption(const ConfigOptions& config_options, const std::string& name,
+                   std::string* value) const override;
+#endif  // ROCKSDB_LITE
+
+  // Returns the inner class when a Customizable implements a has-a (wrapped)
+  // relationship.  Derived classes that implement a has-a must override this
+  // method in order to get CheckedCast to function properly.
+  virtual const Customizable* Inner() const { return nullptr; }
+
+ protected:
+  //  Given a name (e.g. rocksdb.my.type.opt), returns the short name (opt)
+  std::string GetOptionName(const std::string& long_name) const override;
+#ifndef ROCKSDB_LITE
+  std::string SerializeOptions(const ConfigOptions& options,
+                               const std::string& prefix) const override;
+#endif  // ROCKSDB_LITE
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/data_structure.h b/include/rocksdb/data_structure.h
new file mode 100644
index 00000000000..c9a4ebd829b
--- /dev/null
+++ b/include/rocksdb/data_structure.h
@@ -0,0 +1,48 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <assert.h>
+
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This is a data structure specifically designed as a "Set" for a
+// pretty small scale of Enum structure. For now, it can support up
+// to 64 element, and it is expandable in the future.
+template <typename ENUM_TYPE, ENUM_TYPE MAX_VALUE>
+class SmallEnumSet {
+ public:
+  SmallEnumSet() : state_(0) {}
+
+  ~SmallEnumSet() {}
+
+  // Return true if the input enum is included in the "Set" (i.e., changes the
+  // internal scalar state successfully), otherwise, it will return false.
+  bool Add(const ENUM_TYPE value) {
+    static_assert(MAX_VALUE <= 63, "Size currently limited to 64");
+    assert(value >= 0 && value <= MAX_VALUE);
+    uint64_t old_state = state_;
+    uint64_t tmp = 1;
+    state_ |= (tmp << value);
+    return old_state != state_;
+  }
+
+  // Return true if the input enum is contained in the "Set".
+  bool Contains(const ENUM_TYPE value) {
+    static_assert(MAX_VALUE <= 63, "Size currently limited to 64");
+    assert(value >= 0 && value <= MAX_VALUE);
+    uint64_t tmp = 1;
+    return state_ & (tmp << value);
+  }
+
+ private:
+  uint64_t state_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index e31042170f8..88bd00004b1 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -115,7 +115,7 @@ struct RangePtr {
 };
 
 // It is valid that files_checksums and files_checksum_func_names are both
-// empty (no checksum informaiton is provided for ingestion). Otherwise,
+// empty (no checksum information is provided for ingestion). Otherwise,
 // their sizes should be the same as external_files. The file order should
 // be the same in three vectors and guaranteed by the caller.
 struct IngestExternalFileArg {
@@ -143,11 +143,15 @@ typedef std::unordered_map<std::string, std::shared_ptr<const TableProperties>>
 // and a number of wrapper implementations.
 class DB {
  public:
-  // Open the database with the specified "name".
+  // Open the database with the specified "name" for reads and writes.
   // Stores a pointer to a heap-allocated database in *dbptr and returns
   // OK on success.
-  // Stores nullptr in *dbptr and returns a non-OK status on error.
-  // Caller should delete *dbptr when it is no longer needed.
+  // Stores nullptr in *dbptr and returns a non-OK status on error, including
+  // if the DB is already open (read-write) by another DB object. (This
+  // guarantee depends on options.env->LockFile(), which might not provide
+  // this guarantee in a custom Env implementation.)
+  //
+  // Caller must delete *dbptr when it is no longer needed.
   static Status Open(const Options& options, const std::string& name,
                      DB** dbptr);
 
@@ -156,6 +160,12 @@ class DB {
   // If the db is opened in read only mode, then no compactions
   // will happen.
   //
+  // While a given DB can be simultaneously open via OpenForReadOnly
+  // by any number of readers, if a DB is simultaneously open by Open
+  // and OpenForReadOnly, the read-only instance has undefined behavior
+  // (though can often succeed if quickly closed) and the read-write
+  // instance is unaffected. See also OpenAsSecondary.
+  //
   // Not supported in ROCKSDB_LITE, in which case the function will
   // return Status::NotSupported.
   static Status OpenForReadOnly(const Options& options, const std::string& name,
@@ -168,6 +178,12 @@ class DB {
   // column family. The default column family name is 'default' and it's stored
   // in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName
   //
+  // While a given DB can be simultaneously open via OpenForReadOnly
+  // by any number of readers, if a DB is simultaneously open by Open
+  // and OpenForReadOnly, the read-only instance has undefined behavior
+  // (though can often succeed if quickly closed) and the read-write
+  // instance is unaffected. See also OpenAsSecondary.
+  //
   // Not supported in ROCKSDB_LITE, in which case the function will
   // return Status::NotSupported.
   static Status OpenForReadOnly(
@@ -208,11 +224,11 @@ class DB {
   // to open the primary instance.
   // The secondary_path argument points to a directory where the secondary
   // instance stores its info log.
-  // The column_families argument specifieds a list of column families to open.
+  // The column_families argument specifies a list of column families to open.
   // If any of the column families does not exist, the function returns non-OK
   // status.
   // The handles is an out-arg corresponding to the opened database column
-  // familiy handles.
+  // family handles.
   // The dbptr is an out-arg corresponding to the opened secondary instance.
   // The pointer points to a heap-allocated database, and the caller should
   // delete it after use. Before deleting the dbptr, the user should also
@@ -242,6 +258,16 @@ class DB {
                      const std::vector<ColumnFamilyDescriptor>& column_families,
                      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
 
+  // Open DB and run the compaction.
+  // It's a read-only operation, the result won't be installed to the DB, it
+  // will be output to the `output_directory`. The API should only be used with
+  // `options.CompactionService` to run compaction triggered by
+  // `CompactionService`.
+  static Status OpenAndCompact(
+      const std::string& name, const std::string& output_directory,
+      const std::string& input, std::string* output,
+      const CompactionServiceOptionsOverride& override_options);
+
   virtual Status Resume() { return Status::NotSupported(); }
 
   // Close the DB by releasing resources, closing files etc. This should be
@@ -716,7 +742,9 @@ class DB {
   virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
 
 #ifndef ROCKSDB_LITE
-  // Contains all valid property arguments for GetProperty().
+  // Contains all valid property arguments for GetProperty() or
+  // GetMapProperty(). Each is a "string" property for retrieval with
+  // GetProperty() unless noted as a "map" property, for GetMapProperty().
   //
   // NOTE: Property names cannot end in numbers since those are interpreted as
   //       arguments, e.g., see kNumFilesAtLevelPrefix.
@@ -741,19 +769,14 @@ class DB {
     //      SST files.
     static const std::string kSSTables;
 
-    //  "rocksdb.cfstats" - Both of "rocksdb.cfstats-no-file-histogram" and
-    //      "rocksdb.cf-file-histogram" together. See below for description
-    //      of the two.
+    //  "rocksdb.cfstats" - Raw data from "rocksdb.cfstats-no-file-histogram"
+    //      and "rocksdb.cf-file-histogram" as a "map" property.
     static const std::string kCFStats;
 
     //  "rocksdb.cfstats-no-file-histogram" - returns a multi-line string with
-    //      general columm family stats per-level over db's lifetime ("L<n>"),
+    //      general column family stats per-level over db's lifetime ("L<n>"),
     //      aggregated over db's lifetime ("Sum"), and aggregated over the
     //      interval since the last retrieval ("Int").
-    //  It could also be used to return the stats in the format of the map.
-    //  In this case there will a pair of string to array of double for
-    //  each level as well as for "Sum". "Int" stats will not be affected
-    //  when this form of stats are retrieved.
     static const std::string kCFStatsNoFileHistogram;
 
     //  "rocksdb.cf-file-histogram" - print out how many file reads to every
@@ -769,6 +792,10 @@ class DB {
     //      of files per level and total size of each level (MB).
     static const std::string kLevelStats;
 
+    //  "rocksdb.block-cache-entry-stats" - returns a multi-line string or
+    //      map with statistics on block cache usage.
+    static const std::string kBlockCacheEntryStats;
+
     //  "rocksdb.num-immutable-mem-table" - returns number of immutable
     //      memtables that have not yet been flushed.
     static const std::string kNumImmutableMemTable;
@@ -894,8 +921,10 @@ class DB {
     //      based.
     static const std::string kEstimatePendingCompactionBytes;
 
-    //  "rocksdb.aggregated-table-properties" - returns a string representation
-    //      of the aggregated table properties of the target column family.
+    //  "rocksdb.aggregated-table-properties" - returns a string or map
+    //      representation of the aggregated table properties of the target
+    //      column family. Only properties that make sense for aggregation
+    //      are included.
     static const std::string kAggregatedTableProperties;
 
     //  "rocksdb.aggregated-table-properties-at-level<N>", same as the previous
@@ -933,15 +962,19 @@ class DB {
   };
 #endif /* ROCKSDB_LITE */
 
-  // DB implementations can export properties about their state via this method.
-  // If "property" is a valid property understood by this DB implementation (see
-  // Properties struct above for valid options), fills "*value" with its current
-  // value and returns true.  Otherwise, returns false.
+  // DB implementations export properties about their state via this method.
+  // If "property" is a valid "string" property understood by this DB
+  // implementation (see Properties struct above for valid options), fills
+  // "*value" with its current value and returns true.  Otherwise, returns
+  // false.
   virtual bool GetProperty(ColumnFamilyHandle* column_family,
                            const Slice& property, std::string* value) = 0;
   virtual bool GetProperty(const Slice& property, std::string* value) {
     return GetProperty(DefaultColumnFamily(), property, value);
   }
+
+  // Like GetProperty but for valid "map" properties. (Some properties can be
+  // accessed as either "string" properties or "map" properties.)
   virtual bool GetMapProperty(ColumnFamilyHandle* column_family,
                               const Slice& property,
                               std::map<std::string, std::string>* value) = 0;
@@ -1025,21 +1058,24 @@ class DB {
                                      uint64_t* sizes) = 0;
 
   // Simpler versions of the GetApproximateSizes() method above.
-  // The include_flags argumenbt must of type DB::SizeApproximationFlags
+  // The include_flags argument must of type DB::SizeApproximationFlags
   // and can not be NONE.
-  virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
-                                   const Range* ranges, int n, uint64_t* sizes,
-                                   uint8_t include_flags = INCLUDE_FILES) {
+  virtual Status GetApproximateSizes(ColumnFamilyHandle* column_family,
+                                     const Range* ranges, int n,
+                                     uint64_t* sizes,
+                                     uint8_t include_flags = INCLUDE_FILES) {
     SizeApproximationOptions options;
     options.include_memtabtles =
         (include_flags & SizeApproximationFlags::INCLUDE_MEMTABLES) != 0;
     options.include_files =
         (include_flags & SizeApproximationFlags::INCLUDE_FILES) != 0;
-    GetApproximateSizes(options, column_family, ranges, n, sizes);
+    return GetApproximateSizes(options, column_family, ranges, n, sizes);
   }
-  virtual void GetApproximateSizes(const Range* ranges, int n, uint64_t* sizes,
-                                   uint8_t include_flags = INCLUDE_FILES) {
-    GetApproximateSizes(DefaultColumnFamily(), ranges, n, sizes, include_flags);
+  virtual Status GetApproximateSizes(const Range* ranges, int n,
+                                     uint64_t* sizes,
+                                     uint8_t include_flags = INCLUDE_FILES) {
+    return GetApproximateSizes(DefaultColumnFamily(), ranges, n, sizes,
+                               include_flags);
   }
 
   // The method is similar to GetApproximateSizes, except it
@@ -1079,6 +1115,8 @@ class DB {
   // and the data is rearranged to reduce the cost of operations
   // needed to access the data.  This operation should typically only
   // be invoked by users who understand the underlying implementation.
+  // This call blocks until the operation completes successfully, fails,
+  // or is aborted (Status::Incomplete). See DisableManualCompaction.
   //
   // begin==nullptr is treated as a key before all keys in the database.
   // end==nullptr is treated as a key after all keys in the database.
@@ -1133,9 +1171,9 @@ class DB {
       const std::unordered_map<std::string, std::string>& new_options) = 0;
 
   // CompactFiles() inputs a list of files specified by file numbers and
-  // compacts them to the specified level. Note that the behavior is different
-  // from CompactRange() in that CompactFiles() performs the compaction job
-  // using the CURRENT thread.
+  // compacts them to the specified level. A small difference compared to
+  // CompactRange() is that CompactFiles() performs the compaction job
+  // using the CURRENT thread, so is not considered a "background" job.
   //
   // @see GetDataBaseMetaData
   // @see GetColumnFamilyMetaData
@@ -1177,7 +1215,16 @@ class DB {
   virtual Status EnableAutoCompaction(
       const std::vector<ColumnFamilyHandle*>& column_family_handles) = 0;
 
+  // After this function call, CompactRange() or CompactFiles() will not
+  // run compactions and fail. Calling this function will tell outstanding
+  // manual compactions to abort and will wait for them to finish or abort
+  // before returning.
   virtual void DisableManualCompaction() = 0;
+  // Re-enable CompactRange() and ComapctFiles() that are disabled by
+  // DisableManualCompaction(). This function must be called as many times
+  // as DisableManualCompaction() has been called in order to re-enable
+  // manual compactions, and must not be called more times than
+  // DisableManualCompaction() has been called.
   virtual void EnableManualCompaction() = 0;
 
   // Number of levels used for this DB.
@@ -1365,7 +1412,7 @@ class DB {
   virtual void GetLiveFilesMetaData(
       std::vector<LiveFileMetaData>* /*metadata*/) {}
 
-  // Return a list of all table file checksum info.
+  // Return a list of all table and blob files checksum info.
   // Note: This function might be of limited use because it cannot be
   // synchronized with GetLiveFiles.
   virtual Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) = 0;
@@ -1446,6 +1493,12 @@ class DB {
       const ExportImportFilesMetaData& metadata,
       ColumnFamilyHandle** handle) = 0;
 
+  // Verify the checksums of files in db. Currently the whole-file checksum of
+  // table files are checked.
+  virtual Status VerifyFileChecksums(const ReadOptions& /*read_options*/) {
+    return Status::NotSupported("File verification not supported");
+  }
+
   // Verify the block checksums of files in db. The block checksums of table
   // files are checked.
   virtual Status VerifyChecksum(const ReadOptions& read_options) = 0;
@@ -1602,14 +1655,14 @@ class DB {
     return Status::NotSupported("EndTrace() is not implemented.");
   }
 
-  // StartIOTrace and EndIOTrace are experimental. They are not enabled yet.
-  virtual Status StartIOTrace(Env* /*env*/, const TraceOptions& /*options*/,
+  // IO Tracing operations. Use EndIOTrace() to stop tracing.
+  virtual Status StartIOTrace(const TraceOptions& /*options*/,
                               std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
-    return Status::NotSupported("StartTrace() is not implemented.");
+    return Status::NotSupported("StartIOTrace() is not implemented.");
   }
 
   virtual Status EndIOTrace() {
-    return Status::NotSupported("StartTrace() is not implemented.");
+    return Status::NotSupported("EndIOTrace() is not implemented.");
   }
 
   // Trace block cache accesses. Use EndBlockCacheTrace() to stop tracing.
diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h
index 955d591c310..a4463060a90 100644
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@@ -17,12 +17,15 @@
 #pragma once
 
 #include <stdint.h>
+
 #include <cstdarg>
 #include <functional>
 #include <limits>
 #include <memory>
 #include <string>
 #include <vector>
+
+#include "rocksdb/functor_wrapper.h"
 #include "rocksdb/status.h"
 #include "rocksdb/thread_status.h"
 
@@ -30,11 +33,12 @@
 // Windows API macro interference
 #undef DeleteFile
 #undef GetCurrentTime
+#undef LoadLibrary
 #endif
 
 #if defined(__GNUC__) || defined(__clang__)
 #define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param) \
-    __attribute__((__format__(__printf__, format_param, dots_param)))
+  __attribute__((__format__(__printf__, format_param, dots_param)))
 #else
 #define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param)
 #endif
@@ -47,6 +51,7 @@ class Logger;
 class RandomAccessFile;
 class SequentialFile;
 class Slice;
+struct DataVerificationInfo;
 class WritableFile;
 class RandomRWFile;
 class MemoryMappedFileBuffer;
@@ -58,6 +63,8 @@ class RateLimiter;
 class ThreadStatusUpdater;
 struct ThreadStatus;
 class FileSystem;
+class SystemClock;
+struct ConfigOptions;
 
 const size_t kDefaultPageSize = 4 * 1024;
 
@@ -149,8 +156,11 @@ class Env {
   };
 
   Env();
-  // Construct an Env with a separate FileSystem implementation
-  Env(std::shared_ptr<FileSystem> fs);
+  // Construct an Env with a separate FileSystem and/or SystemClock
+  // implementation
+  explicit Env(const std::shared_ptr<FileSystem>& fs);
+  Env(const std::shared_ptr<FileSystem>& fs,
+      const std::shared_ptr<SystemClock>& clock);
   // No copying allowed
   Env(const Env&) = delete;
   void operator=(const Env&) = delete;
@@ -160,12 +170,44 @@ class Env {
   static const char* Type() { return "Environment"; }
 
   // Loads the environment specified by the input value into the result
+  // The CreateFromString alternative should be used; this method may be
+  // deprecated in a future release.
   static Status LoadEnv(const std::string& value, Env** result);
 
   // Loads the environment specified by the input value into the result
+  // The CreateFromString alternative should be used; this method may be
+  // deprecated in a future release.
   static Status LoadEnv(const std::string& value, Env** result,
                         std::shared_ptr<Env>* guard);
 
+  // Loads the environment specified by the input value into the result
+  // @see Customizable for a more detailed description of the parameters and
+  // return codes
+  //
+  // @param config_options Controls how the environment is loaded.
+  // @param value the name and associated properties for the environment.
+  // @param result On success, the environment that was loaded.
+  // @param guard If specified and the loaded environment is not static,
+  //      this value will contain the loaded environment (guard.get() ==
+  //      result).
+  // @return OK If the environment was successfully loaded (and optionally
+  // prepared)
+  // @return not-OK if the load failed.
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& value, Env** result);
+  static Status CreateFromString(const ConfigOptions& config_options,
+                                 const std::string& value, Env** result,
+                                 std::shared_ptr<Env>* guard);
+
+  // Loads the environment specified by the env and fs uri.
+  // If both are specified, an error is returned.
+  // Otherwise, the environment is created by loading (via CreateFromString)
+  // the appropriate env/fs from the corresponding values.
+  static Status CreateFromUri(const ConfigOptions& options,
+                              const std::string& env_uri,
+                              const std::string& fs_uri, Env** result,
+                              std::shared_ptr<Env>* guard);
+
   // Return a default environment suitable for the current operating
   // system.  Sophisticated users may wish to provide their own Env
   // implementation instead of relying on this default environment.
@@ -282,7 +324,8 @@ class Env {
   virtual Status FileExists(const std::string& fname) = 0;
 
   // Store in *result the names of the children of the specified directory.
-  // The names are relative to "dir".
+  // The names are relative to "dir", and shall never include the
+  // names `.` or `..`.
   // Original contents of *results are dropped.
   // Returns OK if "dir" exists and "*result" contains its children.
   //         NotFound if "dir" does not exist, the calling process does not have
@@ -295,7 +338,8 @@ class Env {
   // In case the implementation lists the directory prior to iterating the files
   // and files are concurrently deleted, the deleted files will be omitted from
   // result.
-  // The name attributes are relative to "dir".
+  // The name attributes are relative to "dir", and shall never include the
+  // names `.` or `..`.
   // Original contents of *results are dropped.
   // Returns OK if "dir" exists and "*result" contains its children.
   //         NotFound if "dir" does not exist, the calling process does not have
@@ -414,6 +458,21 @@ class Env {
   // When "function(arg)" returns, the thread will be destroyed.
   virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
 
+  // Start a new thread, invoking "function(args...)" within the new thread.
+  // When "function(args...)" returns, the thread will be destroyed.
+  template <typename FunctionT, typename... Args>
+  void StartThreadTyped(FunctionT function, Args&&... args) {
+    using FWType = FunctorWrapper<Args...>;
+    StartThread(
+        [](void* arg) {
+          auto* functor = static_cast<FWType*>(arg);
+          functor->invoke();
+          delete functor;
+        },
+        new FWType(std::function<void(Args...)>(function),
+                   std::forward<Args>(args)...));
+  }
+
   // Wait for all threads started by StartThread to terminate.
   virtual void WaitForJoin() {}
 
@@ -429,7 +488,7 @@ class Env {
   virtual Status GetTestDirectory(std::string* path) = 0;
 
   // Create and returns a default logger (an instance of EnvLogger) for storing
-  // informational messages. Derived classes can overide to provide custom
+  // informational messages. Derived classes can override to provide custom
   // logger.
   virtual Status NewLogger(const std::string& fname,
                            std::shared_ptr<Logger>* result);
@@ -452,9 +511,15 @@ class Env {
   // Sleep/delay the thread for the prescribed number of micro-seconds.
   virtual void SleepForMicroseconds(int micros) = 0;
 
-  // Get the current host name.
+  // Get the current host name as a null terminated string iff the string
+  // length is < len. The hostname should otherwise be truncated to len.
   virtual Status GetHostName(char* name, uint64_t len) = 0;
 
+  // Get the current hostname from the given env as a std::string in result.
+  // The result may be truncated if the hostname is too
+  // long
+  virtual Status GetHostNameString(std::string* result);
+
   // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
   // Only overwrites *unix_time on success.
   virtual Status GetCurrentTime(int64_t* unix_time) = 0;
@@ -532,6 +597,13 @@ class Env {
       const EnvOptions& env_options,
       const ImmutableDBOptions& db_options) const;
 
+  // OptimizeForBlobFileRead will create a new EnvOptions object that
+  // is a copy of the EnvOptions in the parameters, but is optimized for reading
+  // blob files.
+  virtual EnvOptions OptimizeForBlobFileRead(
+      const EnvOptions& env_options,
+      const ImmutableDBOptions& db_options) const;
+
   // Returns the status of all threads that belong to the current Env.
   virtual Status GetThreadList(std::vector<ThreadStatus>* /*thread_list*/) {
     return Status::NotSupported("Env::GetThreadList() not supported.");
@@ -567,6 +639,10 @@ class Env {
   // could be a fully implemented one, or a wrapper class around the Env
   const std::shared_ptr<FileSystem>& GetFileSystem() const;
 
+  // Get the SystemClock implementation this Env was constructed with. It
+  // could be a fully implemented one, or a wrapper class around the Env
+  const std::shared_ptr<SystemClock>& GetSystemClock() const;
+
   // If you're adding methods here, remember to add them to EnvWrapper too.
 
  protected:
@@ -576,6 +652,12 @@ class Env {
 
   // Pointer to the underlying FileSystem implementation
   std::shared_ptr<FileSystem> file_system_;
+
+  // Pointer to the underlying SystemClock implementation
+  std::shared_ptr<SystemClock> system_clock_;
+
+ private:
+  static const size_t kMaxHostNameLen = 256;
 };
 
 // The factory function to construct a ThreadStatusUpdater.  Any Env
@@ -596,6 +678,10 @@ class SequentialFile {
   // "scratch[0..n-1]" must be live when "*result" is used.
   // If an error was encountered, returns a non-OK status.
   //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
   // REQUIRES: External synchronization
   virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
 
@@ -641,7 +727,8 @@ struct ReadRequest {
   // File offset in bytes
   uint64_t offset;
 
-  // Length to read in bytes
+  // Length to read in bytes. `result` only returns fewer bytes if end of file
+  // is hit (or `status` is not OK).
   size_t len;
 
   // A buffer that MultiRead()  can optionally place data in. It can
@@ -670,6 +757,10 @@ class RandomAccessFile {
   // "*result" is used.  If an error was encountered, returns a non-OK
   // status.
   //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
   // Safe for concurrent use by multiple threads.
   // If Direct I/O enabled, offset, n, and scratch should be aligned properly.
   virtual Status Read(uint64_t offset, size_t n, Slice* result,
@@ -765,10 +856,22 @@ class WritableFile {
   virtual ~WritableFile();
 
   // Append data to the end of the file
-  // Note: A WriteabelFile object must support either Append or
+  // Note: A WriteableFile object must support either Append or
   // PositionedAppend, so the users cannot mix the two.
   virtual Status Append(const Slice& data) = 0;
 
+  // Append data with verification information.
+  // Note that this API change is experimental and it might be changed in
+  // the future. Currently, RocksDB only generates crc32c based checksum for
+  // the file writes when the checksum handoff option is set.
+  // Expected behavior: if currently ChecksumType::kCRC32C is not supported by
+  // WritableFile, the information in DataVerificationInfo can be ignored
+  // (i.e. does not perform checksum verification).
+  virtual Status Append(const Slice& data,
+                        const DataVerificationInfo& /* verification_info */) {
+    return Append(data);
+  }
+
   // PositionedAppend data to the specified offset. The new EOF after append
   // must be larger than the previous EOF. This is to be used when writes are
   // not backed by OS buffers and hence has to always start from the start of
@@ -795,6 +898,19 @@ class WritableFile {
         "WritableFile::PositionedAppend() not supported.");
   }
 
+  // PositionedAppend data with verification information.
+  // Note that this API change is experimental and it might be changed in
+  // the future. Currently, RocksDB only generates crc32c based checksum for
+  // the file writes when the checksum handoff option is set.
+  // Expected behavior: if currently ChecksumType::kCRC32C is not supported by
+  // WritableFile, the information in DataVerificationInfo can be ignored
+  // (i.e. does not perform checksum verification).
+  virtual Status PositionedAppend(
+      const Slice& /* data */, uint64_t /* offset */,
+      const DataVerificationInfo& /* verification_info */) {
+    return Status::NotSupported("PositionedAppend");
+  }
+
   // Truncate is necessary to trim the file to the correct size
   // before closing. It is not always possible to keep track of the file
   // size due to whole pages writes. The behavior is undefined if called
@@ -954,6 +1070,11 @@ class RandomRWFile {
 
   // Read up to `n` bytes starting from offset `offset` and store them in
   // result, provided `scratch` size should be at least `n`.
+  //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
   // Returns Status::OK() on success.
   virtual Status Read(uint64_t offset, size_t n, Slice* result,
                       char* scratch) const = 0;
@@ -1042,11 +1163,17 @@ class Logger {
   virtual void LogHeader(const char* format, va_list ap) {
     // Default implementation does a simple INFO level log write.
     // Please override as per the logger class requirement.
-    Logv(format, ap);
+    Logv(InfoLogLevel::INFO_LEVEL, format, ap);
   }
 
   // Write an entry to the log file with the specified format.
-  virtual void Logv(const char* format, va_list ap) = 0;
+  //
+  // Users who override the `Logv()` overload taking `InfoLogLevel` do not need
+  // to implement this, unless they explicitly invoke it in
+  // `Logv(InfoLogLevel, ...)`.
+  virtual void Logv(const char* /* format */, va_list /* ap */) {
+    assert(false);
+  }
 
   // Write an entry to the log file with the specified log level
   // and format.  Any log with level under the internal log level
@@ -1426,6 +1553,11 @@ class EnvWrapper : public Env {
       const ImmutableDBOptions& db_options) const override {
     return target_->OptimizeForCompactionTableRead(env_options, db_options);
   }
+  EnvOptions OptimizeForBlobFileRead(
+      const EnvOptions& env_options,
+      const ImmutableDBOptions& db_options) const override {
+    return target_->OptimizeForBlobFileRead(env_options, db_options);
+  }
   Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override {
     return target_->GetFreeSpace(path, diskfree);
   }
@@ -1497,9 +1629,18 @@ class WritableFileWrapper : public WritableFile {
   explicit WritableFileWrapper(WritableFile* t) : target_(t) {}
 
   Status Append(const Slice& data) override { return target_->Append(data); }
+  Status Append(const Slice& data,
+                const DataVerificationInfo& verification_info) override {
+    return target_->Append(data, verification_info);
+  }
   Status PositionedAppend(const Slice& data, uint64_t offset) override {
     return target_->PositionedAppend(data, offset);
   }
+  Status PositionedAppend(
+      const Slice& data, uint64_t offset,
+      const DataVerificationInfo& verification_info) override {
+    return target_->PositionedAppend(data, offset, verification_info);
+  }
   Status Truncate(uint64_t size) override { return target_->Truncate(size); }
   Status Close() override { return target_->Close(); }
   Status Flush() override { return target_->Flush(); }
@@ -1648,6 +1789,6 @@ Env* NewTimedEnv(Env* base_env);
 Status NewEnvLogger(const std::string& fname, Env* env,
                     std::shared_ptr<Logger>* result);
 
-std::unique_ptr<Env> NewCompositeEnv(std::shared_ptr<FileSystem> fs);
+std::unique_ptr<Env> NewCompositeEnv(const std::shared_ptr<FileSystem>& fs);
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/env_encryption.h b/include/rocksdb/env_encryption.h
index 6c29dc953e9..7a76ec867b7 100644
--- a/include/rocksdb/env_encryption.h
+++ b/include/rocksdb/env_encryption.h
@@ -10,6 +10,7 @@
 #include <string>
 
 #include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
 #include "rocksdb/rocksdb_namespace.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -72,7 +73,7 @@ class BlockCipher {
   //   - ROT13         Create a ROT13 Cipher
   //   - ROT13:nn      Create a ROT13 Cipher with block size of nn
   // @param result The new cipher object
-  // @return OK if the cipher was sucessfully created
+  // @return OK if the cipher was successfully created
   // @return NotFound if an invalid name was specified in the value
   // @return InvalidArgument if either the options were not valid
   static Status CreateFromString(const ConfigOptions& config_options,
@@ -117,7 +118,7 @@ class EncryptionProvider {
   //   - CTR         Create a CTR provider
   //   - test://CTR Create a CTR provider and initialize it for tests.
   // @param result The new provider object
-  // @return OK if the provider was sucessfully created
+  // @return OK if the provider was successfully created
   // @return NotFound if an invalid name was specified in the value
   // @return InvalidArgument if either the options were not valid
   static Status CreateFromString(const ConfigOptions& config_options,
@@ -171,9 +172,9 @@ class EncryptionProvider {
   virtual Status TEST_Initialize() { return Status::OK(); }
 };
 
-class EncryptedSequentialFile : public SequentialFile {
+class EncryptedSequentialFile : public FSSequentialFile {
  protected:
-  std::unique_ptr<SequentialFile> file_;
+  std::unique_ptr<FSSequentialFile> file_;
   std::unique_ptr<BlockAccessCipherStream> stream_;
   uint64_t offset_;
   size_t prefixLength_;
@@ -181,7 +182,7 @@ class EncryptedSequentialFile : public SequentialFile {
  public:
   // Default ctor. Given underlying sequential file is supposed to be at
   // offset == prefixLength.
-  EncryptedSequentialFile(std::unique_ptr<SequentialFile>&& f,
+  EncryptedSequentialFile(std::unique_ptr<FSSequentialFile>&& f,
                           std::unique_ptr<BlockAccessCipherStream>&& s,
                           size_t prefixLength)
       : file_(std::move(f)),
@@ -197,7 +198,8 @@ class EncryptedSequentialFile : public SequentialFile {
   // If an error was encountered, returns a non-OK status.
   //
   // REQUIRES: External synchronization
-  virtual Status Read(size_t n, Slice* result, char* scratch) override;
+  IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+                char* scratch, IODebugContext* dbg) override;
 
   // Skip "n" bytes from the file. This is guaranteed to be no
   // slower that reading the same data, but may be faster.
@@ -206,36 +208,37 @@ class EncryptedSequentialFile : public SequentialFile {
   // file, and Skip will return OK.
   //
   // REQUIRES: External synchronization
-  virtual Status Skip(uint64_t n) override;
+  IOStatus Skip(uint64_t n) override;
 
   // Indicates the upper layers if the current SequentialFile implementation
   // uses direct IO.
-  virtual bool use_direct_io() const override;
+  bool use_direct_io() const override;
 
   // Use the returned alignment value to allocate
   // aligned buffer for Direct I/O
-  virtual size_t GetRequiredBufferAlignment() const override;
+  size_t GetRequiredBufferAlignment() const override;
 
   // Remove any kind of caching of data from the offset to offset+length
   // of this file. If the length is 0, then it refers to the end of file.
   // If the system is not caching the file contents, then this is a noop.
-  virtual Status InvalidateCache(size_t offset, size_t length) override;
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
 
   // Positioned Read for direct I/O
   // If Direct I/O enabled, offset, n, and scratch should be properly aligned
-  virtual Status PositionedRead(uint64_t offset, size_t n, Slice* result,
-                                char* scratch) override;
+  IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options,
+                          Slice* result, char* scratch,
+                          IODebugContext* dbg) override;
 };
 
 // A file abstraction for randomly reading the contents of a file.
-class EncryptedRandomAccessFile : public RandomAccessFile {
+class EncryptedRandomAccessFile : public FSRandomAccessFile {
  protected:
-  std::unique_ptr<RandomAccessFile> file_;
+  std::unique_ptr<FSRandomAccessFile> file_;
   std::unique_ptr<BlockAccessCipherStream> stream_;
   size_t prefixLength_;
 
  public:
-  EncryptedRandomAccessFile(std::unique_ptr<RandomAccessFile>&& f,
+  EncryptedRandomAccessFile(std::unique_ptr<FSRandomAccessFile>&& f,
                             std::unique_ptr<BlockAccessCipherStream>&& s,
                             size_t prefixLength)
       : file_(std::move(f)),
@@ -252,11 +255,13 @@ class EncryptedRandomAccessFile : public RandomAccessFile {
   //
   // Safe for concurrent use by multiple threads.
   // If Direct I/O enabled, offset, n, and scratch should be aligned properly.
-  virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                      char* scratch) const override;
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
 
   // Readahead the file starting from offset by n bytes for caching.
-  virtual Status Prefetch(uint64_t offset, size_t n) override;
+  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+                    IODebugContext* dbg) override;
 
   // Tries to get an unique ID for this file that will be the same each time
   // the file is opened (and will stay the same while the file is open).
@@ -273,71 +278,76 @@ class EncryptedRandomAccessFile : public RandomAccessFile {
   // a single varint.
   //
   // Note: these IDs are only valid for the duration of the process.
-  virtual size_t GetUniqueId(char* id, size_t max_size) const override;
+  size_t GetUniqueId(char* id, size_t max_size) const override;
 
-  virtual void Hint(AccessPattern pattern) override;
+  void Hint(AccessPattern pattern) override;
 
   // Indicates the upper layers if the current RandomAccessFile implementation
   // uses direct IO.
-  virtual bool use_direct_io() const override;
+  bool use_direct_io() const override;
 
   // Use the returned alignment value to allocate
   // aligned buffer for Direct I/O
-  virtual size_t GetRequiredBufferAlignment() const override;
+  size_t GetRequiredBufferAlignment() const override;
 
   // Remove any kind of caching of data from the offset to offset+length
   // of this file. If the length is 0, then it refers to the end of file.
   // If the system is not caching the file contents, then this is a noop.
-  virtual Status InvalidateCache(size_t offset, size_t length) override;
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
 };
 
 // A file abstraction for sequential writing.  The implementation
 // must provide buffering since callers may append small fragments
 // at a time to the file.
-class EncryptedWritableFile : public WritableFileWrapper {
+class EncryptedWritableFile : public FSWritableFile {
  protected:
-  std::unique_ptr<WritableFile> file_;
+  std::unique_ptr<FSWritableFile> file_;
   std::unique_ptr<BlockAccessCipherStream> stream_;
   size_t prefixLength_;
 
  public:
   // Default ctor. Prefix is assumed to be written already.
-  EncryptedWritableFile(std::unique_ptr<WritableFile>&& f,
+  EncryptedWritableFile(std::unique_ptr<FSWritableFile>&& f,
                         std::unique_ptr<BlockAccessCipherStream>&& s,
                         size_t prefixLength)
-      : WritableFileWrapper(f.get()),
-        file_(std::move(f)),
+      : file_(std::move(f)),
         stream_(std::move(s)),
         prefixLength_(prefixLength) {}
 
-  Status Append(const Slice& data) override;
+  using FSWritableFile::Append;
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  IODebugContext* dbg) override;
 
-  Status PositionedAppend(const Slice& data, uint64_t offset) override;
+  using FSWritableFile::PositionedAppend;
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            IODebugContext* dbg) override;
 
   // Indicates the upper layers if the current WritableFile implementation
   // uses direct IO.
-  virtual bool use_direct_io() const override;
+  bool use_direct_io() const override;
 
   // Use the returned alignment value to allocate
   // aligned buffer for Direct I/O
-  virtual size_t GetRequiredBufferAlignment() const override;
+  size_t GetRequiredBufferAlignment() const override;
 
   /*
    * Get the size of valid data in the file.
    */
-  virtual uint64_t GetFileSize() override;
+  uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override;
 
   // Truncate is necessary to trim the file to the correct size
   // before closing. It is not always possible to keep track of the file
   // size due to whole pages writes. The behavior is undefined if called
   // with other writes to follow.
-  virtual Status Truncate(uint64_t size) override;
+  IOStatus Truncate(uint64_t size, const IOOptions& options,
+                    IODebugContext* dbg) override;
 
   // Remove any kind of caching of data from the offset to offset+length
   // of this file. If the length is 0, then it refers to the end of file.
   // If the system is not caching the file contents, then this is a noop.
   // This call has no effect on dirty pages in the cache.
-  virtual Status InvalidateCache(size_t offset, size_t length) override;
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
 
   // Sync a file range with disk.
   // offset is the starting byte of the file range to be synchronized.
@@ -345,28 +355,42 @@ class EncryptedWritableFile : public WritableFileWrapper {
   // This asks the OS to initiate flushing the cached data to disk,
   // without waiting for completion.
   // Default implementation does nothing.
-  virtual Status RangeSync(uint64_t offset, uint64_t nbytes) override;
+  IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options,
+                     IODebugContext* dbg) override;
 
   // PrepareWrite performs any necessary preparation for a write
   // before the write actually occurs.  This allows for pre-allocation
   // of space on devices where it can result in less file
   // fragmentation and/or less waste from over-zealous filesystem
   // pre-allocation.
-  virtual void PrepareWrite(size_t offset, size_t len) override;
+  void PrepareWrite(size_t offset, size_t len, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+  void SetPreallocationBlockSize(size_t size) override;
+
+  void GetPreallocationStatus(size_t* block_size,
+                              size_t* last_allocated_block) override;
 
   // Pre-allocates space for a file.
-  virtual Status Allocate(uint64_t offset, uint64_t len) override;
+  IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
 };
 
 // A file abstraction for random reading and writing.
-class EncryptedRandomRWFile : public RandomRWFile {
+class EncryptedRandomRWFile : public FSRandomRWFile {
  protected:
-  std::unique_ptr<RandomRWFile> file_;
+  std::unique_ptr<FSRandomRWFile> file_;
   std::unique_ptr<BlockAccessCipherStream> stream_;
   size_t prefixLength_;
 
  public:
-  EncryptedRandomRWFile(std::unique_ptr<RandomRWFile>&& f,
+  EncryptedRandomRWFile(std::unique_ptr<FSRandomRWFile>&& f,
                         std::unique_ptr<BlockAccessCipherStream>&& s,
                         size_t prefixLength)
       : file_(std::move(f)),
@@ -375,31 +399,49 @@ class EncryptedRandomRWFile : public RandomRWFile {
 
   // Indicates if the class makes use of direct I/O
   // If false you must pass aligned buffer to Write()
-  virtual bool use_direct_io() const override;
+  bool use_direct_io() const override;
 
   // Use the returned alignment value to allocate
   // aligned buffer for Direct I/O
-  virtual size_t GetRequiredBufferAlignment() const override;
+  size_t GetRequiredBufferAlignment() const override;
 
   // Write bytes in `data` at  offset `offset`, Returns Status::OK() on success.
   // Pass aligned buffer when use_direct_io() returns true.
-  virtual Status Write(uint64_t offset, const Slice& data) override;
+  IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+                 IODebugContext* dbg) override;
 
   // Read up to `n` bytes starting from offset `offset` and store them in
   // result, provided `scratch` size should be at least `n`.
   // Returns Status::OK() on success.
-  virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                      char* scratch) const override;
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
 
-  virtual Status Flush() override;
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
 
-  virtual Status Sync() override;
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
 
-  virtual Status Fsync() override;
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
 
-  virtual Status Close() override;
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
 };
 
+class EncryptedFileSystem : public FileSystemWrapper {
+ public:
+  explicit EncryptedFileSystem(const std::shared_ptr<FileSystem>& base)
+      : FileSystemWrapper(base) {}
+  // Method to add a new cipher key for use by the EncryptionProvider.
+  // @param description  Descriptor for this key.
+  // @param cipher       The cryptographic key to use
+  // @param len          The length of the cipher key
+  // @param for_write If true, this cipher should be used for writing files.
+  //                  If false, this cipher should only be used for reading
+  //                  files
+  // @return OK if the cipher was successfully added to the provider, non-OK
+  // otherwise
+  virtual Status AddCipher(const std::string& descriptor, const char* cipher,
+                           size_t len, bool for_write) = 0;
+};
 }  // namespace ROCKSDB_NAMESPACE
 
 #endif  // !defined(ROCKSDB_LITE)
diff --git a/include/rocksdb/file_checksum.h b/include/rocksdb/file_checksum.h
index 37b1744ce47..00b2b9d5740 100644
--- a/include/rocksdb/file_checksum.h
+++ b/include/rocksdb/file_checksum.h
@@ -76,7 +76,7 @@ class FileChecksumGenFactory {
 };
 
 // FileChecksumList stores the checksum information of a list of files (e.g.,
-// SST files). The FileChecksumLIst can be used to store the checksum
+// SST files). The FileChecksumList can be used to store the checksum
 // information of all SST file getting  from the MANIFEST, which are
 // the checksum information of all valid SST file of a DB instance. It can
 // also be used to store the checksum information of a list of SST files to
@@ -116,7 +116,7 @@ class FileChecksumList {
 // Create a new file checksum list.
 extern FileChecksumList* NewFileChecksumList();
 
-// Return a shared_ptr of the builtin Crc32c based file checksum generatory
+// Return a shared_ptr of the builtin Crc32c based file checksum generator
 // factory object, which can be shared to create the Crc32c based checksum
 // generator object.
 // Note: this implementation is compatible with many other crc32c checksum
diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h
index 3683491c1ba..025908e4f97 100644
--- a/include/rocksdb/file_system.h
+++ b/include/rocksdb/file_system.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <stdint.h>
+
 #include <chrono>
 #include <cstdarg>
 #include <functional>
@@ -25,9 +26,11 @@
 #include <sstream>
 #include <string>
 #include <vector>
+
 #include "rocksdb/env.h"
 #include "rocksdb/io_status.h"
 #include "rocksdb/options.h"
+#include "rocksdb/table.h"
 #include "rocksdb/thread_status.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -43,6 +46,7 @@ class Slice;
 struct ImmutableDBOptions;
 struct MutableDBOptions;
 class RateLimiter;
+struct ConfigOptions;
 
 using AccessPattern = RandomAccessFile::AccessPattern;
 using FileAttributes = Env::FileAttributes;
@@ -97,16 +101,30 @@ struct FileOptions : EnvOptions {
   // to be issued for the file open/creation
   IOOptions io_options;
 
-  FileOptions() : EnvOptions() {}
+  // EXPERIMENTAL
+  // The feature is in development and is subject to change.
+  // When creating a new file, set the temperature of the file so that
+  // underlying file systems can put it with appropriate storage media and/or
+  // coding.
+  Temperature temperature = Temperature::kUnknown;
+
+  // The checksum type that is used to calculate the checksum value for
+  // handoff during file writes.
+  ChecksumType handoff_checksum_type;
+
+  FileOptions() : EnvOptions(), handoff_checksum_type(ChecksumType::kCRC32c) {}
 
   FileOptions(const DBOptions& opts)
-    : EnvOptions(opts) {}
+      : EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {}
 
   FileOptions(const EnvOptions& opts)
-    : EnvOptions(opts) {}
+      : EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {}
 
   FileOptions(const FileOptions& opts)
-    : EnvOptions(opts), io_options(opts.io_options) {}
+      : EnvOptions(opts),
+        io_options(opts.io_options),
+        temperature(opts.temperature),
+        handoff_checksum_type(opts.handoff_checksum_type) {}
 
   FileOptions& operator=(const FileOptions& opts) = default;
 };
@@ -123,12 +141,36 @@ struct IODebugContext {
   // To be set by the FileSystem implementation
   std::string msg;
 
+  // To be set by the underlying FileSystem implementation.
+  std::string request_id;
+
+  // In order to log required information in IO tracing for different
+  // operations, Each bit in trace_data stores which corresponding info from
+  // IODebugContext will be added in the trace. Foreg, if trace_data = 1, it
+  // means bit at position 0 is set so TraceData::kRequestID (request_id) will
+  // be logged in the trace record.
+  //
+  enum TraceData : char {
+    // The value of each enum represents the bitwise position for
+    // that information in trace_data which will be used by IOTracer for
+    // tracing. Make sure to add them sequentially.
+    kRequestID = 0,
+  };
+  uint64_t trace_data = 0;
+
   IODebugContext() {}
 
   void AddCounter(std::string& name, uint64_t value) {
     counters.emplace(name, value);
   }
 
+  // Called by underlying file system to set request_id and log request_id in
+  // IOTracing.
+  void SetRequestId(const std::string& _request_id) {
+    request_id = _request_id;
+    trace_data |= (1 << TraceData::kRequestID);
+  }
+
   std::string ToString() {
     std::ostringstream ss;
     ss << file_path << ", ";
@@ -168,9 +210,24 @@ class FileSystem {
   static const char* Type() { return "FileSystem"; }
 
   // Loads the FileSystem specified by the input value into the result
+  // The CreateFromString alternative should be used; this method may be
+  // deprecated in a future release.
   static Status Load(const std::string& value,
                      std::shared_ptr<FileSystem>* result);
 
+  // Loads the FileSystem specified by the input value into the result
+  // @see Customizable for a more detailed description of the parameters and
+  // return codes
+  // @param config_options Controls how the FileSystem is loaded
+  // @param value The name and optional properties describing the file system
+  //      to load.
+  // @param result On success, returns the loaded FileSystem
+  // @return OK if the FileSystem was successfully loaded.
+  // @return not-OK if the load failed.
+  static Status CreateFromString(const ConfigOptions& options,
+                                 const std::string& value,
+                                 std::shared_ptr<FileSystem>* result);
+
   // Return a default fie_system suitable for the current operating
   // system.  Sophisticated users may wish to provide their own Env
   // implementation instead of relying on this default file_system
@@ -262,7 +319,7 @@ class FileSystem {
   virtual IOStatus ReopenWritableFile(
       const std::string& /*fname*/, const FileOptions& /*options*/,
       std::unique_ptr<FSWritableFile>* /*result*/, IODebugContext* /*dbg*/) {
-    return IOStatus::NotSupported();
+    return IOStatus::NotSupported("ReopenWritableFile");
   }
 
   // Reuse an existing file by renaming it and opening it as writable.
@@ -366,6 +423,10 @@ class FileSystem {
     return IOStatus::OK();
   }
 
+// This seems to clash with a macro on Windows, so #undef it here
+#ifdef DeleteFile
+#undef DeleteFile
+#endif
   // Delete the named file.
   virtual IOStatus DeleteFile(const std::string& fname,
                               const IOOptions& options,
@@ -460,7 +521,7 @@ class FileSystem {
                                     IODebugContext* dbg) = 0;
 
   // Create and returns a default logger (an instance of EnvLogger) for storing
-  // informational messages. Derived classes can overide to provide custom
+  // informational messages. Derived classes can override to provide custom
   // logger.
   virtual IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts,
                              std::shared_ptr<Logger>* result,
@@ -513,6 +574,13 @@ class FileSystem {
       const FileOptions& file_options,
       const ImmutableDBOptions& db_options) const;
 
+  // OptimizeForBlobFileRead will create a new FileOptions object that
+  // is a copy of the FileOptions in the parameters, but is optimized for
+  // reading blob files.
+  virtual FileOptions OptimizeForBlobFileRead(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& db_options) const;
+
 // This seems to clash with a macro on Windows, so #undef it here
 #ifdef GetFreeSpace
 #undef GetFreeSpace
@@ -523,7 +591,7 @@ class FileSystem {
                                 const IOOptions& /*options*/,
                                 uint64_t* /*diskfree*/,
                                 IODebugContext* /*dbg*/) {
-    return IOStatus::NotSupported();
+    return IOStatus::NotSupported("GetFreeSpace");
   }
 
   virtual IOStatus IsDirectory(const std::string& /*path*/,
@@ -550,6 +618,10 @@ class FSSequentialFile {
   // "scratch[0..n-1]" must be live when "*result" is used.
   // If an error was encountered, returns a non-OK status.
   //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
   // REQUIRES: External synchronization
   virtual IOStatus Read(size_t n, const IOOptions& options, Slice* result,
                         char* scratch, IODebugContext* dbg) = 0;
@@ -584,7 +656,7 @@ class FSSequentialFile {
                                   const IOOptions& /*options*/,
                                   Slice* /*result*/, char* /*scratch*/,
                                   IODebugContext* /*dbg*/) {
-    return IOStatus::NotSupported();
+    return IOStatus::NotSupported("PositionedRead");
   }
 
   // If you're adding methods here, remember to add them to
@@ -596,7 +668,8 @@ struct FSReadRequest {
   // File offset in bytes
   uint64_t offset;
 
-  // Length to read in bytes
+  // Length to read in bytes. `result` only returns fewer bytes if end of file
+  // is hit (or `status` is not OK).
   size_t len;
 
   // A buffer that MultiRead()  can optionally place data in. It can
@@ -626,6 +699,10 @@ class FSRandomAccessFile {
   // "*result" is used.  If an error was encountered, returns a non-OK
   // status.
   //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
   // Safe for concurrent use by multiple threads.
   // If Direct I/O enabled, offset, n, and scratch should be aligned properly.
   virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
@@ -638,7 +715,7 @@ class FSRandomAccessFile {
   virtual IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/,
                             const IOOptions& /*options*/,
                             IODebugContext* /*dbg*/) {
-    return IOStatus::NotSupported();
+    return IOStatus::NotSupported("Prefetch");
   }
 
   // Read a bunch of blocks as described by reqs. The blocks can
@@ -703,7 +780,7 @@ class FSRandomAccessFile {
 };
 
 // A data structure brings the data verification information, which is
-// used togther with data being written to a file.
+// used together with data being written to a file.
 struct DataVerificationInfo {
   // checksum of the data being written.
   Slice checksum;
@@ -731,15 +808,19 @@ class FSWritableFile {
   virtual ~FSWritableFile() {}
 
   // Append data to the end of the file
-  // Note: A WriteabelFile object must support either Append or
+  // Note: A WriteableFile object must support either Append or
   // PositionedAppend, so the users cannot mix the two.
   virtual IOStatus Append(const Slice& data, const IOOptions& options,
                           IODebugContext* dbg) = 0;
 
-  // EXPERIMENTAL / CURRENTLY UNUSED
-  // Append data with verification information
+  // Append data with verification information.
   // Note that this API change is experimental and it might be changed in
-  // the future. Currently, RocksDB does not use this API.
+  // the future. Currently, RocksDB only generates crc32c based checksum for
+  // the file writes when the checksum handoff option is set.
+  // Expected behavior: if the handoff_checksum_type in FileOptions (currently,
+  // ChecksumType::kCRC32C is set as default) is not supported by this
+  // FSWritableFile, the information in DataVerificationInfo can be ignored
+  // (i.e. does not perform checksum verification).
   virtual IOStatus Append(const Slice& data, const IOOptions& options,
                           const DataVerificationInfo& /* verification_info */,
                           IODebugContext* dbg) {
@@ -770,19 +851,23 @@ class FSWritableFile {
                                     uint64_t /* offset */,
                                     const IOOptions& /*options*/,
                                     IODebugContext* /*dbg*/) {
-    return IOStatus::NotSupported();
+    return IOStatus::NotSupported("PositionedAppend");
   }
 
-  // EXPERIMENTAL / CURRENTLY UNUSED
   // PositionedAppend data with verification information.
   // Note that this API change is experimental and it might be changed in
-  // the future. Currently, RocksDB does not use this API.
+  // the future. Currently, RocksDB only generates crc32c based checksum for
+  // the file writes when the checksum handoff option is set.
+  // Expected behavior: if the handoff_checksum_type in FileOptions (currently,
+  // ChecksumType::kCRC32C is set as default) is not supported by this
+  // FSWritableFile, the information in DataVerificationInfo can be ignored
+  // (i.e. does not perform checksum verification).
   virtual IOStatus PositionedAppend(
       const Slice& /* data */, uint64_t /* offset */,
       const IOOptions& /*options*/,
       const DataVerificationInfo& /* verification_info */,
       IODebugContext* /*dbg*/) {
-    return IOStatus::NotSupported();
+    return IOStatus::NotSupported("PositionedAppend");
   }
 
   // Truncate is necessary to trim the file to the correct size
@@ -954,6 +1039,11 @@ class FSRandomRWFile {
 
   // Read up to `n` bytes starting from offset `offset` and store them in
   // result, provided `scratch` size should be at least `n`.
+  //
+  // After call, result->size() < n only if end of file has been
+  // reached (or non-OK status). Read might fail if called again after
+  // first result->size() < n.
+  //
   // Returns Status::OK() on success.
   virtual IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
                         Slice* result, char* scratch,
@@ -1048,7 +1138,8 @@ class FSDirectory {
 class FileSystemWrapper : public FileSystem {
  public:
   // Initialize an EnvWrapper that delegates all calls to *t
-  explicit FileSystemWrapper(std::shared_ptr<FileSystem> t) : target_(t) {}
+  explicit FileSystemWrapper(const std::shared_ptr<FileSystem>& t)
+      : target_(t) {}
   ~FileSystemWrapper() override {}
 
   const char* Name() const override { return target_->Name(); }
@@ -1229,6 +1320,11 @@ class FileSystemWrapper : public FileSystem {
       const ImmutableDBOptions& db_options) const override {
     return target_->OptimizeForCompactionTableRead(file_options, db_options);
   }
+  FileOptions OptimizeForBlobFileRead(
+      const FileOptions& file_options,
+      const ImmutableDBOptions& db_options) const override {
+    return target_->OptimizeForBlobFileRead(file_options, db_options);
+  }
   IOStatus GetFreeSpace(const std::string& path, const IOOptions& options,
                         uint64_t* diskfree, IODebugContext* dbg) override {
     return target_->GetFreeSpace(path, options, diskfree, dbg);
diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h
index 3cd85a22601..fc1985d323e 100644
--- a/include/rocksdb/filter_policy.h
+++ b/include/rocksdb/filter_policy.h
@@ -21,6 +21,7 @@
 
 #include <stdlib.h>
 
+#include <algorithm>
 #include <memory>
 #include <stdexcept>
 #include <string>
@@ -28,6 +29,7 @@
 
 #include "rocksdb/advanced_options.h"
 #include "rocksdb/status.h"
+#include "rocksdb/types.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -40,33 +42,48 @@ class FilterBitsBuilder {
  public:
   virtual ~FilterBitsBuilder() {}
 
-  // Add Key to filter, you could use any way to store the key.
-  // Such as: storing hashes or original keys
-  // Keys are in sorted order and duplicated keys are possible.
+  // Add a key (or prefix) to the filter. Typically, a builder will keep
+  // a set of 64-bit key hashes and only build the filter in Finish
+  // when the final number of keys is known. Keys are added in sorted order
+  // and duplicated keys are possible, so typically, the builder will
+  // only add this key if its hash is different from the most recently
+  // added.
   virtual void AddKey(const Slice& key) = 0;
 
+  // Called by RocksDB before Finish to populate
+  // TableProperties::num_filter_entries, so should represent the
+  // number of unique keys (and/or prefixes) added, but does not have
+  // to be exact.
+  virtual size_t EstimateEntriesAdded() {
+    // Default implementation for backward compatibility.
+    // 0 conspicuously stands for "unknown".
+    return 0;
+  }
+
   // Generate the filter using the keys that are added
   // The return value of this function would be the filter bits,
   // The ownership of actual data is set to buf
   virtual Slice Finish(std::unique_ptr<const char[]>* buf) = 0;
 
-  // Calculate num of keys that can be added and generate a filter
-  // <= the specified number of bytes.
-#if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable : 4702)  // unreachable code
-#endif
-  virtual int CalculateNumEntry(const uint32_t /*bytes*/) {
-#ifndef ROCKSDB_LITE
-    throw std::runtime_error("CalculateNumEntry not Implemented");
-#else
-    abort();
-#endif
-    return 0;
+  // Approximate the number of keys that can be added and generate a filter
+  // <= the specified number of bytes. Callers (including RocksDB) should
+  // only use this result for optimizing performance and not as a guarantee.
+  // This default implementation is for compatibility with older custom
+  // FilterBitsBuilders only implementing deprecated CalculateNumEntry.
+  virtual size_t ApproximateNumEntries(size_t bytes) {
+    bytes = std::min(bytes, size_t{0xffffffff});
+    return static_cast<size_t>(CalculateNumEntry(static_cast<uint32_t>(bytes)));
+  }
+
+  // Old, DEPRECATED version of ApproximateNumEntries. This is not
+  // called by RocksDB except as the default implementation of
+  // ApproximateNumEntries for API compatibility.
+  virtual int CalculateNumEntry(const uint32_t bytes) {
+    // DEBUG: ideally should not rely on this implementation
+    assert(false);
+    // RELEASE: something reasonably conservative: 2 bytes per entry
+    return static_cast<int>(bytes / 2);
   }
-#if defined(_MSC_VER)
-#pragma warning(pop)
-#endif
 };
 
 // A class that checks if a key can be in filter
@@ -96,18 +113,32 @@ struct FilterBuildingContext {
   // Options for the table being built
   const BlockBasedTableOptions& table_options;
 
-  // Name of the column family for the table (or empty string if unknown)
-  std::string column_family_name;
-
-  // The compactions style in effect for the table
+  // BEGIN from (DB|ColumnFamily)Options in effect at table creation time
   CompactionStyle compaction_style = kCompactionStyleLevel;
 
-  // The table level at time of constructing the SST file, or -1 if unknown.
-  // (The table file could later be used at a different level.)
-  int level_at_creation = -1;
+  // Number of LSM levels, or -1 if unknown
+  int num_levels = -1;
 
   // An optional logger for reporting errors, warnings, etc.
   Logger* info_log = nullptr;
+  // END from (DB|ColumnFamily)Options
+
+  // Name of the column family for the table (or empty string if unknown)
+  // TODO: consider changing to Slice
+  std::string column_family_name;
+
+  // The table level at time of constructing the SST file, or -1 if unknown
+  // or N/A as in SstFileWriter. (The table file could later be used at a
+  // different level.)
+  int level_at_creation = -1;
+
+  // True if known to be going into bottommost sorted run for applicable
+  // key range (which might not even be last level with data). False
+  // otherwise.
+  bool is_bottommost = false;
+
+  // Reason for creating the file with the filter
+  TableFileCreationReason reason = TableFileCreationReason::kMisc;
 };
 
 // We add a new format of filter block called full filter block
@@ -212,4 +243,35 @@ class FilterPolicy {
 // trailing spaces in keys.
 extern const FilterPolicy* NewBloomFilterPolicy(
     double bits_per_key, bool use_block_based_builder = false);
+
+// A new Bloom alternative that saves about 30% space compared to
+// Bloom filters, with similar query times but roughly 3-4x CPU time
+// and 3x temporary space usage during construction.  For example, if
+// you pass in 10 for bloom_equivalent_bits_per_key, you'll get the same
+// 0.95% FP rate as Bloom filter but only using about 7 bits per key.
+//
+// Ribbon filters are compatible with RocksDB >= 6.15.0. Earlier
+// versions reading the data will behave as if no filter was used
+// (degraded performance until compaction rebuilds filters). All
+// built-in FilterPolicies (Bloom or Ribbon) are able to read other
+// kinds of built-in filters.
+//
+// Note: the current Ribbon filter schema uses some extra resources
+// when constructing very large filters. For example, for 100 million
+// keys in a single filter (one SST file without partitioned filters),
+// 3GB of temporary, untracked memory is used, vs. 1GB for Bloom.
+// However, the savings in filter space from just ~60 open SST files
+// makes up for the additional temporary memory use.
+//
+// Also consider using optimize_filters_for_memory to save filter
+// memory.
+extern const FilterPolicy* NewRibbonFilterPolicy(
+    double bloom_equivalent_bits_per_key);
+
+// Old name
+inline const FilterPolicy* NewExperimentalRibbonFilterPolicy(
+    double bloom_equivalent_bits_per_key) {
+  return NewRibbonFilterPolicy(bloom_equivalent_bits_per_key);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/functor_wrapper.h b/include/rocksdb/functor_wrapper.h
new file mode 100644
index 00000000000..c5f7414b1a8
--- /dev/null
+++ b/include/rocksdb/functor_wrapper.h
@@ -0,0 +1,55 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <utility>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace detail {
+template <std::size_t...>
+struct IndexSequence {};
+
+template <std::size_t N, std::size_t... Next>
+struct IndexSequenceHelper
+    : public IndexSequenceHelper<N - 1U, N - 1U, Next...> {};
+
+template <std::size_t... Next>
+struct IndexSequenceHelper<0U, Next...> {
+  using type = IndexSequence<Next...>;
+};
+
+template <std::size_t N>
+using make_index_sequence = typename IndexSequenceHelper<N>::type;
+
+template <typename Function, typename Tuple, size_t... I>
+void call(Function f, Tuple t, IndexSequence<I...>) {
+  f(std::get<I>(t)...);
+}
+
+template <typename Function, typename Tuple>
+void call(Function f, Tuple t) {
+  static constexpr auto size = std::tuple_size<Tuple>::value;
+  call(f, t, make_index_sequence<size>{});
+}
+}  // namespace detail
+
+template <typename... Args>
+class FunctorWrapper {
+ public:
+  explicit FunctorWrapper(std::function<void(Args...)> functor, Args &&...args)
+      : functor_(std::move(functor)), args_(std::forward<Args>(args)...) {}
+
+  void invoke() { detail::call(functor_, args_); }
+
+ private:
+  std::function<void(Args...)> functor_;
+  std::tuple<Args...> args_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/iostats_context.h b/include/rocksdb/iostats_context.h
index b31b6d70a4f..0f6ab692ea5 100644
--- a/include/rocksdb/iostats_context.h
+++ b/include/rocksdb/iostats_context.h
@@ -50,7 +50,15 @@ struct IOStatsContext {
   uint64_t cpu_read_nanos;
 };
 
-// Get Thread-local IOStatsContext object pointer
+// If RocksDB is compiled with -DNIOSTATS_CONTEXT, then a pointer to a global,
+// non-thread-local IOStatsContext object will be returned. Attempts to update
+// this object will be ignored, and reading from it will also be no-op.
+// Otherwise,
+// a) if thread-local is supported on the platform, then a pointer to
+//    a thread-local IOStatsContext object will be returned.
+// b) if thread-local is NOT supported, then compilation will fail.
+//
+// This function never returns nullptr.
 IOStatsContext* get_iostats_context();
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/iterator.h b/include/rocksdb/iterator.h
index aa2f2a3ff4c..eb3f42acd69 100644
--- a/include/rocksdb/iterator.h
+++ b/include/rocksdb/iterator.h
@@ -45,7 +45,6 @@ class Iterator : public Cleanable {
 
   // Position at the last key in the source.  The iterator is
   // Valid() after this call iff the source is not empty.
-  // Currently incompatible with user timestamp.
   virtual void SeekToLast() = 0;
 
   // Position at the first key in the source that at or past target.
@@ -60,7 +59,7 @@ class Iterator : public Cleanable {
   // Position at the last key in the source that at or before target.
   // The iterator is Valid() after this call iff the source contains
   // an entry that comes at or before target.
-  // Currently incompatible with user timestamp.
+  // Target does not contain timestamp.
   virtual void SeekForPrev(const Slice& target) = 0;
 
   // Moves to the next entry in the source.  After this call, Valid() is
@@ -70,7 +69,6 @@ class Iterator : public Cleanable {
 
   // Moves to the previous entry in the source.  After this call, Valid() is
   // true iff the iterator was not positioned at the first entry in source.
-  // Currently incompatible with user timestamp.
   // REQUIRES: Valid()
   virtual void Prev() = 0;
 
diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h
index e90a8707af4..dec3e442074 100644
--- a/include/rocksdb/listener.h
+++ b/include/rocksdb/listener.h
@@ -16,6 +16,7 @@
 #include "rocksdb/compression_type.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table_properties.h"
+#include "rocksdb/types.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -27,13 +28,6 @@ class ColumnFamilyHandle;
 class Status;
 struct CompactionJobStats;
 
-enum class TableFileCreationReason {
-  kFlush,
-  kCompaction,
-  kRecovery,
-  kMisc,
-};
-
 struct TableFileCreationBriefInfo {
   // the name of the database where the file was created
   std::string db_name;
@@ -118,8 +112,13 @@ enum class FlushReason : int {
   // When set the flush reason to kErrorRecoveryRetryFlush, SwitchMemtable
   // will not be called to avoid many small immutable memtables.
   kErrorRecoveryRetryFlush = 0xc,
+  kWalFull = 0xd,
 };
 
+// TODO: In the future, BackgroundErrorReason will only be used to indicate
+// why the BG Error is happening (e.g., flush, compaction). We may introduce
+// other data structure to indicate other essential information such as
+// the file type (e.g., Manifest, SST) and special context.
 enum class BackgroundErrorReason {
   kFlush,
   kCompaction,
@@ -127,6 +126,7 @@ enum class BackgroundErrorReason {
   kMemTable,
   kManifestWrite,
   kFlushNoWAL,
+  kManifestWriteNoWAL,
 };
 
 enum class WriteStallCondition {
@@ -333,13 +333,18 @@ struct ExternalFileIngestionInfo {
 // be used as a building block for developing custom features such as
 // stats-collector or external compaction algorithm.
 //
-// Note that callback functions should not run for an extended period of
-// time before the function returns, otherwise RocksDB may be blocked.
-// For example, it is not suggested to do DB::CompactFiles() (as it may
-// run for a long while) or issue many of DB::Put() (as Put may be blocked
-// in certain cases) in the same thread in the EventListener callback.
-// However, doing DB::CompactFiles() and DB::Put() in another thread is
-// considered safe.
+// IMPORTANT
+// Because compaction is needed to resolve a "writes stopped" condition,
+// calling or waiting for any blocking DB write function (no_slowdown=false)
+// from a compaction-related listener callback can hang RocksDB. For DB
+// writes from a callback we recommend a WriteBatch and no_slowdown=true,
+// because the WriteBatch can accumulate writes for later in case DB::Write
+// returns Status::Incomplete. Similarly, calling CompactRange or similar
+// could hang by waiting for a background worker that is occupied until the
+// callback returns.
+//
+// Otherwise, callback functions should not run for an extended period of
+// time before the function returns, because this will slow RocksDB.
 //
 // [Threading] All EventListener callback will be called using the
 // actual thread that involves in that specific event.   For example, it
diff --git a/include/rocksdb/memory_allocator.h b/include/rocksdb/memory_allocator.h
index 60256a9772b..51442239a9e 100644
--- a/include/rocksdb/memory_allocator.h
+++ b/include/rocksdb/memory_allocator.h
@@ -45,31 +45,31 @@ struct JemallocAllocatorOptions {
   bool limit_tcache_size = false;
 
   // Lower bound of allocation size to use tcache, if limit_tcache_size=true.
-  // When used with block cache, it is recommneded to set it to block_size/4.
+  // When used with block cache, it is recommended to set it to block_size/4.
   size_t tcache_size_lower_bound = 1024;
 
   // Upper bound of allocation size to use tcache, if limit_tcache_size=true.
-  // When used with block cache, it is recommneded to set it to block_size.
+  // When used with block cache, it is recommended to set it to block_size.
   size_t tcache_size_upper_bound = 16 * 1024;
 };
 
-// Generate memory allocators which allocates through Jemalloc and utilize
-// MADV_DONTDUMP through madvice to exclude cache items from core dump.
+// Generate memory allocator which allocates through Jemalloc and utilize
+// MADV_DONTDUMP through madvise to exclude cache items from core dump.
 // Applications can use the allocator with block cache to exclude block cache
 // usage from core dump.
 //
 // Implementation details:
-// The JemallocNodumpAllocator creates a delicated jemalloc arena, and all
-// allocations of the JemallocNodumpAllocator is through the same arena.
-// The memory allocator hooks memory allocation of the arena, and call
-// madvice() with MADV_DONTDUMP flag to exclude the piece of memory from
-// core dump. Side benefit of using single arena would be reduce of jemalloc
-// metadata for some workload.
+// The JemallocNodumpAllocator creates a dedicated jemalloc arena, and all
+// allocations of the JemallocNodumpAllocator are through the same arena.
+// The memory allocator hooks memory allocation of the arena, and calls
+// madvise() with MADV_DONTDUMP flag to exclude the piece of memory from
+// core dump. Side benefit of using single arena would be reduction of jemalloc
+// metadata for some workloads.
 //
 // To mitigate mutex contention for using one single arena, jemalloc tcache
 // (thread-local cache) is enabled to cache unused allocations for future use.
-// The tcache normally incur 0.5M extra memory usage per-thread. The usage
-// can be reduce by limitting allocation sizes to cache.
+// The tcache normally incurs 0.5M extra memory usage per-thread. The usage
+// can be reduced by limiting allocation sizes to cache.
 extern Status NewJemallocNodumpAllocator(
     JemallocAllocatorOptions& options,
     std::shared_ptr<MemoryAllocator>* memory_allocator);
diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h
index 49723264a59..b8701135d3e 100644
--- a/include/rocksdb/memtablerep.h
+++ b/include/rocksdb/memtablerep.h
@@ -62,7 +62,7 @@ class MemTableRep {
     typedef ROCKSDB_NAMESPACE::Slice DecodedType;
 
     virtual DecodedType decode_key(const char* key) const {
-      // The format of key is frozen and can be terated as a part of the API
+      // The format of key is frozen and can be treated as a part of the API
       // contract. Refer to MemTable::Add for details.
       return GetLengthPrefixedSlice(key);
     }
@@ -120,7 +120,7 @@ class MemTableRep {
     return true;
   }
 
-  // Same as ::InsertWithHint, but allow concurrnet write
+  // Same as ::InsertWithHint, but allow concurrent write
   //
   // If hint points to nullptr, a new hint will be allocated on heap, otherwise
   // the hint will be updated to reflect the last insert location. The hint is
diff --git a/include/rocksdb/merge_operator.h b/include/rocksdb/merge_operator.h
index a0a99fc4a99..0b04ec3102b 100755
--- a/include/rocksdb/merge_operator.h
+++ b/include/rocksdb/merge_operator.h
@@ -109,7 +109,7 @@ class MergeOperator {
     Slice& existing_operand;
   };
 
-  // This function applies a stack of merge operands in chrionological order
+  // This function applies a stack of merge operands in chronological order
   // on top of an existing value. There are two ways in which this method is
   // being used:
   // a) During Get() operation, it used to calculate the final value of a key
@@ -176,7 +176,7 @@ class MergeOperator {
   // PartialMergeMulti should combine them into a single merge operation that is
   // saved into *new_value, and then it should return true.  *new_value should
   // be constructed such that a call to DB::Merge(key, *new_value) would yield
-  // the same result as subquential individual calls to DB::Merge(key, operand)
+  // the same result as sequential individual calls to DB::Merge(key, operand)
   // for each operand in operand_list from front() to back().
   //
   // The string that new_value is pointing to will be empty.
diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h
index 9a64a7a8f68..b515c51a1e5 100644
--- a/include/rocksdb/metadata.h
+++ b/include/rocksdb/metadata.h
@@ -11,6 +11,7 @@
 #include <string>
 #include <vector>
 
+#include "rocksdb/options.h"
 #include "rocksdb/types.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -62,6 +63,7 @@ struct SstFileMetaData {
         being_compacted(false),
         num_entries(0),
         num_deletions(0),
+        temperature(Temperature::kUnknown),
         oldest_blob_file_number(0),
         oldest_ancester_time(0),
         file_creation_time(0) {}
@@ -71,7 +73,8 @@ struct SstFileMetaData {
                   SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno,
                   const std::string& _smallestkey,
                   const std::string& _largestkey, uint64_t _num_reads_sampled,
-                  bool _being_compacted, uint64_t _oldest_blob_file_number,
+                  bool _being_compacted, Temperature _temperature,
+                  uint64_t _oldest_blob_file_number,
                   uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
                   std::string& _file_checksum,
                   std::string& _file_checksum_func_name)
@@ -87,6 +90,7 @@ struct SstFileMetaData {
         being_compacted(_being_compacted),
         num_entries(0),
         num_deletions(0),
+        temperature(_temperature),
         oldest_blob_file_number(_oldest_blob_file_number),
         oldest_ancester_time(_oldest_ancester_time),
         file_creation_time(_file_creation_time),
@@ -112,18 +116,21 @@ struct SstFileMetaData {
   uint64_t num_entries;
   uint64_t num_deletions;
 
+  // This feature is experimental and subject to change.
+  Temperature temperature;
+
   uint64_t oldest_blob_file_number;  // The id of the oldest blob file
                                      // referenced by the file.
   // An SST file may be generated by compactions whose input files may
   // in turn be generated by earlier compactions. The creation time of the
-  // oldest SST file that is the compaction ancester of this file.
-  // The timestamp is provided Env::GetCurrentTime().
+  // oldest SST file that is the compaction ancestor of this file.
+  // The timestamp is provided SystemClock::GetCurrentTime().
   // 0 if the information is not available.
   //
   // Note: for TTL blob files, it contains the start of the expiration range.
   uint64_t oldest_ancester_time;
-  // Timestamp when the SST file is created, provided by Env::GetCurrentTime().
-  // 0 if the information is not available.
+  // Timestamp when the SST file is created, provided by
+  // SystemClock::GetCurrentTime(). 0 if the information is not available.
   uint64_t file_creation_time;
 
   // The checksum of a SST file, the value is decided by the file content and
diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
index 1a621eec656..86343958abc 100644
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@@ -20,11 +20,14 @@
 #include "rocksdb/advanced_options.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/compression_type.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/data_structure.h"
 #include "rocksdb/env.h"
 #include "rocksdb/file_checksum.h"
 #include "rocksdb/listener.h"
 #include "rocksdb/pre_release_callback.h"
 #include "rocksdb/sst_partitioner.h"
+#include "rocksdb/types.h"
 #include "rocksdb/universal_compaction.h"
 #include "rocksdb/version.h"
 #include "rocksdb/write_buffer_manager.h"
@@ -58,6 +61,8 @@ class FileSystem;
 struct Options;
 struct DbPath;
 
+using FileTypeSet = SmallEnumSet<FileType, FileType::kBlobFile>;
+
 struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // The function recovers options to a previous version. Only 4.6 or later
   // versions are supported.
@@ -125,9 +130,10 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // Allows an application to modify/delete a key-value during background
   // compaction.
   //
-  // If the client requires a new compaction filter to be used for different
-  // compaction runs, it can specify compaction_filter_factory instead of this
-  // option.  The client should specify only one of the two.
+  // If the client requires a new `CompactionFilter` to be used for different
+  // compaction runs and/or requires a `CompactionFilter` for table file
+  // creations outside of compaction, it can specify compaction_filter_factory
+  // instead of this option.  The client should specify only one of the two.
   // compaction_filter takes precedence over compaction_filter_factory if
   // client specifies both.
   //
@@ -138,12 +144,21 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // Default: nullptr
   const CompactionFilter* compaction_filter = nullptr;
 
-  // This is a factory that provides compaction filter objects which allow
-  // an application to modify/delete a key-value during background compaction.
+  // This is a factory that provides `CompactionFilter` objects which allow
+  // an application to modify/delete a key-value during table file creation.
+  //
+  // Unlike the `compaction_filter` option, which is used when compaction
+  // creates a table file, this factory allows using a `CompactionFilter` when a
+  // table file is created for various reasons. The factory can decide what
+  // `TableFileCreationReason`s use a `CompactionFilter`. For compatibility, by
+  // default the decision is to use a `CompactionFilter` for
+  // `TableFileCreationReason::kCompaction` only.
   //
-  // A new filter will be created on each compaction run.  If multithreaded
-  // compaction is being used, each created CompactionFilter will only be used
-  // from a single thread and so does not need to be thread-safe.
+  // Each thread of work involving creating table files will create a new
+  // `CompactionFilter` when it will be used according to the above
+  // `TableFileCreationReason`-based decision. This allows the application to
+  // know about the different ongoing threads of work and makes it unnecessary
+  // for `CompactionFilter` to provide thread-safety.
   //
   // Default: nullptr
   std::shared_ptr<CompactionFilterFactory> compaction_filter_factory = nullptr;
@@ -349,6 +364,37 @@ struct DbPath {
   DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {}
 };
 
+extern const char* kHostnameForDbHostId;
+
+enum class CompactionServiceJobStatus : char {
+  kSuccess,
+  kFailure,
+  kUseLocal,  // TODO: Add support for use local compaction
+};
+
+class CompactionService : public Customizable {
+ public:
+  static const char* Type() { return "CompactionService"; }
+
+  // Returns the name of this compaction service.
+  virtual const char* Name() const = 0;
+
+  // Start the compaction with input information, which can be passed to
+  // `DB::OpenAndCompact()`.
+  // job_id is pre-assigned, it will be reset after DB re-open.
+  // TODO: sub-compaction is not supported, as they will have the same job_id, a
+  // sub-compaction id might be added
+  virtual CompactionServiceJobStatus Start(
+      const std::string& compaction_service_input, int job_id) = 0;
+
+  // Wait compaction to be finish.
+  // TODO: Add output path override
+  virtual CompactionServiceJobStatus WaitForComplete(
+      int job_id, std::string* compaction_service_result) = 0;
+
+  virtual ~CompactionService() {}
+};
+
 struct DBOptions {
   // The function recovers options to the option as in version 4.6.
   DBOptions* OldDefaults(int rocksdb_major_version = 4,
@@ -391,6 +437,23 @@ struct DBOptions {
   // Default: true
   bool paranoid_checks = true;
 
+  // If true, during memtable flush, RocksDB will validate total entries
+  // read in flush, and compare with counter inserted into it.
+  // The option is here to turn the feature off in case this new validation
+  // feature has a bug.
+  // Default: true
+  bool flush_verify_memtable_count = true;
+
+  // If true, the log numbers and sizes of the synced WALs are tracked
+  // in MANIFEST, then during DB recovery, if a synced WAL is missing
+  // from disk, or the WAL's size does not match the recorded size in
+  // MANIFEST, an error will be reported and the recovery will be aborted.
+  //
+  // Note that this option does not work with secondary instance.
+  //
+  // Default: false
+  bool track_and_verify_wals_in_manifest = false;
+
   // Use the specified object to interact with the environment,
   // e.g. to read/write files, schedule background work, etc. In the near
   // future, support for doing storage operations such as read/write files
@@ -821,7 +884,7 @@ struct DBOptions {
   // Allows OS to incrementally sync files to disk while they are being
   // written, asynchronously, in the background. This operation can be used
   // to smooth out write I/Os over time. Users shouldn't rely on it for
-  // persistency guarantee.
+  // persistence guarantee.
   // Issue one request for every bytes_per_sync written. 0 turns it off.
   //
   // You may consider using rate_limiter to regulate write rate to device.
@@ -1179,6 +1242,38 @@ struct DBOptions {
   //
   // Default: false
   bool disable_manifest_sync = false;
+
+  // A string identifying the machine hosting the DB. This
+  // will be written as a property in every SST file written by the DB (or
+  // by offline writers such as SstFileWriter and RepairDB). It can be useful
+  // for troubleshooting in memory corruption caused by a failing host when
+  // writing a file, by tracing back to the writing host. These corruptions
+  // may not be caught by the checksum since they happen before checksumming.
+  // If left as default, the table writer will substitute it with the actual
+  // hostname when writing the SST file. If set to an empty string, the
+  // property will not be written to the SST file.
+  //
+  // Default: hostname
+  std::string db_host_id = kHostnameForDbHostId;
+
+  // Use this if your DB want to enable checksum handoff for specific file
+  // types writes. Make sure that the File_system you use support the
+  // crc32c checksum verification
+  // Currently supported file tyes: kWALFile, kTableFile, kDescriptorFile.
+  // NOTE: currently RocksDB only generates crc32c based checksum for the
+  // handoff. If the storage layer has different checksum support, user
+  // should enble this set as empty. Otherwise,it may cause unexpected
+  // write failures.
+  FileTypeSet checksum_handoff_file_types;
+
+  // EXPERIMENTAL
+  // CompactionService is a feature allows the user to run compactions on a
+  // different host or process, which offloads the background load from the
+  // primary host.
+  // It's an experimental feature, the interface will be changed without
+  // backward/forward compatibility support for now. Some known issues are still
+  // under development.
+  std::shared_ptr<CompactionService> compaction_service = nullptr;
 };
 
 // Options to control the behavior of a database (passed to DB::Open)
@@ -1253,19 +1348,28 @@ struct ReadOptions {
   // Default: nullptr
   const Slice* iterate_lower_bound;
 
-  // "iterate_upper_bound" defines the extent upto which the forward iterator
+  // "iterate_upper_bound" defines the extent up to which the forward iterator
   // can returns entries. Once the bound is reached, Valid() will be false.
   // "iterate_upper_bound" is exclusive ie the bound value is
-  // not a valid entry. If prefix_extractor is not null, the Seek target
-  // and iterate_upper_bound need to have the same prefix.
-  // This is because ordering is not guaranteed outside of prefix domain.
+  // not a valid entry. If prefix_extractor is not null:
+  // 1. If options.auto_prefix_mode = true, iterate_upper_bound will be used
+  //    to infer whether prefix iterating (e.g. applying prefix bloom filter)
+  //    can be used within RocksDB. This is done by comparing
+  //    iterate_upper_bound with the seek key.
+  // 2. If options.auto_prefix_mode = false, iterate_upper_bound only takes
+  //    effect if it shares the same prefix as the seek key. If
+  //    iterate_upper_bound is outside the prefix of the seek key, then keys
+  //    returned outside the prefix range will be undefined, just as if
+  //    iterate_upper_bound = null.
+  // If iterate_upper_bound is not null, SeekToLast() will position the iterator
+  // at the first key smaller than iterate_upper_bound.
   //
   // Default: nullptr
   const Slice* iterate_upper_bound;
 
   // RocksDB does auto-readahead for iterators on noticing more than two reads
   // for a table file. The readahead starts at 8KB and doubles on every
-  // additional read upto 256KB.
+  // additional read up to 256KB.
   // This option can help if most of the range scans are large, and if it is
   // determined that a larger readahead than that enabled by auto-readahead is
   // needed.
@@ -1323,7 +1427,7 @@ struct ReadOptions {
   // When true, by default use total_order_seek = true, and RocksDB can
   // selectively enable prefix seek mode if won't generate a different result
   // from total_order_seek, based on seek key, and iterator upper bound.
-  // Not suppported in ROCKSDB_LITE mode, in the way that even with value true
+  // Not supported in ROCKSDB_LITE mode, in the way that even with value true
   // prefix mode is not used.
   // Default: false
   bool auto_prefix_mode;
@@ -1399,7 +1503,7 @@ struct ReadOptions {
   // A timeout in microseconds to be passed to the underlying FileSystem for
   // reads. As opposed to deadline, this determines the timeout for each
   // individual file read request. If a MultiGet/Get/Seek/Next etc call
-  // results in multiple reads, each read can last upto io_timeout us.
+  // results in multiple reads, each read can last up to io_timeout us.
   std::chrono::microseconds io_timeout;
 
   // It limits the maximum cumulative value size of the keys in batch while
@@ -1453,7 +1557,7 @@ struct WriteOptions {
   bool no_slowdown;
 
   // If true, this write request is of lower priority if compaction is
-  // behind. In this case, no_slowdown = true, the request will be cancelled
+  // behind. In this case, no_slowdown = true, the request will be canceled
   // immediately with Status::Incomplete() returned. Otherwise, it will be
   // slowed down. The slowdown value is determined by RocksDB to guarantee
   // it introduces minimum impacts to high priority writes.
@@ -1572,6 +1676,12 @@ struct CompactRangeOptions {
   bool allow_write_stall = false;
   // If > 0, it will replace the option in the DBOptions for this compaction.
   uint32_t max_subcompactions = 0;
+  // Set user-defined timestamp low bound, the data with older timestamp than
+  // low bound maybe GCed by compaction. Default: nullptr
+  Slice* full_history_ts_low = nullptr;
+
+  // Allows cancellation of an in-progress manual compaction.
+  std::atomic<bool>* canceled = nullptr;
 };
 
 // IngestExternalFileOptions is used by IngestExternalFile()
@@ -1591,7 +1701,7 @@ struct IngestExternalFileOptions {
   bool allow_blocking_flush = true;
   // Set to true if you would like duplicate keys in the file being ingested
   // to be skipped rather than overwriting existing data under that key.
-  // Usecase: back-fill of some historical data in the database without
+  // Use case: back-fill of some historical data in the database without
   // over-writing existing newer version of data.
   // This option could only be used if the DB has been running
   // with allow_ingest_behind=true since the dawn of time.
@@ -1631,7 +1741,7 @@ struct IngestExternalFileOptions {
   // will be ignored; 2) If DB enable the checksum function, we calculate the
   // sst file checksum after the file is moved or copied and compare the
   // checksum and checksum name. If checksum or checksum function name does
-  // not match, ingestion will be failed. If the verification is sucessful,
+  // not match, ingestion will be failed. If the verification is successful,
   // checksum and checksum function name will be stored in Manifest.
   // If this option is set to FALSE, 1) if DB does not enable checksum,
   // the ingested checksum information will be ignored; 2) if DB enable the
@@ -1696,4 +1806,20 @@ struct SizeApproximationOptions {
   double files_size_error_margin = -1.0;
 };
 
+struct CompactionServiceOptionsOverride {
+  // Currently pointer configurations are not passed to compaction service
+  // compaction so the user needs to set it. It will be removed once pointer
+  // configuration passing is supported.
+  Env* env = Env::Default();
+  std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory = nullptr;
+
+  const Comparator* comparator = BytewiseComparator();
+  std::shared_ptr<MergeOperator> merge_operator = nullptr;
+  const CompactionFilter* compaction_filter = nullptr;
+  std::shared_ptr<CompactionFilterFactory> compaction_filter_factory = nullptr;
+  std::shared_ptr<const SliceTransform> prefix_extractor = nullptr;
+  std::shared_ptr<TableFactory> table_factory;
+  std::shared_ptr<SstPartitionerFactory> sst_partitioner_factory = nullptr;
+};
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h
index 3d61000cc57..699f57344f1 100644
--- a/include/rocksdb/perf_context.h
+++ b/include/rocksdb/perf_context.h
@@ -57,7 +57,7 @@ struct PerfContext {
   // enable per level perf context and allocate storage for PerfContextByLevel
   void EnablePerLevelPerfContext();
 
-  // temporarily disable per level perf contxt by setting the flag to false
+  // temporarily disable per level perf context by setting the flag to false
   void DisablePerLevelPerfContext();
 
   // free the space for PerfContextByLevel, also disable per level perf context
@@ -230,8 +230,15 @@ struct PerfContext {
   bool per_level_perf_context_enabled = false;
 };
 
-// Get Thread-local PerfContext object pointer
-// if defined(NPERF_CONTEXT), then the pointer is not thread-local
+// If RocksDB is compiled with -DNPERF_CONTEXT, then a pointer to a global,
+// non-thread-local PerfContext object will be returned. Attempts to update
+// this object will be ignored, and reading from it will also be no-op.
+// Otherwise,
+// a) if thread-local is supported on the platform, then a pointer to
+//    a thread-local PerfContext object will be returned.
+// b) if thread-local is NOT supported, then compilation will fail.
+//
+// This function never returns nullptr.
 PerfContext* get_perf_context();
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/secondary_cache.h b/include/rocksdb/secondary_cache.h
new file mode 100644
index 00000000000..221b3e5f231
--- /dev/null
+++ b/include/rocksdb/secondary_cache.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2021, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+
+#include "rocksdb/cache.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A handle for lookup result. The handle may not be immediately ready or
+// have a valid value. The caller must call isReady() to determine if its
+// ready, and call Wait() in order to block until it becomes ready.
+// The caller must call value() after it becomes ready to determine if the
+// handle successfullly read the item.
+class SecondaryCacheResultHandle {
+ public:
+  virtual ~SecondaryCacheResultHandle() {}
+
+  // Returns whether the handle is ready or not
+  virtual bool IsReady() = 0;
+
+  // Block until handle becomes ready
+  virtual void Wait() = 0;
+
+  // Return the value. If nullptr, it means the lookup was unsuccessful
+  virtual void* Value() = 0;
+
+  // Return the size of value
+  virtual size_t Size() = 0;
+};
+
+// SecondaryCache
+//
+// Cache interface for caching blocks on a secondary tier (which can include
+// non-volatile media, or alternate forms of caching such as compressed data)
+class SecondaryCache {
+ public:
+  virtual ~SecondaryCache() {}
+
+  virtual std::string Name() = 0;
+
+  static const std::string Type() { return "SecondaryCache"; }
+
+  // Insert the given value into this cache. The value is not written
+  // directly. Rather, the SaveToCallback provided by helper_cb will be
+  // used to extract the persistable data in value, which will be written
+  // to this tier. The implementation may or may not write it to cache
+  // depending on the admission control policy, even if the return status is
+  // success.
+  virtual Status Insert(const Slice& key, void* value,
+                        const Cache::CacheItemHelper* helper) = 0;
+
+  // Lookup the data for the given key in this cache. The create_cb
+  // will be used to create the object. The handle returned may not be
+  // ready yet, unless wait=true, in which case Lookup() will block until
+  // the handle is ready
+  virtual std::unique_ptr<SecondaryCacheResultHandle> Lookup(
+      const Slice& key, const Cache::CreateCallback& create_cb, bool wait) = 0;
+
+  // At the discretion of the implementation, erase the data associated
+  // with key
+  virtual void Erase(const Slice& key) = 0;
+
+  // Wait for a collection of handles to become ready
+  virtual void WaitAll(std::vector<SecondaryCacheResultHandle*> handles) = 0;
+
+  virtual std::string GetPrintableOptions() const = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/slice_transform.h b/include/rocksdb/slice_transform.h
index 54f61f9d2a4..589636744ee 100644
--- a/include/rocksdb/slice_transform.h
+++ b/include/rocksdb/slice_transform.h
@@ -62,7 +62,7 @@ class SliceTransform {
   virtual bool InRange(const Slice& /*dst*/) const { return false; }
 
   // Some SliceTransform will have a full length which can be used to
-  // determine if two keys are consecuitive. Can be disabled by always
+  // determine if two keys are consecutive. Can be disabled by always
   // returning 0
   virtual bool FullLengthEnabled(size_t* /*len*/) const { return false; }
 
diff --git a/include/rocksdb/sst_file_manager.h b/include/rocksdb/sst_file_manager.h
index 350dec7a8bb..5aae88dc1ed 100644
--- a/include/rocksdb/sst_file_manager.h
+++ b/include/rocksdb/sst_file_manager.h
@@ -19,17 +19,16 @@ namespace ROCKSDB_NAMESPACE {
 class Env;
 class Logger;
 
-// SstFileManager is used to track SST files in the DB and control their
-// deletion rate.
-// All SstFileManager public functions are thread-safe.
+// SstFileManager is used to track SST and blob files in the DB and control
+// their deletion rate. All SstFileManager public functions are thread-safe.
 // SstFileManager is not extensible.
 class SstFileManager {
  public:
   virtual ~SstFileManager() {}
 
   // Update the maximum allowed space that should be used by RocksDB, if
-  // the total size of the SST files exceeds max_allowed_space, writes to
-  // RocksDB will fail.
+  // the total size of the SST and blob files exceeds max_allowed_space, writes
+  // to RocksDB will fail.
   //
   // Setting max_allowed_space to 0 will disable this feature; maximum allowed
   // space will be infinite (Default value).
@@ -43,14 +42,14 @@ class SstFileManager {
   // other background functions may continue, such as logging and flushing.
   virtual void SetCompactionBufferSize(uint64_t compaction_buffer_size) = 0;
 
-  // Return true if the total size of SST files exceeded the maximum allowed
-  // space usage.
+  // Return true if the total size of SST  and blob files exceeded the maximum
+  // allowed space usage.
   //
   // thread-safe.
   virtual bool IsMaxAllowedSpaceReached() = 0;
 
-  // Returns true if the total size of SST files as well as estimated size
-  // of ongoing compactions exceeds the maximums allowed space usage.
+  // Returns true if the total size of SST and blob files as well as estimated
+  // size of ongoing compactions exceeds the maximums allowed space usage.
   virtual bool IsMaxAllowedSpaceReachedIncludingCompactions() = 0;
 
   // Return the total size of all tracked files.
@@ -87,7 +86,7 @@ class SstFileManager {
 };
 
 // Create a new SstFileManager that can be shared among multiple RocksDB
-// instances to track SST file and control there deletion rate.
+// instances to track SST and blob files and control there deletion rate.
 // Even though SstFileManager don't track WAL files but it still control
 // there deletion rate.
 //
diff --git a/include/rocksdb/sst_file_writer.h b/include/rocksdb/sst_file_writer.h
index 88b1e42cec1..ec436c32edf 100644
--- a/include/rocksdb/sst_file_writer.h
+++ b/include/rocksdb/sst_file_writer.h
@@ -90,6 +90,9 @@ class SstFileWriter {
   // ascending order.
   // If unsafe_disable_sync is false, SstFileWriter will not sync new sst file
   // on close.
+  // The `skip_filters` option is DEPRECATED and could be removed in the
+  // future. Use `BlockBasedTableOptions::filter_policy` to control filter
+  // generation.
   SstFileWriter(const EnvOptions& env_options, const Options& options,
                 ColumnFamilyHandle* column_family = nullptr,
                 bool invalidate_page_cache = true,
diff --git a/include/rocksdb/sst_partitioner.h b/include/rocksdb/sst_partitioner.h
index 5d181958f55..1ac16b49e6e 100644
--- a/include/rocksdb/sst_partitioner.h
+++ b/include/rocksdb/sst_partitioner.h
@@ -51,12 +51,12 @@ class SstPartitioner {
   // It is called for all keys in compaction. When partitioner want to create
   // new SST file it needs to return true. It means compaction job will finish
   // current SST file where last key is "prev_user_key" parameter and start new
-  // SST file where first key is "current_user_key". Returns decission if
+  // SST file where first key is "current_user_key". Returns decision if
   // partition boundary was detected and compaction should create new file.
   virtual PartitionerResult ShouldPartition(
       const PartitionerRequest& request) = 0;
 
-  // Called with smallest and largest keys in SST file when compation try to do
+  // Called with smallest and largest keys in SST file when compaction try to do
   // trivial move. Returns true is partitioner allows to do trivial move.
   virtual bool CanDoTrivialMove(const Slice& smallest_user_key,
                                 const Slice& largest_user_key) = 0;
diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h
index 98b4fb970de..8fc5a2eedb2 100644
--- a/include/rocksdb/statistics.h
+++ b/include/rocksdb/statistics.h
@@ -117,7 +117,7 @@ enum Tickers : uint32_t {
   COMPACTION_RANGE_DEL_DROP_OBSOLETE,  // all keys in range were deleted.
   // Deletions obsoleted before bottom level due to file gap optimization.
   COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
-  // If a compaction was cancelled in sfm to prevent ENOSPC
+  // If a compaction was canceled in sfm to prevent ENOSPC
   COMPACTION_CANCELLED,
 
   // Number of keys written to the database via the Put and Write call's
@@ -183,7 +183,7 @@ enum Tickers : uint32_t {
   // over large number of keys with same userkey.
   NUMBER_OF_RESEEKS_IN_ITERATION,
 
-  // Record the number of calls to GetUpadtesSince. Useful to keep track of
+  // Record the number of calls to GetUpdatesSince. Useful to keep track of
   // transaction log iterator refreshes
   GET_UPDATES_SINCE_CALLS,
   BLOCK_CACHE_COMPRESSED_MISS,  // miss in the compressed block cache
@@ -374,6 +374,21 @@ enum Tickers : uint32_t {
   // # of files deleted immediately by sst file manger through delete scheduler.
   FILES_DELETED_IMMEDIATELY,
 
+  // The counters for error handler, not that, bg_io_error is the subset of
+  // bg_error and bg_retryable_io_error is the subset of bg_io_error
+  ERROR_HANDLER_BG_ERROR_COUNT,
+  ERROR_HANDLER_BG_IO_ERROR_COUNT,
+  ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT,
+  ERROR_HANDLER_AUTORESUME_COUNT,
+  ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT,
+  ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT,
+
+  // Statistics for memtable garbage collection:
+  // Raw bytes of data (payload) present on memtable at flush time.
+  MEMTABLE_PAYLOAD_BYTES_AT_FLUSH,
+  // Outdated bytes of data present on memtable at flush time.
+  MEMTABLE_GARBAGE_BYTES_AT_FLUSH,
+
   TICKER_ENUM_MAX
 };
 
@@ -438,7 +453,7 @@ enum Histograms : uint32_t {
   BLOB_DB_VALUE_SIZE,
   // BlobDB Put/PutWithTTL/PutUntil/Write latency.
   BLOB_DB_WRITE_MICROS,
-  // BlobDB Get lagency.
+  // BlobDB Get latency.
   BLOB_DB_GET_MICROS,
   // BlobDB MultiGet latency.
   BLOB_DB_MULTIGET_MICROS,
@@ -472,6 +487,9 @@ enum Histograms : uint32_t {
   // Num of sst files read from file system per level.
   NUM_SST_READ_PER_LEVEL,
 
+  // Error handler statistics
+  ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
+
   HISTOGRAM_ENUM_MAX,
 };
 
diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h
index bcc55e4fd02..1de2ebcb046 100644
--- a/include/rocksdb/status.h
+++ b/include/rocksdb/status.h
@@ -65,9 +65,11 @@ class Status {
   // In case of intentionally swallowing an error, user must explicitly call
   // this function. That way we are easily able to search the code to find where
   // error swallowing occurs.
-  void PermitUncheckedError() const {
+  inline void PermitUncheckedError() const { MarkChecked(); }
+
+  inline void MustCheck() const {
 #ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
+    checked_ = false;
 #endif  // ROCKSDB_ASSERT_STATUS_CHECKED
   }
 
@@ -92,9 +94,7 @@ class Status {
   };
 
   Code code() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return code_;
   }
 
@@ -118,9 +118,7 @@ class Status {
   };
 
   SubCode subcode() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return subcode_;
   }
 
@@ -134,18 +132,18 @@ class Status {
   };
 
   Status(const Status& s, Severity sev);
+
+  Status(Code _code, SubCode _subcode, Severity _sev, const Slice& msg)
+      : Status(_code, _subcode, msg, "", _sev) {}
+
   Severity severity() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return sev_;
   }
 
   // Returns a C style string indicating the message of the Status
   const char* getState() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return state_;
   }
 
@@ -289,127 +287,95 @@ class Status {
 
   // Returns true iff the status indicates success.
   bool ok() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return code() == kOk;
   }
 
   // Returns true iff the status indicates success *with* something
   // overwritten
   bool IsOkOverwritten() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return code() == kOk && subcode() == kOverwritten;
   }
 
   // Returns true iff the status indicates a NotFound error.
   bool IsNotFound() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return code() == kNotFound;
   }
 
   // Returns true iff the status indicates a Corruption error.
   bool IsCorruption() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return code() == kCorruption;
   }
 
   // Returns true iff the status indicates a NotSupported error.
   bool IsNotSupported() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return code() == kNotSupported;
   }
 
   // Returns true iff the status indicates an InvalidArgument error.
   bool IsInvalidArgument() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return code() == kInvalidArgument;
   }
 
   // Returns true iff the status indicates an IOError.
   bool IsIOError() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return code() == kIOError;
   }
 
   // Returns true iff the status indicates an MergeInProgress.
   bool IsMergeInProgress() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return code() == kMergeInProgress;
   }
 
   // Returns true iff the status indicates Incomplete
   bool IsIncomplete() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return code() == kIncomplete;
   }
 
   // Returns true iff the status indicates Shutdown In progress
   bool IsShutdownInProgress() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return code() == kShutdownInProgress;
   }
 
   bool IsTimedOut() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return code() == kTimedOut;
   }
 
   bool IsAborted() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return code() == kAborted;
   }
 
   bool IsLockLimit() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return code() == kAborted && subcode() == kLockLimit;
   }
 
   // Returns true iff the status indicates that a resource is Busy and
   // temporarily could not be acquired.
   bool IsBusy() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return code() == kBusy;
   }
 
   bool IsDeadlock() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return code() == kBusy && subcode() == kDeadlock;
   }
 
   // Returns true iff the status indicated that the operation has Expired.
   bool IsExpired() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return code() == kExpired;
   }
 
@@ -417,25 +383,19 @@ class Status {
   // This usually means that the operation failed, but may succeed if
   // re-attempted.
   bool IsTryAgain() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return code() == kTryAgain;
   }
 
   // Returns true iff the status indicates the proposed compaction is too large
   bool IsCompactionTooLarge() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return code() == kCompactionTooLarge;
   }
 
   // Returns true iff the status indicates Column Family Dropped
   bool IsColumnFamilyDropped() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return code() == kColumnFamilyDropped;
   }
 
@@ -445,9 +405,7 @@ class Status {
   // with a specific subcode, enabling users to take the appropriate action
   // if needed
   bool IsNoSpace() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return (code() == kIOError) && (subcode() == kNoSpace);
   }
 
@@ -455,9 +413,7 @@ class Status {
   // cases where we limit the memory used in certain operations (eg. the size
   // of a write batch) in order to avoid out of memory exceptions.
   bool IsMemoryLimit() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return (code() == kAborted) && (subcode() == kMemoryLimit);
   }
 
@@ -466,9 +422,7 @@ class Status {
   // directory" error condition. A PathNotFound error is an I/O error with
   // a specific subcode, enabling users to take appropriate action if necessary
   bool IsPathNotFound() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return (code() == kIOError || code() == kNotFound) &&
            (subcode() == kPathNotFound);
   }
@@ -476,25 +430,19 @@ class Status {
   // Returns true iff the status indicates manual compaction paused. This
   // is caused by a call to PauseManualCompaction
   bool IsManualCompactionPaused() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return (code() == kIncomplete) && (subcode() == kManualCompactionPaused);
   }
 
   // Returns true iff the status indicates a TxnNotPrepared error.
   bool IsTxnNotPrepared() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return (code() == kInvalidArgument) && (subcode() == kTxnNotPrepared);
   }
 
   // Returns true iff the status indicates a IOFenced error.
   bool IsIOFenced() const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    MarkChecked();
     return (code() == kIOError) && (subcode() == kIOFenced);
   }
 
@@ -519,33 +467,34 @@ class Status {
   explicit Status(Code _code, SubCode _subcode = kNone)
       : code_(_code), subcode_(_subcode), sev_(kNoError), state_(nullptr) {}
 
-  Status(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2);
+  Status(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2,
+         Severity sev = kNoError);
   Status(Code _code, const Slice& msg, const Slice& msg2)
       : Status(_code, kNone, msg, msg2) {}
 
   static const char* CopyState(const char* s);
+
+  inline void MarkChecked() const {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    checked_ = true;
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  }
 };
 
 inline Status::Status(const Status& s)
     : code_(s.code_), subcode_(s.subcode_), sev_(s.sev_) {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-  s.checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  s.MarkChecked();
   state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
 }
 inline Status::Status(const Status& s, Severity sev)
     : code_(s.code_), subcode_(s.subcode_), sev_(sev) {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-  s.checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  s.MarkChecked();
   state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
 }
 inline Status& Status::operator=(const Status& s) {
   if (this != &s) {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    s.checked_ = true;
-    checked_ = false;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    s.MarkChecked();
+    MustCheck();
     code_ = s.code_;
     subcode_ = s.subcode_;
     sev_ = s.sev_;
@@ -560,9 +509,7 @@ inline Status::Status(Status&& s)
     noexcept
 #endif
     : Status() {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-  s.checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  s.MarkChecked();
   *this = std::move(s);
 }
 
@@ -572,10 +519,8 @@ inline Status& Status::operator=(Status&& s)
 #endif
 {
   if (this != &s) {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-    s.checked_ = true;
-    checked_ = false;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    s.MarkChecked();
+    MustCheck();
     code_ = std::move(s.code_);
     s.code_ = kOk;
     subcode_ = std::move(s.subcode_);
@@ -590,18 +535,14 @@ inline Status& Status::operator=(Status&& s)
 }
 
 inline bool Status::operator==(const Status& rhs) const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-  checked_ = true;
-  rhs.checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  MarkChecked();
+  rhs.MarkChecked();
   return (code_ == rhs.code_);
 }
 
 inline bool Status::operator!=(const Status& rhs) const {
-#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
-  checked_ = true;
-  rhs.checked_ = true;
-#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+  MarkChecked();
+  rhs.MarkChecked();
   return !(*this == rhs);
 }
 
diff --git a/include/rocksdb/system_clock.h b/include/rocksdb/system_clock.h
new file mode 100644
index 00000000000..e03d195ee24
--- /dev/null
+++ b/include/rocksdb/system_clock.h
@@ -0,0 +1,102 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <rocksdb/rocksdb_namespace.h>
+#include <rocksdb/status.h>
+#include <stdint.h>
+
+#include <memory>
+
+#ifdef _WIN32
+// Windows API macro interference
+#undef GetCurrentTime
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+struct ConfigOptions;
+
+// A SystemClock is an interface used by the rocksdb implementation to access
+// operating system time-related functionality.
+class SystemClock {
+ public:
+  virtual ~SystemClock() {}
+
+  static const char* Type() { return "SystemClock"; }
+
+  // The name of this system clock
+  virtual const char* Name() const = 0;
+
+  // Return a default SystemClock suitable for the current operating
+  // system.
+  static const std::shared_ptr<SystemClock>& Default();
+
+  // Returns the number of micro-seconds since some fixed point in time.
+  // It is often used as system time such as in GenericRateLimiter
+  // and other places so a port needs to return system time in order to work.
+  virtual uint64_t NowMicros() = 0;
+
+  // Returns the number of nano-seconds since some fixed point in time. Only
+  // useful for computing deltas of time in one run.
+  // Default implementation simply relies on NowMicros.
+  // In platform-specific implementations, NowNanos() should return time points
+  // that are MONOTONIC.
+  virtual uint64_t NowNanos() { return NowMicros() * 1000; }
+
+  // Returns the number of micro-seconds of CPU time used by the current thread.
+  // 0 indicates not supported.
+  virtual uint64_t CPUMicros() { return 0; }
+
+  // Returns the number of nano-seconds of CPU time used by the current thread.
+  // Default implementation simply relies on CPUMicros.
+  // 0 indicates not supported.
+  virtual uint64_t CPUNanos() { return CPUMicros() * 1000; }
+
+  // Sleep/delay the thread for the prescribed number of micro-seconds.
+  virtual void SleepForMicroseconds(int micros) = 0;
+
+  // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC).
+  // Only overwrites *unix_time on success.
+  virtual Status GetCurrentTime(int64_t* unix_time) = 0;
+
+  // Converts seconds-since-Jan-01-1970 to a printable string
+  virtual std::string TimeToString(uint64_t time) = 0;
+};
+
+// Wrapper class for a SystemClock.  Redirects all methods (except Name)
+// of the SystemClock interface to the target/wrapped class.
+class SystemClockWrapper : public SystemClock {
+ public:
+  explicit SystemClockWrapper(const std::shared_ptr<SystemClock>& t)
+      : target_(t) {}
+
+  uint64_t NowMicros() override { return target_->NowMicros(); }
+
+  uint64_t NowNanos() override { return target_->NowNanos(); }
+
+  uint64_t CPUMicros() override { return target_->CPUMicros(); }
+
+  uint64_t CPUNanos() override { return target_->CPUNanos(); }
+
+  virtual void SleepForMicroseconds(int micros) override {
+    return target_->SleepForMicroseconds(micros);
+  }
+
+  Status GetCurrentTime(int64_t* unix_time) override {
+    return target_->GetCurrentTime(unix_time);
+  }
+
+  std::string TimeToString(uint64_t time) override {
+    return target_->TimeToString(time);
+  }
+
+ protected:
+  std::shared_ptr<SystemClock> target_;
+};
+
+}  // end namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h
index 8e0d144f886..13b31ee47b9 100644
--- a/include/rocksdb/table.h
+++ b/include/rocksdb/table.h
@@ -22,7 +22,7 @@
 #include <string>
 #include <unordered_map>
 
-#include "rocksdb/configurable.h"
+#include "rocksdb/customizable.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "rocksdb/status.h"
@@ -51,6 +51,55 @@ enum ChecksumType : char {
   kxxHash64 = 0x3,
 };
 
+// `PinningTier` is used to specify which tier of block-based tables should
+// be affected by a block cache pinning setting (see
+// `MetadataCacheOptions` below).
+enum class PinningTier {
+  // For compatibility, this value specifies to fallback to the behavior
+  // indicated by the deprecated options,
+  // `pin_l0_filter_and_index_blocks_in_cache` and
+  // `pin_top_level_index_and_filter`.
+  kFallback,
+
+  // This tier contains no block-based tables.
+  kNone,
+
+  // This tier contains block-based tables that may have originated from a
+  // memtable flush. In particular, it includes tables from L0 that are smaller
+  // than 1.5 times the current `write_buffer_size`. Note these criteria imply
+  // it can include intra-L0 compaction outputs and ingested files, as long as
+  // they are not abnormally large compared to flushed files in L0.
+  kFlushedAndSimilar,
+
+  // This tier contains all block-based tables.
+  kAll,
+};
+
+// `MetadataCacheOptions` contains members indicating the desired caching
+// behavior for the different categories of metadata blocks.
+struct MetadataCacheOptions {
+  // The tier of block-based tables whose top-level index into metadata
+  // partitions will be pinned. Currently indexes and filters may be
+  // partitioned.
+  //
+  // Note `cache_index_and_filter_blocks` must be true for this option to have
+  // any effect. Otherwise any top-level index into metadata partitions would be
+  // held in table reader memory, outside the block cache.
+  PinningTier top_level_index_pinning = PinningTier::kFallback;
+
+  // The tier of block-based tables whose metadata partitions will be pinned.
+  // Currently indexes and filters may be partitioned.
+  PinningTier partition_pinning = PinningTier::kFallback;
+
+  // The tier of block-based tables whose unpartitioned metadata blocks will be
+  // pinned.
+  //
+  // Note `cache_index_and_filter_blocks` must be true for this option to have
+  // any effect. Otherwise the unpartitioned meta-blocks would be held in table
+  // reader memory, outside the block cache.
+  PinningTier unpartitioned_pinning = PinningTier::kFallback;
+};
+
 // For advanced user only
 struct BlockBasedTableOptions {
   static const char* kName() { return "BlockTableOptions"; };
@@ -79,12 +128,44 @@ struct BlockBasedTableOptions {
   // than data blocks.
   bool cache_index_and_filter_blocks_with_high_priority = true;
 
+  // DEPRECATED: This option will be removed in a future version. For now, this
+  // option still takes effect by updating each of the following variables that
+  // has the default value, `PinningTier::kFallback`:
+  //
+  // - `MetadataCacheOptions::partition_pinning`
+  // - `MetadataCacheOptions::unpartitioned_pinning`
+  //
+  // The updated value is chosen as follows:
+  //
+  // - `pin_l0_filter_and_index_blocks_in_cache == false` ->
+  //   `PinningTier::kNone`
+  // - `pin_l0_filter_and_index_blocks_in_cache == true` ->
+  //   `PinningTier::kFlushedAndSimilar`
+  //
+  // To migrate away from this flag, explicitly configure
+  // `MetadataCacheOptions` as described above.
+  //
   // if cache_index_and_filter_blocks is true and the below is true, then
   // filter and index blocks are stored in the cache, but a reference is
   // held in the "table reader" object so the blocks are pinned and only
   // evicted from cache when the table reader is freed.
   bool pin_l0_filter_and_index_blocks_in_cache = false;
 
+  // DEPRECATED: This option will be removed in a future version. For now, this
+  // option still takes effect by updating
+  // `MetadataCacheOptions::top_level_index_pinning` when it has the
+  // default value, `PinningTier::kFallback`.
+  //
+  // The updated value is chosen as follows:
+  //
+  // - `pin_top_level_index_and_filter == false` ->
+  //   `PinningTier::kNone`
+  // - `pin_top_level_index_and_filter == true` ->
+  //   `PinningTier::kAll`
+  //
+  // To migrate away from this flag, explicitly configure
+  // `MetadataCacheOptions` as described above.
+  //
   // If cache_index_and_filter_blocks is true and the below is true, then
   // the top-level index of partitioned filter and index blocks are stored in
   // the cache, but a reference is held in the "table reader" object so the
@@ -92,6 +173,12 @@ struct BlockBasedTableOptions {
   // freed. This is not limited to l0 in LSM tree.
   bool pin_top_level_index_and_filter = true;
 
+  // The desired block cache pinning behavior for the different categories of
+  // metadata blocks. While pinning can reduce block cache contention, users
+  // must take care not to pin excessive amounts of data, which risks
+  // overflowing block cache.
+  MetadataCacheOptions metadata_cache_options;
+
   // The index type that will be used for this table.
   enum IndexType : char {
     // A space efficient index block that is optimized for
@@ -201,13 +288,13 @@ struct BlockBasedTableOptions {
   // incompatible with block-based filters.
   bool partition_filters = false;
 
-  // EXPERIMENTAL Option to generate Bloom filters that minimize memory
+  // Option to generate Bloom/Ribbon filters that minimize memory
   // internal fragmentation.
   //
   // When false, malloc_usable_size is not available, or format_version < 5,
   // filters are generated without regard to internal fragmentation when
   // loaded into memory (historical behavior). When true (and
-  // malloc_usable_size is available and format_version >= 5), then Bloom
+  // malloc_usable_size is available and format_version >= 5), then
   // filters are generated to "round up" and "round down" their sizes to
   // minimize internal fragmentation when loaded into memory, assuming the
   // reading DB has the same memory allocation characteristics as the
@@ -226,7 +313,8 @@ struct BlockBasedTableOptions {
   // NOTE: Because some memory counted by block cache might be unmapped pages
   // within internal fragmentation, this option can increase observed RSS
   // memory usage. With cache_index_and_filter_blocks=true, this option makes
-  // the block cache better at using space it is allowed.
+  // the block cache better at using space it is allowed. (These issues
+  // should not arise with partitioned filters.)
   //
   // NOTE: Do not set to true if you do not trust malloc_usable_size. With
   // this option, RocksDB might access an allocated memory object beyond its
@@ -304,7 +392,7 @@ struct BlockBasedTableOptions {
   // 5 -- Can be read by RocksDB's versions since 6.6.0. Full and partitioned
   // filters use a generally faster and more accurate Bloom filter
   // implementation, with a different schema.
-  uint32_t format_version = 4;
+  uint32_t format_version = 5;
 
   // Store index blocks on disk in compressed format. Changing this option to
   // false  will avoid the overhead of decompression if index blocks are evicted
@@ -348,6 +436,55 @@ struct BlockBasedTableOptions {
 
   IndexShorteningMode index_shortening =
       IndexShorteningMode::kShortenSeparators;
+
+  // RocksDB does auto-readahead for iterators on noticing more than two reads
+  // for a table file if user doesn't provide readahead_size. The readahead
+  // starts at 8KB and doubles on every additional read upto
+  // max_auto_readahead_size and max_auto_readahead_size can be configured.
+  //
+  // Special Value: 0 - If max_auto_readahead_size is set 0 then no implicit
+  // auto prefetching will be done. If max_auto_readahead_size provided is less
+  // than 8KB (which is initial readahead size used by rocksdb in case of
+  // auto-readahead), readahead size will remain same as
+  // max_auto_readahead_size.
+  //
+  // Value should be provided along with KB i.e. 256 * 1024 as it will prefetch
+  // the blocks.
+  //
+  // Found that 256 KB readahead size provides the best performance, based on
+  // experiments, for auto readahead. Experiment data is in PR #3282.
+  //
+  // This parameter can be changed dynamically by
+  // DB::SetOptions({{"block_based_table_factory",
+  //                  "{max_auto_readahead_size=0;}"}}));
+  //
+  // Changing the value dynamically will only affect files opened after the
+  // change.
+  //
+  // Default: 256 KB (256 * 1024).
+  size_t max_auto_readahead_size = 256 * 1024;
+
+  // If enabled, prepopulate warm/hot data blocks which are already in memory
+  // into block cache at the time of flush. On a flush, the data block that is
+  // in memory (in memtables) get flushed to the device. If using Direct IO,
+  // additional IO is incurred to read this data back into memory again, which
+  // is avoided by enabling this option. This further helps if the workload
+  // exhibits high temporal locality, where most of the reads go to recently
+  // written data. This also helps in case of Distributed FileSystem.
+  //
+  // Right now, this is enabled only for flush for data blocks. We plan to
+  // expand this option to cover compactions in the future and for other types
+  // of blocks.
+  enum class PrepopulateBlockCache : char {
+    // Disable prepopulate block cache.
+    kDisable,
+    // Prepopulate data blocks during flush only. Plan to extend it to all block
+    // types.
+    kFlushOnly,
+  };
+
+  PrepopulateBlockCache prepopulate_block_cache =
+      PrepopulateBlockCache::kDisable;
 };
 
 // Table Properties that are specific to block-based table properties.
@@ -441,7 +578,7 @@ struct PlainTableOptions {
 
   // @store_index_in_file: compute plain table index and bloom filter during
   //                       file building and store it in file. When reading
-  //                       file, index will be mmaped instead of recomputation.
+  //                       file, index will be mapped instead of recomputation.
   bool store_index_in_file = false;
 };
 
@@ -526,7 +663,7 @@ extern TableFactory* NewCuckooTableFactory(
 class RandomAccessFileReader;
 
 // A base class for table factories.
-class TableFactory : public Configurable {
+class TableFactory : public Customizable {
  public:
   virtual ~TableFactory() override {}
 
@@ -540,21 +677,7 @@ class TableFactory : public Configurable {
                                  const std::string& id,
                                  std::shared_ptr<TableFactory>* factory);
 
-  // The type of the table.
-  //
-  // The client of this package should switch to a new name whenever
-  // the table format implementation changes.
-  //
-  // Names starting with "rocksdb." are reserved and should not be used
-  // by any clients of this package.
-  virtual const char* Name() const = 0;
-
-  // Returns true if the class is an instance of the input name.
-  // This is typically determined by if the input name matches the
-  // name of this object.
-  virtual bool IsInstanceOf(const std::string& name) const {
-    return name == Name();
-  }
+  static const char* Type() { return "TableFactory"; }
 
   // Returns a Table object table that can fetch data from file specified
   // in parameter file. It's the caller's responsibility to make sure
@@ -611,7 +734,7 @@ class TableFactory : public Configurable {
   // to use in this table.
   virtual TableBuilder* NewTableBuilder(
       const TableBuilderOptions& table_builder_options,
-      uint32_t column_family_id, WritableFileWriter* file) const = 0;
+      WritableFileWriter* file) const = 0;
 
   // Return is delete range supported
   virtual bool IsDeleteRangeSupported() const { return false; }
diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h
index ba3eca752ef..d3e9eeace3e 100644
--- a/include/rocksdb/table_properties.h
+++ b/include/rocksdb/table_properties.h
@@ -32,6 +32,7 @@ typedef std::map<std::string, std::string> UserCollectedProperties;
 struct TablePropertiesNames {
   static const std::string kDbId;
   static const std::string kDbSessionId;
+  static const std::string kDbHostId;
   static const std::string kDataSize;
   static const std::string kIndexSize;
   static const std::string kIndexPartitions;
@@ -43,6 +44,7 @@ struct TablePropertiesNames {
   static const std::string kRawValueSize;
   static const std::string kNumDataBlocks;
   static const std::string kNumEntries;
+  static const std::string kNumFilterEntries;
   static const std::string kDeletedKeys;
   static const std::string kMergeOperands;
   static const std::string kNumRangeDeletions;
@@ -60,6 +62,8 @@ struct TablePropertiesNames {
   static const std::string kCreationTime;
   static const std::string kOldestKeyTime;
   static const std::string kFileCreationTime;
+  static const std::string kSlowCompressionEstimatedDataSize;
+  static const std::string kFastCompressionEstimatedDataSize;
 };
 
 extern const std::string kPropertiesBlock;
@@ -98,9 +102,9 @@ class TablePropertiesCollector {
   }
 
   // Called after each new block is cut
-  virtual void BlockAdd(uint64_t /* blockRawBytes */,
-                        uint64_t /* blockCompressedBytesFast */,
-                        uint64_t /* blockCompressedBytesSlow */) {
+  virtual void BlockAdd(uint64_t /* block_raw_bytes */,
+                        uint64_t /* block_compressed_bytes_fast */,
+                        uint64_t /* block_compressed_bytes_slow */) {
     // Nothing to do here. Callback registers can override.
     return;
   }
@@ -172,6 +176,8 @@ struct TableProperties {
   uint64_t num_data_blocks = 0;
   // the number of entries in this table
   uint64_t num_entries = 0;
+  // the number of unique entries (keys or prefixes) added to filters
+  uint64_t num_filter_entries = 0;
   // the number of deletions in the table
   uint64_t num_deletions = 0;
   // the number of merge operands in the table
@@ -194,6 +200,14 @@ struct TableProperties {
   uint64_t oldest_key_time = 0;
   // Actual SST file creation time. 0 means unknown.
   uint64_t file_creation_time = 0;
+  // Estimated size of data blocks if compressed using a relatively slower
+  // compression algorithm (see `ColumnFamilyOptions::sample_for_compression`).
+  // 0 means unknown.
+  uint64_t slow_compression_estimated_data_size = 0;
+  // Estimated size of data blocks if compressed using a relatively faster
+  // compression algorithm (see `ColumnFamilyOptions::sample_for_compression`).
+  // 0 means unknown.
+  uint64_t fast_compression_estimated_data_size = 0;
 
   // DB identity
   // db_id is an identifier generated the first time the DB is created
@@ -206,6 +220,12 @@ struct TableProperties {
   // empty string.
   std::string db_session_id;
 
+  // Location of the machine hosting the DB instance
+  // db_host_id identifies the location of the host in some form
+  // (hostname by default, but can also be any string of the user's choosing).
+  // It can potentially change whenever the DB is opened
+  std::string db_host_id;
+
   // Name of the column family with which this SST file is associated.
   // If column family is unknown, `column_family_name` will be an empty string.
   std::string column_family_name;
@@ -251,6 +271,11 @@ struct TableProperties {
   // Aggregate the numerical member variables of the specified
   // TableProperties.
   void Add(const TableProperties& tp);
+
+  // Subset of properties that make sense when added together
+  // between tables. Keys match field names in this class instead
+  // of using full property names.
+  std::map<std::string, uint64_t> GetAggregatablePropertiesAsMap() const;
 };
 
 // Extra properties
diff --git a/include/rocksdb/trace_reader_writer.h b/include/rocksdb/trace_reader_writer.h
index d58ed47b2a0..26ceab2c847 100644
--- a/include/rocksdb/trace_reader_writer.h
+++ b/include/rocksdb/trace_reader_writer.h
@@ -28,7 +28,7 @@ class TraceWriter {
 };
 
 // TraceReader allows reading RocksDB traces from any system, one operation at
-// a time. A RocksDB Replayer could depend on this to replay opertions.
+// a time. A RocksDB Replayer could depend on this to replay operations.
 class TraceReader {
  public:
   TraceReader() {}
diff --git a/include/rocksdb/types.h b/include/rocksdb/types.h
index a4ab9c07a0b..f495fcd22c1 100644
--- a/include/rocksdb/types.h
+++ b/include/rocksdb/types.h
@@ -12,11 +12,36 @@ namespace ROCKSDB_NAMESPACE {
 
 // Define all public custom types here.
 
+using ColumnFamilyId = uint32_t;
+
 // Represents a sequence number in a WAL file.
 typedef uint64_t SequenceNumber;
 
 const SequenceNumber kMinUnCommittedSeq = 1;  // 0 is always committed
 
+enum class TableFileCreationReason {
+  kFlush,
+  kCompaction,
+  kRecovery,
+  kMisc,
+};
+
+// The types of files RocksDB uses in a DB directory. (Available for
+// advanced options.)
+enum FileType {
+  kWalFile,
+  kDBLockFile,
+  kTableFile,
+  kDescriptorFile,
+  kCurrentFile,
+  kTempFile,
+  kInfoLogFile,  // Either the current one, or an old one
+  kMetaDatabase,
+  kIdentityFile,
+  kOptionsFile,
+  kBlobFile
+};
+
 // User-oriented representation of internal key types.
 // Ordering of this enum entries should not change.
 enum EntryType {
diff --git a/include/rocksdb/universal_compaction.h b/include/rocksdb/universal_compaction.h
index e3aeee6ceec..f4df5c0009c 100644
--- a/include/rocksdb/universal_compaction.h
+++ b/include/rocksdb/universal_compaction.h
@@ -36,12 +36,12 @@ class CompactionOptionsUniversal {
   // The size amplification is defined as the amount (in percentage) of
   // additional storage needed to store a single byte of data in the database.
   // For example, a size amplification of 2% means that a database that
-  // contains 100 bytes of user-data may occupy upto 102 bytes of
+  // contains 100 bytes of user-data may occupy up to 102 bytes of
   // physical storage. By this definition, a fully compacted database has
   // a size amplification of 0%. Rocksdb uses the following heuristic
   // to calculate size amplification: it assumes that all files excluding
   // the earliest file contribute to the size amplification.
-  // Default: 200, which means that a 100 byte database could require upto
+  // Default: 200, which means that a 100 byte database could require up to
   // 300 bytes of storage.
   unsigned int max_size_amplification_percent;
 
diff --git a/include/rocksdb/utilities/backup_engine.h b/include/rocksdb/utilities/backup_engine.h
new file mode 100644
index 00000000000..d6a7764e635
--- /dev/null
+++ b/include/rocksdb/utilities/backup_engine.h
@@ -0,0 +1,606 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <cstdint>
+#include <functional>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The default DB file checksum function name.
+constexpr char kDbFileChecksumFuncName[] = "FileChecksumCrc32c";
+// The default BackupEngine file checksum function name.
+constexpr char kBackupFileChecksumFuncName[] = "crc32c";
+
+struct BackupEngineOptions {
+  // Where to keep the backup files. Has to be different than dbname_
+  // Best to set this to dbname_ + "/backups"
+  // Required
+  std::string backup_dir;
+
+  // Backup Env object. It will be used for backup file I/O. If it's
+  // nullptr, backups will be written out using DBs Env. If it's
+  // non-nullptr, backup's I/O will be performed using this object.
+  // If you want to have backups on HDFS, use HDFS Env here!
+  // Default: nullptr
+  Env* backup_env;
+
+  // share_table_files supports table and blob files.
+  //
+  // If share_table_files == true, the backup directory will share table and
+  // blob files among backups, to save space among backups of the same DB and to
+  // enable incremental backups by only copying new files.
+  // If share_table_files == false, each backup will be on its own and will not
+  // share any data with other backups.
+  //
+  // default: true
+  bool share_table_files;
+
+  // Backup info and error messages will be written to info_log
+  // if non-nullptr.
+  // Default: nullptr
+  Logger* info_log;
+
+  // If sync == true, we can guarantee you'll get consistent backup even
+  // on a machine crash/reboot. Backup process is slower with sync enabled.
+  // If sync == false, we don't guarantee anything on machine reboot. However,
+  // chances are some of the backups are consistent.
+  // Default: true
+  bool sync;
+
+  // If true, it will delete whatever backups there are already
+  // Default: false
+  bool destroy_old_data;
+
+  // If false, we won't backup log files. This option can be useful for backing
+  // up in-memory databases where log file are persisted, but table files are in
+  // memory.
+  // Default: true
+  bool backup_log_files;
+
+  // Max bytes that can be transferred in a second during backup.
+  // If 0, go as fast as you can
+  // Default: 0
+  uint64_t backup_rate_limit;
+
+  // Backup rate limiter. Used to control transfer speed for backup. If this is
+  // not null, backup_rate_limit is ignored.
+  // Default: nullptr
+  std::shared_ptr<RateLimiter> backup_rate_limiter{nullptr};
+
+  // Max bytes that can be transferred in a second during restore.
+  // If 0, go as fast as you can
+  // Default: 0
+  uint64_t restore_rate_limit;
+
+  // Restore rate limiter. Used to control transfer speed during restore. If
+  // this is not null, restore_rate_limit is ignored.
+  // Default: nullptr
+  std::shared_ptr<RateLimiter> restore_rate_limiter{nullptr};
+
+  // share_files_with_checksum supports table and blob files.
+  //
+  // Only used if share_table_files is set to true. Setting to false is
+  // DEPRECATED and potentially dangerous because in that case BackupEngine
+  // can lose data if backing up databases with distinct or divergent
+  // history, for example if restoring from a backup other than the latest,
+  // writing to the DB, and creating another backup. Setting to true (default)
+  // prevents these issues by ensuring that different table files (SSTs) and
+  // blob files with the same number are treated as distinct. See
+  // share_files_with_checksum_naming and ShareFilesNaming.
+  //
+  // Default: true
+  bool share_files_with_checksum;
+
+  // Up to this many background threads will copy files for CreateNewBackup()
+  // and RestoreDBFromBackup()
+  // Default: 1
+  int max_background_operations;
+
+  // During backup user can get callback every time next
+  // callback_trigger_interval_size bytes being copied.
+  // Default: 4194304
+  uint64_t callback_trigger_interval_size;
+
+  // For BackupEngineReadOnly, Open() will open at most this many of the
+  // latest non-corrupted backups.
+  //
+  // Note: this setting is ignored (behaves like INT_MAX) for any kind of
+  // writable BackupEngine because it would inhibit accounting for shared
+  // files for proper backup deletion, including purging any incompletely
+  // created backups on creation of a new backup.
+  //
+  // Default: INT_MAX
+  int max_valid_backups_to_open;
+
+  // ShareFilesNaming describes possible naming schemes for backup
+  // table and blob file names when they are stored in the
+  // shared_checksum directory (i.e., both share_table_files and
+  // share_files_with_checksum are true).
+  enum ShareFilesNaming : uint32_t {
+    // Backup blob filenames are <file_number>_<crc32c>_<file_size>.blob and
+    // backup SST filenames are <file_number>_<crc32c>_<file_size>.sst
+    // where <crc32c> is an unsigned decimal integer. This is the
+    // original/legacy naming scheme for share_files_with_checksum,
+    // with two problems:
+    // * At massive scale, collisions on this triple with different file
+    //   contents is plausible.
+    // * Determining the name to use requires computing the checksum,
+    //   so generally requires reading the whole file even if the file
+    //   is already backed up.
+    //
+    // ** ONLY RECOMMENDED FOR PRESERVING OLD BEHAVIOR **
+    kLegacyCrc32cAndFileSize = 1U,
+
+    // Backup SST filenames are <file_number>_s<db_session_id>.sst. This
+    // pair of values should be very strongly unique for a given SST file
+    // and easily determined before computing a checksum. The 's' indicates
+    // the value is a DB session id, not a checksum.
+    //
+    // Exceptions:
+    // * For blob files, kLegacyCrc32cAndFileSize is used as currently
+    //   db_session_id is not supported by the blob file format.
+    // * For old SST files without a DB session id, kLegacyCrc32cAndFileSize
+    //   will be used instead, matching the names assigned by RocksDB versions
+    //   not supporting the newer naming scheme.
+    // * See also flags below.
+    kUseDbSessionId = 2U,
+
+    kMaskNoNamingFlags = 0xffffU,
+
+    // If not already part of the naming scheme, insert
+    //   _<file_size>
+    // before .sst and .blob in the name. In case of user code actually parsing
+    // the last _<whatever> before the .sst  and .blob as the file size, this
+    // preserves that feature of kLegacyCrc32cAndFileSize. In other words, this
+    // option makes official that unofficial feature of the backup metadata.
+    //
+    // We do not consider SST and blob file sizes to have sufficient entropy to
+    // contribute significantly to naming uniqueness.
+    kFlagIncludeFileSize = 1U << 31,
+
+    kMaskNamingFlags = ~kMaskNoNamingFlags,
+  };
+
+  // Naming option for share_files_with_checksum table and blob files. See
+  // ShareFilesNaming for details.
+  //
+  // Modifying this option cannot introduce a downgrade compatibility issue
+  // because RocksDB can read, restore, and delete backups using different file
+  // names, and it's OK for a backup directory to use a mixture of table and
+  // blob files naming schemes.
+  //
+  // However, modifying this option and saving more backups to the same
+  // directory can lead to the same file getting saved again to that
+  // directory, under the new shared name in addition to the old shared
+  // name.
+  //
+  // Default: kUseDbSessionId | kFlagIncludeFileSize
+  //
+  // Note: This option comes into effect only if both share_files_with_checksum
+  // and share_table_files are true.
+  ShareFilesNaming share_files_with_checksum_naming;
+
+  void Dump(Logger* logger) const;
+
+  explicit BackupEngineOptions(
+      const std::string& _backup_dir, Env* _backup_env = nullptr,
+      bool _share_table_files = true, Logger* _info_log = nullptr,
+      bool _sync = true, bool _destroy_old_data = false,
+      bool _backup_log_files = true, uint64_t _backup_rate_limit = 0,
+      uint64_t _restore_rate_limit = 0, int _max_background_operations = 1,
+      uint64_t _callback_trigger_interval_size = 4 * 1024 * 1024,
+      int _max_valid_backups_to_open = INT_MAX,
+      ShareFilesNaming _share_files_with_checksum_naming =
+          static_cast<ShareFilesNaming>(kUseDbSessionId | kFlagIncludeFileSize))
+      : backup_dir(_backup_dir),
+        backup_env(_backup_env),
+        share_table_files(_share_table_files),
+        info_log(_info_log),
+        sync(_sync),
+        destroy_old_data(_destroy_old_data),
+        backup_log_files(_backup_log_files),
+        backup_rate_limit(_backup_rate_limit),
+        restore_rate_limit(_restore_rate_limit),
+        share_files_with_checksum(true),
+        max_background_operations(_max_background_operations),
+        callback_trigger_interval_size(_callback_trigger_interval_size),
+        max_valid_backups_to_open(_max_valid_backups_to_open),
+        share_files_with_checksum_naming(_share_files_with_checksum_naming) {
+    assert(share_table_files || !share_files_with_checksum);
+    assert((share_files_with_checksum_naming & kMaskNoNamingFlags) != 0);
+  }
+};
+
+inline BackupEngineOptions::ShareFilesNaming operator&(
+    BackupEngineOptions::ShareFilesNaming lhs,
+    BackupEngineOptions::ShareFilesNaming rhs) {
+  uint32_t l = static_cast<uint32_t>(lhs);
+  uint32_t r = static_cast<uint32_t>(rhs);
+  assert(r == BackupEngineOptions::kMaskNoNamingFlags ||
+         (r & BackupEngineOptions::kMaskNoNamingFlags) == 0);
+  return static_cast<BackupEngineOptions::ShareFilesNaming>(l & r);
+}
+
+inline BackupEngineOptions::ShareFilesNaming operator|(
+    BackupEngineOptions::ShareFilesNaming lhs,
+    BackupEngineOptions::ShareFilesNaming rhs) {
+  uint32_t l = static_cast<uint32_t>(lhs);
+  uint32_t r = static_cast<uint32_t>(rhs);
+  assert((r & BackupEngineOptions::kMaskNoNamingFlags) == 0);
+  return static_cast<BackupEngineOptions::ShareFilesNaming>(l | r);
+}
+
+struct CreateBackupOptions {
+  // Flush will always trigger if 2PC is enabled.
+  // If write-ahead logs are disabled, set flush_before_backup=true to
+  // avoid losing unflushed key/value pairs from the memtable.
+  bool flush_before_backup = false;
+
+  // Callback for reporting progress, based on callback_trigger_interval_size.
+  std::function<void()> progress_callback = []() {};
+
+  // If false, background_thread_cpu_priority is ignored.
+  // Otherwise, the cpu priority can be decreased,
+  // if you try to increase the priority, the priority will not change.
+  // The initial priority of the threads is CpuPriority::kNormal,
+  // so you can decrease to priorities lower than kNormal.
+  bool decrease_background_thread_cpu_priority = false;
+  CpuPriority background_thread_cpu_priority = CpuPriority::kNormal;
+};
+
+struct RestoreOptions {
+  // If true, restore won't overwrite the existing log files in wal_dir. It will
+  // also move all log files from archive directory to wal_dir. Use this option
+  // in combination with BackupEngineOptions::backup_log_files = false for
+  // persisting in-memory databases.
+  // Default: false
+  bool keep_log_files;
+
+  explicit RestoreOptions(bool _keep_log_files = false)
+      : keep_log_files(_keep_log_files) {}
+};
+
+struct BackupFileInfo {
+  // File name and path relative to the backup_dir directory.
+  std::string relative_filename;
+
+  // Size of the file in bytes, not including filesystem overheads.
+  uint64_t size;
+};
+
+typedef uint32_t BackupID;
+
+struct BackupInfo {
+  BackupID backup_id = 0U;
+  // Creation time, according to GetCurrentTime
+  int64_t timestamp = 0;
+
+  // Total size in bytes (based on file payloads, not including filesystem
+  // overheads or backup meta file)
+  uint64_t size = 0U;
+
+  // Number of backed up files, some of which might be shared with other
+  // backups. Does not include backup meta file.
+  uint32_t number_files = 0U;
+
+  // Backup API user metadata
+  std::string app_metadata;
+
+  // Backup file details, if requested with include_file_details=true
+  std::vector<BackupFileInfo> file_details;
+
+  // DB "name" (a directory in the backup_env) for opening this backup as a
+  // read-only DB. This should also be used as the DBOptions::wal_dir, such
+  // as by default setting wal_dir="". See also env_for_open.
+  // This field is only set if include_file_details=true
+  std::string name_for_open;
+
+  // An Env(+FileSystem) for opening this backup as a read-only DB, with
+  // DB::OpenForReadOnly or similar. This field is only set if
+  // include_file_details=true. (The FileSystem in this Env takes care
+  // of making shared backup files openable from the `name_for_open` DB
+  // directory.) See also name_for_open.
+  //
+  // This Env might or might not be shared with other backups. To work
+  // around DBOptions::env being a raw pointer, this is a shared_ptr so
+  // that keeping either this BackupInfo, the BackupEngine, or a copy of
+  // this shared_ptr alive is sufficient to keep the Env alive for use by
+  // a read-only DB.
+  std::shared_ptr<Env> env_for_open;
+
+  BackupInfo() {}
+
+  BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size,
+             uint32_t _number_files, const std::string& _app_metadata)
+      : backup_id(_backup_id),
+        timestamp(_timestamp),
+        size(_size),
+        number_files(_number_files),
+        app_metadata(_app_metadata) {}
+};
+
+class BackupStatistics {
+ public:
+  BackupStatistics() {
+    number_success_backup = 0;
+    number_fail_backup = 0;
+  }
+
+  BackupStatistics(uint32_t _number_success_backup,
+                   uint32_t _number_fail_backup)
+      : number_success_backup(_number_success_backup),
+        number_fail_backup(_number_fail_backup) {}
+
+  ~BackupStatistics() {}
+
+  void IncrementNumberSuccessBackup();
+  void IncrementNumberFailBackup();
+
+  uint32_t GetNumberSuccessBackup() const;
+  uint32_t GetNumberFailBackup() const;
+
+  std::string ToString() const;
+
+ private:
+  uint32_t number_success_backup;
+  uint32_t number_fail_backup;
+};
+
+// Read-only functions of a BackupEngine. (Restore writes to another directory
+// not the backup directory.) See BackupEngine comments for details on
+// safe concurrent operations.
+class BackupEngineReadOnlyBase {
+ public:
+  virtual ~BackupEngineReadOnlyBase() {}
+
+  // Returns info about the latest good backup in backup_info, or NotFound
+  // no good backup exists.
+  // Setting include_file_details=true provides information about each
+  // backed-up file in BackupInfo::file_details and more.
+  virtual Status GetLatestBackupInfo(
+      BackupInfo* backup_info, bool include_file_details = false) const = 0;
+
+  // Returns info about a specific backup in backup_info, or NotFound
+  // or Corruption status if the requested backup id does not exist or is
+  // known corrupt.
+  // Setting include_file_details=true provides information about each
+  // backed-up file in BackupInfo::file_details and more.
+  virtual Status GetBackupInfo(BackupID backup_id, BackupInfo* backup_info,
+                               bool include_file_details = false) const = 0;
+
+  // Returns info about backups in backup_info
+  // Setting include_file_details=true provides information about each
+  // backed-up file in BackupInfo::file_details and more.
+  virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info,
+                             bool include_file_details = false) const = 0;
+
+  // Returns info about corrupt backups in corrupt_backups.
+  // WARNING: Any write to the BackupEngine could trigger automatic
+  // GarbageCollect(), which could delete files that would be needed to
+  // manually recover a corrupt backup or to preserve an unrecognized (e.g.
+  // incompatible future version) backup.
+  virtual void GetCorruptedBackups(
+      std::vector<BackupID>* corrupt_backup_ids) const = 0;
+
+  // Restore to specified db_dir and wal_dir from backup_id.
+  virtual Status RestoreDBFromBackup(const RestoreOptions& options,
+                                     BackupID backup_id,
+                                     const std::string& db_dir,
+                                     const std::string& wal_dir) const = 0;
+
+  // keep for backward compatibility.
+  virtual Status RestoreDBFromBackup(
+      BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& options = RestoreOptions()) const {
+    return RestoreDBFromBackup(options, backup_id, db_dir, wal_dir);
+  }
+
+  // Like RestoreDBFromBackup but restores from latest non-corrupt backup_id
+  virtual Status RestoreDBFromLatestBackup(
+      const RestoreOptions& options, const std::string& db_dir,
+      const std::string& wal_dir) const = 0;
+
+  // keep for backward compatibility.
+  virtual Status RestoreDBFromLatestBackup(
+      const std::string& db_dir, const std::string& wal_dir,
+      const RestoreOptions& options = RestoreOptions()) const {
+    return RestoreDBFromLatestBackup(options, db_dir, wal_dir);
+  }
+
+  // If verify_with_checksum is true, this function
+  // inspects the current checksums and file sizes of backup files to see if
+  // they match our expectation.
+  //
+  // If verify_with_checksum is false, this function
+  // checks that each file exists and that the size of the file matches our
+  // expectation. It does not check file checksum.
+  //
+  // If this BackupEngine created the backup, it compares the files' current
+  // sizes (and current checksum) against the number of bytes written to
+  // them (and the checksum calculated) during creation.
+  // Otherwise, it compares the files' current sizes (and checksums) against
+  // their sizes (and checksums) when the BackupEngine was opened.
+  //
+  // Returns Status::OK() if all checks are good
+  virtual Status VerifyBackup(BackupID backup_id,
+                              bool verify_with_checksum = false) const = 0;
+};
+
+// Append-only functions of a BackupEngine. See BackupEngine comment for
+// details on distinction between Append and Write operations and safe
+// concurrent operations.
+class BackupEngineAppendOnlyBase {
+ public:
+  virtual ~BackupEngineAppendOnlyBase() {}
+
+  // same as CreateNewBackup, but stores extra application metadata.
+  virtual Status CreateNewBackupWithMetadata(
+      const CreateBackupOptions& options, DB* db,
+      const std::string& app_metadata, BackupID* new_backup_id = nullptr) = 0;
+
+  // keep here for backward compatibility.
+  virtual Status CreateNewBackupWithMetadata(
+      DB* db, const std::string& app_metadata, bool flush_before_backup = false,
+      std::function<void()> progress_callback = []() {}) {
+    CreateBackupOptions options;
+    options.flush_before_backup = flush_before_backup;
+    options.progress_callback = progress_callback;
+    return CreateNewBackupWithMetadata(options, db, app_metadata);
+  }
+
+  // Captures the state of the database by creating a new (latest) backup.
+  // On success (OK status), the BackupID of the new backup is saved to
+  // *new_backup_id when not nullptr.
+  virtual Status CreateNewBackup(const CreateBackupOptions& options, DB* db,
+                                 BackupID* new_backup_id = nullptr) {
+    return CreateNewBackupWithMetadata(options, db, "", new_backup_id);
+  }
+
+  // keep here for backward compatibility.
+  virtual Status CreateNewBackup(
+      DB* db, bool flush_before_backup = false,
+      std::function<void()> progress_callback = []() {}) {
+    CreateBackupOptions options;
+    options.flush_before_backup = flush_before_backup;
+    options.progress_callback = progress_callback;
+    return CreateNewBackup(options, db);
+  }
+
+  // Call this from another thread if you want to stop the backup
+  // that is currently happening. It will return immediately, will
+  // not wait for the backup to stop.
+  // The backup will stop ASAP and the call to CreateNewBackup will
+  // return Status::Incomplete(). It will not clean up after itself, but
+  // the state will remain consistent. The state will be cleaned up the
+  // next time you call CreateNewBackup or GarbageCollect.
+  virtual void StopBackup() = 0;
+
+  // Will delete any files left over from incomplete creation or deletion of
+  // a backup. This is not normally needed as those operations also clean up
+  // after prior incomplete calls to the same kind of operation (create or
+  // delete). This does not delete corrupt backups but can delete files that
+  // would be needed to manually recover a corrupt backup or to preserve an
+  // unrecognized (e.g. incompatible future version) backup.
+  // NOTE: This is not designed to delete arbitrary files added to the backup
+  // directory outside of BackupEngine, and clean-up is always subject to
+  // permissions on and availability of the underlying filesystem.
+  // NOTE2: For concurrency and interference purposes (see BackupEngine
+  // comment), GarbageCollect (GC) is like other Append operations, even
+  // though it seems different. Although GC can delete physical data, it does
+  // not delete any logical data read by Read operations. GC can interfere
+  // with Append or Write operations in another BackupEngine on the same
+  // backup_dir, because temporary files will be treated as obsolete and
+  // deleted.
+  virtual Status GarbageCollect() = 0;
+};
+
+// A backup engine for organizing and managing backups.
+// This class is not user-extensible.
+//
+// This class declaration adds "Write" operations in addition to the
+// operations from BackupEngineAppendOnlyBase and BackupEngineReadOnlyBase.
+//
+// # Concurrency between threads on the same BackupEngine* object
+//
+// As of version 6.20, BackupEngine* operations are generally thread-safe,
+// using a read-write lock, though single-thread operation is still
+// recommended to avoid TOCTOU bugs. Specifically, particular kinds of
+// concurrent operations behave like this:
+//
+// op1\op2| Read  | Append | Write
+// -------|-------|--------|--------
+//   Read | conc  | block  | block
+// Append | block | block  | block
+//  Write | block | block  | block
+//
+// conc = operations safely proceed concurrently
+// block = one of the operations safely blocks until the other completes.
+//   There is generally no guarantee as to which completes first.
+//
+// StopBackup is the only operation that affects an ongoing operation.
+//
+// # Interleaving operations between BackupEngine* objects open on the
+// same backup_dir
+//
+// It is recommended only to have one BackupEngine* object open for a given
+// backup_dir, but it is possible to mix / interleave some operations
+// (regardless of whether they are concurrent) with these caveats:
+//
+// op1\op2|  Open  |  Read  | Append | Write
+// -------|--------|--------|--------|--------
+//   Open | conc   | conc   | atomic | unspec
+//   Read | conc   | conc   | old    | unspec
+// Append | atomic | old    | unspec | unspec
+//  Write | unspec | unspec | unspec | unspec
+//
+// Special case: Open with destroy_old_data=true is really a Write
+//
+// conc = operations safely proceed, concurrently when applicable
+// atomic = operations are effectively atomic; if a concurrent Append
+//   operation has not completed at some key point during Open, the
+//   opened BackupEngine* will never see the result of the Append op.
+// old = Read operations do not include any state changes from other
+//   BackupEngine* objects; they return the state at their Open time.
+// unspec = Behavior is unspecified, including possibly trashing the
+//   backup_dir, but is "memory safe" (no C++ undefined behavior)
+//
+class BackupEngine : public BackupEngineReadOnlyBase,
+                     public BackupEngineAppendOnlyBase {
+ public:
+  virtual ~BackupEngine() {}
+
+  // BackupEngineOptions have to be the same as the ones used in previous
+  // BackupEngines for the same backup directory.
+  static Status Open(const BackupEngineOptions& options, Env* db_env,
+                     BackupEngine** backup_engine_ptr);
+
+  // keep for backward compatibility.
+  static Status Open(Env* db_env, const BackupEngineOptions& options,
+                     BackupEngine** backup_engine_ptr) {
+    return BackupEngine::Open(options, db_env, backup_engine_ptr);
+  }
+
+  // Deletes old backups, keeping latest num_backups_to_keep alive.
+  // See also DeleteBackup.
+  virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0;
+
+  // Deletes a specific backup. If this operation (or PurgeOldBackups)
+  // is not completed due to crash, power failure, etc. the state
+  // will be cleaned up the next time you call DeleteBackup,
+  // PurgeOldBackups, or GarbageCollect.
+  virtual Status DeleteBackup(BackupID backup_id) = 0;
+};
+
+// A variant of BackupEngine that only allows "Read" operations. See
+// BackupEngine comment for details. This class is not user-extensible.
+class BackupEngineReadOnly : public BackupEngineReadOnlyBase {
+ public:
+  virtual ~BackupEngineReadOnly() {}
+
+  static Status Open(const BackupEngineOptions& options, Env* db_env,
+                     BackupEngineReadOnly** backup_engine_ptr);
+  // keep for backward compatibility.
+  static Status Open(Env* db_env, const BackupEngineOptions& options,
+                     BackupEngineReadOnly** backup_engine_ptr) {
+    return BackupEngineReadOnly::Open(options, db_env, backup_engine_ptr);
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/include/rocksdb/utilities/backupable_db.h b/include/rocksdb/utilities/backupable_db.h
index 36f29edbb2f..de040b5521c 100644
--- a/include/rocksdb/utilities/backupable_db.h
+++ b/include/rocksdb/utilities/backupable_db.h
@@ -1,535 +1,26 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This is a DEPRECATED header for API backward compatibility. Please
+// use backup_engine.h.
 
 #pragma once
 #ifndef ROCKSDB_LITE
 
+// A legacy unnecessary include
 #include <cinttypes>
-#include <functional>
-#include <map>
-#include <string>
-#include <vector>
 
-#include "rocksdb/utilities/stackable_db.h"
+#include "rocksdb/utilities/backup_engine.h"
 
-#include "rocksdb/env.h"
-#include "rocksdb/options.h"
-#include "rocksdb/status.h"
+// A legacy unnecessary include
+#include "rocksdb/utilities/stackable_db.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-// The default DB file checksum function name.
-constexpr char kDbFileChecksumFuncName[] = "FileChecksumCrc32c";
-// The default BackupEngine file checksum function name.
-constexpr char kBackupFileChecksumFuncName[] = "crc32c";
-
-struct BackupableDBOptions {
-  // Where to keep the backup files. Has to be different than dbname_
-  // Best to set this to dbname_ + "/backups"
-  // Required
-  std::string backup_dir;
-
-  // Backup Env object. It will be used for backup file I/O. If it's
-  // nullptr, backups will be written out using DBs Env. If it's
-  // non-nullptr, backup's I/O will be performed using this object.
-  // If you want to have backups on HDFS, use HDFS Env here!
-  // Default: nullptr
-  Env* backup_env;
-
-  // If share_table_files == true, backup will assume that table files with
-  // same name have the same contents. This enables incremental backups and
-  // avoids unnecessary data copies.
-  // If share_table_files == false, each backup will be on its own and will
-  // not share any data with other backups.
-  // default: true
-  bool share_table_files;
-
-  // Backup info and error messages will be written to info_log
-  // if non-nullptr.
-  // Default: nullptr
-  Logger* info_log;
-
-  // If sync == true, we can guarantee you'll get consistent backup even
-  // on a machine crash/reboot. Backup process is slower with sync enabled.
-  // If sync == false, we don't guarantee anything on machine reboot. However,
-  // chances are some of the backups are consistent.
-  // Default: true
-  bool sync;
-
-  // If true, it will delete whatever backups there are already
-  // Default: false
-  bool destroy_old_data;
-
-  // If false, we won't backup log files. This option can be useful for backing
-  // up in-memory databases where log file are persisted, but table files are in
-  // memory.
-  // Default: true
-  bool backup_log_files;
-
-  // Max bytes that can be transferred in a second during backup.
-  // If 0, go as fast as you can
-  // Default: 0
-  uint64_t backup_rate_limit;
-
-  // Backup rate limiter. Used to control transfer speed for backup. If this is
-  // not null, backup_rate_limit is ignored.
-  // Default: nullptr
-  std::shared_ptr<RateLimiter> backup_rate_limiter{nullptr};
-
-  // Max bytes that can be transferred in a second during restore.
-  // If 0, go as fast as you can
-  // Default: 0
-  uint64_t restore_rate_limit;
-
-  // Restore rate limiter. Used to control transfer speed during restore. If
-  // this is not null, restore_rate_limit is ignored.
-  // Default: nullptr
-  std::shared_ptr<RateLimiter> restore_rate_limiter{nullptr};
-
-  // Only used if share_table_files is set to true. If true, will consider
-  // that backups can come from different databases, even differently mutated
-  // databases with the same DB ID. See share_files_with_checksum_naming and
-  // ShareFilesNaming for details on how table files names are made
-  // unique between databases.
-  //
-  // Using 'true' is fundamentally safer, and performance improvements vs.
-  // original design should leave almost no reason to use the 'false' setting.
-  //
-  // Default (only for historical reasons): false
-  bool share_files_with_checksum;
-
-  // Up to this many background threads will copy files for CreateNewBackup()
-  // and RestoreDBFromBackup()
-  // Default: 1
-  int max_background_operations;
-
-  // During backup user can get callback every time next
-  // callback_trigger_interval_size bytes being copied.
-  // Default: 4194304
-  uint64_t callback_trigger_interval_size;
-
-  // For BackupEngineReadOnly, Open() will open at most this many of the
-  // latest non-corrupted backups.
-  //
-  // Note: this setting is ignored (behaves like INT_MAX) for any kind of
-  // writable BackupEngine because it would inhibit accounting for shared
-  // files for proper backup deletion, including purging any incompletely
-  // created backups on creation of a new backup.
-  //
-  // Default: INT_MAX
-  int max_valid_backups_to_open;
-
-  // ShareFilesNaming describes possible naming schemes for backup
-  // table file names when the table files are stored in the shared_checksum
-  // directory (i.e., both share_table_files and share_files_with_checksum
-  // are true).
-  enum ShareFilesNaming : uint32_t {
-    // Backup SST filenames are <file_number>_<crc32c>_<file_size>.sst
-    // where <crc32c> is an unsigned decimal integer. This is the
-    // original/legacy naming scheme for share_files_with_checksum,
-    // with two problems:
-    // * At massive scale, collisions on this triple with different file
-    //   contents is plausible.
-    // * Determining the name to use requires computing the checksum,
-    //   so generally requires reading the whole file even if the file
-    //   is already backed up.
-    // ** ONLY RECOMMENDED FOR PRESERVING OLD BEHAVIOR **
-    kLegacyCrc32cAndFileSize = 1U,
-
-    // Backup SST filenames are <file_number>_s<db_session_id>.sst. This
-    // pair of values should be very strongly unique for a given SST file
-    // and easily determined before computing a checksum. The 's' indicates
-    // the value is a DB session id, not a checksum.
-    //
-    // Exceptions:
-    // * For old SST files without a DB session id, kLegacyCrc32cAndFileSize
-    //   will be used instead, matching the names assigned by RocksDB versions
-    //   not supporting the newer naming scheme.
-    // * See also flags below.
-    kUseDbSessionId = 2U,
-
-    kMaskNoNamingFlags = 0xffffU,
-
-    // If not already part of the naming scheme, insert
-    //   _<file_size>
-    // before .sst in the name. In case of user code actually parsing the
-    // last _<whatever> before the .sst as the file size, this preserves that
-    // feature of kLegacyCrc32cAndFileSize. In other words, this option makes
-    // official that unofficial feature of the backup metadata.
-    //
-    // We do not consider SST file sizes to have sufficient entropy to
-    // contribute significantly to naming uniqueness.
-    kFlagIncludeFileSize = 1U << 31,
-
-    // When encountering an SST file from a Facebook-internal early
-    // release of 6.12, use the default naming scheme in effect for
-    // when the SST file was generated (assuming full file checksum
-    // was not set to GetFileChecksumGenCrc32cFactory()). That naming is
-    // <file_number>_<db_session_id>.sst
-    // and ignores kFlagIncludeFileSize setting.
-    // NOTE: This flag is intended to be temporary and should be removed
-    // in a later release.
-    kFlagMatchInterimNaming = 1U << 30,
-
-    kMaskNamingFlags = ~kMaskNoNamingFlags,
-  };
-
-  // Naming option for share_files_with_checksum table files. See
-  // ShareFilesNaming for details.
-  //
-  // Modifying this option cannot introduce a downgrade compatibility issue
-  // because RocksDB can read, restore, and delete backups using different file
-  // names, and it's OK for a backup directory to use a mixture of table file
-  // naming schemes.
-  //
-  // However, modifying this option and saving more backups to the same
-  // directory can lead to the same file getting saved again to that
-  // directory, under the new shared name in addition to the old shared
-  // name.
-  //
-  // Default: kUseDbSessionId | kFlagIncludeFileSize | kFlagMatchInterimNaming
-  //
-  // Note: This option comes into effect only if both share_files_with_checksum
-  // and share_table_files are true.
-  ShareFilesNaming share_files_with_checksum_naming;
-
-  void Dump(Logger* logger) const;
-
-  explicit BackupableDBOptions(
-      const std::string& _backup_dir, Env* _backup_env = nullptr,
-      bool _share_table_files = true, Logger* _info_log = nullptr,
-      bool _sync = true, bool _destroy_old_data = false,
-      bool _backup_log_files = true, uint64_t _backup_rate_limit = 0,
-      uint64_t _restore_rate_limit = 0, int _max_background_operations = 1,
-      uint64_t _callback_trigger_interval_size = 4 * 1024 * 1024,
-      int _max_valid_backups_to_open = INT_MAX,
-      ShareFilesNaming _share_files_with_checksum_naming =
-          static_cast<ShareFilesNaming>(kUseDbSessionId | kFlagIncludeFileSize |
-                                        kFlagMatchInterimNaming))
-      : backup_dir(_backup_dir),
-        backup_env(_backup_env),
-        share_table_files(_share_table_files),
-        info_log(_info_log),
-        sync(_sync),
-        destroy_old_data(_destroy_old_data),
-        backup_log_files(_backup_log_files),
-        backup_rate_limit(_backup_rate_limit),
-        restore_rate_limit(_restore_rate_limit),
-        share_files_with_checksum(false),
-        max_background_operations(_max_background_operations),
-        callback_trigger_interval_size(_callback_trigger_interval_size),
-        max_valid_backups_to_open(_max_valid_backups_to_open),
-        share_files_with_checksum_naming(_share_files_with_checksum_naming) {
-    assert(share_table_files || !share_files_with_checksum);
-    assert((share_files_with_checksum_naming & kMaskNoNamingFlags) != 0);
-  }
-};
-
-inline BackupableDBOptions::ShareFilesNaming operator&(
-    BackupableDBOptions::ShareFilesNaming lhs,
-    BackupableDBOptions::ShareFilesNaming rhs) {
-  uint32_t l = static_cast<uint32_t>(lhs);
-  uint32_t r = static_cast<uint32_t>(rhs);
-  assert(r == BackupableDBOptions::kMaskNoNamingFlags ||
-         (r & BackupableDBOptions::kMaskNoNamingFlags) == 0);
-  return static_cast<BackupableDBOptions::ShareFilesNaming>(l & r);
-}
-
-inline BackupableDBOptions::ShareFilesNaming operator|(
-    BackupableDBOptions::ShareFilesNaming lhs,
-    BackupableDBOptions::ShareFilesNaming rhs) {
-  uint32_t l = static_cast<uint32_t>(lhs);
-  uint32_t r = static_cast<uint32_t>(rhs);
-  assert((r & BackupableDBOptions::kMaskNoNamingFlags) == 0);
-  return static_cast<BackupableDBOptions::ShareFilesNaming>(l | r);
-}
-
-struct CreateBackupOptions {
-  // Flush will always trigger if 2PC is enabled.
-  // If write-ahead logs are disabled, set flush_before_backup=true to
-  // avoid losing unflushed key/value pairs from the memtable.
-  bool flush_before_backup = false;
-
-  // Callback for reporting progress, based on callback_trigger_interval_size.
-  std::function<void()> progress_callback = []() {};
-
-  // If false, background_thread_cpu_priority is ignored.
-  // Otherwise, the cpu priority can be decreased,
-  // if you try to increase the priority, the priority will not change.
-  // The initial priority of the threads is CpuPriority::kNormal,
-  // so you can decrease to priorities lower than kNormal.
-  bool decrease_background_thread_cpu_priority = false;
-  CpuPriority background_thread_cpu_priority = CpuPriority::kNormal;
-};
-
-struct RestoreOptions {
-  // If true, restore won't overwrite the existing log files in wal_dir. It will
-  // also move all log files from archive directory to wal_dir. Use this option
-  // in combination with BackupableDBOptions::backup_log_files = false for
-  // persisting in-memory databases.
-  // Default: false
-  bool keep_log_files;
-
-  explicit RestoreOptions(bool _keep_log_files = false)
-      : keep_log_files(_keep_log_files) {}
-};
-
-typedef uint32_t BackupID;
-
-struct BackupInfo {
-  BackupID backup_id;
-  int64_t timestamp;
-  uint64_t size;
-
-  uint32_t number_files;
-  std::string app_metadata;
-
-  BackupInfo() {}
-
-  BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size,
-             uint32_t _number_files, const std::string& _app_metadata)
-      : backup_id(_backup_id),
-        timestamp(_timestamp),
-        size(_size),
-        number_files(_number_files),
-        app_metadata(_app_metadata) {}
-};
-
-class BackupStatistics {
- public:
-  BackupStatistics() {
-    number_success_backup = 0;
-    number_fail_backup = 0;
-  }
-
-  BackupStatistics(uint32_t _number_success_backup,
-                   uint32_t _number_fail_backup)
-      : number_success_backup(_number_success_backup),
-        number_fail_backup(_number_fail_backup) {}
-
-  ~BackupStatistics() {}
-
-  void IncrementNumberSuccessBackup();
-  void IncrementNumberFailBackup();
-
-  uint32_t GetNumberSuccessBackup() const;
-  uint32_t GetNumberFailBackup() const;
-
-  std::string ToString() const;
-
- private:
-  uint32_t number_success_backup;
-  uint32_t number_fail_backup;
-};
-
-// A backup engine for accessing information about backups and restoring from
-// them.
-// BackupEngineReadOnly is not extensible.
-class BackupEngineReadOnly {
- public:
-  virtual ~BackupEngineReadOnly() {}
-
-  static Status Open(const BackupableDBOptions& options, Env* db_env,
-                     BackupEngineReadOnly** backup_engine_ptr);
-  // keep for backward compatibility.
-  static Status Open(Env* db_env, const BackupableDBOptions& options,
-                     BackupEngineReadOnly** backup_engine_ptr) {
-    return BackupEngineReadOnly::Open(options, db_env, backup_engine_ptr);
-  }
-
-  // Returns info about backups in backup_info
-  // You can GetBackupInfo safely, even with other BackupEngine performing
-  // backups on the same directory
-  virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) = 0;
-
-  // Returns info about corrupt backups in corrupt_backups
-  virtual void GetCorruptedBackups(
-      std::vector<BackupID>* corrupt_backup_ids) = 0;
-
-  // Restoring DB from backup is NOT safe when there is another BackupEngine
-  // running that might call DeleteBackup() or PurgeOldBackups(). It is caller's
-  // responsibility to synchronize the operation, i.e. don't delete the backup
-  // when you're restoring from it
-  // See also the corresponding doc in BackupEngine
-  virtual Status RestoreDBFromBackup(const RestoreOptions& options,
-                                     BackupID backup_id,
-                                     const std::string& db_dir,
-                                     const std::string& wal_dir) = 0;
-
-  // keep for backward compatibility.
-  virtual Status RestoreDBFromBackup(
-      BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
-      const RestoreOptions& options = RestoreOptions()) {
-    return RestoreDBFromBackup(options, backup_id, db_dir, wal_dir);
-  }
-
-  // See the corresponding doc in BackupEngine
-  virtual Status RestoreDBFromLatestBackup(const RestoreOptions& options,
-                                           const std::string& db_dir,
-                                           const std::string& wal_dir) = 0;
-
-  // keep for backward compatibility.
-  virtual Status RestoreDBFromLatestBackup(
-      const std::string& db_dir, const std::string& wal_dir,
-      const RestoreOptions& options = RestoreOptions()) {
-    return RestoreDBFromLatestBackup(options, db_dir, wal_dir);
-  }
-
-  // If verify_with_checksum is true, this function
-  // inspects the current checksums and file sizes of backup files to see if
-  // they match our expectation.
-  //
-  // If verify_with_checksum is false, this function
-  // checks that each file exists and that the size of the file matches our
-  // expectation. It does not check file checksum.
-  //
-  // If this BackupEngine created the backup, it compares the files' current
-  // sizes (and current checksum) against the number of bytes written to
-  // them (and the checksum calculated) during creation.
-  // Otherwise, it compares the files' current sizes (and checksums) against
-  // their sizes (and checksums) when the BackupEngine was opened.
-  //
-  // Returns Status::OK() if all checks are good
-  virtual Status VerifyBackup(BackupID backup_id,
-                              bool verify_with_checksum = false) = 0;
-};
-
-// A backup engine for creating new backups.
-// BackupEngine is not extensible.
-class BackupEngine {
- public:
-  virtual ~BackupEngine() {}
-
-  // BackupableDBOptions have to be the same as the ones used in previous
-  // BackupEngines for the same backup directory.
-  static Status Open(const BackupableDBOptions& options, Env* db_env,
-                     BackupEngine** backup_engine_ptr);
-
-  // keep for backward compatibility.
-  static Status Open(Env* db_env, const BackupableDBOptions& options,
-                     BackupEngine** backup_engine_ptr) {
-    return BackupEngine::Open(options, db_env, backup_engine_ptr);
-  }
-
-  // same as CreateNewBackup, but stores extra application metadata.
-  virtual Status CreateNewBackupWithMetadata(
-      const CreateBackupOptions& options, DB* db,
-      const std::string& app_metadata) = 0;
-
-  // keep here for backward compatibility.
-  virtual Status CreateNewBackupWithMetadata(
-      DB* db, const std::string& app_metadata, bool flush_before_backup = false,
-      std::function<void()> progress_callback = []() {}) {
-    CreateBackupOptions options;
-    options.flush_before_backup = flush_before_backup;
-    options.progress_callback = progress_callback;
-    return CreateNewBackupWithMetadata(options, db, app_metadata);
-  }
-
-  // Captures the state of the database in the latest backup
-  // NOT a thread safe call
-  virtual Status CreateNewBackup(const CreateBackupOptions& options, DB* db) {
-    return CreateNewBackupWithMetadata(options, db, "");
-  }
-
-  // keep here for backward compatibility.
-  virtual Status CreateNewBackup(DB* db, bool flush_before_backup = false,
-                                 std::function<void()> progress_callback =
-                                     []() {}) {
-    CreateBackupOptions options;
-    options.flush_before_backup = flush_before_backup;
-    options.progress_callback = progress_callback;
-    return CreateNewBackup(options, db);
-  }
-
-  // Deletes old backups, keeping latest num_backups_to_keep alive.
-  // See also DeleteBackup.
-  virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0;
-
-  // Deletes a specific backup. If this operation (or PurgeOldBackups)
-  // is not completed due to crash, power failure, etc. the state
-  // will be cleaned up the next time you call DeleteBackup,
-  // PurgeOldBackups, or GarbageCollect.
-  virtual Status DeleteBackup(BackupID backup_id) = 0;
-
-  // Call this from another thread if you want to stop the backup
-  // that is currently happening. It will return immediatelly, will
-  // not wait for the backup to stop.
-  // The backup will stop ASAP and the call to CreateNewBackup will
-  // return Status::Incomplete(). It will not clean up after itself, but
-  // the state will remain consistent. The state will be cleaned up the
-  // next time you call CreateNewBackup or GarbageCollect.
-  virtual void StopBackup() = 0;
-
-  // Returns info about backups in backup_info
-  virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) = 0;
-
-  // Returns info about corrupt backups in corrupt_backups
-  virtual void GetCorruptedBackups(
-      std::vector<BackupID>* corrupt_backup_ids) = 0;
-
-  // restore from backup with backup_id
-  // IMPORTANT -- if options_.share_table_files == true,
-  // options_.share_files_with_checksum == false, you restore DB from some
-  // backup that is not the latest, and you start creating new backups from the
-  // new DB, they will probably fail.
-  //
-  // Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3.
-  // If you add new data to the DB and try creating a new backup now, the
-  // database will diverge from backups 4 and 5 and the new backup will fail.
-  // If you want to create new backup, you will first have to delete backups 4
-  // and 5.
-  virtual Status RestoreDBFromBackup(const RestoreOptions& options,
-                                     BackupID backup_id,
-                                     const std::string& db_dir,
-                                     const std::string& wal_dir) = 0;
-
-  // keep for backward compatibility.
-  virtual Status RestoreDBFromBackup(
-      BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
-      const RestoreOptions& options = RestoreOptions()) {
-    return RestoreDBFromBackup(options, backup_id, db_dir, wal_dir);
-  }
-
-  // restore from the latest backup
-  virtual Status RestoreDBFromLatestBackup(const RestoreOptions& options,
-                                           const std::string& db_dir,
-                                           const std::string& wal_dir) = 0;
-
-  // keep for backward compatibility.
-  virtual Status RestoreDBFromLatestBackup(
-      const std::string& db_dir, const std::string& wal_dir,
-      const RestoreOptions& options = RestoreOptions()) {
-    return RestoreDBFromLatestBackup(options, db_dir, wal_dir);
-  }
-
-  // If verify_with_checksum is true, this function
-  // inspects the current checksums and file sizes of backup files to see if
-  // they match our expectation.
-  //
-  // If verify_with_checksum is false, this function
-  // checks that each file exists and that the size of the file matches our
-  // expectation. It does not check file checksum.
-  //
-  // Returns Status::OK() if all checks are good
-  virtual Status VerifyBackup(BackupID backup_id,
-                              bool verify_with_checksum = false) = 0;
-
-  // Will delete any files left over from incomplete creation or deletion of
-  // a backup. This is not normally needed as those operations also clean up
-  // after prior incomplete calls to the same kind of operation (create or
-  // delete).
-  // NOTE: This is not designed to delete arbitrary files added to the backup
-  // directory outside of BackupEngine, and clean-up is always subject to
-  // permissions on and availability of the underlying filesystem.
-  virtual Status GarbageCollect() = 0;
-};
+using BackupableDBOptions = BackupEngineOptions;
 
 }  // namespace ROCKSDB_NAMESPACE
+
 #endif  // ROCKSDB_LITE
diff --git a/include/rocksdb/utilities/checkpoint.h b/include/rocksdb/utilities/checkpoint.h
index 1b6a7407ff1..df2a744033e 100644
--- a/include/rocksdb/utilities/checkpoint.h
+++ b/include/rocksdb/utilities/checkpoint.h
@@ -24,13 +24,13 @@ class Checkpoint {
   // Creates a Checkpoint object to be used for creating openable snapshots
   static Status Create(DB* db, Checkpoint** checkpoint_ptr);
 
-  // Builds an openable snapshot of RocksDB on the same disk, which
-  // accepts an output directory on the same disk, and under the directory
-  // (1) hard-linked SST files pointing to existing live SST files
-  // SST files will be copied if output directory is on a different filesystem
-  // (2) a copied manifest files and other files
-  // The directory should not already exist and will be created by this API.
-  // The directory will be an absolute path
+  // Builds an openable snapshot of RocksDB. checkpoint_dir should contain an
+  // absolute path. The specified directory should not exist, since it will be
+  // created by the API.
+  // When a checkpoint is created,
+  // (1) SST and blob files are hard linked if the output directory is on the
+  // same filesystem as the database, and copied otherwise.
+  // (2) other required files (like MANIFEST) are always copied.
   // log_size_for_flush: if the total log file size is equal or larger than
   // this value, then a flush is triggered for all the column families. The
   // default value is 0, which means flush is always triggered. If you move
diff --git a/include/rocksdb/utilities/ldb_cmd.h b/include/rocksdb/utilities/ldb_cmd.h
index c7f227fc02a..e900abefee5 100644
--- a/include/rocksdb/utilities/ldb_cmd.h
+++ b/include/rocksdb/utilities/ldb_cmd.h
@@ -32,6 +32,7 @@ class LDBCommand {
  public:
   // Command-line arguments
   static const std::string ARG_ENV_URI;
+  static const std::string ARG_FS_URI;
   static const std::string ARG_DB;
   static const std::string ARG_PATH;
   static const std::string ARG_SECONDARY_PATH;
@@ -88,6 +89,8 @@ class LDBCommand {
 
   virtual void OverrideBaseOptions();
 
+  virtual void OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts);
+
   virtual void SetDBOptions(Options options) { options_ = options; }
 
   virtual void SetColumnFamilies(
@@ -135,6 +138,7 @@ class LDBCommand {
  protected:
   LDBCommandExecuteResult exec_state_;
   std::string env_uri_;
+  std::string fs_uri_;
   std::string db_path_;
   // If empty, open DB as primary. If non-empty, open the DB as secondary
   // with this secondary path. When running against a database opened by
diff --git a/include/rocksdb/utilities/object_registry.h b/include/rocksdb/utilities/object_registry.h
index 538cb6a8fd4..5a454d7755f 100644
--- a/include/rocksdb/utilities/object_registry.h
+++ b/include/rocksdb/utilities/object_registry.h
@@ -17,12 +17,23 @@
 
 namespace ROCKSDB_NAMESPACE {
 class Logger;
+class ObjectLibrary;
+
 // Returns a new T when called with a string. Populates the std::unique_ptr
 // argument if granting ownership to caller.
 template <typename T>
 using FactoryFunc =
     std::function<T*(const std::string&, std::unique_ptr<T>*, std::string*)>;
 
+// The signature of the function for loading factories
+// into an object library.  This method is expected to register
+// factory functions in the supplied ObjectLibrary.
+// The ObjectLibrary is the library in which the factories will be loaded.
+// The std::string is the argument passed to the loader function.
+// The RegistrarFunc should return the number of objects loaded into this
+// library
+using RegistrarFunc = std::function<int(ObjectLibrary&, const std::string&)>;
+
 class ObjectLibrary {
  public:
   // Base class for an Entry in the Registry.
@@ -62,9 +73,18 @@ class ObjectLibrary {
     FactoryFunc<T> factory_;
   };  // End class FactoryEntry
  public:
+  explicit ObjectLibrary(const std::string& id) { id_ = id; }
+
+  const std::string& GetID() const { return id_; }
   // Finds the entry matching the input name and type
   const Entry* FindEntry(const std::string& type,
                          const std::string& name) const;
+
+  // Returns the total number of factories registered for this library.
+  // This method returns the sum of all factories registered for all types.
+  // @param num_types returns how many unique types are registered.
+  size_t GetFactoryCount(size_t* num_types) const;
+
   void Dump(Logger* logger) const;
 
   // Registers the factory with the library for the pattern.
@@ -76,6 +96,12 @@ class ObjectLibrary {
     AddEntry(T::Type(), entry);
     return factory;
   }
+
+  // Invokes the registrar function with the supplied arg for this library.
+  int Register(const RegistrarFunc& registrar, const std::string& arg) {
+    return registrar(*this, arg);
+  }
+
   // Returns the default ObjectLibrary
   static std::shared_ptr<ObjectLibrary>& Default();
 
@@ -85,6 +111,9 @@ class ObjectLibrary {
 
   // ** FactoryFunctions for this loader, organized by type
   std::unordered_map<std::string, std::vector<std::unique_ptr<Entry>>> entries_;
+
+  // The name for this library
+  std::string id_;
 };
 
 // The ObjectRegistry is used to register objects that can be created by a
@@ -93,11 +122,26 @@ class ObjectLibrary {
 class ObjectRegistry {
  public:
   static std::shared_ptr<ObjectRegistry> NewInstance();
-
-  ObjectRegistry();
+  static std::shared_ptr<ObjectRegistry> NewInstance(
+      const std::shared_ptr<ObjectRegistry>& parent);
+  static std::shared_ptr<ObjectRegistry> Default();
+  explicit ObjectRegistry(const std::shared_ptr<ObjectRegistry>& parent)
+      : parent_(parent) {}
+
+  std::shared_ptr<ObjectLibrary> AddLibrary(const std::string& id) {
+    auto library = std::make_shared<ObjectLibrary>(id);
+    libraries_.push_back(library);
+    return library;
+  }
 
   void AddLibrary(const std::shared_ptr<ObjectLibrary>& library) {
-    libraries_.emplace_back(library);
+    libraries_.push_back(library);
+  }
+
+  void AddLibrary(const std::string& id, const RegistrarFunc& registrar,
+                  const std::string& arg) {
+    auto library = AddLibrary(id);
+    library->Register(registrar, arg);
   }
 
   // Creates a new T using the factory function that was registered with a
@@ -193,6 +237,10 @@ class ObjectRegistry {
   void Dump(Logger* logger) const;
 
  private:
+  explicit ObjectRegistry(const std::shared_ptr<ObjectLibrary>& library) {
+    libraries_.push_back(library);
+  }
+
   const ObjectLibrary::Entry* FindEntry(const std::string& type,
                                         const std::string& name) const;
 
@@ -200,6 +248,7 @@ class ObjectRegistry {
   // The libraries are searched in reverse order (back to front) when
   // searching for entries.
   std::vector<std::shared_ptr<ObjectLibrary>> libraries_;
+  std::shared_ptr<ObjectRegistry> parent_;
 };
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // ROCKSDB_LITE
diff --git a/include/rocksdb/utilities/optimistic_transaction_db.h b/include/rocksdb/utilities/optimistic_transaction_db.h
index 5356df71f39..c070e49a309 100644
--- a/include/rocksdb/utilities/optimistic_transaction_db.h
+++ b/include/rocksdb/utilities/optimistic_transaction_db.h
@@ -51,6 +51,8 @@ struct OptimisticTransactionDBOptions {
   uint32_t occ_lock_buckets = (1 << 20);
 };
 
+// Range deletions (including those in `WriteBatch`es passed to `Write()`) are
+// incompatible with `OptimisticTransactionDB` and will return a non-OK `Status`
 class OptimisticTransactionDB : public StackableDB {
  public:
   // Open an OptimisticTransactionDB similar to DB::Open().
diff --git a/include/rocksdb/utilities/options_type.h b/include/rocksdb/utilities/options_type.h
index 2bd081abfb0..7057c78ac20 100644
--- a/include/rocksdb/utilities/options_type.h
+++ b/include/rocksdb/utilities/options_type.h
@@ -25,6 +25,7 @@ enum class OptionType {
   kInt32T,
   kInt64T,
   kUInt,
+  kUInt8T,
   kUInt32T,
   kUInt64T,
   kSizeT,
@@ -49,6 +50,8 @@ enum class OptionType {
   kStruct,
   kVector,
   kConfigurable,
+  kCustomizable,
+  kEncodedString,
   kUnknown,
 };
 
@@ -93,13 +96,14 @@ enum class OptionTypeFlags : uint32_t {
   kCompareLoose = ConfigOptions::kSanityLevelLooselyCompatible,
   kCompareExact = ConfigOptions::kSanityLevelExactMatch,
 
-  kMutable = 0x0100,        // Option is mutable
-  kRawPointer = 0x0200,     // The option is stored as a raw pointer
-  kShared = 0x0400,         // The option is stored as a shared_ptr
-  kUnique = 0x0800,         // The option is stored as a unique_ptr
-  kAllowNull = 0x1000,      // The option can be null
-  kDontSerialize = 0x2000,  // Don't serialize the option
-  kDontPrepare = 0x4000,    // Don't prepare or sanitize this option
+  kMutable = 0x0100,         // Option is mutable
+  kRawPointer = 0x0200,      // The option is stored as a raw pointer
+  kShared = 0x0400,          // The option is stored as a shared_ptr
+  kUnique = 0x0800,          // The option is stored as a unique_ptr
+  kAllowNull = 0x1000,       // The option can be null
+  kDontSerialize = 0x2000,   // Don't serialize the option
+  kDontPrepare = 0x4000,     // Don't prepare or sanitize this option
+  kStringNameOnly = 0x8000,  // The option serializes to a name only
 };
 
 inline OptionTypeFlags operator|(const OptionTypeFlags &a,
@@ -174,7 +178,7 @@ bool VectorsAreEqual(const ConfigOptions& config_options,
 // @param addr  Pointer to the object
 using ParseFunc = std::function<Status(
     const ConfigOptions& /*opts*/, const std::string& /*name*/,
-    const std::string& /*value*/, char* /*addr*/)>;
+    const std::string& /*value*/, void* /*addr*/)>;
 
 // Function for converting an option "addr" into its string representation.
 // On success, Status::OK is returned and value is the serialized form.
@@ -185,7 +189,7 @@ using ParseFunc = std::function<Status(
 // @param value The result of the serialization.
 using SerializeFunc = std::function<Status(
     const ConfigOptions& /*opts*/, const std::string& /*name*/,
-    const char* /*addr*/, std::string* /*value*/)>;
+    const void* /*addr*/, std::string* /*value*/)>;
 
 // Function for comparing two option values
 // If they are not equal, updates "mismatch" with the name of the bad option
@@ -197,7 +201,7 @@ using SerializeFunc = std::function<Status(
 // first differs
 using EqualsFunc = std::function<bool(
     const ConfigOptions& /*opts*/, const std::string& /*name*/,
-    const char* /*addr1*/, const char* /*addr2*/, std::string* mismatch)>;
+    const void* /*addr1*/, const void* /*addr2*/, std::string* mismatch)>;
 
 // A struct for storing constant option information such as option name,
 // option type, and offset.
@@ -253,7 +257,7 @@ class OptionTypeInfo {
   // - Create a static map of string values to the corresponding enum value
   // - Call this method passing the static map in as a parameter.
   // Note that it is not necessary to add a new OptionType or make any
-  // other changes -- the returned object handles parsing, serialiation, and
+  // other changes -- the returned object handles parsing, serialization, and
   // comparisons.
   //
   // @param offset The offset in the option object for this enum
@@ -270,10 +274,10 @@ class OptionTypeInfo {
         // @return OK if the value is found in the map
         // @return InvalidArgument if the value is not found in the map
         [map](const ConfigOptions&, const std::string& name,
-              const std::string& value, char* addr) {
+              const std::string& value, void* addr) {
           if (map == nullptr) {
             return Status::NotSupported("No enum mapping ", name);
-          } else if (ParseEnum<T>(*map, value, reinterpret_cast<T*>(addr))) {
+          } else if (ParseEnum<T>(*map, value, static_cast<T*>(addr))) {
             return Status::OK();
           } else {
             return Status::InvalidArgument("No mapping for enum ", name);
@@ -284,11 +288,11 @@ class OptionTypeInfo {
         // value is updated to the corresponding string value in the map.
         // @return OK if the enum is found in the map
         // @return InvalidArgument if the enum is not found in the map
-        [map](const ConfigOptions&, const std::string& name, const char* addr,
+        [map](const ConfigOptions&, const std::string& name, const void* addr,
               std::string* value) {
           if (map == nullptr) {
             return Status::NotSupported("No enum mapping ", name);
-          } else if (SerializeEnum<T>(*map, (*reinterpret_cast<const T*>(addr)),
+          } else if (SerializeEnum<T>(*map, (*static_cast<const T*>(addr)),
                                       value)) {
             return Status::OK();
           } else {
@@ -297,10 +301,10 @@ class OptionTypeInfo {
         },
         // Casts addr1 and addr2 to the enum type and returns true if
         // they are equal, false otherwise.
-        [](const ConfigOptions&, const std::string&, const char* addr1,
-           const char* addr2, std::string*) {
-          return (*reinterpret_cast<const T*>(addr1) ==
-                  *reinterpret_cast<const T*>(addr2));
+        [](const ConfigOptions&, const std::string&, const void* addr1,
+           const void* addr2, std::string*) {
+          return (*static_cast<const T*>(addr1) ==
+                  *static_cast<const T*>(addr2));
         });
   }  // End OptionTypeInfo::Enum
 
@@ -335,20 +339,20 @@ class OptionTypeInfo {
         // Parses the struct and updates the fields at addr
         [struct_name, struct_map](const ConfigOptions& opts,
                                   const std::string& name,
-                                  const std::string& value, char* addr) {
+                                  const std::string& value, void* addr) {
           return ParseStruct(opts, struct_name, struct_map, name, value, addr);
         },
         // Serializes the struct options into value
         [struct_name, struct_map](const ConfigOptions& opts,
-                                  const std::string& name, const char* addr,
+                                  const std::string& name, const void* addr,
                                   std::string* value) {
           return SerializeStruct(opts, struct_name, struct_map, name, addr,
                                  value);
         },
         // Compares the struct fields of addr1 and addr2 for equality
         [struct_name, struct_map](const ConfigOptions& opts,
-                                  const std::string& name, const char* addr1,
-                                  const char* addr2, std::string* mismatch) {
+                                  const std::string& name, const void* addr1,
+                                  const void* addr2, std::string* mismatch) {
           return StructsAreEqual(opts, struct_name, struct_map, name, addr1,
                                  addr2, mismatch);
         });
@@ -361,14 +365,14 @@ class OptionTypeInfo {
     return OptionTypeInfo(
         offset, OptionType::kStruct, verification, flags, parse_func,
         [struct_name, struct_map](const ConfigOptions& opts,
-                                  const std::string& name, const char* addr,
+                                  const std::string& name, const void* addr,
                                   std::string* value) {
           return SerializeStruct(opts, struct_name, struct_map, name, addr,
                                  value);
         },
         [struct_name, struct_map](const ConfigOptions& opts,
-                                  const std::string& name, const char* addr1,
-                                  const char* addr2, std::string* mismatch) {
+                                  const std::string& name, const void* addr1,
+                                  const void* addr2, std::string* mismatch) {
           return StructsAreEqual(opts, struct_name, struct_map, name, addr1,
                                  addr2, mismatch);
         });
@@ -384,30 +388,134 @@ class OptionTypeInfo {
         _offset, OptionType::kVector, _verification, _flags,
         [elem_info, separator](const ConfigOptions& opts,
                                const std::string& name,
-                               const std::string& value, char* addr) {
-          auto result = reinterpret_cast<std::vector<T>*>(addr);
+                               const std::string& value, void* addr) {
+          auto result = static_cast<std::vector<T>*>(addr);
           return ParseVector<T>(opts, elem_info, separator, name, value,
                                 result);
         },
         [elem_info, separator](const ConfigOptions& opts,
-                               const std::string& name, const char* addr,
+                               const std::string& name, const void* addr,
                                std::string* value) {
-          const auto& vec = *(reinterpret_cast<const std::vector<T>*>(addr));
+          const auto& vec = *(static_cast<const std::vector<T>*>(addr));
           return SerializeVector<T>(opts, elem_info, separator, name, vec,
                                     value);
         },
         [elem_info](const ConfigOptions& opts, const std::string& name,
-                    const char* addr1, const char* addr2,
+                    const void* addr1, const void* addr2,
                     std::string* mismatch) {
-          const auto& vec1 = *(reinterpret_cast<const std::vector<T>*>(addr1));
-          const auto& vec2 = *(reinterpret_cast<const std::vector<T>*>(addr2));
+          const auto& vec1 = *(static_cast<const std::vector<T>*>(addr1));
+          const auto& vec2 = *(static_cast<const std::vector<T>*>(addr2));
           return VectorsAreEqual<T>(opts, elem_info, name, vec1, vec2,
                                     mismatch);
         });
   }
 
+  // Create a new std::shared_ptr<Customizable> OptionTypeInfo
+  // This function will call the T::CreateFromString method to create a new
+  // std::shared_ptr<T> object.
+  //
+  // @param offset The offset for the Customizable from the base pointer
+  // @param ovt How to verify this option
+  // @param flags, Extra flags specifying the behavior of this option
+  // @param _sfunc Optional function for serializing this option
+  // @param _efunc Optional function for comparing this option
+  template <typename T>
+  static OptionTypeInfo AsCustomSharedPtr(int offset,
+                                          OptionVerificationType ovt,
+                                          OptionTypeFlags flags) {
+    return AsCustomSharedPtr<T>(offset, ovt, flags, nullptr, nullptr);
+  }
+
+  template <typename T>
+  static OptionTypeInfo AsCustomSharedPtr(int offset,
+                                          OptionVerificationType ovt,
+                                          OptionTypeFlags flags,
+                                          const SerializeFunc& serialize_func,
+                                          const EqualsFunc& equals_func) {
+    return OptionTypeInfo(
+        offset, OptionType::kCustomizable, ovt,
+        flags | OptionTypeFlags::kShared,
+        [](const ConfigOptions& opts, const std::string&,
+           const std::string& value, void* addr) {
+          auto* shared = static_cast<std::shared_ptr<T>*>(addr);
+          return T::CreateFromString(opts, value, shared);
+        },
+        serialize_func, equals_func);
+  }
+
+  // Create a new std::unique_ptr<Customizable> OptionTypeInfo
+  // This function will call the T::CreateFromString method to create a new
+  // std::unique_ptr<T> object.
+  //
+  // @param offset The offset for the Customizable from the base pointer
+  // @param ovt How to verify this option
+  // @param flags, Extra flags specifying the behavior of this option
+  // @param _sfunc Optional function for serializing this option
+  // @param _efunc Optional function for comparing this option
+  template <typename T>
+  static OptionTypeInfo AsCustomUniquePtr(int offset,
+                                          OptionVerificationType ovt,
+                                          OptionTypeFlags flags) {
+    return AsCustomUniquePtr<T>(offset, ovt, flags, nullptr, nullptr);
+  }
+
+  template <typename T>
+  static OptionTypeInfo AsCustomUniquePtr(int offset,
+                                          OptionVerificationType ovt,
+                                          OptionTypeFlags flags,
+                                          const SerializeFunc& serialize_func,
+                                          const EqualsFunc& equals_func) {
+    return OptionTypeInfo(
+        offset, OptionType::kCustomizable, ovt,
+        flags | OptionTypeFlags::kUnique,
+        [](const ConfigOptions& opts, const std::string&,
+           const std::string& value, void* addr) {
+          auto* unique = static_cast<std::unique_ptr<T>*>(addr);
+          return T::CreateFromString(opts, value, unique);
+        },
+        serialize_func, equals_func);
+  }
+
+  // Create a new Customizable* OptionTypeInfo
+  // This function will call the T::CreateFromString method to create a new
+  // T object.
+  //
+  // @param _offset The offset for the Customizable from the base pointer
+  // @param ovt How to verify this option
+  // @param flags, Extra flags specifying the behavior of this option
+  // @param _sfunc Optional function for serializing this option
+  // @param _efunc Optional function for comparing this option
+  template <typename T>
+  static OptionTypeInfo AsCustomRawPtr(int offset, OptionVerificationType ovt,
+                                       OptionTypeFlags flags) {
+    return AsCustomRawPtr<T>(offset, ovt, flags, nullptr, nullptr);
+  }
+
+  template <typename T>
+  static OptionTypeInfo AsCustomRawPtr(int offset, OptionVerificationType ovt,
+                                       OptionTypeFlags flags,
+                                       const SerializeFunc& serialize_func,
+                                       const EqualsFunc& equals_func) {
+    return OptionTypeInfo(
+        offset, OptionType::kCustomizable, ovt,
+        flags | OptionTypeFlags::kRawPointer,
+        [](const ConfigOptions& opts, const std::string&,
+           const std::string& value, void* addr) {
+          auto** pointer = static_cast<T**>(addr);
+          return T::CreateFromString(opts, value, pointer);
+        },
+        serialize_func, equals_func);
+  }
+
   bool IsEnabled(OptionTypeFlags otf) const { return (flags_ & otf) == otf; }
 
+  bool IsEditable(const ConfigOptions& opts) const {
+    if (opts.mutable_options_only) {
+      return IsMutable();
+    } else {
+      return true;
+    }
+  }
   bool IsMutable() const { return IsEnabled(OptionTypeFlags::kMutable); }
 
   bool IsDeprecated() const {
@@ -475,7 +583,12 @@ class OptionTypeInfo {
 
   bool IsStruct() const { return (type_ == OptionType::kStruct); }
 
-  bool IsConfigurable() const { return (type_ == OptionType::kConfigurable); }
+  bool IsConfigurable() const {
+    return (type_ == OptionType::kConfigurable ||
+            type_ == OptionType::kCustomizable);
+  }
+
+  bool IsCustomizable() const { return (type_ == OptionType::kCustomizable); }
 
   // Returns the underlying pointer for the type at base_addr
   // The value returned is the underlying "raw" pointer, offset from base.
@@ -484,20 +597,20 @@ class OptionTypeInfo {
     if (base_addr == nullptr) {
       return nullptr;
     }
-    const auto opt_addr = reinterpret_cast<const char*>(base_addr) + offset_;
+    const void* opt_addr = static_cast<const char*>(base_addr) + offset_;
     if (IsUniquePtr()) {
       const std::unique_ptr<T>* ptr =
-          reinterpret_cast<const std::unique_ptr<T>*>(opt_addr);
+          static_cast<const std::unique_ptr<T>*>(opt_addr);
       return ptr->get();
     } else if (IsSharedPtr()) {
       const std::shared_ptr<T>* ptr =
-          reinterpret_cast<const std::shared_ptr<T>*>(opt_addr);
+          static_cast<const std::shared_ptr<T>*>(opt_addr);
       return ptr->get();
     } else if (IsRawPtr()) {
-      const T* const* ptr = reinterpret_cast<const T* const*>(opt_addr);
+      const T* const* ptr = static_cast<const T* const*>(opt_addr);
       return *ptr;
     } else {
-      return reinterpret_cast<const T*>(opt_addr);
+      return static_cast<const T*>(opt_addr);
     }
   }
 
@@ -508,18 +621,18 @@ class OptionTypeInfo {
     if (base_addr == nullptr) {
       return nullptr;
     }
-    auto opt_addr = reinterpret_cast<char*>(base_addr) + offset_;
+    void* opt_addr = static_cast<char*>(base_addr) + offset_;
     if (IsUniquePtr()) {
-      std::unique_ptr<T>* ptr = reinterpret_cast<std::unique_ptr<T>*>(opt_addr);
+      std::unique_ptr<T>* ptr = static_cast<std::unique_ptr<T>*>(opt_addr);
       return ptr->get();
     } else if (IsSharedPtr()) {
-      std::shared_ptr<T>* ptr = reinterpret_cast<std::shared_ptr<T>*>(opt_addr);
+      std::shared_ptr<T>* ptr = static_cast<std::shared_ptr<T>*>(opt_addr);
       return ptr->get();
     } else if (IsRawPtr()) {
-      T** ptr = reinterpret_cast<T**>(opt_addr);
+      T** ptr = static_cast<T**>(opt_addr);
       return *ptr;
     } else {
-      return reinterpret_cast<T*>(opt_addr);
+      return static_cast<T*>(opt_addr);
     }
   }
 
@@ -553,6 +666,37 @@ class OptionTypeInfo {
                       const std::string& opt_name, const void* const this_ptr,
                       const std::string& that_value) const;
 
+  // Parses the input opts_map according to the type_map for the opt_addr
+  // For each name-value pair in opts_map, find the corresponding name in
+  // type_map If the name is found:
+  //    - set the corresponding value in opt_addr, returning the status on
+  //    failure;
+  // If the name is not found:
+  //    - If unused is specified, add the name-value to unused and continue
+  //    - If ingore_unknown_options is false, return NotFound
+  // Returns OK if all options were either:
+  //    - Successfully set
+  //    - options were not found and ignore_unknown_options=true
+  //    - options were not found and unused was specified
+  // Note that this method is much less sophisticated than the comparable
+  // Configurable::Configure methods.  For example, on error, there is no
+  // attempt to return opt_addr to the initial state.  Additionally, there
+  // is no effort to initialize (Configurable::PrepareOptions) the object
+  // on success.  This method should typically only be used for simpler,
+  // standalone structures and not those that contain shared and embedded
+  // objects.
+  static Status ParseType(
+      const ConfigOptions& config_options, const std::string& opts_str,
+      const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+      void* opt_addr,
+      std::unordered_map<std::string, std::string>* unused = nullptr);
+  static Status ParseType(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, std::string>& opts_map,
+      const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+      void* opt_addr,
+      std::unordered_map<std::string, std::string>* unused = nullptr);
+
   // Parses the input value according to the map for the struct at opt_addr
   // struct_name is the name of the struct option as registered
   // opt_name is the name of the option being evaluated.  This may
@@ -561,7 +705,15 @@ class OptionTypeInfo {
   static Status ParseStruct(
       const ConfigOptions& config_options, const std::string& struct_name,
       const std::unordered_map<std::string, OptionTypeInfo>* map,
-      const std::string& opt_name, const std::string& value, char* opt_addr);
+      const std::string& opt_name, const std::string& value, void* opt_addr);
+
+  // Serializes the values from opt_addr using the rules in type_map.
+  // Returns the serialized form in result.
+  // Returns OK on success or non-OK if some option could not be serialized.
+  static Status SerializeType(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+      const void* opt_addr, std::string* value);
 
   // Serializes the input addr according to the map for the struct to value.
   // struct_name is the name of the struct option as registered
@@ -570,7 +722,16 @@ class OptionTypeInfo {
   static Status SerializeStruct(
       const ConfigOptions& config_options, const std::string& struct_name,
       const std::unordered_map<std::string, OptionTypeInfo>* map,
-      const std::string& opt_name, const char* opt_addr, std::string* value);
+      const std::string& opt_name, const void* opt_addr, std::string* value);
+
+  // Compares the values in this_addr and that_addr using the rules in type_map.
+  // If the values are equal, returns true
+  // If the values are not equal, returns false and sets mismatch to the name
+  // of the first value that did not match.
+  static bool TypesAreEqual(
+      const ConfigOptions& config_options,
+      const std::unordered_map<std::string, OptionTypeInfo>& map,
+      const void* this_addr, const void* that_addr, std::string* mismatch);
 
   // Compares the input offsets according to the map for the struct and returns
   // true if they are equivalent, false otherwise.
@@ -580,8 +741,8 @@ class OptionTypeInfo {
   static bool StructsAreEqual(
       const ConfigOptions& config_options, const std::string& struct_name,
       const std::unordered_map<std::string, OptionTypeInfo>* map,
-      const std::string& opt_name, const char* this_offset,
-      const char* that_offset, std::string* mismatch);
+      const std::string& opt_name, const void* this_offset,
+      const void* that_offset, std::string* mismatch);
 
   // Finds the entry for the opt_name in the opt_map, returning
   // nullptr if not found.
@@ -607,7 +768,7 @@ class OptionTypeInfo {
   // @param opts The string in which to find the next token
   // @param delimiter The delimiter between tokens
   // @param start     The position in opts to start looking for the token
-  // @parem ed        Returns the end position in opts of the token
+  // @param ed        Returns the end position in opts of the token
   // @param token     Returns the token
   // @returns OK if a token was found
   // @return InvalidArgument if the braces mismatch
@@ -660,6 +821,10 @@ Status ParseVector(const ConfigOptions& config_options,
   result->clear();
   Status status;
 
+  // Turn off ignore_unknown_objects so we can tell if the returned
+  // object is valid or not.
+  ConfigOptions copy = config_options;
+  copy.ignore_unsupported_options = false;
   for (size_t start = 0, end = 0;
        status.ok() && start < value.size() && end != std::string::npos;
        start = end + 1) {
@@ -667,10 +832,14 @@ Status ParseVector(const ConfigOptions& config_options,
     status = OptionTypeInfo::NextToken(value, separator, start, &end, &token);
     if (status.ok()) {
       T elem;
-      status = elem_info.Parse(config_options, name, token,
-                               reinterpret_cast<char*>(&elem));
+      status = elem_info.Parse(copy, name, token, &elem);
       if (status.ok()) {
         result->emplace_back(elem);
+      } else if (config_options.ignore_unsupported_options &&
+                 status.IsNotSupported()) {
+        // If we were ignoring unsupported options and this one should be
+        // ignored, ignore it by setting the status to OK
+        status = Status::OK();
       }
     }
   }
@@ -739,7 +908,7 @@ Status SerializeVector(const ConfigOptions& config_options,
 // @param vec1,vec2 The vectors to compare.
 // @param mismatch  If the vectors are not equivalent, mismatch will point to
 // the first
-//                  element of the comparison tht did not match.
+//                  element of the comparison that did not match.
 // @return true     If vec1 and vec2 are "equal", false otherwise
 template <typename T>
 bool VectorsAreEqual(const ConfigOptions& config_options,
diff --git a/include/rocksdb/utilities/options_util.h b/include/rocksdb/utilities/options_util.h
index 681b42cfdf9..064c087f05c 100644
--- a/include/rocksdb/utilities/options_util.h
+++ b/include/rocksdb/utilities/options_util.h
@@ -51,7 +51,7 @@ struct ConfigOptions;
 // BlockBasedTableOptions and making necessary changes.
 //
 // ignore_unknown_options can be set to true if you want to ignore options
-// that are from a newer version of the db, esentially for forward
+// that are from a newer version of the db, essentially for forward
 // compatibility.
 //
 // config_options contains a set of options that controls the processing
@@ -66,7 +66,7 @@ struct ConfigOptions;
 // @return the function returns an OK status when it went successfully.  If
 //     the specified "dbpath" does not contain any option file, then a
 //     Status::NotFound will be returned.  A return value other than
-//     Status::OK or Status::NotFound indicates there're some error related
+//     Status::OK or Status::NotFound indicates there is some error related
 //     to the options file itself.
 //
 // @see LoadOptionsFromFile
diff --git a/include/rocksdb/utilities/sim_cache.h b/include/rocksdb/utilities/sim_cache.h
index ba6f1d74894..17143916b30 100644
--- a/include/rocksdb/utilities/sim_cache.h
+++ b/include/rocksdb/utilities/sim_cache.h
@@ -25,7 +25,7 @@ class SimCache;
 // can help users tune their current block cache size, and determine how
 // efficient they are using the memory.
 //
-// Since GetSimCapacity() returns the capacity for simulutation, it differs from
+// Since GetSimCapacity() returns the capacity for simulation, it differs from
 // actual memory usage, which can be estimated as:
 // sim_capacity * entry_size / (entry_size + block_size),
 // where 76 <= entry_size <= 104,
@@ -60,7 +60,7 @@ class SimCache : public Cache {
   // sets the maximum configured capacity of the simcache. When the new
   // capacity is less than the old capacity and the existing usage is
   // greater than new capacity, the implementation will purge old entries
-  // to fit new capapicty.
+  // to fit new capacity.
   virtual void SetSimCapacity(size_t capacity) = 0;
 
   // returns the lookup times of simcache
diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h
index b785961f343..5cecc80c905 100644
--- a/include/rocksdb/utilities/stackable_db.h
+++ b/include/rocksdb/utilities/stackable_db.h
@@ -141,6 +141,11 @@ class StackableDB : public DB {
                                              import_options, metadata, handle);
   }
 
+  using DB::VerifyFileChecksums;
+  Status VerifyFileChecksums(const ReadOptions& read_opts) override {
+    return db_->VerifyFileChecksums(read_opts);
+  }
+
   virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); }
 
   virtual Status VerifyChecksum(const ReadOptions& options) override {
@@ -367,6 +372,24 @@ class StackableDB : public DB {
   using DB::EndBlockCacheTrace;
   Status EndBlockCacheTrace() override { return db_->EndBlockCacheTrace(); }
 
+  using DB::StartIOTrace;
+  Status StartIOTrace(const TraceOptions& options,
+                      std::unique_ptr<TraceWriter>&& trace_writer) override {
+    return db_->StartIOTrace(options, std::move(trace_writer));
+  }
+
+  using DB::EndIOTrace;
+  Status EndIOTrace() override { return db_->EndIOTrace(); }
+
+  using DB::StartTrace;
+  Status StartTrace(const TraceOptions& options,
+                    std::unique_ptr<TraceWriter>&& trace_writer) override {
+    return db_->StartTrace(options, std::move(trace_writer));
+  }
+
+  using DB::EndTrace;
+  Status EndTrace() override { return db_->EndTrace(); }
+
 #endif  // ROCKSDB_LITE
 
   virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
diff --git a/include/rocksdb/utilities/transaction.h b/include/rocksdb/utilities/transaction.h
index 6ebdbcc408b..dd7dd998ab3 100644
--- a/include/rocksdb/utilities/transaction.h
+++ b/include/rocksdb/utilities/transaction.h
@@ -24,6 +24,83 @@ using TransactionName = std::string;
 
 using TransactionID = uint64_t;
 
+/*
+  class Endpoint allows to define prefix ranges.
+
+  Prefix ranges are introduced below.
+
+  == Basic Ranges ==
+  Let's start from basic ranges. Key Comparator defines ordering of rowkeys.
+  Then, one can specify finite closed ranges by just providing rowkeys of their
+  endpoints:
+
+    lower_endpoint <= X <= upper_endpoint
+
+  However our goal is to provide a richer set of endpoints. Read on.
+
+  == Lexicographic ordering ==
+  A lexicographic (or dictionary) ordering satisfies these criteria: If there
+  are two keys in form
+    key_a = {prefix_a, suffix_a}
+    key_b = {prefix_b, suffix_b}
+  and
+    prefix_a < prefix_b
+  then
+    key_a < key_b.
+
+  == Prefix ranges ==
+  With lexicographic ordering, one may want to define ranges in form
+
+     "prefix is $PREFIX"
+
+  which translates to a range in form
+
+    {$PREFIX, -infinity} < X < {$PREFIX, +infinity}
+
+  where -infinity will compare less than any possible suffix, and +infinity
+  will compare as greater than any possible suffix.
+
+  class Endpoint allows to define these kind of rangtes.
+
+  == Notes ==
+  BytewiseComparator and ReverseBytewiseComparator produce lexicographic
+  ordering.
+
+  The row comparison function is able to compare key prefixes. If the data
+  domain includes keys A and B, then the comparison function is able to compare
+  equal-length prefixes:
+
+    min_len= min(byte_length(A), byte_length(B));
+    cmp(Slice(A, min_len), Slice(B, min_len));  // this call is valid
+
+  == Other options ==
+  As far as MyRocks is concerned, the alternative to prefix ranges would be to
+  support both open (non-inclusive) and closed (inclusive) range endpoints.
+*/
+
+class Endpoint {
+ public:
+  Slice slice;
+
+  /*
+    true  : the key has a "+infinity" suffix. A suffix that would compare as
+            greater than any other suffix
+    false : otherwise
+  */
+  bool inf_suffix;
+
+  explicit Endpoint(const Slice& slice_arg, bool inf_suffix_arg = false)
+      : slice(slice_arg), inf_suffix(inf_suffix_arg) {}
+
+  explicit Endpoint(const char* s, bool inf_suffix_arg = false)
+      : slice(s), inf_suffix(inf_suffix_arg) {}
+
+  Endpoint(const char* s, size_t size, bool inf_suffix_arg = false)
+      : slice(s, size), inf_suffix(inf_suffix_arg) {}
+
+  Endpoint() : inf_suffix(false) {}
+};
+
 // Provides notification to the caller of SetSnapshotOnNextOperation when
 // the actual snapshot gets created
 class TransactionNotifier {
@@ -277,6 +354,12 @@ class Transaction {
     }
   }
 
+  // Get a range lock on [start_endpoint; end_endpoint].
+  virtual Status GetRangeLock(ColumnFamilyHandle*, const Endpoint&,
+                              const Endpoint&) {
+    return Status::NotSupported();
+  }
+
   virtual Status GetForUpdate(const ReadOptions& options, const Slice& key,
                               std::string* value, bool exclusive = true,
                               const bool do_validate = true) = 0;
diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h
index 8967b7eefab..265d4b79a09 100644
--- a/include/rocksdb/utilities/transaction_db.h
+++ b/include/rocksdb/utilities/transaction_db.h
@@ -31,6 +31,102 @@ enum TxnDBWritePolicy {
 
 const uint32_t kInitialMaxDeadlocks = 5;
 
+class LockManager;
+struct RangeLockInfo;
+
+// A lock manager handle
+// The workflow is as follows:
+//  * Use a factory method (like NewRangeLockManager()) to create a lock
+//    manager and get its handle.
+//  * A Handle for a particular kind of lock manager will have extra
+//    methods and parameters to control the lock manager
+//  * Pass the handle to RocksDB in TransactionDBOptions::lock_mgr_handle. It
+//    will be used to perform locking.
+class LockManagerHandle {
+ public:
+  // PessimisticTransactionDB will call this to get the Lock Manager it's going
+  // to use.
+  virtual LockManager* getLockManager() = 0;
+
+  virtual ~LockManagerHandle() {}
+};
+
+// Same as class Endpoint, but use std::string to manage the buffer allocation
+struct EndpointWithString {
+  std::string slice;
+  bool inf_suffix;
+};
+
+struct RangeDeadlockInfo {
+  TransactionID m_txn_id;
+  uint32_t m_cf_id;
+  bool m_exclusive;
+
+  EndpointWithString m_start;
+  EndpointWithString m_end;
+};
+
+struct RangeDeadlockPath {
+  std::vector<RangeDeadlockInfo> path;
+  bool limit_exceeded;
+  int64_t deadlock_time;
+
+  explicit RangeDeadlockPath(std::vector<RangeDeadlockInfo> path_entry,
+                             const int64_t& dl_time)
+      : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {}
+
+  // empty path, limit exceeded constructor and default constructor
+  explicit RangeDeadlockPath(const int64_t& dl_time = 0, bool limit = false)
+      : path(0), limit_exceeded(limit), deadlock_time(dl_time) {}
+
+  bool empty() { return path.empty() && !limit_exceeded; }
+};
+
+// A handle to control RangeLockManager (Range-based lock manager) from outside
+// RocksDB
+class RangeLockManagerHandle : public LockManagerHandle {
+ public:
+  // Set total amount of lock memory to use.
+  //
+  //  @return 0 Ok
+  //  @return EDOM Failed to set because currently using more memory than
+  //        specified
+  virtual int SetMaxLockMemory(size_t max_lock_memory) = 0;
+  virtual size_t GetMaxLockMemory() = 0;
+
+  using RangeLockStatus =
+      std::unordered_multimap<ColumnFamilyId, RangeLockInfo>;
+
+  virtual RangeLockStatus GetRangeLockStatusData() = 0;
+
+  class Counters {
+   public:
+    // Number of times lock escalation was triggered (for all column families)
+    uint64_t escalation_count;
+
+    // How much memory is currently used for locks (total for all column
+    // families)
+    uint64_t current_lock_memory;
+  };
+
+  // Get the current counter values
+  virtual Counters GetStatus() = 0;
+
+  // Functions for range-based Deadlock reporting.
+  virtual std::vector<RangeDeadlockPath> GetRangeDeadlockInfoBuffer() = 0;
+  virtual void SetRangeDeadlockInfoBufferSize(uint32_t target_size) = 0;
+
+  virtual ~RangeLockManagerHandle() {}
+};
+
+// A factory function to create a Range Lock Manager. The created object should
+// be:
+//  1. Passed in TransactionDBOptions::lock_mgr_handle to open the database in
+//     range-locking mode
+//  2. Used to control the lock manager when the DB is already open.
+RangeLockManagerHandle* NewRangeLockManager(
+    std::shared_ptr<TransactionDBMutexFactory> mutex_factory);
+
 struct TransactionDBOptions {
   // Specifies the maximum number of keys that can be locked at the same time
   // per column family.
@@ -92,9 +188,13 @@ struct TransactionDBOptions {
   // for the special way that myrocks uses this operands.
   bool rollback_merge_operands = false;
 
+  // nullptr means use default lock manager.
+  // Other value means the user provides a custom lock manager.
+  std::shared_ptr<LockManagerHandle> lock_mgr_handle;
+
   // If true, the TransactionDB implementation might skip concurrency control
   // unless it is overridden by TransactionOptions or
-  // TransactionDBWriteOptimizations. This can be used in conjuction with
+  // TransactionDBWriteOptimizations. This can be used in conjunction with
   // DBOptions::unordered_write when the TransactionDB is used solely for write
   // ordering rather than concurrency control.
   bool skip_concurrency_control = false;
@@ -202,6 +302,13 @@ struct KeyLockInfo {
   bool exclusive;
 };
 
+struct RangeLockInfo {
+  EndpointWithString start;
+  EndpointWithString end;
+  std::vector<TransactionID> ids;
+  bool exclusive;
+};
+
 struct DeadlockInfo {
   TransactionID m_txn_id;
   uint32_t m_cf_id;
@@ -237,6 +344,17 @@ class TransactionDB : public StackableDB {
     // falls back to the un-optimized version of ::Write
     return Write(opts, updates);
   }
+  // Transactional `DeleteRange()` is not yet supported.
+  // However, users who know their deleted range does not conflict with
+  // anything can still use it via the `Write()` API. In all cases, the
+  // `Write()` overload specifying `TransactionDBWriteOptimizations` must be
+  // used and `skip_concurrency_control` must be set. When using either
+  // WRITE_PREPARED or WRITE_UNPREPARED , `skip_duplicate_key_check` must
+  // additionally be set.
+  virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*,
+                             const Slice&, const Slice&) override {
+    return Status::NotSupported();
+  }
   // Open a TransactionDB similar to DB::Open().
   // Internally call PrepareWrap() and WrapDB()
   // If the return status is not ok, then dbptr is set to nullptr.
@@ -296,6 +414,7 @@ class TransactionDB : public StackableDB {
   // The mapping is column family id -> KeyLockInfo
   virtual std::unordered_multimap<uint32_t, KeyLockInfo>
   GetLockStatusData() = 0;
+
   virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0;
   virtual void SetDeadlockInfoBufferSize(uint32_t target_size) = 0;
 
diff --git a/include/rocksdb/utilities/transaction_db_mutex.h b/include/rocksdb/utilities/transaction_db_mutex.h
index 96a42adf8c5..13d899c3226 100644
--- a/include/rocksdb/utilities/transaction_db_mutex.h
+++ b/include/rocksdb/utilities/transaction_db_mutex.h
@@ -61,7 +61,7 @@ class TransactionDBCondVar {
   //
   // Returns OK if notified.
   // Returns TimedOut if timeout is reached.
-  // Returns other status if TransactionDB should otherwis stop waiting and
+  // Returns other status if TransactionDB should otherwise stop waiting and
   //  fail the operation.
   // May return OK spuriously even if not notified.
   virtual Status WaitFor(std::shared_ptr<TransactionDBMutex> mutex,
diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h
index 424aa158239..65feaa7b8ae 100644
--- a/include/rocksdb/utilities/write_batch_with_index.h
+++ b/include/rocksdb/utilities/write_batch_with_index.h
@@ -40,12 +40,13 @@ enum WriteType {
   kDeleteRangeRecord,
   kLogDataRecord,
   kXIDRecord,
+  kUnknownRecord,
 };
 
 // an entry for Put, Merge, Delete, or SingleDelete entry for write batches.
 // Used in WBWIIterator.
 struct WriteEntry {
-  WriteType type;
+  WriteType type = kUnknownRecord;
   Slice key;
   Slice value;
 };
@@ -168,7 +169,7 @@ class WriteBatchWithIndex : public WriteBatchBase {
   // returned iterator will also delete the base_iterator.
   //
   // Updating write batch with the current key of the iterator is not safe.
-  // We strongly recommand users not to do it. It will invalidate the current
+  // We strongly recommend users not to do it. It will invalidate the current
   // key() and value() of the iterator. This invalidation happens even before
   // the write batch update finishes. The state may recover after Next() is
   // called.
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index c2631d0722b..ebdfb7468f1 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -4,9 +4,14 @@
 //  (found in the LICENSE.Apache file in the root directory).
 #pragma once
 
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/rocksdb_namespace.h"
+
 #define ROCKSDB_MAJOR 6
-#define ROCKSDB_MINOR 14
-#define ROCKSDB_PATCH 6
+#define ROCKSDB_MINOR 22
+#define ROCKSDB_PATCH 1
 
 // Do not use these. We made the mistake of declaring macros starting with
 // double underscore. Now we have to live with our choice. We'll deprecate these
@@ -14,3 +19,23 @@
 #define __ROCKSDB_MAJOR__ ROCKSDB_MAJOR
 #define __ROCKSDB_MINOR__ ROCKSDB_MINOR
 #define __ROCKSDB_PATCH__ ROCKSDB_PATCH
+
+namespace ROCKSDB_NAMESPACE {
+// Returns a set of properties indicating how/when/where this version of RocksDB
+// was created.
+const std::unordered_map<std::string, std::string>& GetRocksBuildProperties();
+
+// Returns the current version of RocksDB as a string (e.g. "6.16.0").
+// If with_patch is true, the patch is included (6.16.x).
+// Otherwise, only major and minor version is included (6.16)
+std::string GetRocksVersionAsString(bool with_patch = true);
+
+// Gets the set of build properties (@see GetRocksBuildProperties) into a
+// string. Properties are returned one-per-line, with the first line being:
+// "<program> from RocksDB <version>.
+// If verbose is true, the full set of properties is
+// printed. If verbose is false, only the version information (@see
+// GetRocksVersionString) is printed.
+std::string GetRocksBuildInfoAsString(const std::string& program,
+                                      bool verbose = false);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h
index 51fd4d8ac80..d47c435bf4f 100644
--- a/include/rocksdb/write_batch.h
+++ b/include/rocksdb/write_batch.h
@@ -62,6 +62,11 @@ class WriteBatch : public WriteBatchBase {
  public:
   explicit WriteBatch(size_t reserved_bytes = 0, size_t max_bytes = 0);
   explicit WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz);
+  // `protection_bytes_per_key` is the number of bytes used to store
+  // protection information for each key entry. Currently supported values are
+  // zero (disabled) and eight.
+  explicit WriteBatch(size_t reserved_bytes, size_t max_bytes, size_t ts_sz,
+                      size_t protection_bytes_per_key);
   ~WriteBatch() override;
 
   using WriteBatchBase::Put;
@@ -307,10 +312,10 @@ class WriteBatch : public WriteBatchBase {
   // Returns true if MarkEndPrepare will be called during Iterate
   bool HasEndPrepare() const;
 
-  // Returns trie if MarkCommit will be called during Iterate
+  // Returns true if MarkCommit will be called during Iterate
   bool HasCommit() const;
 
-  // Returns trie if MarkRollback will be called during Iterate
+  // Returns true if MarkRollback will be called during Iterate
   bool HasRollback() const;
 
   // Assign timestamp to write batch
@@ -338,6 +343,9 @@ class WriteBatch : public WriteBatchBase {
 
   void SetMaxBytes(size_t max_bytes) override { max_bytes_ = max_bytes; }
 
+  struct ProtectionInfo;
+  size_t GetProtectionBytesPerKey() const;
+
  private:
   friend class WriteBatchInternal;
   friend class LocalSavePoint;
@@ -367,11 +375,11 @@ class WriteBatch : public WriteBatchBase {
   // more details.
   bool is_latest_persistent_state_ = false;
 
+  std::unique_ptr<ProtectionInfo> prot_info_;
+
  protected:
   std::string rep_;  // See comment in write_batch.cc for the format of rep_
   const size_t timestamp_size_;
-
-  // Intentionally copyable
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/include/rocksdb/write_buffer_manager.h b/include/rocksdb/write_buffer_manager.h
index ae1c98cafb2..67aef7f8fe7 100644
--- a/include/rocksdb/write_buffer_manager.h
+++ b/include/rocksdb/write_buffer_manager.h
@@ -13,46 +13,96 @@
 #pragma once
 
 #include <atomic>
+#include <condition_variable>
 #include <cstddef>
+#include <list>
+#include <mutex>
+
 #include "rocksdb/cache.h"
 
 namespace ROCKSDB_NAMESPACE {
 
+// Interface to block and signal DB instances.
+// Each DB instance contains ptr to StallInterface.
+class StallInterface {
+ public:
+  virtual ~StallInterface() {}
+
+  virtual void Block() = 0;
+
+  virtual void Signal() = 0;
+};
+
 class WriteBufferManager {
  public:
-  // _buffer_size = 0 indicates no limit. Memory won't be capped.
+  // Parameters:
+  // _buffer_size: _buffer_size = 0 indicates no limit. Memory won't be capped.
   // memory_usage() won't be valid and ShouldFlush() will always return true.
-  // if `cache` is provided, we'll put dummy entries in the cache and cost
-  // the memory allocated to the cache. It can be used even if _buffer_size = 0.
+  //
+  // cache_: if `cache` is provided, we'll put dummy entries in the cache and
+  // cost the memory allocated to the cache. It can be used even if _buffer_size
+  // = 0.
+  //
+  // allow_stall: if set true, it will enable stalling of writes when
+  // memory_usage() exceeds buffer_size. It will wait for flush to complete and
+  // memory usage to drop down.
   explicit WriteBufferManager(size_t _buffer_size,
-                              std::shared_ptr<Cache> cache = {});
+                              std::shared_ptr<Cache> cache = {},
+                              bool allow_stall = false);
   // No copying allowed
   WriteBufferManager(const WriteBufferManager&) = delete;
   WriteBufferManager& operator=(const WriteBufferManager&) = delete;
 
   ~WriteBufferManager();
 
-  bool enabled() const { return buffer_size_ != 0; }
+  // Returns true if buffer_limit is passed to limit the total memory usage and
+  // is greater than 0.
+  bool enabled() const { return buffer_size() > 0; }
 
+  // Returns true if pointer to cache is passed.
   bool cost_to_cache() const { return cache_rep_ != nullptr; }
 
+  // Returns the total memory used by memtables.
   // Only valid if enabled()
   size_t memory_usage() const {
     return memory_used_.load(std::memory_order_relaxed);
   }
+
+  // Returns the total memory used by active memtables.
   size_t mutable_memtable_memory_usage() const {
     return memory_active_.load(std::memory_order_relaxed);
   }
-  size_t buffer_size() const { return buffer_size_; }
+
+  size_t dummy_entries_in_cache_usage() const {
+    return dummy_size_.load(std::memory_order_relaxed);
+  }
+
+  // Returns the buffer_size.
+  size_t buffer_size() const {
+    return buffer_size_.load(std::memory_order_relaxed);
+  }
+
+  void SetBufferSize(size_t new_size) {
+    buffer_size_.store(new_size, std::memory_order_relaxed);
+    mutable_limit_.store(new_size * 7 / 8, std::memory_order_relaxed);
+    // Check if stall is active and can be ended.
+    if (allow_stall_) {
+      EndWriteStall();
+    }
+  }
+
+  // Below functions should be called by RocksDB internally.
 
   // Should only be called from write thread
   bool ShouldFlush() const {
     if (enabled()) {
-      if (mutable_memtable_memory_usage() > mutable_limit_) {
+      if (mutable_memtable_memory_usage() >
+          mutable_limit_.load(std::memory_order_relaxed)) {
         return true;
       }
-      if (memory_usage() >= buffer_size_ &&
-          mutable_memtable_memory_usage() >= buffer_size_ / 2) {
+      size_t local_size = buffer_size();
+      if (memory_usage() >= local_size &&
+          mutable_memtable_memory_usage() >= local_size / 2) {
         // If the memory exceeds the buffer size, we trigger more aggressive
         // flush. But if already more than half memory is being flushed,
         // triggering more flush may not help. We will hold it instead.
@@ -62,39 +112,65 @@ class WriteBufferManager {
     return false;
   }
 
-  void ReserveMem(size_t mem) {
-    if (cache_rep_ != nullptr) {
-      ReserveMemWithCache(mem);
-    } else if (enabled()) {
-      memory_used_.fetch_add(mem, std::memory_order_relaxed);
-    }
-    if (enabled()) {
-      memory_active_.fetch_add(mem, std::memory_order_relaxed);
+  // Returns true if total memory usage exceeded buffer_size.
+  // We stall the writes untill memory_usage drops below buffer_size. When the
+  // function returns true, all writer threads (including one checking this
+  // condition) across all DBs will be stalled. Stall is allowed only if user
+  // pass allow_stall = true during WriteBufferManager instance creation.
+  //
+  // Should only be called by RocksDB internally .
+  bool ShouldStall() {
+    if (allow_stall_ && enabled()) {
+      if (IsStallActive()) {
+        return true;
+      }
+      if (IsStallThresholdExceeded()) {
+        stall_active_.store(true, std::memory_order_relaxed);
+        return true;
+      }
     }
+    return false;
   }
+
+  // Returns true if stall is active.
+  bool IsStallActive() const {
+    return stall_active_.load(std::memory_order_relaxed);
+  }
+
+  // Returns true if stalling condition is met.
+  bool IsStallThresholdExceeded() { return memory_usage() >= buffer_size_; }
+
+  void ReserveMem(size_t mem);
+
   // We are in the process of freeing `mem` bytes, so it is not considered
   // when checking the soft limit.
-  void ScheduleFreeMem(size_t mem) {
-    if (enabled()) {
-      memory_active_.fetch_sub(mem, std::memory_order_relaxed);
-    }
-  }
-  void FreeMem(size_t mem) {
-    if (cache_rep_ != nullptr) {
-      FreeMemWithCache(mem);
-    } else if (enabled()) {
-      memory_used_.fetch_sub(mem, std::memory_order_relaxed);
-    }
-  }
+  void ScheduleFreeMem(size_t mem);
+
+  void FreeMem(size_t mem);
+
+  // Add the DB instance to the queue and block the DB.
+  // Should only be called by RocksDB internally.
+  void BeginWriteStall(StallInterface* wbm_stall);
+
+  // Remove DB instances from queue and signal them to continue.
+  void EndWriteStall();
+
+  void RemoveDBFromQueue(StallInterface* wbm_stall);
 
  private:
-  const size_t buffer_size_;
-  const size_t mutable_limit_;
+  std::atomic<size_t> buffer_size_;
+  std::atomic<size_t> mutable_limit_;
   std::atomic<size_t> memory_used_;
   // Memory that hasn't been scheduled to free.
   std::atomic<size_t> memory_active_;
+  std::atomic<size_t> dummy_size_;
   struct CacheRep;
   std::unique_ptr<CacheRep> cache_rep_;
+  std::list<StallInterface*> queue_;
+  // Protects the queue_
+  std::mutex mu_;
+  bool allow_stall_;
+  std::atomic<bool> stall_active_;
 
   void ReserveMemWithCache(size_t mem);
   void FreeMemWithCache(size_t mem);
diff --git a/java/CMakeLists.txt b/java/CMakeLists.txt
index ebc1c197092..3ec6099fc26 100644
--- a/java/CMakeLists.txt
+++ b/java/CMakeLists.txt
@@ -13,6 +13,7 @@ set(JNI_NATIVE_SOURCES
         rocksjni/cassandra_value_operator.cc
         rocksjni/checkpoint.cc
         rocksjni/clock_cache.cc
+        rocksjni/cache.cc
         rocksjni/columnfamilyhandle.cc
         rocksjni/compaction_filter.cc
         rocksjni/compaction_filter_factory.cc
@@ -30,6 +31,8 @@ set(JNI_NATIVE_SOURCES
         rocksjni/config_options.cc
         rocksjni/env.cc
         rocksjni/env_options.cc
+        rocksjni/event_listener.cc
+        rocksjni/event_listener_jnicallback.cc
         rocksjni/filter.cc
         rocksjni/ingest_external_file_options.cc
         rocksjni/iterator.cc
@@ -63,6 +66,7 @@ set(JNI_NATIVE_SOURCES
         rocksjni/table.cc
         rocksjni/table_filter.cc
         rocksjni/table_filter_jnicallback.cc
+        rocksjni/testable_event_listener.cc
         rocksjni/thread_status.cc
         rocksjni/trace_writer.cc
         rocksjni/trace_writer_jnicallback.cc
@@ -87,6 +91,7 @@ set(JAVA_MAIN_CLASSES
   src/main/java/org/rocksdb/AbstractCompactionFilter.java
   src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java
   src/main/java/org/rocksdb/AbstractComparator.java
+  src/main/java/org/rocksdb/AbstractEventListener.java
   src/main/java/org/rocksdb/AbstractImmutableNativeReference.java
   src/main/java/org/rocksdb/AbstractMutableOptions.java
   src/main/java/org/rocksdb/AbstractNativeReference.java
@@ -100,6 +105,7 @@ set(JAVA_MAIN_CLASSES
   src/main/java/org/rocksdb/AccessHint.java
   src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
   src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java
+  src/main/java/org/rocksdb/BackgroundErrorReason.java
   src/main/java/org/rocksdb/BackupableDBOptions.java
   src/main/java/org/rocksdb/BackupEngine.java
   src/main/java/org/rocksdb/BackupInfo.java
@@ -140,8 +146,13 @@ set(JAVA_MAIN_CLASSES
   src/main/java/org/rocksdb/EncodingType.java
   src/main/java/org/rocksdb/Env.java
   src/main/java/org/rocksdb/EnvOptions.java
+  src/main/java/org/rocksdb/EventListener.java
   src/main/java/org/rocksdb/Experimental.java
+  src/main/java/org/rocksdb/ExternalFileIngestionInfo.java
   src/main/java/org/rocksdb/Filter.java
+  src/main/java/org/rocksdb/FileOperationInfo.java
+  src/main/java/org/rocksdb/FlushJobInfo.java
+  src/main/java/org/rocksdb/FlushReason.java
   src/main/java/org/rocksdb/FlushOptions.java
   src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java
   src/main/java/org/rocksdb/HashSkipListMemTableConfig.java
@@ -163,6 +174,7 @@ set(JAVA_MAIN_CLASSES
   src/main/java/org/rocksdb/MemoryUsageType.java
   src/main/java/org/rocksdb/MemoryUtil.java
   src/main/java/org/rocksdb/MemTableConfig.java
+  src/main/java/org/rocksdb/MemTableInfo.java
   src/main/java/org/rocksdb/MergeOperator.java
   src/main/java/org/rocksdb/MutableColumnFamilyOptions.java
   src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java
@@ -218,6 +230,10 @@ set(JAVA_MAIN_CLASSES
   src/main/java/org/rocksdb/StatsLevel.java
   src/main/java/org/rocksdb/Status.java
   src/main/java/org/rocksdb/StringAppendOperator.java
+  src/main/java/org/rocksdb/TableFileCreationBriefInfo.java
+  src/main/java/org/rocksdb/TableFileCreationInfo.java
+  src/main/java/org/rocksdb/TableFileCreationReason.java
+  src/main/java/org/rocksdb/TableFileDeletionInfo.java
   src/main/java/org/rocksdb/TableFilter.java
   src/main/java/org/rocksdb/TableProperties.java
   src/main/java/org/rocksdb/TableFormatConfig.java
@@ -247,6 +263,8 @@ set(JAVA_MAIN_CLASSES
   src/main/java/org/rocksdb/WriteBatchWithIndex.java
   src/main/java/org/rocksdb/WriteOptions.java
   src/main/java/org/rocksdb/WriteBufferManager.java
+  src/main/java/org/rocksdb/WriteStallCondition.java
+  src/main/java/org/rocksdb/WriteStallInfo.java
   src/main/java/org/rocksdb/util/ByteUtil.java
   src/main/java/org/rocksdb/util/BytewiseComparator.java
   src/main/java/org/rocksdb/util/Environment.java
@@ -267,6 +285,7 @@ set(JAVA_TEST_CLASSES
   src/test/java/org/rocksdb/WriteBatchTest.java
   src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java
   src/test/java/org/rocksdb/util/WriteBatchGetter.java
+  src/test/java/org/rocksdb/test/TestableEventListener.java
 )
 
 include(FindJava)
@@ -391,6 +410,7 @@ if(${CMAKE_VERSION} VERSION_LESS "3.11.4" OR (${Java_VERSION_MINOR} STREQUAL "7"
           org.rocksdb.AbstractCompactionFilter
           org.rocksdb.AbstractCompactionFilterFactory
           org.rocksdb.AbstractComparator
+          org.rocksdb.AbstractEventListener
           org.rocksdb.AbstractImmutableNativeReference
           org.rocksdb.AbstractNativeReference
           org.rocksdb.AbstractRocksIterator
@@ -407,6 +427,7 @@ if(${CMAKE_VERSION} VERSION_LESS "3.11.4" OR (${Java_VERSION_MINOR} STREQUAL "7"
           org.rocksdb.CassandraValueMergeOperator
           org.rocksdb.Checkpoint
           org.rocksdb.ClockCache
+          org.rocksdb.Cache
           org.rocksdb.ColumnFamilyHandle
           org.rocksdb.ColumnFamilyOptions
           org.rocksdb.CompactionJobInfo
@@ -417,6 +438,8 @@ if(${CMAKE_VERSION} VERSION_LESS "3.11.4" OR (${Java_VERSION_MINOR} STREQUAL "7"
           org.rocksdb.CompactRangeOptions
           org.rocksdb.ComparatorOptions
           org.rocksdb.CompressionOptions
+          org.rocksdb.ConcurrentTaskLimiterImpl
+          org.rocksdb.ConfigOptions
           org.rocksdb.DBOptions
           org.rocksdb.DirectSlice
           org.rocksdb.Env
@@ -485,6 +508,7 @@ if(${CMAKE_VERSION} VERSION_LESS "3.11.4" OR (${Java_VERSION_MINOR} STREQUAL "7"
           org.rocksdb.WriteBatchTest
           org.rocksdb.WriteBatchTestInternalHelper
           org.rocksdb.WriteBufferManager
+          org.rocksdb.test.TestableEventListener
   )
 
   create_javah(
diff --git a/java/Makefile b/java/Makefile
index c391a9bd235..1c702371222 100644
--- a/java/Makefile
+++ b/java/Makefile
@@ -2,6 +2,7 @@ NATIVE_JAVA_CLASSES = \
 	org.rocksdb.AbstractCompactionFilter\
 	org.rocksdb.AbstractCompactionFilterFactory\
 	org.rocksdb.AbstractComparator\
+	org.rocksdb.AbstractEventListener\
 	org.rocksdb.AbstractSlice\
 	org.rocksdb.AbstractTableFilter\
 	org.rocksdb.AbstractTraceWriter\
@@ -13,6 +14,7 @@ NATIVE_JAVA_CLASSES = \
 	org.rocksdb.BloomFilter\
 	org.rocksdb.Checkpoint\
 	org.rocksdb.ClockCache\
+	org.rocksdb.Cache\
 	org.rocksdb.CassandraCompactionFilter\
 	org.rocksdb.CassandraValueMergeOperator\
 	org.rocksdb.ColumnFamilyHandle\
@@ -87,7 +89,9 @@ NATIVE_JAVA_CLASSES = \
 	org.rocksdb.WriteBufferManager\
 	org.rocksdb.WBWIRocksIterator
 
-NATIVE_JAVA_TEST_CLASSES = org.rocksdb.RocksDBExceptionTest\
+NATIVE_JAVA_TEST_CLASSES = \
+    org.rocksdb.RocksDBExceptionTest\
+    org.rocksdb.test.TestableEventListener\
     org.rocksdb.NativeComparatorWrapperTest.NativeStringComparatorWrapper\
     org.rocksdb.WriteBatchTest\
     org.rocksdb.WriteBatchTestInternalHelper
@@ -98,10 +102,7 @@ ROCKSDB_PATCH = $(shell egrep "ROCKSDB_PATCH.[0-9]" ../include/rocksdb/version.h
 
 NATIVE_INCLUDE = ./include
 ARCH := $(shell getconf LONG_BIT)
-ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar
-ifeq ($(PLATFORM), OS_MACOSX)
-ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar
-endif
+SHA256_CMD ?= sha256sum
 
 JAVA_TESTS = \
 	org.rocksdb.BackupableDBOptionsTest\
@@ -129,6 +130,7 @@ JAVA_TESTS = \
 	org.rocksdb.DirectSliceTest\
 	org.rocksdb.util.EnvironmentTest\
 	org.rocksdb.EnvOptionsTest\
+	org.rocksdb.EventListenerTest\
 	org.rocksdb.HdfsEnvTest\
 	org.rocksdb.IngestExternalFileOptionsTest\
 	org.rocksdb.util.IntComparatorTest\
@@ -205,12 +207,27 @@ SAMPLES_OUTPUT = samples/target
 SAMPLES_MAIN_CLASSES = $(SAMPLES_OUTPUT)/classes
 
 JAVA_TEST_LIBDIR = test-libs
-JAVA_JUNIT_JAR = $(JAVA_TEST_LIBDIR)/junit-4.12.jar
-JAVA_HAMCR_JAR = $(JAVA_TEST_LIBDIR)/hamcrest-core-1.3.jar
-JAVA_MOCKITO_JAR = $(JAVA_TEST_LIBDIR)/mockito-all-1.10.19.jar
-JAVA_CGLIB_JAR = $(JAVA_TEST_LIBDIR)/cglib-2.2.2.jar
-JAVA_ASSERTJ_JAR = $(JAVA_TEST_LIBDIR)/assertj-core-1.7.1.jar
-JAVA_TESTCLASSPATH = $(JAVA_JUNIT_JAR):$(JAVA_HAMCR_JAR):$(JAVA_MOCKITO_JAR):$(JAVA_CGLIB_JAR):$(JAVA_ASSERTJ_JAR)
+JAVA_JUNIT_VER = 4.13.1
+JAVA_JUNIT_SHA256 = c30719db974d6452793fe191b3638a5777005485bae145924044530ffa5f6122
+JAVA_JUNIT_JAR = junit-$(JAVA_JUNIT_VER).jar
+JAVA_JUNIT_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_JUNIT_JAR)
+JAVA_HAMCREST_VER = 2.2
+JAVA_HAMCREST_SHA256 = 5e62846a89f05cd78cd9c1a553f340d002458380c320455dd1f8fc5497a8a1c1
+JAVA_HAMCREST_JAR = hamcrest-$(JAVA_HAMCREST_VER).jar
+JAVA_HAMCREST_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_HAMCREST_JAR)
+JAVA_MOCKITO_VER = 1.10.19
+JAVA_MOCKITO_SHA256 = d1a7a7ef14b3db5c0fc3e0a63a81b374b510afe85add9f7984b97911f4c70605
+JAVA_MOCKITO_JAR = mockito-all-$(JAVA_MOCKITO_VER).jar
+JAVA_MOCKITO_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_MOCKITO_JAR)
+JAVA_CGLIB_VER = 3.3.0
+JAVA_CGLIB_SHA256 = 9fe0c26d7464140ccdfe019ac687be1fb906122b508ab54beb810db0f09a9212
+JAVA_CGLIB_JAR = cglib-$(JAVA_CGLIB_VER).jar
+JAVA_CGLIB_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_CGLIB_JAR)
+JAVA_ASSERTJ_VER = 2.9.0
+JAVA_ASSERTJ_SHA256 = 5e88ea3ecbe3c48aa1346fec76c84979fa9c8d22499f11479011691230e8babf
+JAVA_ASSERTJ_JAR = assertj-core-$(JAVA_ASSERTJ_VER).jar
+JAVA_ASSERTJ_JAR_PATH = $(JAVA_TEST_LIBDIR)/$(JAVA_ASSERTJ_JAR)
+JAVA_TESTCLASSPATH = $(JAVA_JUNIT_JAR_PATH):$(JAVA_HAMCREST_JAR_PATH):$(JAVA_MOCKITO_JAR_PATH):$(JAVA_CGLIB_JAR_PATH):$(JAVA_ASSERTJ_JAR_PATH)
 
 MVN_LOCAL = ~/.m2/repository
 
@@ -294,13 +311,70 @@ optimistic_transaction_sample: java
 	java -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) OptimisticTransactionSample /tmp/rocksdbjni
 	$(AM_V_at)@rm -rf /tmp/rocksdbjni
 
-resolve_test_deps:
-	test -d "$(JAVA_TEST_LIBDIR)" || mkdir -p "$(JAVA_TEST_LIBDIR)"
-	test -s "$(JAVA_JUNIT_JAR)" || cp $(MVN_LOCAL)/junit/junit/4.12/junit-4.12.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output $(JAVA_JUNIT_JAR) --location $(DEPS_URL)/junit-4.12.jar
-	test -s "$(JAVA_HAMCR_JAR)" || cp $(MVN_LOCAL)/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output $(JAVA_HAMCR_JAR) --location $(DEPS_URL)/hamcrest-core-1.3.jar
-	test -s "$(JAVA_MOCKITO_JAR)" || cp $(MVN_LOCAL)/org/mockito/mockito-all/1.10.19/mockito-all-1.10.19.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output "$(JAVA_MOCKITO_JAR)" --location $(DEPS_URL)/mockito-all-1.10.19.jar
-	test -s "$(JAVA_CGLIB_JAR)" || cp $(MVN_LOCAL)/cglib/cglib/2.2.2/cglib-2.2.2.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output "$(JAVA_CGLIB_JAR)" --location $(DEPS_URL)/cglib-2.2.2.jar
-	test -s "$(JAVA_ASSERTJ_JAR)" || cp $(MVN_LOCAL)/org/assertj/assertj-core/1.7.1/assertj-core-1.7.1.jar $(JAVA_TEST_LIBDIR) || curl --fail --insecure --output "$(JAVA_ASSERTJ_JAR)" --location $(DEPS_URL)/assertj-core-1.7.1.jar
+$(JAVA_TEST_LIBDIR):
+	mkdir -p "$(JAVA_TEST_LIBDIR)"
+
+$(JAVA_JUNIT_JAR_PATH): $(JAVA_TEST_LIBDIR)
+ifneq (,$(wildcard $(MVN_LOCAL)/junit/junit/$(JAVA_JUNIT_VER)/$(JAVA_JUNIT_JAR)))
+	cp -v $(MVN_LOCAL)/junit/junit/$(JAVA_JUNIT_VER)/$(JAVA_JUNIT_JAR) $(JAVA_TEST_LIBDIR)
+else
+	curl --fail --insecure --output $(JAVA_JUNIT_JAR_PATH) --location $(DEPS_URL)/$(JAVA_JUNIT_JAR)
+	JAVA_JUNIT_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_JUNIT_JAR_PATH) | cut -d ' ' -f 1`; \
+	if [ "$(JAVA_JUNIT_SHA256)" != "$$JAVA_JUNIT_SHA256_ACTUAL" ]; then \
+		echo $(JAVA_JUNIT_JAR_PATH) checksum mismatch, expected=\"$(JAVA_JUNIT_SHA256)\" actual=\"$$JAVA_JUNIT_SHA256_ACTUAL\"; \
+		exit 1; \
+	fi
+endif
+
+$(JAVA_HAMCREST_JAR_PATH): $(JAVA_TEST_LIBDIR)
+ifneq (,$(wildcard $(MVN_LOCAL)/org/hamcrest/hamcrest/$(JAVA_HAMCREST_VER)/$(JAVA_HAMCREST_JAR)))
+	cp -v $(MVN_LOCAL)/org/hamcrest/hamcrest/$(JAVA_HAMCREST_VER)/$(JAVA_HAMCREST_JAR) $(JAVA_TEST_LIBDIR)
+else
+	curl --fail --insecure --output $(JAVA_HAMCREST_JAR_PATH) --location $(DEPS_URL)/$(JAVA_HAMCREST_JAR)
+	JAVA_HAMCREST_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_HAMCREST_JAR_PATH) | cut -d ' ' -f 1`; \
+	if [ "$(JAVA_HAMCREST_SHA256)" != "$$JAVA_HAMCREST_SHA256_ACTUAL" ]; then \
+		echo $(JAVA_HAMCREST_JAR_PATH) checksum mismatch, expected=\"$(JAVA_HAMCREST_SHA256)\" actual=\"$$JAVA_HAMCREST_SHA256_ACTUAL\"; \
+		exit 1; \
+	fi
+endif
+
+$(JAVA_MOCKITO_JAR_PATH): $(JAVA_TEST_LIBDIR)
+ifneq (,$(wildcard $(MVN_LOCAL)/org/mockito/mockito-all/$(JAVA_MOCKITO_VER)/$(JAVA_MOCKITO_JAR)))
+	cp -v $(MVN_LOCAL)/org/mockito/mockito-all/$(JAVA_MOCKITO_VER)/$(JAVA_MOCKITO_JAR) $(JAVA_TEST_LIBDIR)
+else
+	curl --fail --insecure --output "$(JAVA_MOCKITO_JAR_PATH)" --location $(DEPS_URL)/$(JAVA_MOCKITO_JAR)
+	JAVA_MOCKITO_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_MOCKITO_JAR_PATH) | cut -d ' ' -f 1`; \
+	if [ "$(JAVA_MOCKITO_SHA256)" != "$$JAVA_MOCKITO_SHA256_ACTUAL" ]; then \
+		echo $(JAVA_MOCKITO_JAR_PATH) checksum mismatch, expected=\"$(JAVA_MOCKITO_SHA256)\" actual=\"$$JAVA_MOCKITO_SHA256_ACTUAL\"; \
+		exit 1; \
+	fi
+endif
+
+$(JAVA_CGLIB_JAR_PATH): $(JAVA_TEST_LIBDIR)
+ifneq (,$(wildcard $(MVN_LOCAL)/cglib/cglib/$(JAVA_CGLIB_VER)/$(JAVA_CGLIB_JAR)))
+	cp -v $(MVN_LOCAL)/cglib/cglib/$(JAVA_CGLIB_VER)/$(JAVA_CGLIB_JAR) $(JAVA_TEST_LIBDIR)
+else
+	curl --fail --insecure --output "$(JAVA_CGLIB_JAR_PATH)" --location $(DEPS_URL)/$(JAVA_CGLIB_JAR)
+	JAVA_CGLIB_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_CGLIB_JAR_PATH) | cut -d ' ' -f 1`; \
+	if [ "$(JAVA_CGLIB_SHA256)" != "$$JAVA_CGLIB_SHA256_ACTUAL" ]; then \
+		echo $(JAVA_CGLIB_JAR_PATH) checksum mismatch, expected=\"$(JAVA_CGLIB_SHA256)\" actual=\"$$JAVA_CGLIB_SHA256_ACTUAL\"; \
+		exit 1; \
+	fi
+endif
+
+$(JAVA_ASSERTJ_JAR_PATH): $(JAVA_TEST_LIBDIR)
+ifneq (,$(wildcard $(MVN_LOCAL)/org/assertj/assertj-core/$(JAVA_ASSERTJ_VER)/$(JAVA_ASSERTJ_JAR)))
+	cp -v $(MVN_LOCAL)/org/assertj/assertj-core/$(JAVA_ASSERTJ_VER)/$(JAVA_ASSERTJ_JAR) $(JAVA_TEST_LIBDIR)
+else
+	curl --fail --insecure --output "$(JAVA_ASSERTJ_JAR_PATH)" --location $(DEPS_URL)/$(JAVA_ASSERTJ_JAR)
+	JAVA_ASSERTJ_SHA256_ACTUAL=`$(SHA256_CMD) $(JAVA_ASSERTJ_JAR_PATH) | cut -d ' ' -f 1`; \
+	if [ "$(JAVA_ASSERTJ_SHA256)" != "$$JAVA_ASSERTJ_SHA256_ACTUAL" ]; then \
+		echo $(JAVA_ASSERTJ_JAR_PATH) checksum mismatch, expected=\"$(JAVA_ASSERTJ_SHA256)\" actual=\"$$JAVA_ASSERTJ_SHA256_ACTUAL\"; \
+		exit 1; \
+	fi
+endif
+
+resolve_test_deps: $(JAVA_JUNIT_JAR_PATH) $(JAVA_HAMCREST_JAR_PATH) $(JAVA_MOCKITO_JAR_PATH) $(JAVA_CGLIB_JAR_PATH) $(JAVA_ASSERTJ_JAR_PATH)
 
 java_test: java resolve_test_deps
 	$(AM_V_GEN)mkdir -p $(TEST_CLASSES)
diff --git a/java/pom.xml.template b/java/pom.xml.template
index 4cd70458a87..4abff4768e4 100644
--- a/java/pom.xml.template
+++ b/java/pom.xml.template
@@ -147,13 +147,25 @@
         <dependency>
             <groupId>junit</groupId>
             <artifactId>junit</artifactId>
-            <version>4.12</version>
+            <version>4.13.1</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.hamcrest</groupId>
+            <artifactId>hamcrest</artifactId>
+            <version>2.2</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>cglib</groupId>
+            <artifactId>cglib</artifactId>
+            <version>3.3.0</version>
             <scope>test</scope>
         </dependency>
         <dependency>
             <groupId>org.assertj</groupId>
             <artifactId>assertj-core</artifactId>
-            <version>1.7.1</version>
+            <version>2.9.0</version>
             <scope>test</scope>
         </dependency>
         <dependency>
diff --git a/java/rocksjni/cache.cc b/java/rocksjni/cache.cc
new file mode 100644
index 00000000000..33c0a2f0be7
--- /dev/null
+++ b/java/rocksjni/cache.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// ROCKSDB_NAMESPACE::Cache.
+
+#include "rocksdb/cache.h"
+
+#include <jni.h>
+
+#include "include/org_rocksdb_Cache.h"
+
+/*
+ * Class:     org_rocksdb_Cache
+ * Method:    getUsage
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Cache_getUsage(JNIEnv*, jclass, jlong jhandle) {
+  auto* sptr_cache =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache>*>(jhandle);
+  return static_cast<jlong>(sptr_cache->get()->GetUsage());
+}
+
+/*
+ * Class:     org_rocksdb_Cache
+ * Method:    getPinnedUsage
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Cache_getPinnedUsage(JNIEnv*, jclass, jlong jhandle) {
+  auto* sptr_cache =
+      reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache>*>(jhandle);
+  return static_cast<jlong>(sptr_cache->get()->GetPinnedUsage());
+}
diff --git a/java/rocksjni/compression_options.cc b/java/rocksjni/compression_options.cc
index 4fed5ba5f9c..1857faf6807 100644
--- a/java/rocksjni/compression_options.cc
+++ b/java/rocksjni/compression_options.cc
@@ -132,6 +132,27 @@ jint Java_org_rocksdb_CompressionOptions_zstdMaxTrainBytes(
   return static_cast<jint>(opt->zstd_max_train_bytes);
 }
 
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    setMaxDictBufferBytes
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_CompressionOptions_setMaxDictBufferBytes(
+    JNIEnv*, jobject, jlong jhandle, jlong jmax_dict_buffer_bytes) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(jhandle);
+  opt->max_dict_buffer_bytes = static_cast<uint64_t>(jmax_dict_buffer_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    maxDictBufferBytes
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompressionOptions_maxDictBufferBytes(JNIEnv*, jobject,
+                                                             jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::CompressionOptions*>(jhandle);
+  return static_cast<jlong>(opt->max_dict_buffer_bytes);
+}
 /*
  * Class:     org_rocksdb_CompressionOptions
  * Method:    setEnabled
diff --git a/java/rocksjni/concurrent_task_limiter.cc b/java/rocksjni/concurrent_task_limiter.cc
index ddcdda478e3..591f721a2f6 100644
--- a/java/rocksjni/concurrent_task_limiter.cc
+++ b/java/rocksjni/concurrent_task_limiter.cc
@@ -15,7 +15,7 @@
  */
 jlong Java_org_rocksdb_ConcurrentTaskLimiterImpl_newConcurrentTaskLimiterImpl0(
     JNIEnv* env, jclass, jstring jname, jint limit) {
-  jboolean has_exception;
+  jboolean has_exception = JNI_FALSE;
   std::string name =
       ROCKSDB_NAMESPACE::JniUtil::copyStdString(env, jname, &has_exception);
   if (JNI_TRUE == has_exception) {
diff --git a/java/rocksjni/event_listener.cc b/java/rocksjni/event_listener.cc
new file mode 100644
index 00000000000..5b07ffc9be0
--- /dev/null
+++ b/java/rocksjni/event_listener.cc
@@ -0,0 +1,43 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// rocksdb::EventListener.
+
+#include <jni.h>
+
+#include <memory>
+
+#include "include/org_rocksdb_AbstractEventListener.h"
+#include "rocksjni/event_listener_jnicallback.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_AbstractEventListener
+ * Method:    createNewEventListener
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_AbstractEventListener_createNewEventListener(
+    JNIEnv* env, jobject jobj, jlong jenabled_event_callback_values) {
+  auto enabled_event_callbacks =
+      ROCKSDB_NAMESPACE::EnabledEventCallbackJni::toCppEnabledEventCallbacks(
+          jenabled_event_callback_values);
+  auto* sptr_event_listener =
+      new std::shared_ptr<ROCKSDB_NAMESPACE::EventListener>(
+          new ROCKSDB_NAMESPACE::EventListenerJniCallback(
+              env, jobj, enabled_event_callbacks));
+  return reinterpret_cast<jlong>(sptr_event_listener);
+}
+
+/*
+ * Class:     org_rocksdb_AbstractEventListener
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_AbstractEventListener_disposeInternal(JNIEnv*, jobject,
+                                                            jlong jhandle) {
+  delete reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::EventListener>*>(
+      jhandle);
+}
diff --git a/java/rocksjni/event_listener_jnicallback.cc b/java/rocksjni/event_listener_jnicallback.cc
new file mode 100644
index 00000000000..6e4d8975edf
--- /dev/null
+++ b/java/rocksjni/event_listener_jnicallback.cc
@@ -0,0 +1,502 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// rocksdb::EventListener.
+
+#include "rocksjni/event_listener_jnicallback.h"
+
+#include "rocksjni/portal.h"
+
+namespace rocksdb {
+EventListenerJniCallback::EventListenerJniCallback(
+    JNIEnv* env, jobject jevent_listener,
+    const std::set<EnabledEventCallback>& enabled_event_callbacks)
+    : JniCallback(env, jevent_listener),
+      m_enabled_event_callbacks(enabled_event_callbacks) {
+  InitCallbackMethodId(
+      m_on_flush_completed_proxy_mid, EnabledEventCallback::ON_FLUSH_COMPLETED,
+      env, AbstractEventListenerJni::getOnFlushCompletedProxyMethodId);
+
+  InitCallbackMethodId(m_on_flush_begin_proxy_mid,
+                       EnabledEventCallback::ON_FLUSH_BEGIN, env,
+                       AbstractEventListenerJni::getOnFlushBeginProxyMethodId);
+
+  InitCallbackMethodId(m_on_table_file_deleted_mid,
+                       EnabledEventCallback::ON_TABLE_FILE_DELETED, env,
+                       AbstractEventListenerJni::getOnTableFileDeletedMethodId);
+
+  InitCallbackMethodId(
+      m_on_compaction_begin_proxy_mid,
+      EnabledEventCallback::ON_COMPACTION_BEGIN, env,
+      AbstractEventListenerJni::getOnCompactionBeginProxyMethodId);
+
+  InitCallbackMethodId(
+      m_on_compaction_completed_proxy_mid,
+      EnabledEventCallback::ON_COMPACTION_COMPLETED, env,
+      AbstractEventListenerJni::getOnCompactionCompletedProxyMethodId);
+
+  InitCallbackMethodId(m_on_table_file_created_mid,
+                       EnabledEventCallback::ON_TABLE_FILE_CREATED, env,
+                       AbstractEventListenerJni::getOnTableFileCreatedMethodId);
+
+  InitCallbackMethodId(
+      m_on_table_file_creation_started_mid,
+      EnabledEventCallback::ON_TABLE_FILE_CREATION_STARTED, env,
+      AbstractEventListenerJni::getOnTableFileCreationStartedMethodId);
+
+  InitCallbackMethodId(m_on_mem_table_sealed_mid,
+                       EnabledEventCallback::ON_MEMTABLE_SEALED, env,
+                       AbstractEventListenerJni::getOnMemTableSealedMethodId);
+
+  InitCallbackMethodId(
+      m_on_column_family_handle_deletion_started_mid,
+      EnabledEventCallback::ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED, env,
+      AbstractEventListenerJni::getOnColumnFamilyHandleDeletionStartedMethodId);
+
+  InitCallbackMethodId(
+      m_on_external_file_ingested_proxy_mid,
+      EnabledEventCallback::ON_EXTERNAL_FILE_INGESTED, env,
+      AbstractEventListenerJni::getOnExternalFileIngestedProxyMethodId);
+
+  InitCallbackMethodId(
+      m_on_background_error_proxy_mid,
+      EnabledEventCallback::ON_BACKGROUND_ERROR, env,
+      AbstractEventListenerJni::getOnBackgroundErrorProxyMethodId);
+
+  InitCallbackMethodId(
+      m_on_stall_conditions_changed_mid,
+      EnabledEventCallback::ON_STALL_CONDITIONS_CHANGED, env,
+      AbstractEventListenerJni::getOnStallConditionsChangedMethodId);
+
+  InitCallbackMethodId(m_on_file_read_finish_mid,
+                       EnabledEventCallback::ON_FILE_READ_FINISH, env,
+                       AbstractEventListenerJni::getOnFileReadFinishMethodId);
+
+  InitCallbackMethodId(m_on_file_write_finish_mid,
+                       EnabledEventCallback::ON_FILE_WRITE_FINISH, env,
+                       AbstractEventListenerJni::getOnFileWriteFinishMethodId);
+
+  InitCallbackMethodId(m_on_file_flush_finish_mid,
+                       EnabledEventCallback::ON_FILE_FLUSH_FINISH, env,
+                       AbstractEventListenerJni::getOnFileFlushFinishMethodId);
+
+  InitCallbackMethodId(m_on_file_sync_finish_mid,
+                       EnabledEventCallback::ON_FILE_SYNC_FINISH, env,
+                       AbstractEventListenerJni::getOnFileSyncFinishMethodId);
+
+  InitCallbackMethodId(
+      m_on_file_range_sync_finish_mid,
+      EnabledEventCallback::ON_FILE_RANGE_SYNC_FINISH, env,
+      AbstractEventListenerJni::getOnFileRangeSyncFinishMethodId);
+
+  InitCallbackMethodId(
+      m_on_file_truncate_finish_mid,
+      EnabledEventCallback::ON_FILE_TRUNCATE_FINISH, env,
+      AbstractEventListenerJni::getOnFileTruncateFinishMethodId);
+
+  InitCallbackMethodId(m_on_file_close_finish_mid,
+                       EnabledEventCallback::ON_FILE_CLOSE_FINISH, env,
+                       AbstractEventListenerJni::getOnFileCloseFinishMethodId);
+
+  InitCallbackMethodId(
+      m_should_be_notified_on_file_io,
+      EnabledEventCallback::SHOULD_BE_NOTIFIED_ON_FILE_IO, env,
+      AbstractEventListenerJni::getShouldBeNotifiedOnFileIOMethodId);
+
+  InitCallbackMethodId(
+      m_on_error_recovery_begin_proxy_mid,
+      EnabledEventCallback::ON_ERROR_RECOVERY_BEGIN, env,
+      AbstractEventListenerJni::getOnErrorRecoveryBeginProxyMethodId);
+
+  InitCallbackMethodId(
+      m_on_error_recovery_completed_mid,
+      EnabledEventCallback::ON_ERROR_RECOVERY_COMPLETED, env,
+      AbstractEventListenerJni::getOnErrorRecoveryCompletedMethodId);
+}
+
+EventListenerJniCallback::~EventListenerJniCallback() {}
+
+void EventListenerJniCallback::OnFlushCompleted(
+    DB* db, const FlushJobInfo& flush_job_info) {
+  if (m_on_flush_completed_proxy_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jflush_job_info = SetupCallbackInvocation<FlushJobInfo>(
+      env, attached_thread, flush_job_info,
+      FlushJobInfoJni::fromCppFlushJobInfo);
+
+  if (jflush_job_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_flush_completed_proxy_mid,
+                        reinterpret_cast<jlong>(db), jflush_job_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jflush_job_info});
+}
+
+void EventListenerJniCallback::OnFlushBegin(
+    DB* db, const FlushJobInfo& flush_job_info) {
+  if (m_on_flush_begin_proxy_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jflush_job_info = SetupCallbackInvocation<FlushJobInfo>(
+      env, attached_thread, flush_job_info,
+      FlushJobInfoJni::fromCppFlushJobInfo);
+
+  if (jflush_job_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_flush_begin_proxy_mid,
+                        reinterpret_cast<jlong>(db), jflush_job_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jflush_job_info});
+}
+
+void EventListenerJniCallback::OnTableFileDeleted(
+    const TableFileDeletionInfo& info) {
+  if (m_on_table_file_deleted_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jdeletion_info = SetupCallbackInvocation<TableFileDeletionInfo>(
+      env, attached_thread, info,
+      TableFileDeletionInfoJni::fromCppTableFileDeletionInfo);
+
+  if (jdeletion_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_table_file_deleted_mid,
+                        jdeletion_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jdeletion_info});
+}
+
+void EventListenerJniCallback::OnCompactionBegin(DB* db,
+                                                 const CompactionJobInfo& ci) {
+  if (m_on_compaction_begin_proxy_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jcompaction_job_info = SetupCallbackInvocation<CompactionJobInfo>(
+      env, attached_thread, ci, CompactionJobInfoJni::fromCppCompactionJobInfo);
+
+  if (jcompaction_job_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_compaction_begin_proxy_mid,
+                        reinterpret_cast<jlong>(db), jcompaction_job_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jcompaction_job_info});
+}
+
+void EventListenerJniCallback::OnCompactionCompleted(
+    DB* db, const CompactionJobInfo& ci) {
+  if (m_on_compaction_completed_proxy_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jcompaction_job_info = SetupCallbackInvocation<CompactionJobInfo>(
+      env, attached_thread, ci, CompactionJobInfoJni::fromCppCompactionJobInfo);
+
+  if (jcompaction_job_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_compaction_completed_proxy_mid,
+                        reinterpret_cast<jlong>(db), jcompaction_job_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jcompaction_job_info});
+}
+
+void EventListenerJniCallback::OnTableFileCreated(
+    const TableFileCreationInfo& info) {
+  if (m_on_table_file_created_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jfile_creation_info = SetupCallbackInvocation<TableFileCreationInfo>(
+      env, attached_thread, info,
+      TableFileCreationInfoJni::fromCppTableFileCreationInfo);
+
+  if (jfile_creation_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_table_file_created_mid,
+                        jfile_creation_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jfile_creation_info});
+}
+
+void EventListenerJniCallback::OnTableFileCreationStarted(
+    const TableFileCreationBriefInfo& info) {
+  if (m_on_table_file_creation_started_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jcreation_brief_info =
+      SetupCallbackInvocation<TableFileCreationBriefInfo>(
+          env, attached_thread, info,
+          TableFileCreationBriefInfoJni::fromCppTableFileCreationBriefInfo);
+
+  if (jcreation_brief_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_table_file_creation_started_mid,
+                        jcreation_brief_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jcreation_brief_info});
+}
+
+void EventListenerJniCallback::OnMemTableSealed(const MemTableInfo& info) {
+  if (m_on_mem_table_sealed_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jmem_table_info = SetupCallbackInvocation<MemTableInfo>(
+      env, attached_thread, info, MemTableInfoJni::fromCppMemTableInfo);
+
+  if (jmem_table_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_mem_table_sealed_mid,
+                        jmem_table_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jmem_table_info});
+}
+
+void EventListenerJniCallback::OnColumnFamilyHandleDeletionStarted(
+    ColumnFamilyHandle* handle) {
+  if (m_on_column_family_handle_deletion_started_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jcf_handle = SetupCallbackInvocation<ColumnFamilyHandle>(
+      env, attached_thread, *handle,
+      ColumnFamilyHandleJni::fromCppColumnFamilyHandle);
+
+  if (jcf_handle != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj,
+                        m_on_column_family_handle_deletion_started_mid,
+                        jcf_handle);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jcf_handle});
+}
+
+void EventListenerJniCallback::OnExternalFileIngested(
+    DB* db, const ExternalFileIngestionInfo& info) {
+  if (m_on_external_file_ingested_proxy_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jingestion_info = SetupCallbackInvocation<ExternalFileIngestionInfo>(
+      env, attached_thread, info,
+      ExternalFileIngestionInfoJni::fromCppExternalFileIngestionInfo);
+
+  if (jingestion_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_external_file_ingested_proxy_mid,
+                        reinterpret_cast<jlong>(db), jingestion_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jingestion_info});
+}
+
+void EventListenerJniCallback::OnBackgroundError(BackgroundErrorReason reason,
+                                                 Status* bg_error) {
+  if (m_on_background_error_proxy_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jstatus = SetupCallbackInvocation<Status>(
+      env, attached_thread, *bg_error, StatusJni::construct);
+
+  if (jstatus != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_background_error_proxy_mid,
+                        static_cast<jbyte>(reason), jstatus);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jstatus});
+}
+
+void EventListenerJniCallback::OnStallConditionsChanged(
+    const WriteStallInfo& info) {
+  if (m_on_stall_conditions_changed_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jwrite_stall_info = SetupCallbackInvocation<WriteStallInfo>(
+      env, attached_thread, info, WriteStallInfoJni::fromCppWriteStallInfo);
+
+  if (jwrite_stall_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_stall_conditions_changed_mid,
+                        jwrite_stall_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jwrite_stall_info});
+}
+
+void EventListenerJniCallback::OnFileReadFinish(const FileOperationInfo& info) {
+  OnFileOperation(m_on_file_read_finish_mid, info);
+}
+
+void EventListenerJniCallback::OnFileWriteFinish(
+    const FileOperationInfo& info) {
+  OnFileOperation(m_on_file_write_finish_mid, info);
+}
+
+void EventListenerJniCallback::OnFileFlushFinish(
+    const FileOperationInfo& info) {
+  OnFileOperation(m_on_file_flush_finish_mid, info);
+}
+
+void EventListenerJniCallback::OnFileSyncFinish(const FileOperationInfo& info) {
+  OnFileOperation(m_on_file_sync_finish_mid, info);
+}
+
+void EventListenerJniCallback::OnFileRangeSyncFinish(
+    const FileOperationInfo& info) {
+  OnFileOperation(m_on_file_range_sync_finish_mid, info);
+}
+
+void EventListenerJniCallback::OnFileTruncateFinish(
+    const FileOperationInfo& info) {
+  OnFileOperation(m_on_file_truncate_finish_mid, info);
+}
+
+void EventListenerJniCallback::OnFileCloseFinish(
+    const FileOperationInfo& info) {
+  OnFileOperation(m_on_file_close_finish_mid, info);
+}
+
+bool EventListenerJniCallback::ShouldBeNotifiedOnFileIO() {
+  if (m_should_be_notified_on_file_io == nullptr) {
+    return false;
+  }
+
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  assert(env != nullptr);
+
+  jboolean jshould_be_notified =
+      env->CallBooleanMethod(m_jcallback_obj, m_should_be_notified_on_file_io);
+
+  CleanupCallbackInvocation(env, attached_thread, {});
+
+  return static_cast<bool>(jshould_be_notified);
+}
+
+void EventListenerJniCallback::OnErrorRecoveryBegin(
+    BackgroundErrorReason reason, Status bg_error, bool* auto_recovery) {
+  if (m_on_error_recovery_begin_proxy_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jbg_error = SetupCallbackInvocation<Status>(
+      env, attached_thread, bg_error, StatusJni::construct);
+
+  if (jbg_error != nullptr) {
+    jboolean jauto_recovery = env->CallBooleanMethod(
+        m_jcallback_obj, m_on_error_recovery_begin_proxy_mid,
+        static_cast<jbyte>(reason), jbg_error);
+    *auto_recovery = jauto_recovery == JNI_TRUE;
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jbg_error});
+}
+
+void EventListenerJniCallback::OnErrorRecoveryCompleted(Status old_bg_error) {
+  if (m_on_error_recovery_completed_mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jold_bg_error = SetupCallbackInvocation<Status>(
+      env, attached_thread, old_bg_error, StatusJni::construct);
+
+  if (jold_bg_error != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, m_on_error_recovery_completed_mid,
+                        jold_bg_error);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jold_bg_error});
+}
+
+void EventListenerJniCallback::InitCallbackMethodId(
+    jmethodID& mid, EnabledEventCallback eec, JNIEnv* env,
+    jmethodID (*get_id)(JNIEnv* env)) {
+  if (m_enabled_event_callbacks.count(eec) == 1) {
+    mid = get_id(env);
+  } else {
+    mid = nullptr;
+  }
+}
+
+template <class T>
+jobject EventListenerJniCallback::SetupCallbackInvocation(
+    JNIEnv*& env, jboolean& attached_thread, const T& cpp_obj,
+    jobject (*convert)(JNIEnv* env, const T* cpp_obj)) {
+  attached_thread = JNI_FALSE;
+  env = getJniEnv(&attached_thread);
+  assert(env != nullptr);
+
+  return convert(env, &cpp_obj);
+}
+
+void EventListenerJniCallback::CleanupCallbackInvocation(
+    JNIEnv* env, jboolean attached_thread,
+    std::initializer_list<jobject*> refs) {
+  for (auto* ref : refs) {
+    if (*ref == nullptr) continue;
+    env->DeleteLocalRef(*ref);
+  }
+
+  if (env->ExceptionCheck()) {
+    // exception thrown from CallVoidMethod
+    env->ExceptionDescribe();  // print out exception to stderr
+  }
+
+  releaseJniEnv(attached_thread);
+}
+
+void EventListenerJniCallback::OnFileOperation(const jmethodID& mid,
+                                               const FileOperationInfo& info) {
+  if (mid == nullptr) {
+    return;
+  }
+
+  JNIEnv* env;
+  jboolean attached_thread;
+  jobject jop_info = SetupCallbackInvocation<FileOperationInfo>(
+      env, attached_thread, info,
+      FileOperationInfoJni::fromCppFileOperationInfo);
+
+  if (jop_info != nullptr) {
+    env->CallVoidMethod(m_jcallback_obj, mid, jop_info);
+  }
+
+  CleanupCallbackInvocation(env, attached_thread, {&jop_info});
+}
+}  // namespace rocksdb
diff --git a/java/rocksjni/event_listener_jnicallback.h b/java/rocksjni/event_listener_jnicallback.h
new file mode 100644
index 00000000000..e3b5d0e940b
--- /dev/null
+++ b/java/rocksjni/event_listener_jnicallback.h
@@ -0,0 +1,122 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// rocksdb::EventListener.
+
+#ifndef JAVA_ROCKSJNI_EVENT_LISTENER_JNICALLBACK_H_
+#define JAVA_ROCKSJNI_EVENT_LISTENER_JNICALLBACK_H_
+
+#include <jni.h>
+
+#include <memory>
+#include <set>
+
+#include "rocksdb/listener.h"
+#include "rocksjni/jnicallback.h"
+
+namespace rocksdb {
+
+enum EnabledEventCallback {
+  ON_FLUSH_COMPLETED = 0x0,
+  ON_FLUSH_BEGIN = 0x1,
+  ON_TABLE_FILE_DELETED = 0x2,
+  ON_COMPACTION_BEGIN = 0x3,
+  ON_COMPACTION_COMPLETED = 0x4,
+  ON_TABLE_FILE_CREATED = 0x5,
+  ON_TABLE_FILE_CREATION_STARTED = 0x6,
+  ON_MEMTABLE_SEALED = 0x7,
+  ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED = 0x8,
+  ON_EXTERNAL_FILE_INGESTED = 0x9,
+  ON_BACKGROUND_ERROR = 0xA,
+  ON_STALL_CONDITIONS_CHANGED = 0xB,
+  ON_FILE_READ_FINISH = 0xC,
+  ON_FILE_WRITE_FINISH = 0xD,
+  ON_FILE_FLUSH_FINISH = 0xE,
+  ON_FILE_SYNC_FINISH = 0xF,
+  ON_FILE_RANGE_SYNC_FINISH = 0x10,
+  ON_FILE_TRUNCATE_FINISH = 0x11,
+  ON_FILE_CLOSE_FINISH = 0x12,
+  SHOULD_BE_NOTIFIED_ON_FILE_IO = 0x13,
+  ON_ERROR_RECOVERY_BEGIN = 0x14,
+  ON_ERROR_RECOVERY_COMPLETED = 0x15,
+
+  NUM_ENABLED_EVENT_CALLBACK = 0x16,
+};
+
+class EventListenerJniCallback : public JniCallback, public EventListener {
+ public:
+  EventListenerJniCallback(
+      JNIEnv* env, jobject jevent_listener,
+      const std::set<EnabledEventCallback>& enabled_event_callbacks);
+  virtual ~EventListenerJniCallback();
+  virtual void OnFlushCompleted(DB* db, const FlushJobInfo& flush_job_info);
+  virtual void OnFlushBegin(DB* db, const FlushJobInfo& flush_job_info);
+  virtual void OnTableFileDeleted(const TableFileDeletionInfo& info);
+  virtual void OnCompactionBegin(DB* db, const CompactionJobInfo& ci);
+  virtual void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci);
+  virtual void OnTableFileCreated(const TableFileCreationInfo& info);
+  virtual void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& info);
+  virtual void OnMemTableSealed(const MemTableInfo& info);
+  virtual void OnColumnFamilyHandleDeletionStarted(ColumnFamilyHandle* handle);
+  virtual void OnExternalFileIngested(DB* db,
+                                      const ExternalFileIngestionInfo& info);
+  virtual void OnBackgroundError(BackgroundErrorReason reason,
+                                 Status* bg_error);
+  virtual void OnStallConditionsChanged(const WriteStallInfo& info);
+  virtual void OnFileReadFinish(const FileOperationInfo& info);
+  virtual void OnFileWriteFinish(const FileOperationInfo& info);
+  virtual void OnFileFlushFinish(const FileOperationInfo& info);
+  virtual void OnFileSyncFinish(const FileOperationInfo& info);
+  virtual void OnFileRangeSyncFinish(const FileOperationInfo& info);
+  virtual void OnFileTruncateFinish(const FileOperationInfo& info);
+  virtual void OnFileCloseFinish(const FileOperationInfo& info);
+  virtual bool ShouldBeNotifiedOnFileIO();
+  virtual void OnErrorRecoveryBegin(BackgroundErrorReason reason,
+                                    Status bg_error, bool* auto_recovery);
+  virtual void OnErrorRecoveryCompleted(Status old_bg_error);
+
+ private:
+  inline void InitCallbackMethodId(jmethodID& mid, EnabledEventCallback eec,
+                                   JNIEnv* env,
+                                   jmethodID (*get_id)(JNIEnv* env));
+  template <class T>
+  inline jobject SetupCallbackInvocation(
+      JNIEnv*& env, jboolean& attached_thread, const T& cpp_obj,
+      jobject (*convert)(JNIEnv* env, const T* cpp_obj));
+  inline void CleanupCallbackInvocation(JNIEnv* env, jboolean attached_thread,
+                                        std::initializer_list<jobject*> refs);
+  inline void OnFileOperation(const jmethodID& mid,
+                              const FileOperationInfo& info);
+
+  const std::set<EnabledEventCallback> m_enabled_event_callbacks;
+  jmethodID m_on_flush_completed_proxy_mid;
+  jmethodID m_on_flush_begin_proxy_mid;
+  jmethodID m_on_table_file_deleted_mid;
+  jmethodID m_on_compaction_begin_proxy_mid;
+  jmethodID m_on_compaction_completed_proxy_mid;
+  jmethodID m_on_table_file_created_mid;
+  jmethodID m_on_table_file_creation_started_mid;
+  jmethodID m_on_mem_table_sealed_mid;
+  jmethodID m_on_column_family_handle_deletion_started_mid;
+  jmethodID m_on_external_file_ingested_proxy_mid;
+  jmethodID m_on_background_error_proxy_mid;
+  jmethodID m_on_stall_conditions_changed_mid;
+  jmethodID m_on_file_read_finish_mid;
+  jmethodID m_on_file_write_finish_mid;
+  jmethodID m_on_file_flush_finish_mid;
+  jmethodID m_on_file_sync_finish_mid;
+  jmethodID m_on_file_range_sync_finish_mid;
+  jmethodID m_on_file_truncate_finish_mid;
+  jmethodID m_on_file_close_finish_mid;
+  jmethodID m_should_be_notified_on_file_io;
+  jmethodID m_on_error_recovery_begin_proxy_mid;
+  jmethodID m_on_error_recovery_completed_mid;
+};
+
+}  // namespace rocksdb
+
+#endif  // JAVA_ROCKSJNI_EVENT_LISTENER_JNICALLBACK_H_
diff --git a/java/rocksjni/jnicallback.h b/java/rocksjni/jnicallback.h
index b11ef2ef2cc..5baa8973c14 100644
--- a/java/rocksjni/jnicallback.h
+++ b/java/rocksjni/jnicallback.h
@@ -19,6 +19,8 @@ class JniCallback {
   JniCallback(JNIEnv* env, jobject jcallback_obj);
   virtual ~JniCallback();
 
+  const jobject& GetJavaObject() const { return m_jcallback_obj; }
+
  protected:
   JavaVM* m_jvm;
   jobject m_jcallback_obj;
diff --git a/java/rocksjni/memory_util.cc b/java/rocksjni/memory_util.cc
index fac288c925c..07284d434fe 100644
--- a/java/rocksjni/memory_util.cc
+++ b/java/rocksjni/memory_util.cc
@@ -22,20 +22,14 @@
  * Signature: ([J[J)Ljava/util/Map;
  */
 jobject Java_org_rocksdb_MemoryUtil_getApproximateMemoryUsageByType(
-    JNIEnv *env, jclass /*jclazz*/, jlongArray jdb_handles, jlongArray jcache_handles) {
-  std::vector<ROCKSDB_NAMESPACE::DB *> dbs;
-  jsize db_handle_count = env->GetArrayLength(jdb_handles);
-  if(db_handle_count > 0) {
-    jlong *ptr_jdb_handles = env->GetLongArrayElements(jdb_handles, nullptr);
-    if (ptr_jdb_handles == nullptr) {
-      // exception thrown: OutOfMemoryError
-      return nullptr;
-    }
-    for (jsize i = 0; i < db_handle_count; i++) {
-      dbs.push_back(
-          reinterpret_cast<ROCKSDB_NAMESPACE::DB *>(ptr_jdb_handles[i]));
-    }
-    env->ReleaseLongArrayElements(jdb_handles, ptr_jdb_handles, JNI_ABORT);
+    JNIEnv *env, jclass, jlongArray jdb_handles, jlongArray jcache_handles) {
+  jboolean has_exception = JNI_FALSE;
+  std::vector<ROCKSDB_NAMESPACE::DB *> dbs =
+      ROCKSDB_NAMESPACE::JniUtil::fromJPointers<ROCKSDB_NAMESPACE::DB>(
+          env, jdb_handles, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
   }
 
   std::unordered_set<const ROCKSDB_NAMESPACE::Cache *> cache_set;
@@ -103,5 +97,4 @@ jobject Java_org_rocksdb_MemoryUtil_getApproximateMemoryUsageByType(
   }
 
   return jusage_by_type;
-
 }
diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 62ff1a3e16d..fbf3241792e 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -553,7 +553,8 @@ jlong Java_org_rocksdb_Options_dbPathsLen(
 void Java_org_rocksdb_Options_dbPaths(
     JNIEnv* env, jobject, jlong jhandle, jobjectArray jpaths,
     jlongArray jtarget_sizes) {
-  jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, nullptr);
+  jboolean is_copy;
+  jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, &is_copy);
   if (ptr_jtarget_size == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
@@ -581,7 +582,8 @@ void Java_org_rocksdb_Options_dbPaths(
     ptr_jtarget_size[i] = static_cast<jint>(db_path.target_size);
   }
 
-  env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_COMMIT);
+  env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size,
+                                is_copy == JNI_TRUE ? 0 : JNI_ABORT);
 }
 
 /*
@@ -943,6 +945,7 @@ rocksdb_convert_cf_paths_from_java_helper(JNIEnv* env, jobjectArray path_array,
   jlong* size_array_ptr = env->GetLongArrayElements(size_array, nullptr);
   if (nullptr == size_array_ptr) {
     // exception thrown: OutOfMemoryError
+    *has_exception = JNI_TRUE;
     return {};
   }
   std::vector<ROCKSDB_NAMESPACE::DbPath> cf_paths;
@@ -975,7 +978,7 @@ void Java_org_rocksdb_Options_setCfPaths(JNIEnv* env, jclass, jlong jhandle,
                                          jobjectArray path_array,
                                          jlongArray size_array) {
   auto* options = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
-  jboolean has_exception;
+  jboolean has_exception = JNI_FALSE;
   std::vector<ROCKSDB_NAMESPACE::DbPath> cf_paths =
       rocksdb_convert_cf_paths_from_java_helper(env, path_array, size_array,
                                                 &has_exception);
@@ -1767,6 +1770,76 @@ jboolean Java_org_rocksdb_Options_strictBytesPerSync(
   return static_cast<jboolean>(opt->strict_bytes_per_sync);
 }
 
+// Note: the RocksJava API currently only supports EventListeners implemented in
+// Java. It could be extended in future to also support adding/removing
+// EventListeners implemented in C++.
+static void rocksdb_set_event_listeners_helper(
+    JNIEnv* env, jlongArray jlistener_array,
+    std::vector<std::shared_ptr<ROCKSDB_NAMESPACE::EventListener>>&
+        listener_sptr_vec) {
+  jlong* ptr_jlistener_array =
+      env->GetLongArrayElements(jlistener_array, nullptr);
+  if (ptr_jlistener_array == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+  const jsize array_size = env->GetArrayLength(jlistener_array);
+  listener_sptr_vec.clear();
+  for (jsize i = 0; i < array_size; ++i) {
+    const auto& listener_sptr =
+        *reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::EventListener>*>(
+            ptr_jlistener_array[i]);
+    listener_sptr_vec.push_back(listener_sptr);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setEventListeners
+ * Signature: (J[J)V
+ */
+void Java_org_rocksdb_Options_setEventListeners(JNIEnv* env, jclass,
+                                                jlong jhandle,
+                                                jlongArray jlistener_array) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  rocksdb_set_event_listeners_helper(env, jlistener_array, opt->listeners);
+}
+
+// Note: the RocksJava API currently only supports EventListeners implemented in
+// Java. It could be extended in future to also support adding/removing
+// EventListeners implemented in C++.
+static jobjectArray rocksdb_get_event_listeners_helper(
+    JNIEnv* env,
+    const std::vector<std::shared_ptr<ROCKSDB_NAMESPACE::EventListener>>&
+        listener_sptr_vec) {
+  jsize sz = static_cast<jsize>(listener_sptr_vec.size());
+  jclass jlistener_clazz =
+      ROCKSDB_NAMESPACE::AbstractEventListenerJni::getJClass(env);
+  jobjectArray jlisteners = env->NewObjectArray(sz, jlistener_clazz, nullptr);
+  if (jlisteners == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  for (jsize i = 0; i < sz; ++i) {
+    const auto* jni_cb =
+        static_cast<ROCKSDB_NAMESPACE::EventListenerJniCallback*>(
+            listener_sptr_vec[i].get());
+    env->SetObjectArrayElement(jlisteners, i, jni_cb->GetJavaObject());
+  }
+  return jlisteners;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    eventListeners
+ * Signature: (J)[Lorg/rocksdb/AbstractEventListener;
+ */
+jobjectArray Java_org_rocksdb_Options_eventListeners(JNIEnv* env, jclass,
+                                                     jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
+  return rocksdb_get_event_listeners_helper(env, opt->listeners);
+}
+
 /*
  * Class:     org_rocksdb_Options
  * Method:    setEnableThreadTracking
@@ -4070,7 +4143,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCfPaths(JNIEnv* env, jclass,
                                                      jlongArray size_array) {
   auto* options =
       reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
-  jboolean has_exception;
+  jboolean has_exception = JNI_FALSE;
   std::vector<ROCKSDB_NAMESPACE::DbPath> cf_paths =
       rocksdb_convert_cf_paths_from_java_helper(env, path_array, size_array,
                                                 &has_exception);
@@ -4959,8 +5032,8 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelMultiplierAdditiona
     JNIEnv* env, jobject, jlong jhandle,
     jintArray jmax_bytes_for_level_multiplier_additional) {
   jsize len = env->GetArrayLength(jmax_bytes_for_level_multiplier_additional);
-  jint* additionals =
-      env->GetIntArrayElements(jmax_bytes_for_level_multiplier_additional, 0);
+  jint* additionals = env->GetIntArrayElements(
+      jmax_bytes_for_level_multiplier_additional, nullptr);
   if (additionals == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
@@ -5612,7 +5685,8 @@ jlong Java_org_rocksdb_DBOptions_dbPathsLen(
 void Java_org_rocksdb_DBOptions_dbPaths(
     JNIEnv* env, jobject, jlong jhandle, jobjectArray jpaths,
     jlongArray jtarget_sizes) {
-  jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, nullptr);
+  jboolean is_copy;
+  jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, &is_copy);
   if (ptr_jtarget_size == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
@@ -5640,7 +5714,8 @@ void Java_org_rocksdb_DBOptions_dbPaths(
     ptr_jtarget_size[i] = static_cast<jint>(db_path.target_size);
   }
 
-  env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_COMMIT);
+  env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size,
+                                is_copy == JNI_TRUE ? 0 : JNI_ABORT);
 }
 
 /*
@@ -6549,6 +6624,29 @@ jboolean Java_org_rocksdb_DBOptions_strictBytesPerSync(
           ->strict_bytes_per_sync);
 }
 
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setEventListeners
+ * Signature: (J[J)V
+ */
+void Java_org_rocksdb_DBOptions_setEventListeners(JNIEnv* env, jclass,
+                                                  jlong jhandle,
+                                                  jlongArray jlistener_array) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  rocksdb_set_event_listeners_helper(env, jlistener_array, opt->listeners);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    eventListeners
+ * Signature: (J)[Lorg/rocksdb/AbstractEventListener;
+ */
+jobjectArray Java_org_rocksdb_DBOptions_eventListeners(JNIEnv* env, jclass,
+                                                       jlong jhandle) {
+  auto* opt = reinterpret_cast<ROCKSDB_NAMESPACE::DBOptions*>(jhandle);
+  return rocksdb_get_event_listeners_helper(env, opt->listeners);
+}
+
 /*
  * Class:     org_rocksdb_DBOptions
  * Method:    setDelayedWriteRate
diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h
index e31da0f4fd3..3800c8324e4 100644
--- a/java/rocksjni/portal.h
+++ b/java/rocksjni/portal.h
@@ -10,14 +10,16 @@
 #ifndef JAVA_ROCKSJNI_PORTAL_H_
 #define JAVA_ROCKSJNI_PORTAL_H_
 
+#include <jni.h>
+
 #include <algorithm>
 #include <cstring>
 #include <functional>
 #include <iostream>
 #include <iterator>
-#include <jni.h>
 #include <limits>
 #include <memory>
+#include <set>
 #include <string>
 #include <type_traits>
 #include <vector>
@@ -34,6 +36,7 @@
 #include "rocksdb/utilities/write_batch_with_index.h"
 #include "rocksjni/compaction_filter_factory_jnicallback.h"
 #include "rocksjni/comparatorjnicallback.h"
+#include "rocksjni/event_listener_jnicallback.h"
 #include "rocksjni/loggerjnicallback.h"
 #include "rocksjni/table_filter_jnicallback.h"
 #include "rocksjni/trace_writer_jnicallback.h"
@@ -223,7 +226,7 @@ class CodeJni : public JavaClass {
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getValueMethod(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -261,7 +264,7 @@ class SubCodeJni : public JavaClass {
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getValueMethod(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -326,7 +329,7 @@ class StatusJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getCodeMethod(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -347,7 +350,7 @@ class StatusJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getSubCodeMethod(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -368,7 +371,7 @@ class StatusJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getStateMethod(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -438,6 +441,10 @@ class StatusJni
     return jstatus;
   }
 
+  static jobject construct(JNIEnv* env, const Status* status) {
+    return construct(env, *status);
+  }
+
   // Returns the equivalent org.rocksdb.Status.Code for the provided
   // C++ ROCKSDB_NAMESPACE::Status::Code enum
   static jbyte toJavaStatusCode(const ROCKSDB_NAMESPACE::Status::Code& code) {
@@ -934,7 +941,7 @@ class RocksDBExceptionJni :
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getStatusMethod(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -1025,7 +1032,7 @@ class ListJni : public JavaClass {
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getIteratorMethod(JNIEnv* env) {
     jclass jlist_clazz = getListClass(env);
@@ -1046,7 +1053,7 @@ class ListJni : public JavaClass {
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getHasNextMethod(JNIEnv* env) {
     jclass jiterator_clazz = getIteratorClass(env);
@@ -1066,7 +1073,7 @@ class ListJni : public JavaClass {
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getNextMethod(JNIEnv* env) {
     jclass jiterator_clazz = getIteratorClass(env);
@@ -1087,7 +1094,7 @@ class ListJni : public JavaClass {
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getArrayListConstructorMethodId(JNIEnv* env) {
     jclass jarray_list_clazz = getArrayListClass(env);
@@ -1107,7 +1114,7 @@ class ListJni : public JavaClass {
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getListAddMethodId(JNIEnv* env) {
     jclass jlist_clazz = getListClass(env);
@@ -1244,10 +1251,11 @@ class ByteBufferJni : public JavaClass {
    * Get the Java Method: ByteBuffer#allocate
    *
    * @param env A pointer to the Java environment
-   * @param jbytebuffer_clazz if you have a reference to a ByteBuffer class, or nullptr
+   * @param jbytebuffer_clazz if you have a reference to a ByteBuffer class, or
+   * nullptr
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getAllocateMethodId(JNIEnv* env,
       jclass jbytebuffer_clazz = nullptr) {
@@ -1270,7 +1278,7 @@ class ByteBufferJni : public JavaClass {
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getArrayMethodId(JNIEnv* env,
       jclass jbytebuffer_clazz = nullptr) {
@@ -1479,7 +1487,7 @@ class StringBuilderJni : public JavaClass {
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getListAddMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -2366,7 +2374,7 @@ class MapJni : public JavaClass {
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getMapPutMethodId(JNIEnv* env) {
     jclass jlist_clazz = getJClass(env);
@@ -2898,7 +2906,7 @@ class WriteBatchHandlerJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getPutCfMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -2918,7 +2926,7 @@ class WriteBatchHandlerJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getPutMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -2938,7 +2946,7 @@ class WriteBatchHandlerJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getMergeCfMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -2958,7 +2966,7 @@ class WriteBatchHandlerJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getMergeMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -2978,7 +2986,7 @@ class WriteBatchHandlerJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getDeleteCfMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -2998,7 +3006,7 @@ class WriteBatchHandlerJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getDeleteMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3018,7 +3026,7 @@ class WriteBatchHandlerJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getSingleDeleteCfMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3038,7 +3046,7 @@ class WriteBatchHandlerJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getSingleDeleteMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3058,7 +3066,7 @@ class WriteBatchHandlerJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getDeleteRangeCfMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3078,7 +3086,7 @@ class WriteBatchHandlerJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getDeleteRangeMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3098,7 +3106,7 @@ class WriteBatchHandlerJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getLogDataMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3118,7 +3126,7 @@ class WriteBatchHandlerJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getPutBlobIndexCfMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3138,7 +3146,7 @@ class WriteBatchHandlerJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getMarkBeginPrepareMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3158,7 +3166,7 @@ class WriteBatchHandlerJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getMarkEndPrepareMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3178,7 +3186,7 @@ class WriteBatchHandlerJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getMarkNoopMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3198,7 +3206,7 @@ class WriteBatchHandlerJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getMarkRollbackMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3218,7 +3226,7 @@ class WriteBatchHandlerJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getMarkCommitMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3238,7 +3246,7 @@ class WriteBatchHandlerJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getContinueMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3274,7 +3282,7 @@ class WriteBatchSavePointJni : public JavaClass {
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getConstructorMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3365,7 +3373,7 @@ class HistogramDataJni : public JavaClass {
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getConstructorMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3461,6 +3469,19 @@ class ColumnFamilyHandleJni
     : public RocksDBNativeClass<ROCKSDB_NAMESPACE::ColumnFamilyHandle*,
                                 ColumnFamilyHandleJni> {
  public:
+  static jobject fromCppColumnFamilyHandle(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::ColumnFamilyHandle* info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    return env->NewObject(jclazz, ctor, reinterpret_cast<jlong>(info));
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(clazz, "<init>", "(J)V");
+  }
+
   /**
    * Get the Java Class org.rocksdb.ColumnFamilyHandle
    *
@@ -3541,7 +3562,7 @@ class AbstractCompactionFilterFactoryJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getNameMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3562,7 +3583,7 @@ class AbstractCompactionFilterFactoryJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getCreateCompactionFilterMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3629,7 +3650,7 @@ class AbstractComparatorJniBridge : public JavaClass {
    * @param jclazz the AbstractComparatorJniBridge class
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getCompareInternalMethodId(JNIEnv* env, jclass jclazz) {
     static jmethodID mid =
@@ -3646,7 +3667,7 @@ class AbstractComparatorJniBridge : public JavaClass {
    * @param jclazz the AbstractComparatorJniBridge class
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getFindShortestSeparatorInternalMethodId(JNIEnv* env, jclass jclazz) {
     static jmethodID mid =
@@ -3663,7 +3684,7 @@ class AbstractComparatorJniBridge : public JavaClass {
    * @param jclazz the AbstractComparatorJniBridge class
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getFindShortSuccessorInternalMethodId(JNIEnv* env, jclass jclazz) {
     static jmethodID mid =
@@ -3699,7 +3720,7 @@ class AbstractComparatorJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getNameMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -3996,7 +4017,7 @@ class WBWIRocksIteratorJni : public JavaClass {
    * @param env A pointer to the Java environment
    *
    * @return The Java Field ID or nullptr if the class or field id could not
-   *     be retieved
+   *     be retrieved
    */
   static jfieldID getWriteEntryField(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -4317,7 +4338,7 @@ class LoggerJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getLogMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -4961,7 +4982,24 @@ class TickerTypeJni {
         return -0x14;
       case ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_TTL:
         return -0x15;
-
+      case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_ERROR_COUNT:
+        return -0x16;
+      case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_IO_ERROR_COUNT:
+        return -0x17;
+      case ROCKSDB_NAMESPACE::Tickers::
+          ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT:
+        return -0x18;
+      case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_COUNT:
+        return -0x19;
+      case ROCKSDB_NAMESPACE::Tickers::
+          ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT:
+        return -0x1A;
+      case ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT:
+        return -0x1B;
+      case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_PAYLOAD_BYTES_AT_FLUSH:
+        return -0x1C;
+      case ROCKSDB_NAMESPACE::Tickers::MEMTABLE_GARBAGE_BYTES_AT_FLUSH:
+        return -0x1D;
       case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX:
         // 0x5F for backwards compatibility on current minor version.
         return 0x5F;
@@ -5273,6 +5311,25 @@ class TickerTypeJni {
         return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_PERIODIC;
       case -0x15:
         return ROCKSDB_NAMESPACE::Tickers::COMPACT_WRITE_BYTES_TTL;
+      case -0x16:
+        return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_ERROR_COUNT;
+      case -0x17:
+        return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_BG_IO_ERROR_COUNT;
+      case -0x18:
+        return ROCKSDB_NAMESPACE::Tickers::
+            ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT;
+      case -0x19:
+        return ROCKSDB_NAMESPACE::Tickers::ERROR_HANDLER_AUTORESUME_COUNT;
+      case -0x1A:
+        return ROCKSDB_NAMESPACE::Tickers::
+            ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT;
+      case -0x1B:
+        return ROCKSDB_NAMESPACE::Tickers::
+            ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT;
+      case -0x1C:
+        return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_PAYLOAD_BYTES_AT_FLUSH;
+      case -0x1D:
+        return ROCKSDB_NAMESPACE::Tickers::MEMTABLE_GARBAGE_BYTES_AT_FLUSH;
       case 0x5F:
         // 0x5F for backwards compatibility on current minor version.
         return ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX;
@@ -5392,6 +5449,8 @@ class HistogramTypeJni {
         return 0x30;
       case ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL:
         return 0x31;
+      case ROCKSDB_NAMESPACE::Histograms::ERROR_HANDLER_AUTORESUME_RETRY_COUNT:
+        return 0x31;
       case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX:
         // 0x1F for backwards compatibility on current minor version.
         return 0x1F;
@@ -5506,6 +5565,9 @@ class HistogramTypeJni {
         return ROCKSDB_NAMESPACE::Histograms::NUM_DATA_BLOCKS_READ_PER_LEVEL;
       case 0x31:
         return ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL;
+      case 0x32:
+        return ROCKSDB_NAMESPACE::Histograms::
+            ERROR_HANDLER_AUTORESUME_RETRY_COUNT;
       case 0x1F:
         // 0x1F for backwards compatibility on current minor version.
         return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX;
@@ -5698,7 +5760,8 @@ class TransactionJni : public JavaClass {
       return nullptr;
     }
 
-    jlong *body = env->GetLongArrayElements(jtransaction_ids, nullptr);
+    jboolean is_copy;
+    jlong* body = env->GetLongArrayElements(jtransaction_ids, &is_copy);
     if(body == nullptr) {
         // exception thrown: OutOfMemoryError
         env->DeleteLocalRef(jkey);
@@ -5708,7 +5771,8 @@ class TransactionJni : public JavaClass {
     for(size_t i = 0; i < len; ++i) {
       body[i] = static_cast<jlong>(transaction_ids[i]);
     }
-    env->ReleaseLongArrayElements(jtransaction_ids, body, 0);
+    env->ReleaseLongArrayElements(jtransaction_ids, body,
+                                  is_copy == JNI_TRUE ? 0 : JNI_ABORT);
 
     jobject jwaiting_transactions = env->CallObjectMethod(jtransaction,
       mid, static_cast<jlong>(column_family_id), jkey, jtransaction_ids);
@@ -5979,7 +6043,7 @@ class AbstractTableFilterJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getFilterMethod(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -6019,7 +6083,11 @@ class TablePropertiesJni : public JavaClass {
       return nullptr;
     }
 
-    jmethodID mid = env->GetMethodID(jclazz, "<init>", "(JJJJJJJJJJJJJJJJJJJ[BLjava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/util/Map;Ljava/util/Map;Ljava/util/Map;)V");
+    jmethodID mid = env->GetMethodID(
+        jclazz, "<init>",
+        "(JJJJJJJJJJJJJJJJJJJJJ[BLjava/lang/String;Ljava/lang/String;Ljava/"
+        "lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/"
+        "String;Ljava/util/Map;Ljava/util/Map;Ljava/util/Map;)V");
     if (mid == nullptr) {
       // exception thrown: NoSuchMethodException or OutOfMemoryError
       return nullptr;
@@ -6145,8 +6213,8 @@ class TablePropertiesJni : public JavaClass {
       return nullptr;
     }
 
-    jobject jtable_properties = env->NewObject(jclazz, mid,
-        static_cast<jlong>(table_properties.data_size),
+    jobject jtable_properties = env->NewObject(
+        jclazz, mid, static_cast<jlong>(table_properties.data_size),
         static_cast<jlong>(table_properties.index_size),
         static_cast<jlong>(table_properties.index_partitions),
         static_cast<jlong>(table_properties.top_level_index_size),
@@ -6165,17 +6233,14 @@ class TablePropertiesJni : public JavaClass {
         static_cast<jlong>(table_properties.column_family_id),
         static_cast<jlong>(table_properties.creation_time),
         static_cast<jlong>(table_properties.oldest_key_time),
-        jcolumn_family_name,
-        jfilter_policy_name,
-        jcomparator_name,
-        jmerge_operator_name,
-        jprefix_extractor_name,
-        jproperty_collectors_names,
-        jcompression_name,
-        juser_collected_properties,
-        jreadable_properties,
-        jproperties_offsets
-    );
+        static_cast<jlong>(
+            table_properties.slow_compression_estimated_data_size),
+        static_cast<jlong>(
+            table_properties.fast_compression_estimated_data_size),
+        jcolumn_family_name, jfilter_policy_name, jcomparator_name,
+        jmerge_operator_name, jprefix_extractor_name,
+        jproperty_collectors_names, jcompression_name,
+        juser_collected_properties, jreadable_properties, jproperties_offsets);
 
     if (env->ExceptionCheck()) {
       return nullptr;
@@ -6249,7 +6314,7 @@ class ColumnFamilyDescriptorJni : public JavaClass {
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getColumnFamilyNameMethod(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -6269,7 +6334,7 @@ class ColumnFamilyDescriptorJni : public JavaClass {
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getColumnFamilyOptionsMethod(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -6763,7 +6828,8 @@ class ThreadStatusJni : public JavaClass {
       env->DeleteLocalRef(jcf_name);
       return nullptr;
     }
-    jlong *body = env->GetLongArrayElements(joperation_properties, nullptr);
+    jboolean is_copy;
+    jlong* body = env->GetLongArrayElements(joperation_properties, &is_copy);
     if (body == nullptr) {
         // exception thrown: OutOfMemoryError
         env->DeleteLocalRef(jdb_name);
@@ -6774,7 +6840,8 @@ class ThreadStatusJni : public JavaClass {
     for (size_t i = 0; i < len; ++i) {
       body[i] = static_cast<jlong>(thread_status->op_properties[i]);
     }
-    env->ReleaseLongArrayElements(joperation_properties, body, 0);
+    env->ReleaseLongArrayElements(joperation_properties, body,
+                                  is_copy == JNI_TRUE ? 0 : JNI_ABORT);
 
     jobject jcfd = env->NewObject(jclazz, mid,
         static_cast<jlong>(thread_status->thread_id),
@@ -7395,7 +7462,7 @@ class AbstractTraceWriterJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getWriteProxyMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -7416,7 +7483,7 @@ class AbstractTraceWriterJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getCloseWriterProxyMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -7437,7 +7504,7 @@ class AbstractTraceWriterJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getGetFileSizeMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -7478,7 +7545,7 @@ class AbstractWalFilterJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getColumnFamilyLogNumberMapMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -7500,7 +7567,7 @@ class AbstractWalFilterJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getLogRecordFoundProxyMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -7521,7 +7588,7 @@ class AbstractWalFilterJni
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
   static jmethodID getNameMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
@@ -7659,5 +7726,760 @@ class SanityLevelJni {
     }
   }
 };
+
+// The portal class for org.rocksdb.AbstractListener.EnabledEventCallback
+class EnabledEventCallbackJni {
+ public:
+  // Returns the set of equivalent C++
+  // rocksdb::EnabledEventCallbackJni::EnabledEventCallback enums for
+  // the provided Java jenabled_event_callback_values
+  static std::set<EnabledEventCallback> toCppEnabledEventCallbacks(
+      jlong jenabled_event_callback_values) {
+    std::set<EnabledEventCallback> enabled_event_callbacks;
+    for (size_t i = 0; i < EnabledEventCallback::NUM_ENABLED_EVENT_CALLBACK;
+         ++i) {
+      if (((1ULL << i) & jenabled_event_callback_values) > 0) {
+        enabled_event_callbacks.emplace(static_cast<EnabledEventCallback>(i));
+      }
+    }
+    return enabled_event_callbacks;
+  }
+};
+
+// The portal class for org.rocksdb.AbstractEventListener
+class AbstractEventListenerJni
+    : public RocksDBNativeClass<
+          const ROCKSDB_NAMESPACE::EventListenerJniCallback*,
+          AbstractEventListenerJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.AbstractEventListener
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+                                         "org/rocksdb/AbstractEventListener");
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFlushCompletedProxy
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFlushCompletedProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(jclazz, "onFlushCompletedProxy",
+                                            "(JLorg/rocksdb/FlushJobInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFlushBeginProxy
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFlushBeginProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(jclazz, "onFlushBeginProxy",
+                                            "(JLorg/rocksdb/FlushJobInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onTableFileDeleted
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnTableFileDeletedMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onTableFileDeleted", "(Lorg/rocksdb/TableFileDeletionInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onCompactionBeginProxy
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnCompactionBeginProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "onCompactionBeginProxy",
+                         "(JLorg/rocksdb/CompactionJobInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onCompactionCompletedProxy
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnCompactionCompletedProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "onCompactionCompletedProxy",
+                         "(JLorg/rocksdb/CompactionJobInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onTableFileCreated
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnTableFileCreatedMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onTableFileCreated", "(Lorg/rocksdb/TableFileCreationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onTableFileCreationStarted
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnTableFileCreationStartedMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "onTableFileCreationStarted",
+                         "(Lorg/rocksdb/TableFileCreationBriefInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onMemTableSealed
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnMemTableSealedMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(jclazz, "onMemTableSealed",
+                                            "(Lorg/rocksdb/MemTableInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method:
+   * AbstractEventListener#onColumnFamilyHandleDeletionStarted
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnColumnFamilyHandleDeletionStartedMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "onColumnFamilyHandleDeletionStarted",
+                         "(Lorg/rocksdb/ColumnFamilyHandle;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onExternalFileIngestedProxy
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnExternalFileIngestedProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "onExternalFileIngestedProxy",
+                         "(JLorg/rocksdb/ExternalFileIngestionInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onBackgroundError
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnBackgroundErrorProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(jclazz, "onBackgroundErrorProxy",
+                                            "(BLorg/rocksdb/Status;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onStallConditionsChanged
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnStallConditionsChangedMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(jclazz, "onStallConditionsChanged",
+                                            "(Lorg/rocksdb/WriteStallInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFileReadFinish
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFileReadFinishMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onFileReadFinish", "(Lorg/rocksdb/FileOperationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFileWriteFinish
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFileWriteFinishMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onFileWriteFinish", "(Lorg/rocksdb/FileOperationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFileFlushFinish
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFileFlushFinishMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onFileFlushFinish", "(Lorg/rocksdb/FileOperationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFileSyncFinish
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFileSyncFinishMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onFileSyncFinish", "(Lorg/rocksdb/FileOperationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFileRangeSyncFinish
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFileRangeSyncFinishMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onFileRangeSyncFinish", "(Lorg/rocksdb/FileOperationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFileTruncateFinish
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFileTruncateFinishMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onFileTruncateFinish", "(Lorg/rocksdb/FileOperationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onFileCloseFinish
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnFileCloseFinishMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "onFileCloseFinish", "(Lorg/rocksdb/FileOperationInfo;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#shouldBeNotifiedOnFileIO
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getShouldBeNotifiedOnFileIOMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "shouldBeNotifiedOnFileIO", "()Z");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onErrorRecoveryBeginProxy
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnErrorRecoveryBeginProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(jclazz, "onErrorRecoveryBeginProxy",
+                                            "(BLorg/rocksdb/Status;)Z");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractEventListener#onErrorRecoveryCompleted
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID
+   */
+  static jmethodID getOnErrorRecoveryCompletedMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID mid = env->GetMethodID(jclazz, "onErrorRecoveryCompleted",
+                                            "(Lorg/rocksdb/Status;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+class FlushJobInfoJni : public JavaClass {
+ public:
+  /**
+   * Create a new Java org.rocksdb.FlushJobInfo object.
+   *
+   * @param env A pointer to the Java environment
+   * @param flush_job_info A Cpp flush job info object
+   *
+   * @return A reference to a Java org.rocksdb.FlushJobInfo object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject fromCppFlushJobInfo(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::FlushJobInfo* flush_job_info) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jcf_name = JniUtil::toJavaString(env, &flush_job_info->cf_name);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    jstring jfile_path = JniUtil::toJavaString(env, &flush_job_info->file_path);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jfile_path);
+      return nullptr;
+    }
+    jobject jtable_properties = TablePropertiesJni::fromCppTableProperties(
+        env, flush_job_info->table_properties);
+    if (jtable_properties == nullptr) {
+      env->DeleteLocalRef(jcf_name);
+      env->DeleteLocalRef(jfile_path);
+      return nullptr;
+    }
+    return env->NewObject(
+        jclazz, ctor, static_cast<jlong>(flush_job_info->cf_id), jcf_name,
+        jfile_path, static_cast<jlong>(flush_job_info->thread_id),
+        static_cast<jint>(flush_job_info->job_id),
+        static_cast<jboolean>(flush_job_info->triggered_writes_slowdown),
+        static_cast<jboolean>(flush_job_info->triggered_writes_stop),
+        static_cast<jlong>(flush_job_info->smallest_seqno),
+        static_cast<jlong>(flush_job_info->largest_seqno), jtable_properties,
+        static_cast<jbyte>(flush_job_info->flush_reason));
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/FlushJobInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(clazz, "<init>",
+                            "(JLjava/lang/String;Ljava/lang/String;JIZZJJLorg/"
+                            "rocksdb/TableProperties;B)V");
+  }
+};
+
+class TableFileDeletionInfoJni : public JavaClass {
+ public:
+  /**
+   * Create a new Java org.rocksdb.TableFileDeletionInfo object.
+   *
+   * @param env A pointer to the Java environment
+   * @param file_del_info A Cpp table file deletion info object
+   *
+   * @return A reference to a Java org.rocksdb.TableFileDeletionInfo object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject fromCppTableFileDeletionInfo(
+      JNIEnv* env,
+      const ROCKSDB_NAMESPACE::TableFileDeletionInfo* file_del_info) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jdb_name = JniUtil::toJavaString(env, &file_del_info->db_name);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    jobject jstatus = StatusJni::construct(env, file_del_info->status);
+    if (jstatus == nullptr) {
+      env->DeleteLocalRef(jdb_name);
+      return nullptr;
+    }
+    return env->NewObject(jclazz, ctor, jdb_name,
+                          JniUtil::toJavaString(env, &file_del_info->file_path),
+                          static_cast<jint>(file_del_info->job_id), jstatus);
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/TableFileDeletionInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(
+        clazz, "<init>",
+        "(Ljava/lang/String;Ljava/lang/String;ILorg/rocksdb/Status;)V");
+  }
+};
+
+class CompactionJobInfoJni : public JavaClass {
+ public:
+  static jobject fromCppCompactionJobInfo(
+      JNIEnv* env,
+      const ROCKSDB_NAMESPACE::CompactionJobInfo* compaction_job_info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    return env->NewObject(jclazz, ctor,
+                          reinterpret_cast<jlong>(compaction_job_info));
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/CompactionJobInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(clazz, "<init>", "(J)V");
+  }
+};
+
+class TableFileCreationInfoJni : public JavaClass {
+ public:
+  static jobject fromCppTableFileCreationInfo(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::TableFileCreationInfo* info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jdb_name = JniUtil::toJavaString(env, &info->db_name);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jdb_name);
+      return nullptr;
+    }
+    jstring jfile_path = JniUtil::toJavaString(env, &info->file_path);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jdb_name);
+      env->DeleteLocalRef(jcf_name);
+      return nullptr;
+    }
+    jobject jtable_properties =
+        TablePropertiesJni::fromCppTableProperties(env, info->table_properties);
+    if (jtable_properties == nullptr) {
+      env->DeleteLocalRef(jdb_name);
+      env->DeleteLocalRef(jcf_name);
+      return nullptr;
+    }
+    jobject jstatus = StatusJni::construct(env, info->status);
+    if (jstatus == nullptr) {
+      env->DeleteLocalRef(jdb_name);
+      env->DeleteLocalRef(jcf_name);
+      env->DeleteLocalRef(jtable_properties);
+      return nullptr;
+    }
+    return env->NewObject(jclazz, ctor, static_cast<jlong>(info->file_size),
+                          jtable_properties, jstatus, jdb_name, jcf_name,
+                          jfile_path, static_cast<jint>(info->job_id),
+                          static_cast<jbyte>(info->reason));
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/TableFileCreationInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(
+        clazz, "<init>",
+        "(JLorg/rocksdb/TableProperties;Lorg/rocksdb/Status;Ljava/lang/"
+        "String;Ljava/lang/String;Ljava/lang/String;IB)V");
+  }
+};
+
+class TableFileCreationBriefInfoJni : public JavaClass {
+ public:
+  static jobject fromCppTableFileCreationBriefInfo(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::TableFileCreationBriefInfo* info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jdb_name = JniUtil::toJavaString(env, &info->db_name);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jdb_name);
+      return nullptr;
+    }
+    jstring jfile_path = JniUtil::toJavaString(env, &info->file_path);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jdb_name);
+      env->DeleteLocalRef(jcf_name);
+      return nullptr;
+    }
+    return env->NewObject(jclazz, ctor, jdb_name, jcf_name, jfile_path,
+                          static_cast<jint>(info->job_id),
+                          static_cast<jbyte>(info->reason));
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/TableFileCreationBriefInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(
+        clazz, "<init>",
+        "(Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;IB)V");
+  }
+};
+
+class MemTableInfoJni : public JavaClass {
+ public:
+  static jobject fromCppMemTableInfo(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::MemTableInfo* info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    return env->NewObject(jclazz, ctor, jcf_name,
+                          static_cast<jlong>(info->first_seqno),
+                          static_cast<jlong>(info->earliest_seqno),
+                          static_cast<jlong>(info->num_entries),
+                          static_cast<jlong>(info->num_deletes));
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/MemTableInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(clazz, "<init>", "(Ljava/lang/String;JJJJ)V");
+  }
+};
+
+class ExternalFileIngestionInfoJni : public JavaClass {
+ public:
+  static jobject fromCppExternalFileIngestionInfo(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::ExternalFileIngestionInfo* info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    jstring jexternal_file_path =
+        JniUtil::toJavaString(env, &info->external_file_path);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jcf_name);
+      return nullptr;
+    }
+    jstring jinternal_file_path =
+        JniUtil::toJavaString(env, &info->internal_file_path);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jcf_name);
+      env->DeleteLocalRef(jexternal_file_path);
+      return nullptr;
+    }
+    jobject jtable_properties =
+        TablePropertiesJni::fromCppTableProperties(env, info->table_properties);
+    if (jtable_properties == nullptr) {
+      env->DeleteLocalRef(jcf_name);
+      env->DeleteLocalRef(jexternal_file_path);
+      env->DeleteLocalRef(jinternal_file_path);
+      return nullptr;
+    }
+    return env->NewObject(
+        jclazz, ctor, jcf_name, jexternal_file_path, jinternal_file_path,
+        static_cast<jlong>(info->global_seqno), jtable_properties);
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/ExternalFileIngestionInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(clazz, "<init>",
+                            "(Ljava/lang/String;Ljava/lang/String;Ljava/lang/"
+                            "String;JLorg/rocksdb/TableProperties;)V");
+  }
+};
+
+class WriteStallInfoJni : public JavaClass {
+ public:
+  static jobject fromCppWriteStallInfo(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::WriteStallInfo* info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jcf_name = JniUtil::toJavaString(env, &info->cf_name);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    return env->NewObject(jclazz, ctor, jcf_name,
+                          static_cast<jbyte>(info->condition.cur),
+                          static_cast<jbyte>(info->condition.prev));
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/WriteStallInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(clazz, "<init>", "(Ljava/lang/String;BB)V");
+  }
+};
+
+class FileOperationInfoJni : public JavaClass {
+ public:
+  static jobject fromCppFileOperationInfo(
+      JNIEnv* env, const ROCKSDB_NAMESPACE::FileOperationInfo* info) {
+    jclass jclazz = getJClass(env);
+    assert(jclazz != nullptr);
+    static jmethodID ctor = getConstructorMethodId(env, jclazz);
+    assert(ctor != nullptr);
+    jstring jpath = JniUtil::toJavaString(env, &info->path);
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    jobject jstatus = StatusJni::construct(env, info->status);
+    if (jstatus == nullptr) {
+      env->DeleteLocalRef(jpath);
+      return nullptr;
+    }
+    return env->NewObject(
+        jclazz, ctor, jpath, static_cast<jlong>(info->offset),
+        static_cast<jlong>(info->length),
+        static_cast<jlong>(info->start_ts.time_since_epoch().count()),
+        static_cast<jlong>(info->duration.count()), jstatus);
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/FileOperationInfo");
+  }
+
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) {
+    return env->GetMethodID(clazz, "<init>",
+                            "(Ljava/lang/String;JJJJLorg/rocksdb/Status;)V");
+  }
+};
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // JAVA_ROCKSJNI_PORTAL_H_
diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc
index 0f463e9b3d5..054c0fc89fd 100644
--- a/java/rocksjni/rocksjni.cc
+++ b/java/rocksjni/rocksjni.cc
@@ -421,8 +421,8 @@ jlongArray Java_org_rocksdb_RocksDB_createColumnFamilies__J_3J_3_3B(
   std::vector<ROCKSDB_NAMESPACE::ColumnFamilyDescriptor> cf_descriptors;
   cf_descriptors.reserve(jlen);
 
-  jboolean jcf_options_handles_is_copy = JNI_FALSE;
-  jlong *jcf_options_handles_elems = env->GetLongArrayElements(jcf_options_handles, &jcf_options_handles_is_copy);
+  jlong* jcf_options_handles_elems =
+      env->GetLongArrayElements(jcf_options_handles, nullptr);
   if(jcf_options_handles_elems == nullptr) {
       // exception thrown: OutOfMemoryError
       return nullptr;
@@ -2343,9 +2343,7 @@ jlongArray Java_org_rocksdb_RocksDB_getApproximateSizes(
   const jsize jlen = env->GetArrayLength(jrange_slice_handles);
   const size_t range_count = jlen / 2;
 
-  jboolean jranges_is_copy = JNI_FALSE;
-  jlong* jranges = env->GetLongArrayElements(jrange_slice_handles,
-      &jranges_is_copy);
+  jlong* jranges = env->GetLongArrayElements(jrange_slice_handles, nullptr);
   if (jranges == nullptr) {
     // exception thrown: OutOfMemoryError
     return nullptr;
@@ -2430,14 +2428,13 @@ jlongArray Java_org_rocksdb_RocksDB_getApproximateMemTableStats(
       static_cast<jlong>(count),
       static_cast<jlong>(sizes)};
 
-  const jsize jcount = static_cast<jsize>(count);
-  jlongArray jsizes = env->NewLongArray(jcount);
+  jlongArray jsizes = env->NewLongArray(2);
   if (jsizes == nullptr) {
     // exception thrown: OutOfMemoryError
     return nullptr;
   }
 
-  env->SetLongArrayRegion(jsizes, 0, jcount, results);
+  env->SetLongArrayRegion(jsizes, 0, 2, results);
   if (env->ExceptionCheck()) {
     // exception thrown: ArrayIndexOutOfBoundsException
     env->DeleteLocalRef(jsizes);
@@ -3256,9 +3253,8 @@ jobject Java_org_rocksdb_RocksDB_getPropertiesOfTablesInRange(
         reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyHandle*>(jcf_handle);
   }
   const jsize jlen = env->GetArrayLength(jrange_slice_handles);
-  jboolean jrange_slice_handles_is_copy = JNI_FALSE;
-  jlong *jrange_slice_handle = env->GetLongArrayElements(
-      jrange_slice_handles, &jrange_slice_handles_is_copy);
+  jlong* jrange_slice_handle =
+      env->GetLongArrayElements(jrange_slice_handles, nullptr);
   if (jrange_slice_handle == nullptr) {
     // exception occurred
     return nullptr;
diff --git a/java/rocksjni/slice.cc b/java/rocksjni/slice.cc
index d9e58992bd6..00ee8e7e22e 100644
--- a/java/rocksjni/slice.cc
+++ b/java/rocksjni/slice.cc
@@ -228,6 +228,17 @@ void Java_org_rocksdb_Slice_removePrefix0(JNIEnv* /*env*/, jobject /*jobj*/,
   slice->remove_prefix(length);
 }
 
+/*
+ * Class:     org_rocksdb_DirectSlice
+ * Method:    setLength0
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DirectSlice_setLength0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                             jlong handle, jint length) {
+  auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  slice->size_ = length;
+}
+
 /*
  * Class:     org_rocksdb_Slice
  * Method:    disposeInternalBuf
diff --git a/java/rocksjni/testable_event_listener.cc b/java/rocksjni/testable_event_listener.cc
new file mode 100644
index 00000000000..2540f2ecb69
--- /dev/null
+++ b/java/rocksjni/testable_event_listener.cc
@@ -0,0 +1,198 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include <climits>
+#include <cstdint>
+#include <utility>
+
+#include "include/org_rocksdb_test_TestableEventListener.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table_properties.h"
+
+using namespace ROCKSDB_NAMESPACE;
+
+static TableProperties newTablePropertiesForTest() {
+  TableProperties table_properties;
+  table_properties.data_size = UINT64_MAX;
+  table_properties.index_size = UINT64_MAX;
+  table_properties.index_partitions = UINT64_MAX;
+  table_properties.top_level_index_size = UINT64_MAX;
+  table_properties.index_key_is_user_key = UINT64_MAX;
+  table_properties.index_value_is_delta_encoded = UINT64_MAX;
+  table_properties.filter_size = UINT64_MAX;
+  table_properties.raw_key_size = UINT64_MAX;
+  table_properties.raw_value_size = UINT64_MAX;
+  table_properties.num_data_blocks = UINT64_MAX;
+  table_properties.num_entries = UINT64_MAX;
+  table_properties.num_deletions = UINT64_MAX;
+  table_properties.num_merge_operands = UINT64_MAX;
+  table_properties.num_range_deletions = UINT64_MAX;
+  table_properties.format_version = UINT64_MAX;
+  table_properties.fixed_key_len = UINT64_MAX;
+  table_properties.column_family_id = UINT64_MAX;
+  table_properties.creation_time = UINT64_MAX;
+  table_properties.oldest_key_time = UINT64_MAX;
+  table_properties.file_creation_time = UINT64_MAX;
+  table_properties.slow_compression_estimated_data_size = UINT64_MAX;
+  table_properties.fast_compression_estimated_data_size = UINT64_MAX;
+  table_properties.db_id = "dbId";
+  table_properties.db_session_id = "sessionId";
+  table_properties.column_family_name = "columnFamilyName";
+  table_properties.filter_policy_name = "filterPolicyName";
+  table_properties.comparator_name = "comparatorName";
+  table_properties.merge_operator_name = "mergeOperatorName";
+  table_properties.prefix_extractor_name = "prefixExtractorName";
+  table_properties.property_collectors_names = "propertyCollectorsNames";
+  table_properties.compression_name = "compressionName";
+  table_properties.compression_options = "compressionOptions";
+  table_properties.user_collected_properties = {{"key", "value"}};
+  table_properties.readable_properties = {{"key", "value"}};
+  table_properties.properties_offsets = {{"key", UINT64_MAX}};
+  return table_properties;
+}
+
+/*
+ * Class:     org_rocksdb_test_TestableEventListener
+ * Method:    invokeAllCallbacks
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_test_TestableEventListener_invokeAllCallbacks(
+    JNIEnv *, jclass, jlong jhandle) {
+  const auto &el =
+      *reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::EventListener> *>(
+          jhandle);
+
+  TableProperties table_properties = newTablePropertiesForTest();
+
+  FlushJobInfo flush_job_info;
+  flush_job_info.cf_id = INT_MAX;
+  flush_job_info.cf_name = "testColumnFamily";
+  flush_job_info.file_path = "/file/path";
+  flush_job_info.file_number = UINT64_MAX;
+  flush_job_info.oldest_blob_file_number = UINT64_MAX;
+  flush_job_info.thread_id = UINT64_MAX;
+  flush_job_info.job_id = INT_MAX;
+  flush_job_info.triggered_writes_slowdown = true;
+  flush_job_info.triggered_writes_stop = true;
+  flush_job_info.smallest_seqno = UINT64_MAX;
+  flush_job_info.largest_seqno = UINT64_MAX;
+  flush_job_info.table_properties = table_properties;
+  flush_job_info.flush_reason = FlushReason::kManualFlush;
+
+  el->OnFlushCompleted(nullptr, flush_job_info);
+  el->OnFlushBegin(nullptr, flush_job_info);
+
+  Status status = Status::Incomplete(Status::SubCode::kNoSpace);
+
+  TableFileDeletionInfo file_deletion_info;
+  file_deletion_info.db_name = "dbName";
+  file_deletion_info.file_path = "/file/path";
+  file_deletion_info.job_id = INT_MAX;
+  file_deletion_info.status = status;
+
+  el->OnTableFileDeleted(file_deletion_info);
+
+  CompactionJobInfo compaction_job_info;
+  compaction_job_info.cf_id = UINT32_MAX;
+  compaction_job_info.cf_name = "compactionColumnFamily";
+  compaction_job_info.status = status;
+  compaction_job_info.thread_id = UINT64_MAX;
+  compaction_job_info.job_id = INT_MAX;
+  compaction_job_info.base_input_level = INT_MAX;
+  compaction_job_info.output_level = INT_MAX;
+  compaction_job_info.input_files = {"inputFile.sst"};
+  compaction_job_info.input_file_infos = {};
+  compaction_job_info.output_files = {"outputFile.sst"};
+  compaction_job_info.output_file_infos = {};
+  compaction_job_info.table_properties = {
+      {"tableProperties", std::shared_ptr<TableProperties>(
+                              &table_properties, [](TableProperties *) {})}};
+  compaction_job_info.compaction_reason = CompactionReason::kFlush;
+  compaction_job_info.compression = CompressionType::kSnappyCompression;
+
+  compaction_job_info.stats = CompactionJobStats();
+
+  el->OnCompactionBegin(nullptr, compaction_job_info);
+  el->OnCompactionCompleted(nullptr, compaction_job_info);
+
+  TableFileCreationInfo file_creation_info;
+  file_creation_info.file_size = UINT64_MAX;
+  file_creation_info.table_properties = table_properties;
+  file_creation_info.status = status;
+  file_creation_info.file_checksum = "fileChecksum";
+  file_creation_info.file_checksum_func_name = "fileChecksumFuncName";
+  file_creation_info.db_name = "dbName";
+  file_creation_info.cf_name = "columnFamilyName";
+  file_creation_info.file_path = "/file/path";
+  file_creation_info.job_id = INT_MAX;
+  file_creation_info.reason = TableFileCreationReason::kMisc;
+
+  el->OnTableFileCreated(file_creation_info);
+
+  TableFileCreationBriefInfo file_creation_brief_info;
+  file_creation_brief_info.db_name = "dbName";
+  file_creation_brief_info.cf_name = "columnFamilyName";
+  file_creation_brief_info.file_path = "/file/path";
+  file_creation_brief_info.job_id = INT_MAX;
+  file_creation_brief_info.reason = TableFileCreationReason::kMisc;
+
+  el->OnTableFileCreationStarted(file_creation_brief_info);
+
+  MemTableInfo mem_table_info;
+  mem_table_info.cf_name = "columnFamilyName";
+  mem_table_info.first_seqno = UINT64_MAX;
+  mem_table_info.earliest_seqno = UINT64_MAX;
+  mem_table_info.num_entries = UINT64_MAX;
+  mem_table_info.num_deletes = UINT64_MAX;
+
+  el->OnMemTableSealed(mem_table_info);
+  el->OnColumnFamilyHandleDeletionStarted(nullptr);
+
+  ExternalFileIngestionInfo file_ingestion_info;
+  file_ingestion_info.cf_name = "columnFamilyName";
+  file_ingestion_info.external_file_path = "/external/file/path";
+  file_ingestion_info.internal_file_path = "/internal/file/path";
+  file_ingestion_info.global_seqno = UINT64_MAX;
+  file_ingestion_info.table_properties = table_properties;
+  el->OnExternalFileIngested(nullptr, file_ingestion_info);
+
+  el->OnBackgroundError(BackgroundErrorReason::kFlush, &status);
+
+  WriteStallInfo write_stall_info;
+  write_stall_info.cf_name = "columnFamilyName";
+  write_stall_info.condition.cur = WriteStallCondition::kDelayed;
+  write_stall_info.condition.prev = WriteStallCondition::kStopped;
+  el->OnStallConditionsChanged(write_stall_info);
+
+  FileOperationInfo op_info = FileOperationInfo(
+      FileOperationType::kRead, "/file/path",
+      std::make_pair(std::chrono::time_point<std::chrono::system_clock,
+                                             std::chrono::nanoseconds>(
+                         std::chrono::nanoseconds(1600699420000000000ll)),
+                     std::chrono::time_point<std::chrono::steady_clock,
+                                             std::chrono::nanoseconds>(
+                         std::chrono::nanoseconds(1600699420000000000ll))),
+      std::chrono::time_point<std::chrono::steady_clock,
+                              std::chrono::nanoseconds>(
+          std::chrono::nanoseconds(1600699425000000000ll)),
+      status);
+  op_info.offset = UINT64_MAX;
+  op_info.length = SIZE_MAX;
+  op_info.status = status;
+
+  el->OnFileReadFinish(op_info);
+  el->OnFileWriteFinish(op_info);
+  el->OnFileFlushFinish(op_info);
+  el->OnFileSyncFinish(op_info);
+  el->OnFileRangeSyncFinish(op_info);
+  el->OnFileTruncateFinish(op_info);
+  el->OnFileCloseFinish(op_info);
+  el->ShouldBeNotifiedOnFileIO();
+
+  bool auto_recovery;
+  el->OnErrorRecoveryBegin(BackgroundErrorReason::kFlush, status,
+                           &auto_recovery);
+  el->OnErrorRecoveryCompleted(status);
+}
diff --git a/java/rocksjni/transaction.cc b/java/rocksjni/transaction.cc
index 3b9123ed13e..720285e7555 100644
--- a/java/rocksjni/transaction.cc
+++ b/java/rocksjni/transaction.cc
@@ -689,6 +689,7 @@ void txn_write_kv_parts_helper(JNIEnv* env,
       // out of memory
       env->DeleteLocalRef(jobj_value_part);
       env->DeleteLocalRef(jobj_key_part);
+      env->ReleaseByteArrayElements(jba_key_part, jkey_part, JNI_ABORT);
       free_parts(env, jparts_to_free);
       return;
     }
@@ -698,6 +699,7 @@ void txn_write_kv_parts_helper(JNIEnv* env,
       env->ReleaseByteArrayElements(jba_value_part, jvalue_part, JNI_ABORT);
       env->DeleteLocalRef(jobj_value_part);
       env->DeleteLocalRef(jobj_key_part);
+      env->ReleaseByteArrayElements(jba_key_part, jkey_part, JNI_ABORT);
       free_parts(env, jparts_to_free);
       return;
     }
diff --git a/java/rocksjni/ttl.cc b/java/rocksjni/ttl.cc
index 77d17c82a3a..a898bffb80e 100644
--- a/java/rocksjni/ttl.cc
+++ b/java/rocksjni/ttl.cc
@@ -197,7 +197,7 @@ jlong Java_org_rocksdb_TtlDB_createColumnFamilyWithTtl(
       *cfOptions, std::string(reinterpret_cast<char*>(cfname), len), &handle,
       jttl);
 
-  env->ReleaseByteArrayElements(jcolumn_name, cfname, 0);
+  env->ReleaseByteArrayElements(jcolumn_name, cfname, JNI_ABORT);
 
   if (s.ok()) {
     return reinterpret_cast<jlong>(handle);
diff --git a/java/rocksjni/write_batch_test.cc b/java/rocksjni/write_batch_test.cc
index c517afcc129..dd6e0b36c7a 100644
--- a/java/rocksjni/write_batch_test.cc
+++ b/java/rocksjni/write_batch_test.cc
@@ -47,7 +47,7 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(JNIEnv* env,
   ROCKSDB_NAMESPACE::WriteBufferManager wb(options.db_write_buffer_size);
   options.memtable_factory = factory;
   ROCKSDB_NAMESPACE::MemTable* mem = new ROCKSDB_NAMESPACE::MemTable(
-      cmp, ROCKSDB_NAMESPACE::ImmutableCFOptions(options),
+      cmp, ROCKSDB_NAMESPACE::ImmutableOptions(options),
       ROCKSDB_NAMESPACE::MutableCFOptions(options), &wb,
       ROCKSDB_NAMESPACE::kMaxSequenceNumber, 0 /* column_family_id */);
   mem->Ref();
@@ -63,10 +63,10 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(JNIEnv* env,
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     ROCKSDB_NAMESPACE::ParsedInternalKey ikey;
     ikey.clear();
-    ROCKSDB_NAMESPACE::Status pikStatus =
-        ROCKSDB_NAMESPACE::ParseInternalKey(iter->key(), &ikey);
-    pikStatus.PermitUncheckedError();
-    assert(pikStatus.ok());
+    ROCKSDB_NAMESPACE::Status pik_status = ROCKSDB_NAMESPACE::ParseInternalKey(
+        iter->key(), &ikey, true /* log_err_key */);
+    pik_status.PermitUncheckedError();
+    assert(pik_status.ok());
     switch (ikey.type) {
       case ROCKSDB_NAMESPACE::kTypeValue:
         state.append("Put(");
diff --git a/java/samples/src/main/java/OptimisticTransactionSample.java b/java/samples/src/main/java/OptimisticTransactionSample.java
index 1633d1f2bd4..7e7a22e9485 100644
--- a/java/samples/src/main/java/OptimisticTransactionSample.java
+++ b/java/samples/src/main/java/OptimisticTransactionSample.java
@@ -111,7 +111,7 @@ private static void repeatableRead(final OptimisticTransactionDB txnDb,
       // Read a key using the snapshot.
       readOptions.setSnapshot(snapshot);
       final byte[] value = txn.getForUpdate(readOptions, key1, true);
-      assert(value == value1);
+      assert (value == null);
 
       try {
         // Attempt to commit transaction
diff --git a/java/samples/src/main/java/RocksDBColumnFamilySample.java b/java/samples/src/main/java/RocksDBColumnFamilySample.java
index 650b1b2f600..72f5731a1bd 100644
--- a/java/samples/src/main/java/RocksDBColumnFamilySample.java
+++ b/java/samples/src/main/java/RocksDBColumnFamilySample.java
@@ -53,8 +53,8 @@ public static void main(final String[] args) throws RocksDBException {
 
       try {
         // put and get from non-default column family
-        db.put(columnFamilyHandles.get(0), new WriteOptions(),
-            "key".getBytes(), "value".getBytes());
+        db.put(
+            columnFamilyHandles.get(1), new WriteOptions(), "key".getBytes(), "value".getBytes());
 
         // atomic write
         try (final WriteBatch wb = new WriteBatch()) {
@@ -62,7 +62,7 @@ public static void main(final String[] args) throws RocksDBException {
               "value2".getBytes());
           wb.put(columnFamilyHandles.get(1), "key3".getBytes(),
               "value3".getBytes());
-          wb.remove(columnFamilyHandles.get(0), "key".getBytes());
+          wb.delete(columnFamilyHandles.get(1), "key".getBytes());
           db.write(new WriteOptions(), wb);
         }
 
diff --git a/java/samples/src/main/java/RocksDBSample.java b/java/samples/src/main/java/RocksDBSample.java
index f61995ed98f..0637c11148b 100644
--- a/java/samples/src/main/java/RocksDBSample.java
+++ b/java/samples/src/main/java/RocksDBSample.java
@@ -45,7 +45,7 @@ public static void main(final String[] args) {
             .setStatistics(stats)
             .setWriteBufferSize(8 * SizeUnit.KB)
             .setMaxWriteBufferNumber(3)
-            .setMaxBackgroundCompactions(10)
+            .setMaxBackgroundJobs(10)
             .setCompressionType(CompressionType.SNAPPY_COMPRESSION)
             .setCompactionStyle(CompactionStyle.UNIVERSAL);
       } catch (final IllegalArgumentException e) {
@@ -55,7 +55,7 @@ public static void main(final String[] args) {
       assert (options.createIfMissing() == true);
       assert (options.writeBufferSize() == 8 * SizeUnit.KB);
       assert (options.maxWriteBufferNumber() == 3);
-      assert (options.maxBackgroundCompactions() == 10);
+      assert (options.maxBackgroundJobs() == 10);
       assert (options.compressionType() == CompressionType.SNAPPY_COMPRESSION);
       assert (options.compactionStyle() == CompactionStyle.UNIVERSAL);
 
@@ -87,24 +87,17 @@ public static void main(final String[] args) {
       options.setRateLimiter(rateLimiter);
 
       final BlockBasedTableConfig table_options = new BlockBasedTableConfig();
-      table_options.setBlockCacheSize(64 * SizeUnit.KB)
-          .setFilter(bloomFilter)
-          .setCacheNumShardBits(6)
+      Cache cache = new LRUCache(64 * 1024, 6);
+      table_options.setBlockCache(cache)
+          .setFilterPolicy(bloomFilter)
           .setBlockSizeDeviation(5)
           .setBlockRestartInterval(10)
           .setCacheIndexAndFilterBlocks(true)
-          .setHashIndexAllowCollision(false)
-          .setBlockCacheCompressedSize(64 * SizeUnit.KB)
-          .setBlockCacheCompressedNumShardBits(10);
+          .setBlockCacheCompressed(new LRUCache(64 * 1000, 10));
 
-      assert (table_options.blockCacheSize() == 64 * SizeUnit.KB);
-      assert (table_options.cacheNumShardBits() == 6);
       assert (table_options.blockSizeDeviation() == 5);
       assert (table_options.blockRestartInterval() == 10);
       assert (table_options.cacheIndexAndFilterBlocks() == true);
-      assert (table_options.hashIndexAllowCollision() == false);
-      assert (table_options.blockCacheCompressedSize() == 64 * SizeUnit.KB);
-      assert (table_options.blockCacheCompressedNumShardBits() == 10);
 
       options.setTableFormatConfig(table_options);
       assert (options.tableFactoryName().equals("BlockBasedTable"));
@@ -203,14 +196,14 @@ public static void main(final String[] args) {
         len = db.get(readOptions, testKey, enoughArray);
         assert (len == testValue.length);
 
-        db.remove(testKey);
+        db.delete(testKey);
         len = db.get(testKey, enoughArray);
         assert (len == RocksDB.NOT_FOUND);
 
         // repeat the test with WriteOptions
         try (final WriteOptions writeOpts = new WriteOptions()) {
           writeOpts.setSync(true);
-          writeOpts.setDisableWAL(true);
+          writeOpts.setDisableWAL(false);
           db.put(writeOpts, testKey, testValue);
           len = db.get(testKey, enoughArray);
           assert (len == testValue.length);
@@ -284,15 +277,15 @@ public static void main(final String[] args) {
           }
         }
 
-        Map<byte[], byte[]> values = db.multiGet(keys);
+        List<byte[]> values = db.multiGetAsList(keys);
         assert (values.size() == keys.size());
-        for (final byte[] value1 : values.values()) {
+        for (final byte[] value1 : values) {
           assert (value1 != null);
         }
 
-        values = db.multiGet(new ReadOptions(), keys);
+        values = db.multiGetAsList(new ReadOptions(), keys);
         assert (values.size() == keys.size());
-        for (final byte[] value1 : values.values()) {
+        for (final byte[] value1 : values) {
           assert (value1 != null);
         }
       } catch (final RocksDBException e) {
diff --git a/java/src/main/java/org/rocksdb/AbstractEventListener.java b/java/src/main/java/org/rocksdb/AbstractEventListener.java
new file mode 100644
index 00000000000..6698acf88f2
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/AbstractEventListener.java
@@ -0,0 +1,334 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import static org.rocksdb.AbstractEventListener.EnabledEventCallback.*;
+
+/**
+ * Base class for Event Listeners.
+ */
+public abstract class AbstractEventListener extends RocksCallbackObject implements EventListener {
+  public enum EnabledEventCallback {
+    ON_FLUSH_COMPLETED((byte) 0x0),
+    ON_FLUSH_BEGIN((byte) 0x1),
+    ON_TABLE_FILE_DELETED((byte) 0x2),
+    ON_COMPACTION_BEGIN((byte) 0x3),
+    ON_COMPACTION_COMPLETED((byte) 0x4),
+    ON_TABLE_FILE_CREATED((byte) 0x5),
+    ON_TABLE_FILE_CREATION_STARTED((byte) 0x6),
+    ON_MEMTABLE_SEALED((byte) 0x7),
+    ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED((byte) 0x8),
+    ON_EXTERNAL_FILE_INGESTED((byte) 0x9),
+    ON_BACKGROUND_ERROR((byte) 0xA),
+    ON_STALL_CONDITIONS_CHANGED((byte) 0xB),
+    ON_FILE_READ_FINISH((byte) 0xC),
+    ON_FILE_WRITE_FINISH((byte) 0xD),
+    ON_FILE_FLUSH_FINISH((byte) 0xE),
+    ON_FILE_SYNC_FINISH((byte) 0xF),
+    ON_FILE_RANGE_SYNC_FINISH((byte) 0x10),
+    ON_FILE_TRUNCATE_FINISH((byte) 0x11),
+    ON_FILE_CLOSE_FINISH((byte) 0x12),
+    SHOULD_BE_NOTIFIED_ON_FILE_IO((byte) 0x13),
+    ON_ERROR_RECOVERY_BEGIN((byte) 0x14),
+    ON_ERROR_RECOVERY_COMPLETED((byte) 0x15);
+
+    private final byte value;
+
+    EnabledEventCallback(final byte value) {
+      this.value = value;
+    }
+
+    /**
+     * Get the internal representation value.
+     *
+     * @return the internal representation value
+     */
+    byte getValue() {
+      return value;
+    }
+
+    /**
+     * Get the EnabledEventCallbacks from the internal representation value.
+     *
+     * @return the enabled event callback.
+     *
+     * @throws IllegalArgumentException if the value is unknown.
+     */
+    static EnabledEventCallback fromValue(final byte value) {
+      for (final EnabledEventCallback enabledEventCallback : EnabledEventCallback.values()) {
+        if (enabledEventCallback.value == value) {
+          return enabledEventCallback;
+        }
+      }
+
+      throw new IllegalArgumentException(
+          "Illegal value provided for EnabledEventCallback: " + value);
+    }
+  }
+
+  /**
+   * Creates an Event Listener that will
+   * received all callbacks from C++.
+   *
+   * If you don't need all callbacks, it is much more efficient to
+   * just register for the ones you need by calling
+   * {@link #AbstractEventListener(EnabledEventCallback...)} instead.
+   */
+  protected AbstractEventListener() {
+    this(ON_FLUSH_COMPLETED, ON_FLUSH_BEGIN, ON_TABLE_FILE_DELETED, ON_COMPACTION_BEGIN,
+        ON_COMPACTION_COMPLETED, ON_TABLE_FILE_CREATED, ON_TABLE_FILE_CREATION_STARTED,
+        ON_MEMTABLE_SEALED, ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED, ON_EXTERNAL_FILE_INGESTED,
+        ON_BACKGROUND_ERROR, ON_STALL_CONDITIONS_CHANGED, ON_FILE_READ_FINISH, ON_FILE_WRITE_FINISH,
+        ON_FILE_FLUSH_FINISH, ON_FILE_SYNC_FINISH, ON_FILE_RANGE_SYNC_FINISH,
+        ON_FILE_TRUNCATE_FINISH, ON_FILE_CLOSE_FINISH, SHOULD_BE_NOTIFIED_ON_FILE_IO,
+        ON_ERROR_RECOVERY_BEGIN, ON_ERROR_RECOVERY_COMPLETED);
+  }
+
+  /**
+   * Creates an Event Listener that will
+   * receive only certain callbacks from C++.
+   *
+   * @param enabledEventCallbacks callbacks to enable in Java.
+   */
+  protected AbstractEventListener(final EnabledEventCallback... enabledEventCallbacks) {
+    super(packToLong(enabledEventCallbacks));
+  }
+
+  /**
+   * Pack EnabledEventCallbacks to a long.
+   *
+   * @param enabledEventCallbacks the flags
+   *
+   * @return a long
+   */
+  private static long packToLong(final EnabledEventCallback... enabledEventCallbacks) {
+    long l = 0;
+    for (int i = 0; i < enabledEventCallbacks.length; i++) {
+      l |= 1 << enabledEventCallbacks[i].getValue();
+    }
+    return l;
+  }
+
+  @Override
+  public void onFlushCompleted(final RocksDB db, final FlushJobInfo flushJobInfo) {
+    // no-op
+  }
+
+  /**
+   * Called from JNI, proxy for
+   *     {@link #onFlushCompleted(RocksDB, FlushJobInfo)}.
+   *
+   * @param dbHandle native handle of the database
+   * @param flushJobInfo the flush job info
+   */
+  private void onFlushCompletedProxy(final long dbHandle, final FlushJobInfo flushJobInfo) {
+    final RocksDB db = new RocksDB(dbHandle);
+    db.disOwnNativeHandle(); // we don't own this!
+    onFlushCompleted(db, flushJobInfo);
+  }
+
+  @Override
+  public void onFlushBegin(final RocksDB db, final FlushJobInfo flushJobInfo) {
+    // no-op
+  }
+
+  /**
+   * Called from JNI, proxy for
+   *     {@link #onFlushBegin(RocksDB, FlushJobInfo)}.
+   *
+   * @param dbHandle native handle of the database
+   * @param flushJobInfo the flush job info
+   */
+  private void onFlushBeginProxy(final long dbHandle, final FlushJobInfo flushJobInfo) {
+    final RocksDB db = new RocksDB(dbHandle);
+    db.disOwnNativeHandle(); // we don't own this!
+    onFlushBegin(db, flushJobInfo);
+  }
+
+  @Override
+  public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+    // no-op
+  }
+
+  /**
+   * Called from JNI, proxy for
+   *     {@link #onCompactionBegin(RocksDB, CompactionJobInfo)}.
+   *
+   * @param dbHandle native handle of the database
+   * @param compactionJobInfo the flush job info
+   */
+  private void onCompactionBeginProxy(
+      final long dbHandle, final CompactionJobInfo compactionJobInfo) {
+    final RocksDB db = new RocksDB(dbHandle);
+    db.disOwnNativeHandle(); // we don't own this!
+    onCompactionBegin(db, compactionJobInfo);
+  }
+
+  @Override
+  public void onCompactionCompleted(final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+    // no-op
+  }
+
+  /**
+   * Called from JNI, proxy for
+   *     {@link #onCompactionCompleted(RocksDB, CompactionJobInfo)}.
+   *
+   * @param dbHandle native handle of the database
+   * @param compactionJobInfo the flush job info
+   */
+  private void onCompactionCompletedProxy(
+      final long dbHandle, final CompactionJobInfo compactionJobInfo) {
+    final RocksDB db = new RocksDB(dbHandle);
+    db.disOwnNativeHandle(); // we don't own this!
+    onCompactionCompleted(db, compactionJobInfo);
+  }
+
+  @Override
+  public void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onTableFileCreationStarted(
+      final TableFileCreationBriefInfo tableFileCreationBriefInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onMemTableSealed(final MemTableInfo memTableInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onColumnFamilyHandleDeletionStarted(final ColumnFamilyHandle columnFamilyHandle) {
+    // no-op
+  }
+
+  @Override
+  public void onExternalFileIngested(
+      final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo) {
+    // no-op
+  }
+
+  /**
+   * Called from JNI, proxy for
+   *     {@link #onExternalFileIngested(RocksDB, ExternalFileIngestionInfo)}.
+   *
+   * @param dbHandle native handle of the database
+   * @param externalFileIngestionInfo the flush job info
+   */
+  private void onExternalFileIngestedProxy(
+      final long dbHandle, final ExternalFileIngestionInfo externalFileIngestionInfo) {
+    final RocksDB db = new RocksDB(dbHandle);
+    db.disOwnNativeHandle(); // we don't own this!
+    onExternalFileIngested(db, externalFileIngestionInfo);
+  }
+
+  @Override
+  public void onBackgroundError(
+      final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) {
+    // no-op
+  }
+
+  /**
+   * Called from JNI, proxy for
+   *     {@link #onBackgroundError(BackgroundErrorReason, Status)}.
+   *
+   * @param reasonByte byte value representing error reason
+   * @param backgroundError status with error code
+   */
+  private void onBackgroundErrorProxy(final byte reasonByte, final Status backgroundError) {
+    onBackgroundError(BackgroundErrorReason.fromValue(reasonByte), backgroundError);
+  }
+
+  @Override
+  public void onStallConditionsChanged(final WriteStallInfo writeStallInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onFileReadFinish(final FileOperationInfo fileOperationInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onFileWriteFinish(final FileOperationInfo fileOperationInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onFileFlushFinish(final FileOperationInfo fileOperationInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onFileSyncFinish(final FileOperationInfo fileOperationInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onFileRangeSyncFinish(final FileOperationInfo fileOperationInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onFileTruncateFinish(final FileOperationInfo fileOperationInfo) {
+    // no-op
+  }
+
+  @Override
+  public void onFileCloseFinish(final FileOperationInfo fileOperationInfo) {
+    // no-op
+  }
+
+  @Override
+  public boolean shouldBeNotifiedOnFileIO() {
+    return false;
+  }
+
+  @Override
+  public boolean onErrorRecoveryBegin(
+      final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) {
+    return true;
+  }
+
+  /**
+   * Called from JNI, proxy for
+   *     {@link #onErrorRecoveryBegin(BackgroundErrorReason, Status)}.
+   *
+   * @param reasonByte byte value representing error reason
+   * @param backgroundError status with error code
+   */
+  private boolean onErrorRecoveryBeginProxy(final byte reasonByte, final Status backgroundError) {
+    return onErrorRecoveryBegin(BackgroundErrorReason.fromValue(reasonByte), backgroundError);
+  }
+
+  @Override
+  public void onErrorRecoveryCompleted(final Status oldBackgroundError) {
+    // no-op
+  }
+
+  @Override
+  protected long initializeNative(final long... nativeParameterHandles) {
+    return createNewEventListener(nativeParameterHandles[0]);
+  }
+
+  /**
+   * Deletes underlying C++ native callback object pointer
+   */
+  @Override
+  protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+
+  private native long createNewEventListener(final long enabledEventCallbackValues);
+  private native void disposeInternal(final long handle);
+}
diff --git a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
index 772a5900b5b..76d9bde4646 100644
--- a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
@@ -301,7 +301,7 @@ T setCompressionPerLevel(
    * @return the reference to the current options.
    */
   @Experimental("Turning this feature on or off for an existing DB can cause" +
-      "unexpected LSM tree structure so it's not recommended")
+      " unexpected LSM tree structure so it's not recommended")
   T setLevelCompactionDynamicLevelBytes(
       boolean enableLevelCompactionDynamicLevelBytes);
 
diff --git a/java/src/main/java/org/rocksdb/BackgroundErrorReason.java b/java/src/main/java/org/rocksdb/BackgroundErrorReason.java
new file mode 100644
index 00000000000..eec593d35c5
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/BackgroundErrorReason.java
@@ -0,0 +1,46 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public enum BackgroundErrorReason {
+  FLUSH((byte) 0x0),
+  COMPACTION((byte) 0x1),
+  WRITE_CALLBACK((byte) 0x2),
+  MEMTABLE((byte) 0x3);
+
+  private final byte value;
+
+  BackgroundErrorReason(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation.
+   *
+   * @return the internal representation
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the BackgroundErrorReason from the internal representation value.
+   *
+   * @return the background error reason.
+   *
+   * @throws IllegalArgumentException if the value is unknown.
+   */
+  static BackgroundErrorReason fromValue(final byte value) {
+    for (final BackgroundErrorReason backgroundErrorReason : BackgroundErrorReason.values()) {
+      if (backgroundErrorReason.value == value) {
+        return backgroundErrorReason;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal value provided for BackgroundErrorReason: " + value);
+  }
+}
diff --git a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
index 6730e645230..a8f436e2f0a 100644
--- a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
+++ b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
@@ -38,7 +38,7 @@ public BlockBasedTableConfig() {
     wholeKeyFiltering = true;
     verifyCompression = false;
     readAmpBytesPerBit = 0;
-    formatVersion = 4;
+    formatVersion = 5;
     enableIndexCompression = true;
     blockAlign = false;
     indexShortening = IndexShorteningMode.kShortenSeparators;
diff --git a/java/src/main/java/org/rocksdb/Cache.java b/java/src/main/java/org/rocksdb/Cache.java
index 3952e1d109c..569a1df06cf 100644
--- a/java/src/main/java/org/rocksdb/Cache.java
+++ b/java/src/main/java/org/rocksdb/Cache.java
@@ -10,4 +10,31 @@ public abstract class Cache extends RocksObject {
   protected Cache(final long nativeHandle) {
     super(nativeHandle);
   }
+
+  /**
+   * Returns the memory size for the entries
+   * residing in cache.
+   *
+   * @return cache usage size.
+   *
+   */
+  public long getUsage() {
+    assert (isOwningHandle());
+    return getUsage(this.nativeHandle_);
+  }
+
+  /**
+   * Returns the memory size for the entries
+   * being pinned in cache.
+   *
+   * @return cache pinned usage size.
+   *
+   */
+  public long getPinnedUsage() {
+    assert (isOwningHandle());
+    return getPinnedUsage(this.nativeHandle_);
+  }
+
+  private native static long getUsage(final long handle);
+  private native static long getPinnedUsage(final long handle);
 }
diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java b/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java
index 3a2e97efb57..1ac0a35bbd7 100644
--- a/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java
+++ b/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java
@@ -13,6 +13,12 @@
  * ColumnFamily Pointers.
  */
 public class ColumnFamilyHandle extends RocksObject {
+  /**
+   * Constructs column family Java object, which operates on underlying native object.
+   *
+   * @param rocksDB db instance associated with this column family
+   * @param nativeHandle native handle to underlying native ColumnFamily object
+   */
   ColumnFamilyHandle(final RocksDB rocksDB,
       final long nativeHandle) {
     super(nativeHandle);
@@ -24,6 +30,28 @@ public class ColumnFamilyHandle extends RocksObject {
     this.rocksDB_ = rocksDB;
   }
 
+  /**
+   * Constructor called only from JNI.
+   *
+   * NOTE: we are producing an additional Java Object here to represent the underlying native C++
+   * ColumnFamilyHandle object. The underlying object is not owned by ourselves. The Java API user
+   * likely already had a ColumnFamilyHandle Java object which owns the underlying C++ object, as
+   * they will have been presented it when they opened the database or added a Column Family.
+   *
+   *
+   * TODO(AR) - Potentially a better design would be to cache the active Java Column Family Objects
+   * in RocksDB, and return the same Java Object instead of instantiating a new one here. This could
+   * also help us to improve the Java API semantics for Java users. See for example
+   * https://github.com/facebook/rocksdb/issues/2687.
+   *
+   * @param nativeHandle native handle to the column family.
+   */
+  ColumnFamilyHandle(final long nativeHandle) {
+    super(nativeHandle);
+    rocksDB_ = null;
+    disOwnNativeHandle();
+  }
+
   /**
    * Gets the name of the Column Family.
    *
@@ -87,7 +115,9 @@ public boolean equals(final Object o) {
   @Override
   public int hashCode() {
     try {
-      return Objects.hash(getName(), getID(), rocksDB_.nativeHandle_);
+      int result = Objects.hash(getID(), rocksDB_.nativeHandle_);
+      result = 31 * result + Arrays.hashCode(getName());
+      return result;
     } catch (RocksDBException e) {
       throw new RuntimeException("Cannot calculate hash code of column family handle", e);
     }
diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java
index 825c34973c5..72149bf2669 100644
--- a/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java
+++ b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java
@@ -339,7 +339,7 @@ public List<CompressionType> compressionPerLevel() {
     final byte[] byteCompressionTypes =
         compressionPerLevel(nativeHandle_);
     final List<CompressionType> compressionLevels = new ArrayList<>();
-    for (final Byte byteCompressionType : byteCompressionTypes) {
+    for (final byte byteCompressionType : byteCompressionTypes) {
       compressionLevels.add(CompressionType.getCompressionType(
           byteCompressionType));
     }
diff --git a/java/src/main/java/org/rocksdb/CompactionJobInfo.java b/java/src/main/java/org/rocksdb/CompactionJobInfo.java
index 8b59edc91db..4e3b8d68b82 100644
--- a/java/src/main/java/org/rocksdb/CompactionJobInfo.java
+++ b/java/src/main/java/org/rocksdb/CompactionJobInfo.java
@@ -20,6 +20,8 @@ public CompactionJobInfo() {
    */
   private CompactionJobInfo(final long nativeHandle) {
     super(nativeHandle);
+    // We do not own the native object!
+    disOwnNativeHandle();
   }
 
   /**
diff --git a/java/src/main/java/org/rocksdb/DBOptions.java b/java/src/main/java/org/rocksdb/DBOptions.java
index a3eef513e31..2930a92728b 100644
--- a/java/src/main/java/org/rocksdb/DBOptions.java
+++ b/java/src/main/java/org/rocksdb/DBOptions.java
@@ -884,32 +884,18 @@ public boolean strictBytesPerSync() {
     return strictBytesPerSync(nativeHandle_);
   }
 
-  //TODO(AR) NOW
-//  @Override
-//  public DBOptions setListeners(final List<EventListener> listeners) {
-//    assert(isOwningHandle());
-//    final long[] eventListenerHandlers = new long[listeners.size()];
-//    for (int i = 0; i < eventListenerHandlers.length; i++) {
-//      eventListenerHandlers[i] = listeners.get(i).nativeHandle_;
-//    }
-//    setEventListeners(nativeHandle_, eventListenerHandlers);
-//    return this;
-//  }
-//
-//  @Override
-//  public Collection<EventListener> listeners() {
-//    assert(isOwningHandle());
-//    final long[] eventListenerHandlers = listeners(nativeHandle_);
-//    if (eventListenerHandlers == null || eventListenerHandlers.length == 0) {
-//      return Collections.emptyList();
-//    }
-//
-//    final List<EventListener> eventListeners = new ArrayList<>();
-//    for (final long eventListenerHandle : eventListenerHandlers) {
-//      eventListeners.add(new EventListener(eventListenerHandle)); //TODO(AR) check ownership is set to false!
-//    }
-//    return eventListeners;
-//  }
+  @Override
+  public DBOptions setListeners(final List<AbstractEventListener> listeners) {
+    assert (isOwningHandle());
+    setEventListeners(nativeHandle_, RocksCallbackObject.toNativeHandleList(listeners));
+    return this;
+  }
+
+  @Override
+  public List<AbstractEventListener> listeners() {
+    assert (isOwningHandle());
+    return Arrays.asList(eventListeners(nativeHandle_));
+  }
 
   @Override
   public DBOptions setEnableThreadTracking(final boolean enableThreadTracking) {
@@ -1459,6 +1445,9 @@ private native void setStrictBytesPerSync(
       final long handle, final boolean strictBytesPerSync);
   private native boolean strictBytesPerSync(
       final long handle);
+  private static native void setEventListeners(
+      final long handle, final long[] eventListenerHandles);
+  private static native AbstractEventListener[] eventListeners(final long handle);
   private native void setEnableThreadTracking(long handle,
       boolean enableThreadTracking);
   private native boolean enableThreadTracking(long handle);
diff --git a/java/src/main/java/org/rocksdb/DBOptionsInterface.java b/java/src/main/java/org/rocksdb/DBOptionsInterface.java
index 72be7d0e647..6609e0ad307 100644
--- a/java/src/main/java/org/rocksdb/DBOptionsInterface.java
+++ b/java/src/main/java/org/rocksdb/DBOptionsInterface.java
@@ -625,7 +625,7 @@ public interface DBOptionsInterface<T extends DBOptionsInterface<T>> {
    *    then WAL_size_limit_MB, they will be deleted starting with the
    *    earliest until size_limit is met. All empty files will be deleted.</li>
    * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
-   *    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
+   *    WAL files will be checked every WAL_ttl_seconds / 2 and those that
    *    are older than WAL_ttl_seconds will be deleted.</li>
    * <li>If both are not 0, WAL files will be checked every 10 min and both
    *    checks will be performed with ttl being first.</li>
@@ -648,7 +648,7 @@ public interface DBOptionsInterface<T extends DBOptionsInterface<T>> {
    * then WAL_size_limit_MB, they will be deleted starting with the
    * earliest until size_limit is met. All empty files will be deleted.</li>
    * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
-   * WAL files will be checked every WAL_ttl_secondsi / 2 and those that
+   * WAL files will be checked every WAL_ttl_seconds / 2 and those that
    * are older than WAL_ttl_seconds will be deleted.</li>
    * <li>If both are not 0, WAL files will be checked every 10 min and both
    * checks will be performed with ttl being first.</li>
@@ -1055,24 +1055,31 @@ T setNewTableReaderForCompactionInputs(
    */
   boolean useAdaptiveMutex();
 
-  //TODO(AR) NOW
-//  /**
-//   * Sets the {@link EventListener}s whose callback functions
-//   * will be called when specific RocksDB event happens.
-//   *
-//   * @param listeners the listeners who should be notified on various events.
-//   *
-//   * @return the instance of the current object.
-//   */
-//  T setListeners(final List<EventListener> listeners);
-//
-//  /**
-//   * Gets the {@link EventListener}s whose callback functions
-//   * will be called when specific RocksDB event happens.
-//   *
-//   * @return a collection of Event listeners.
-//   */
-//  Collection<EventListener> listeners();
+  /**
+   * Sets the {@link EventListener}s whose callback functions
+   * will be called when specific RocksDB event happens.
+   *
+   * Note: the RocksJava API currently only supports EventListeners implemented in Java.
+   * It could be extended in future to also support adding/removing EventListeners implemented in
+   * C++.
+   *
+   * @param listeners the listeners who should be notified on various events.
+   *
+   * @return the instance of the current object.
+   */
+  T setListeners(final List<AbstractEventListener> listeners);
+
+  /**
+   * Sets the {@link EventListener}s whose callback functions
+   * will be called when specific RocksDB event happens.
+   *
+   * Note: the RocksJava API currently only supports EventListeners implemented in Java.
+   * It could be extended in future to also support adding/removing EventListeners implemented in
+   * C++.
+   *
+   * @return the instance of the current object.
+   */
+  List<AbstractEventListener> listeners();
 
   /**
    * If true, then the status of the threads involved in this DB will
diff --git a/java/src/main/java/org/rocksdb/DirectSlice.java b/java/src/main/java/org/rocksdb/DirectSlice.java
index b0d35c3cc5a..02fa3511fc0 100644
--- a/java/src/main/java/org/rocksdb/DirectSlice.java
+++ b/java/src/main/java/org/rocksdb/DirectSlice.java
@@ -110,6 +110,10 @@ public void removePrefix(final int n) {
     this.internalBufferOffset += n;
   }
 
+  public void setLength(final int n) {
+    setLength0(getNativeHandle(), n);
+  }
+
   @Override
   protected void disposeInternal() {
     final long nativeHandle = getNativeHandle();
@@ -127,6 +131,7 @@ private native static long createNewDirectSlice0(final ByteBuffer data,
   private native void clear0(long handle, boolean internalBuffer,
       long internalBufferOffset);
   private native void removePrefix0(long handle, int length);
+  private native void setLength0(long handle, int length);
   private native void disposeInternalBuf(final long handle,
       long internalBufferOffset);
 }
diff --git a/java/src/main/java/org/rocksdb/EventListener.java b/java/src/main/java/org/rocksdb/EventListener.java
new file mode 100644
index 00000000000..808278d02b1
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/EventListener.java
@@ -0,0 +1,332 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.List;
+
+/**
+ * EventListener class contains a set of callback functions that will
+ * be called when specific RocksDB event happens such as flush.  It can
+ * be used as a building block for developing custom features such as
+ * stats-collector or external compaction algorithm.
+ *
+ * Note that callback functions should not run for an extended period of
+ * time before the function returns, otherwise RocksDB may be blocked.
+ * For example, it is not suggested to do
+ * {@link RocksDB#compactFiles(CompactionOptions, ColumnFamilyHandle, List, int, int,
+ * CompactionJobInfo)} (as it may run for a long while) or issue many of
+ * {@link RocksDB#put(ColumnFamilyHandle, WriteOptions, byte[], byte[])}
+ * (as Put may be blocked in certain cases) in the same thread in the
+ * EventListener callback.
+ *
+ * However, doing
+ * {@link RocksDB#compactFiles(CompactionOptions, ColumnFamilyHandle, List, int, int,
+ * CompactionJobInfo)} and {@link RocksDB#put(ColumnFamilyHandle, WriteOptions, byte[], byte[])} in
+ * another thread is considered safe.
+ *
+ * [Threading] All EventListener callback will be called using the
+ * actual thread that involves in that specific event. For example, it
+ * is the RocksDB background flush thread that does the actual flush to
+ * call {@link #onFlushCompleted(RocksDB, FlushJobInfo)}.
+ *
+ * [Locking] All EventListener callbacks are designed to be called without
+ * the current thread holding any DB mutex. This is to prevent potential
+ * deadlock and performance issue when using EventListener callback
+ * in a complex way.
+ */
+public interface EventListener {
+  /**
+   * A callback function to RocksDB which will be called before a
+   * RocksDB starts to flush memtables.
+   *
+   * Note that the this function must be implemented in a way such that
+   * it should not run for an extended period of time before the function
+   * returns. Otherwise, RocksDB may be blocked.
+   *
+   * @param db the database
+   * @param flushJobInfo the flush job info, contains data copied from
+   *     respective native structure.
+   */
+  void onFlushBegin(final RocksDB db, final FlushJobInfo flushJobInfo);
+
+  /**
+   * callback function to RocksDB which will be called whenever a
+   * registered RocksDB flushes a file.
+   *
+   * Note that the this function must be implemented in a way such that
+   * it should not run for an extended period of time before the function
+   * returns. Otherwise, RocksDB may be blocked.
+   *
+   * @param db the database
+   * @param flushJobInfo the flush job info, contains data copied from
+   *     respective native structure.
+   */
+  void onFlushCompleted(final RocksDB db, final FlushJobInfo flushJobInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever
+   * a SST file is deleted. Different from
+   * {@link #onCompactionCompleted(RocksDB, CompactionJobInfo)} and
+   * {@link #onFlushCompleted(RocksDB, FlushJobInfo)},
+   * this callback is designed for external logging
+   * service and thus only provide string parameters instead
+   * of a pointer to DB.  Applications that build logic basic based
+   * on file creations and deletions is suggested to implement
+   * {@link #onFlushCompleted(RocksDB, FlushJobInfo)} and
+   * {@link #onCompactionCompleted(RocksDB, CompactionJobInfo)}.
+   *
+   * Note that if applications would like to use the passed reference
+   * outside this function call, they should make copies from the
+   * returned value.
+   *
+   * @param tableFileDeletionInfo the table file deletion info,
+   *     contains data copied from respective native structure.
+   */
+  void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo);
+
+  /**
+   * A callback function to RocksDB which will be called before a
+   * RocksDB starts to compact. The default implementation is
+   * no-op.
+   *
+   * Note that the this function must be implemented in a way such that
+   * it should not run for an extended period of time before the function
+   * returns. Otherwise, RocksDB may be blocked.
+   *
+   * @param db a pointer to the rocksdb instance which just compacted
+   *     a file.
+   * @param compactionJobInfo a reference to a native CompactionJobInfo struct,
+   *     which is released after this function is returned, and must be copied
+   *     if it is needed outside of this function.
+   */
+  void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever
+   * a registered RocksDB compacts a file. The default implementation
+   * is a no-op.
+   *
+   * Note that this function must be implemented in a way such that
+   * it should not run for an extended period of time before the function
+   * returns. Otherwise, RocksDB may be blocked.
+   *
+   * @param db a pointer to the rocksdb instance which just compacted
+   *     a file.
+   * @param compactionJobInfo a reference to a native CompactionJobInfo struct,
+   *     which is released after this function is returned, and must be copied
+   *     if it is needed outside of this function.
+   */
+  void onCompactionCompleted(final RocksDB db, final CompactionJobInfo compactionJobInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever
+   * a SST file is created.  Different from OnCompactionCompleted and
+   * OnFlushCompleted, this callback is designed for external logging
+   * service and thus only provide string parameters instead
+   * of a pointer to DB.  Applications that build logic basic based
+   * on file creations and deletions is suggested to implement
+   * OnFlushCompleted and OnCompactionCompleted.
+   *
+   * Historically it will only be called if the file is successfully created.
+   * Now it will also be called on failure case. User can check info.status
+   * to see if it succeeded or not.
+   *
+   * Note that if applications would like to use the passed reference
+   * outside this function call, they should make copies from these
+   * returned value.
+   *
+   * @param tableFileCreationInfo the table file creation info,
+   *     contains data copied from respective native structure.
+   */
+  void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo);
+
+  /**
+   * A callback function for RocksDB which will be called before
+   * a SST file is being created. It will follow by OnTableFileCreated after
+   * the creation finishes.
+   *
+   * Note that if applications would like to use the passed reference
+   * outside this function call, they should make copies from these
+   * returned value.
+   *
+   * @param tableFileCreationBriefInfo the table file creation brief info,
+   *     contains data copied from respective native structure.
+   */
+  void onTableFileCreationStarted(final TableFileCreationBriefInfo tableFileCreationBriefInfo);
+
+  /**
+   * A callback function for RocksDB which will be called before
+   * a memtable is made immutable.
+   *
+   * Note that the this function must be implemented in a way such that
+   * it should not run for an extended period of time before the function
+   * returns.  Otherwise, RocksDB may be blocked.
+   *
+   * Note that if applications would like to use the passed reference
+   * outside this function call, they should make copies from these
+   * returned value.
+   *
+   * @param memTableInfo the mem table info, contains data
+   *     copied from respective native structure.
+   */
+  void onMemTableSealed(final MemTableInfo memTableInfo);
+
+  /**
+   * A callback function for RocksDB which will be called before
+   * a column family handle is deleted.
+   *
+   * Note that the this function must be implemented in a way such that
+   * it should not run for an extended period of time before the function
+   * returns.  Otherwise, RocksDB may be blocked.
+   *
+   * @param columnFamilyHandle is a pointer to the column family handle to be
+   *     deleted which will become a dangling pointer after the deletion.
+   */
+  void onColumnFamilyHandleDeletionStarted(final ColumnFamilyHandle columnFamilyHandle);
+
+  /**
+   * A callback function for RocksDB which will be called after an external
+   * file is ingested using IngestExternalFile.
+   *
+   * Note that the this function will run on the same thread as
+   * IngestExternalFile(), if this function is blocked, IngestExternalFile()
+   * will be blocked from finishing.
+   *
+   * @param db the database
+   * @param externalFileIngestionInfo the external file ingestion info,
+   *     contains data copied from respective native structure.
+   */
+  void onExternalFileIngested(
+      final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo);
+
+  /**
+   * A callback function for RocksDB which will be called before setting the
+   * background error status to a non-OK value. The new background error status
+   * is provided in `bg_error` and can be modified by the callback. E.g., a
+   * callback can suppress errors by resetting it to Status::OK(), thus
+   * preventing the database from entering read-only mode. We do not provide any
+   * guarantee when failed flushes/compactions will be rescheduled if the user
+   * suppresses an error.
+   *
+   * Note that this function can run on the same threads as flush, compaction,
+   * and user writes. So, it is extremely important not to perform heavy
+   * computations or blocking calls in this function.
+   *
+   * @param backgroundErrorReason background error reason code
+   * @param backgroundError background error codes
+   */
+  void onBackgroundError(
+      final BackgroundErrorReason backgroundErrorReason, final Status backgroundError);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a change
+   * of superversion triggers a change of the stall conditions.
+   *
+   * Note that the this function must be implemented in a way such that
+   * it should not run for an extended period of time before the function
+   * returns. Otherwise, RocksDB may be blocked.
+   *
+   * @param writeStallInfo write stall info,
+   *     contains data copied from respective native structure.
+   */
+  void onStallConditionsChanged(final WriteStallInfo writeStallInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a file read
+   * operation finishes.
+   *
+   * @param fileOperationInfo file operation info,
+   *     contains data copied from respective native structure.
+   */
+  void onFileReadFinish(final FileOperationInfo fileOperationInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a file write
+   * operation finishes.
+   *
+   * @param fileOperationInfo file operation info,
+   *     contains data copied from respective native structure.
+   */
+  void onFileWriteFinish(final FileOperationInfo fileOperationInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a file flush
+   * operation finishes.
+   *
+   * @param fileOperationInfo file operation info,
+   *     contains data copied from respective native structure.
+   */
+  void onFileFlushFinish(final FileOperationInfo fileOperationInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a file sync
+   * operation finishes.
+   *
+   * @param fileOperationInfo file operation info,
+   *     contains data copied from respective native structure.
+   */
+  void onFileSyncFinish(final FileOperationInfo fileOperationInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a file
+   * rangeSync operation finishes.
+   *
+   * @param fileOperationInfo file operation info,
+   *     contains data copied from respective native structure.
+   */
+  void onFileRangeSyncFinish(final FileOperationInfo fileOperationInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a file
+   * truncate operation finishes.
+   *
+   * @param fileOperationInfo file operation info,
+   *     contains data copied from respective native structure.
+   */
+  void onFileTruncateFinish(final FileOperationInfo fileOperationInfo);
+
+  /**
+   * A callback function for RocksDB which will be called whenever a file close
+   * operation finishes.
+   *
+   * @param fileOperationInfo file operation info,
+   *     contains data copied from respective native structure.
+   */
+  void onFileCloseFinish(final FileOperationInfo fileOperationInfo);
+
+  /**
+   * If true, the {@link #onFileReadFinish(FileOperationInfo)}
+   * and {@link #onFileWriteFinish(FileOperationInfo)} will be called. If
+   * false, then they won't be called.
+   *
+   * Default: false
+   */
+  boolean shouldBeNotifiedOnFileIO();
+
+  /**
+   * A callback function for RocksDB which will be called just before
+   * starting the automatic recovery process for recoverable background
+   * errors, such as NoSpace(). The callback can suppress the automatic
+   * recovery by setting returning false. The database will then
+   * have to be transitioned out of read-only mode by calling
+   * RocksDB#resume().
+   *
+   * @param backgroundErrorReason background error reason code
+   * @param backgroundError background error codes
+   */
+  boolean onErrorRecoveryBegin(
+      final BackgroundErrorReason backgroundErrorReason, final Status backgroundError);
+
+  /**
+   * A callback function for RocksDB which will be called once the database
+   * is recovered from read-only mode after an error. When this is called, it
+   * means normal writes to the database can be issued and the user can
+   * initiate any further recovery actions needed
+   *
+   * @param oldBackgroundError old background error codes
+   */
+  void onErrorRecoveryCompleted(final Status oldBackgroundError);
+}
diff --git a/java/src/main/java/org/rocksdb/ExternalFileIngestionInfo.java b/java/src/main/java/org/rocksdb/ExternalFileIngestionInfo.java
new file mode 100644
index 00000000000..6b14a80240b
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/ExternalFileIngestionInfo.java
@@ -0,0 +1,103 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+public class ExternalFileIngestionInfo {
+  private final String columnFamilyName;
+  private final String externalFilePath;
+  private final String internalFilePath;
+  private final long globalSeqno;
+  private final TableProperties tableProperties;
+
+  /**
+   * Access is package private as this will only be constructed from
+   * C++ via JNI and for testing.
+   */
+  ExternalFileIngestionInfo(final String columnFamilyName, final String externalFilePath,
+      final String internalFilePath, final long globalSeqno,
+      final TableProperties tableProperties) {
+    this.columnFamilyName = columnFamilyName;
+    this.externalFilePath = externalFilePath;
+    this.internalFilePath = internalFilePath;
+    this.globalSeqno = globalSeqno;
+    this.tableProperties = tableProperties;
+  }
+
+  /**
+   * Get the name of the column family.
+   *
+   * @return the name of the column family.
+   */
+  public String getColumnFamilyName() {
+    return columnFamilyName;
+  }
+
+  /**
+   * Get the path of the file outside the DB.
+   *
+   * @return the path of the file outside the DB.
+   */
+  public String getExternalFilePath() {
+    return externalFilePath;
+  }
+
+  /**
+   * Get the path of the file inside the DB.
+   *
+   * @return the path of the file inside the DB.
+   */
+  public String getInternalFilePath() {
+    return internalFilePath;
+  }
+
+  /**
+   * Get the global sequence number assigned to keys in this file.
+   *
+   * @return the global sequence number.
+   */
+  public long getGlobalSeqno() {
+    return globalSeqno;
+  }
+
+  /**
+   * Get the Table properties of the table being flushed.
+   *
+   * @return the table properties.
+   */
+  public TableProperties getTableProperties() {
+    return tableProperties;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    ExternalFileIngestionInfo that = (ExternalFileIngestionInfo) o;
+    return globalSeqno == that.globalSeqno
+        && Objects.equals(columnFamilyName, that.columnFamilyName)
+        && Objects.equals(externalFilePath, that.externalFilePath)
+        && Objects.equals(internalFilePath, that.internalFilePath)
+        && Objects.equals(tableProperties, that.tableProperties);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(
+        columnFamilyName, externalFilePath, internalFilePath, globalSeqno, tableProperties);
+  }
+
+  @Override
+  public String toString() {
+    return "ExternalFileIngestionInfo{"
+        + "columnFamilyName='" + columnFamilyName + '\'' + ", externalFilePath='" + externalFilePath
+        + '\'' + ", internalFilePath='" + internalFilePath + '\'' + ", globalSeqno=" + globalSeqno
+        + ", tableProperties=" + tableProperties + '}';
+  }
+}
diff --git a/java/src/main/java/org/rocksdb/FileOperationInfo.java b/java/src/main/java/org/rocksdb/FileOperationInfo.java
new file mode 100644
index 00000000000..aa5743ed377
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/FileOperationInfo.java
@@ -0,0 +1,112 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+/**
+ * Java representation of FileOperationInfo struct from include/rocksdb/listener.h
+ */
+public class FileOperationInfo {
+  private final String path;
+  private final long offset;
+  private final long length;
+  private final long startTimestamp;
+  private final long duration;
+  private final Status status;
+
+  /**
+   * Access is private as this will only be constructed from
+   * C++ via JNI.
+   */
+  FileOperationInfo(final String path, final long offset, final long length,
+      final long startTimestamp, final long duration, final Status status) {
+    this.path = path;
+    this.offset = offset;
+    this.length = length;
+    this.startTimestamp = startTimestamp;
+    this.duration = duration;
+    this.status = status;
+  }
+
+  /**
+   * Get the file path.
+   *
+   * @return the file path.
+   */
+  public String getPath() {
+    return path;
+  }
+
+  /**
+   * Get the offset.
+   *
+   * @return the offset.
+   */
+  public long getOffset() {
+    return offset;
+  }
+
+  /**
+   * Get the length.
+   *
+   * @return the length.
+   */
+  public long getLength() {
+    return length;
+  }
+
+  /**
+   * Get the start timestamp (in nanoseconds).
+   *
+   * @return the start timestamp.
+   */
+  public long getStartTimestamp() {
+    return startTimestamp;
+  }
+
+  /**
+   * Get the operation duration (in nanoseconds).
+   *
+   * @return the operation duration.
+   */
+  public long getDuration() {
+    return duration;
+  }
+
+  /**
+   * Get the status.
+   *
+   * @return the status.
+   */
+  public Status getStatus() {
+    return status;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    FileOperationInfo that = (FileOperationInfo) o;
+    return offset == that.offset && length == that.length && startTimestamp == that.startTimestamp
+        && duration == that.duration && Objects.equals(path, that.path)
+        && Objects.equals(status, that.status);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(path, offset, length, startTimestamp, duration, status);
+  }
+
+  @Override
+  public String toString() {
+    return "FileOperationInfo{"
+        + "path='" + path + '\'' + ", offset=" + offset + ", length=" + length + ", startTimestamp="
+        + startTimestamp + ", duration=" + duration + ", status=" + status + '}';
+  }
+}
diff --git a/java/src/main/java/org/rocksdb/FlushJobInfo.java b/java/src/main/java/org/rocksdb/FlushJobInfo.java
new file mode 100644
index 00000000000..ca9aa05236b
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/FlushJobInfo.java
@@ -0,0 +1,186 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+public class FlushJobInfo {
+  private final long columnFamilyId;
+  private final String columnFamilyName;
+  private final String filePath;
+  private final long threadId;
+  private final int jobId;
+  private final boolean triggeredWritesSlowdown;
+  private final boolean triggeredWritesStop;
+  private final long smallestSeqno;
+  private final long largestSeqno;
+  private final TableProperties tableProperties;
+  private final FlushReason flushReason;
+
+  /**
+   * Access is package private as this will only be constructed from
+   * C++ via JNI and for testing.
+   */
+  FlushJobInfo(final long columnFamilyId, final String columnFamilyName, final String filePath,
+      final long threadId, final int jobId, final boolean triggeredWritesSlowdown,
+      final boolean triggeredWritesStop, final long smallestSeqno, final long largestSeqno,
+      final TableProperties tableProperties, final byte flushReasonValue) {
+    this.columnFamilyId = columnFamilyId;
+    this.columnFamilyName = columnFamilyName;
+    this.filePath = filePath;
+    this.threadId = threadId;
+    this.jobId = jobId;
+    this.triggeredWritesSlowdown = triggeredWritesSlowdown;
+    this.triggeredWritesStop = triggeredWritesStop;
+    this.smallestSeqno = smallestSeqno;
+    this.largestSeqno = largestSeqno;
+    this.tableProperties = tableProperties;
+    this.flushReason = FlushReason.fromValue(flushReasonValue);
+  }
+
+  /**
+   * Get the id of the column family.
+   *
+   * @return the id of the column family
+   */
+  public long getColumnFamilyId() {
+    return columnFamilyId;
+  }
+
+  /**
+   * Get the name of the column family.
+   *
+   * @return the name of the column family
+   */
+  public String getColumnFamilyName() {
+    return columnFamilyName;
+  }
+
+  /**
+   * Get the path to the newly created file.
+   *
+   * @return the path to the newly created file
+   */
+  public String getFilePath() {
+    return filePath;
+  }
+
+  /**
+   * Get the id of the thread that completed this flush job.
+   *
+   * @return the id of the thread that completed this flush job
+   */
+  public long getThreadId() {
+    return threadId;
+  }
+
+  /**
+   * Get the job id, which is unique in the same thread.
+   *
+   * @return the job id
+   */
+  public int getJobId() {
+    return jobId;
+  }
+
+  /**
+   * Determine if rocksdb is currently slowing-down all writes to prevent
+   * creating too many Level 0 files as compaction seems not able to
+   * catch up the write request speed.
+   *
+   * This indicates that there are too many files in Level 0.
+   *
+   * @return true if rocksdb is currently slowing-down all writes,
+   *     false otherwise
+   */
+  public boolean isTriggeredWritesSlowdown() {
+    return triggeredWritesSlowdown;
+  }
+
+  /**
+   * Determine if rocksdb is currently blocking any writes to prevent
+   * creating more L0 files.
+   *
+   * This indicates that there are too many files in level 0.
+   * Compactions should try to compact L0 files down to lower levels as soon
+   * as possible.
+   *
+   * @return true  if rocksdb is currently blocking any writes, false otherwise
+   */
+  public boolean isTriggeredWritesStop() {
+    return triggeredWritesStop;
+  }
+
+  /**
+   * Get the smallest sequence number in the newly created file.
+   *
+   * @return the smallest sequence number
+   */
+  public long getSmallestSeqno() {
+    return smallestSeqno;
+  }
+
+  /**
+   * Get the largest sequence number in the newly created file.
+   *
+   * @return the largest sequence number
+   */
+  public long getLargestSeqno() {
+    return largestSeqno;
+  }
+
+  /**
+   * Get the Table properties of the table being flushed.
+   *
+   * @return the Table properties of the table being flushed
+   */
+  public TableProperties getTableProperties() {
+    return tableProperties;
+  }
+
+  /**
+   * Get the reason for initiating the flush.
+   *
+   * @return the reason for initiating the flush.
+   */
+  public FlushReason getFlushReason() {
+    return flushReason;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    FlushJobInfo that = (FlushJobInfo) o;
+    return columnFamilyId == that.columnFamilyId && threadId == that.threadId && jobId == that.jobId
+        && triggeredWritesSlowdown == that.triggeredWritesSlowdown
+        && triggeredWritesStop == that.triggeredWritesStop && smallestSeqno == that.smallestSeqno
+        && largestSeqno == that.largestSeqno
+        && Objects.equals(columnFamilyName, that.columnFamilyName)
+        && Objects.equals(filePath, that.filePath)
+        && Objects.equals(tableProperties, that.tableProperties) && flushReason == that.flushReason;
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(columnFamilyId, columnFamilyName, filePath, threadId, jobId,
+        triggeredWritesSlowdown, triggeredWritesStop, smallestSeqno, largestSeqno, tableProperties,
+        flushReason);
+  }
+
+  @Override
+  public String toString() {
+    return "FlushJobInfo{"
+        + "columnFamilyId=" + columnFamilyId + ", columnFamilyName='" + columnFamilyName + '\''
+        + ", filePath='" + filePath + '\'' + ", threadId=" + threadId + ", jobId=" + jobId
+        + ", triggeredWritesSlowdown=" + triggeredWritesSlowdown
+        + ", triggeredWritesStop=" + triggeredWritesStop + ", smallestSeqno=" + smallestSeqno
+        + ", largestSeqno=" + largestSeqno + ", tableProperties=" + tableProperties
+        + ", flushReason=" + flushReason + '}';
+  }
+}
diff --git a/java/src/main/java/org/rocksdb/FlushReason.java b/java/src/main/java/org/rocksdb/FlushReason.java
new file mode 100644
index 00000000000..9d486cda16b
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/FlushReason.java
@@ -0,0 +1,53 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public enum FlushReason {
+  OTHERS((byte) 0x00),
+  GET_LIVE_FILES((byte) 0x01),
+  SHUTDOWN((byte) 0x02),
+  EXTERNAL_FILE_INGESTION((byte) 0x03),
+  MANUAL_COMPACTION((byte) 0x04),
+  WRITE_BUFFER_MANAGER((byte) 0x05),
+  WRITE_BUFFER_FULL((byte) 0x06),
+  TEST((byte) 0x07),
+  DELETE_FILES((byte) 0x08),
+  AUTO_COMPACTION((byte) 0x09),
+  MANUAL_FLUSH((byte) 0x0a),
+  ERROR_RECOVERY((byte) 0xb);
+
+  private final byte value;
+
+  FlushReason(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation.
+   *
+   * @return the internal representation
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the FlushReason from the internal representation value.
+   *
+   * @return the flush reason.
+   *
+   * @throws IllegalArgumentException if the value is unknown.
+   */
+  static FlushReason fromValue(final byte value) {
+    for (final FlushReason flushReason : FlushReason.values()) {
+      if (flushReason.value == value) {
+        return flushReason;
+      }
+    }
+
+    throw new IllegalArgumentException("Illegal value provided for FlushReason: " + value);
+  }
+}
diff --git a/java/src/main/java/org/rocksdb/HistogramType.java b/java/src/main/java/org/rocksdb/HistogramType.java
index 80d7c600edb..5953a7d9bdd 100644
--- a/java/src/main/java/org/rocksdb/HistogramType.java
+++ b/java/src/main/java/org/rocksdb/HistogramType.java
@@ -175,6 +175,11 @@ public enum HistogramType {
    */
   NUM_SST_READ_PER_LEVEL((byte) 0x31),
 
+  /**
+   * The number of retry in auto resume
+   */
+  ERROR_HANDLER_AUTORESUME_RETRY_COUNT((byte) 0x32),
+
   // 0x1F for backwards compatibility on current minor version.
   HISTOGRAM_ENUM_MAX((byte) 0x1F);
 
diff --git a/java/src/main/java/org/rocksdb/MemTableInfo.java b/java/src/main/java/org/rocksdb/MemTableInfo.java
new file mode 100644
index 00000000000..f4fb577c3a9
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/MemTableInfo.java
@@ -0,0 +1,103 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+public class MemTableInfo {
+  private final String columnFamilyName;
+  private final long firstSeqno;
+  private final long earliestSeqno;
+  private final long numEntries;
+  private final long numDeletes;
+
+  /**
+   * Access is package private as this will only be constructed from
+   * C++ via JNI and for testing.
+   */
+  MemTableInfo(final String columnFamilyName, final long firstSeqno, final long earliestSeqno,
+      final long numEntries, final long numDeletes) {
+    this.columnFamilyName = columnFamilyName;
+    this.firstSeqno = firstSeqno;
+    this.earliestSeqno = earliestSeqno;
+    this.numEntries = numEntries;
+    this.numDeletes = numDeletes;
+  }
+
+  /**
+   * Get the name of the column family to which memtable belongs.
+   *
+   * @return the name of the column family.
+   */
+  public String getColumnFamilyName() {
+    return columnFamilyName;
+  }
+
+  /**
+   * Get the Sequence number of the first element that was inserted into the
+   * memtable.
+   *
+   * @return the sequence number of the first inserted element.
+   */
+  public long getFirstSeqno() {
+    return firstSeqno;
+  }
+
+  /**
+   * Get the Sequence number that is guaranteed to be smaller than or equal
+   * to the sequence number of any key that could be inserted into this
+   * memtable. It can then be assumed that any write with a larger(or equal)
+   * sequence number will be present in this memtable or a later memtable.
+   *
+   * @return the earliest sequence number.
+   */
+  public long getEarliestSeqno() {
+    return earliestSeqno;
+  }
+
+  /**
+   * Get the total number of entries in memtable.
+   *
+   * @return the total number of entries.
+   */
+  public long getNumEntries() {
+    return numEntries;
+  }
+
+  /**
+   * Get the total number of deletes in memtable.
+   *
+   * @return the total number of deletes.
+   */
+  public long getNumDeletes() {
+    return numDeletes;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    MemTableInfo that = (MemTableInfo) o;
+    return firstSeqno == that.firstSeqno && earliestSeqno == that.earliestSeqno
+        && numEntries == that.numEntries && numDeletes == that.numDeletes
+        && Objects.equals(columnFamilyName, that.columnFamilyName);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(columnFamilyName, firstSeqno, earliestSeqno, numEntries, numDeletes);
+  }
+
+  @Override
+  public String toString() {
+    return "MemTableInfo{"
+        + "columnFamilyName='" + columnFamilyName + '\'' + ", firstSeqno=" + firstSeqno
+        + ", earliestSeqno=" + earliestSeqno + ", numEntries=" + numEntries
+        + ", numDeletes=" + numDeletes + '}';
+  }
+}
diff --git a/java/src/main/java/org/rocksdb/Options.java b/java/src/main/java/org/rocksdb/Options.java
index 95d1daa8097..57f3aeffbd3 100644
--- a/java/src/main/java/org/rocksdb/Options.java
+++ b/java/src/main/java/org/rocksdb/Options.java
@@ -970,6 +970,19 @@ public boolean strictBytesPerSync() {
     return strictBytesPerSync(nativeHandle_);
   }
 
+  @Override
+  public Options setListeners(final List<AbstractEventListener> listeners) {
+    assert (isOwningHandle());
+    setEventListeners(nativeHandle_, RocksCallbackObject.toNativeHandleList(listeners));
+    return this;
+  }
+
+  @Override
+  public List<AbstractEventListener> listeners() {
+    assert (isOwningHandle());
+    return Arrays.asList(eventListeners(nativeHandle_));
+  }
+
   @Override
   public Options setEnableThreadTracking(final boolean enableThreadTracking) {
     assert(isOwningHandle());
@@ -1395,7 +1408,7 @@ public List<CompressionType> compressionPerLevel() {
     final byte[] byteCompressionTypes =
         compressionPerLevel(nativeHandle_);
     final List<CompressionType> compressionLevels = new ArrayList<>();
-    for (final Byte byteCompressionType : byteCompressionTypes) {
+    for (final byte byteCompressionType : byteCompressionTypes) {
       compressionLevels.add(CompressionType.getCompressionType(
           byteCompressionType));
     }
@@ -2151,6 +2164,9 @@ private native void setStrictBytesPerSync(
       final long handle, final boolean strictBytesPerSync);
   private native boolean strictBytesPerSync(
       final long handle);
+  private static native void setEventListeners(
+      final long handle, final long[] eventListenerHandles);
+  private static native AbstractEventListener[] eventListeners(final long handle);
   private native void setEnableThreadTracking(long handle,
       boolean enableThreadTracking);
   private native boolean enableThreadTracking(long handle);
diff --git a/java/src/main/java/org/rocksdb/ReadOptions.java b/java/src/main/java/org/rocksdb/ReadOptions.java
index 1f1510568ff..8e287eb9d7a 100644
--- a/java/src/main/java/org/rocksdb/ReadOptions.java
+++ b/java/src/main/java/org/rocksdb/ReadOptions.java
@@ -437,16 +437,15 @@ public ReadOptions setIgnoreRangeDeletions(final boolean ignoreRangeDeletions) {
    *
    * Default: null
    *
-   * @param iterateLowerBound Slice representing the upper bound
+   * @param iterateLowerBound Slice representing the lower bound
    * @return the reference to the current ReadOptions.
    */
-  public ReadOptions setIterateLowerBound(final Slice iterateLowerBound) {
+  public ReadOptions setIterateLowerBound(final AbstractSlice<?> iterateLowerBound) {
     assert(isOwningHandle());
-    if (iterateLowerBound != null) {
-      // Hold onto a reference so it doesn't get garbage collected out from under us.
-      iterateLowerBoundSlice_ = iterateLowerBound;
-      setIterateLowerBound(nativeHandle_, iterateLowerBoundSlice_.getNativeHandle());
-    }
+    setIterateLowerBound(
+        nativeHandle_, iterateLowerBound == null ? 0 : iterateLowerBound.getNativeHandle());
+    // Hold onto a reference so it doesn't get garbage collected out from under us.
+    iterateLowerBoundSlice_ = iterateLowerBound;
     return this;
   }
 
@@ -485,13 +484,12 @@ public Slice iterateLowerBound() {
    * @param iterateUpperBound Slice representing the upper bound
    * @return the reference to the current ReadOptions.
    */
-  public ReadOptions setIterateUpperBound(final Slice iterateUpperBound) {
+  public ReadOptions setIterateUpperBound(final AbstractSlice<?> iterateUpperBound) {
     assert(isOwningHandle());
-    if (iterateUpperBound != null) {
-      // Hold onto a reference so it doesn't get garbage collected out from under us.
-      iterateUpperBoundSlice_ = iterateUpperBound;
-      setIterateUpperBound(nativeHandle_, iterateUpperBoundSlice_.getNativeHandle());
-    }
+    setIterateUpperBound(
+        nativeHandle_, iterateUpperBound == null ? 0 : iterateUpperBound.getNativeHandle());
+    // Hold onto a reference so it doesn't get garbage collected out from under us.
+    iterateUpperBoundSlice_ = iterateUpperBound;
     return this;
   }
 
@@ -570,8 +568,8 @@ public long iterStartSeqnum() {
   // freely leave scope without us losing the Java Slice object, which during
   // close() would also reap its associated rocksdb::Slice native object since
   // it's possibly (likely) to be an owning handle.
-  private Slice iterateLowerBoundSlice_;
-  private Slice iterateUpperBoundSlice_;
+  private AbstractSlice<?> iterateLowerBoundSlice_;
+  private AbstractSlice<?> iterateUpperBoundSlice_;
 
   private native static long newReadOptions();
   private native static long newReadOptions(final boolean verifyChecksums,
diff --git a/java/src/main/java/org/rocksdb/RocksCallbackObject.java b/java/src/main/java/org/rocksdb/RocksCallbackObject.java
index a662f78fd70..8d7a867ee7c 100644
--- a/java/src/main/java/org/rocksdb/RocksCallbackObject.java
+++ b/java/src/main/java/org/rocksdb/RocksCallbackObject.java
@@ -5,6 +5,8 @@
 
 package org.rocksdb;
 
+import java.util.List;
+
 /**
  * RocksCallbackObject is similar to {@link RocksObject} but varies
  * in its construction as it is designed for Java objects which have functions
@@ -26,6 +28,27 @@ protected RocksCallbackObject(final long... nativeParameterHandles) {
     this.nativeHandle_ = initializeNative(nativeParameterHandles);
   }
 
+  /**
+   * Given a list of RocksCallbackObjects, it returns a list
+   * of the native handles of the underlying objects.
+   *
+   * @param objectList the rocks callback objects
+   *
+   * @return the native handles
+   */
+  static /* @Nullable */ long[] toNativeHandleList(
+      /* @Nullable */ final List<? extends RocksCallbackObject> objectList) {
+    if (objectList == null) {
+      return null;
+    }
+    final int len = objectList.size();
+    final long[] handleList = new long[len];
+    for (int i = 0; i < len; i++) {
+      handleList[i] = objectList.get(i).nativeHandle_;
+    }
+    return handleList;
+  }
+
   /**
    * Construct the Native C++ object which will callback
    * to our object methods
diff --git a/java/src/main/java/org/rocksdb/RocksDB.java b/java/src/main/java/org/rocksdb/RocksDB.java
index 471a9bbd9e6..bec702faf37 100644
--- a/java/src/main/java/org/rocksdb/RocksDB.java
+++ b/java/src/main/java/org/rocksdb/RocksDB.java
@@ -38,6 +38,8 @@ private enum LibraryState {
     RocksDB.loadLibrary();
   }
 
+  private List<ColumnFamilyHandle> ownedColumnFamilyHandles = new ArrayList<>();
+
   /**
    * Loads the necessary library files.
    * Calling this method twice will have no effect.
@@ -307,9 +309,12 @@ public static RocksDB open(final DBOptions options, final String path,
     db.storeOptionsInstance(options);
 
     for (int i = 1; i < handles.length; i++) {
-      columnFamilyHandles.add(new ColumnFamilyHandle(db, handles[i]));
+      final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(db, handles[i]);
+      columnFamilyHandles.add(columnFamilyHandle);
     }
 
+    db.ownedColumnFamilyHandles.addAll(columnFamilyHandles);
+
     return db;
   }
 
@@ -484,9 +489,12 @@ public static RocksDB openReadOnly(final DBOptions options, final String path,
     db.storeOptionsInstance(options);
 
     for (int i = 1; i < handles.length; i++) {
-      columnFamilyHandles.add(new ColumnFamilyHandle(db, handles[i]));
+      final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(db, handles[i]);
+      columnFamilyHandles.add(columnFamilyHandle);
     }
 
+    db.ownedColumnFamilyHandles.addAll(columnFamilyHandles);
+
     return db;
   }
 
@@ -577,9 +585,12 @@ public static RocksDB openAsSecondary(final DBOptions options, final String path
     db.storeOptionsInstance(options);
 
     for (int i = 1; i < handles.length; i++) {
-      columnFamilyHandles.add(new ColumnFamilyHandle(db, handles[i]));
+      final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(db, handles[i]);
+      columnFamilyHandles.add(columnFamilyHandle);
     }
 
+    db.ownedColumnFamilyHandles.addAll(columnFamilyHandles);
+
     return db;
   }
 
@@ -597,6 +608,11 @@ public static RocksDB openAsSecondary(final DBOptions options, final String path
    * @throws RocksDBException if an error occurs whilst closing.
    */
   public void closeE() throws RocksDBException {
+    for (final ColumnFamilyHandle columnFamilyHandle : ownedColumnFamilyHandles) {
+      columnFamilyHandle.close();
+    }
+    ownedColumnFamilyHandles.clear();
+
     if (owningHandle_.compareAndSet(true, false)) {
       try {
         closeDatabase(nativeHandle_);
@@ -619,6 +635,11 @@ public void closeE() throws RocksDBException {
    */
   @Override
   public void close() {
+    for (final ColumnFamilyHandle columnFamilyHandle : ownedColumnFamilyHandles) {
+      columnFamilyHandle.close();
+    }
+    ownedColumnFamilyHandles.clear();
+
     if (owningHandle_.compareAndSet(true, false)) {
       try {
         closeDatabase(nativeHandle_);
@@ -661,10 +682,12 @@ public static List<byte[]> listColumnFamilies(final Options options,
   public ColumnFamilyHandle createColumnFamily(
       final ColumnFamilyDescriptor columnFamilyDescriptor)
       throws RocksDBException {
-    return new ColumnFamilyHandle(this, createColumnFamily(nativeHandle_,
-        columnFamilyDescriptor.getName(),
-        columnFamilyDescriptor.getName().length,
-        columnFamilyDescriptor.getOptions().nativeHandle_));
+    final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(this,
+        createColumnFamily(nativeHandle_, columnFamilyDescriptor.getName(),
+            columnFamilyDescriptor.getName().length,
+            columnFamilyDescriptor.getOptions().nativeHandle_));
+    ownedColumnFamilyHandles.add(columnFamilyHandle);
+    return columnFamilyHandle;
   }
 
   /**
@@ -688,8 +711,10 @@ public List<ColumnFamilyHandle> createColumnFamilies(
     final List<ColumnFamilyHandle> columnFamilyHandles =
         new ArrayList<>(cfHandles.length);
     for (int i = 0; i < cfHandles.length; i++) {
-      columnFamilyHandles.add(new ColumnFamilyHandle(this, cfHandles[i]));
+      final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(this, cfHandles[i]);
+      columnFamilyHandles.add(columnFamilyHandle);
     }
+    ownedColumnFamilyHandles.addAll(columnFamilyHandles);
     return columnFamilyHandles;
   }
 
@@ -719,8 +744,10 @@ public List<ColumnFamilyHandle> createColumnFamilies(
     final List<ColumnFamilyHandle> columnFamilyHandles =
         new ArrayList<>(cfHandles.length);
     for (int i = 0; i < cfHandles.length; i++) {
-      columnFamilyHandles.add(new ColumnFamilyHandle(this, cfHandles[i]));
+      final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(this, cfHandles[i]);
+      columnFamilyHandles.add(columnFamilyHandle);
     }
+    ownedColumnFamilyHandles.addAll(columnFamilyHandles);
     return columnFamilyHandles;
   }
 
@@ -753,7 +780,22 @@ public void dropColumnFamilies(
     dropColumnFamilies(nativeHandle_, cfHandles);
   }
 
-  //TODO(AR) what about DestroyColumnFamilyHandle
+  /**
+   * Deletes native column family handle of given {@link ColumnFamilyHandle} Java object
+   * and removes reference from {@link RocksDB#ownedColumnFamilyHandles}.
+   *
+   * @param columnFamilyHandle column family handle object.
+   */
+  public void destroyColumnFamilyHandle(final ColumnFamilyHandle columnFamilyHandle) {
+    for (int i = 0; i < ownedColumnFamilyHandles.size(); ++i) {
+      final ColumnFamilyHandle ownedHandle = ownedColumnFamilyHandles.get(i);
+      if (ownedHandle.equals(columnFamilyHandle)) {
+        columnFamilyHandle.close();
+        ownedColumnFamilyHandles.remove(i);
+        return;
+      }
+    }
+  }
 
   /**
    * Set the database entry for "key" to "value".
@@ -2504,7 +2546,9 @@ public List<byte[]> multiGetAsList(final ReadOptions opt,
 
   /**
    * If the key definitely does not exist in the database, then this method
-   * returns null, else it returns an instance of KeyMayExistResult
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a true negative.
    *
    * If the caller wants to obtain value when the key
    * is found in memory, then {@code valueHolder} must be set.
@@ -2528,7 +2572,9 @@ public boolean keyMayExist(final byte[] key,
 
   /**
    * If the key definitely does not exist in the database, then this method
-   * returns null, else it returns an instance of KeyMayExistResult
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a true negative.
    *
    * If the caller wants to obtain value when the key
    * is found in memory, then {@code valueHolder} must be set.
@@ -2557,7 +2603,9 @@ public boolean keyMayExist(final byte[] key,
 
   /**
    * If the key definitely does not exist in the database, then this method
-   * returns null, else it returns an instance of KeyMayExistResult
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a true negative.
    *
    * If the caller wants to obtain value when the key
    * is found in memory, then {@code valueHolder} must be set.
@@ -2584,7 +2632,9 @@ public boolean keyMayExist(
 
   /**
    * If the key definitely does not exist in the database, then this method
-   * returns null, else it returns an instance of KeyMayExistResult
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a true negative.
    *
    * If the caller wants to obtain value when the key
    * is found in memory, then {@code valueHolder} must be set.
@@ -2616,7 +2666,9 @@ public boolean keyMayExist(
 
   /**
    * If the key definitely does not exist in the database, then this method
-   * returns null, else it returns an instance of KeyMayExistResult
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a true negative.
    *
    * If the caller wants to obtain value when the key
    * is found in memory, then {@code valueHolder} must be set.
@@ -2643,7 +2695,9 @@ public boolean keyMayExist(
 
   /**
    * If the key definitely does not exist in the database, then this method
-   * returns null, else it returns an instance of KeyMayExistResult
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a true negative.
    *
    * If the caller wants to obtain value when the key
    * is found in memory, then {@code valueHolder} must be set.
@@ -2675,7 +2729,9 @@ public boolean keyMayExist(
 
   /**
    * If the key definitely does not exist in the database, then this method
-   * returns null, else it returns an instance of KeyMayExistResult
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a true negative.
    *
    * If the caller wants to obtain value when the key
    * is found in memory, then {@code valueHolder} must be set.
@@ -2704,7 +2760,9 @@ public boolean keyMayExist(
 
   /**
    * If the key definitely does not exist in the database, then this method
-   * returns null, else it returns an instance of KeyMayExistResult
+   * returns false, otherwise it returns true if the key might exist.
+   * That is to say that this method is probabilistic and may return false
+   * positives, but never a true negative.
    *
    * If the caller wants to obtain value when the key
    * is found in memory, then {@code valueHolder} must be set.
@@ -2792,8 +2850,8 @@ public RocksIterator newIterator(final ReadOptions readOptions) {
   }
 
   /**
-   * <p>Return a heap-allocated iterator over the contents of the
-   * database. The result of newIterator() is initially invalid
+   * <p>Return a heap-allocated iterator over the contents of a
+   * ColumnFamily. The result of newIterator() is initially invalid
    * (caller must call one of the Seek methods on the iterator
    * before using it).</p>
    *
@@ -2812,8 +2870,8 @@ public RocksIterator newIterator(
   }
 
   /**
-   * <p>Return a heap-allocated iterator over the contents of the
-   * database. The result of newIterator() is initially invalid
+   * <p>Return a heap-allocated iterator over the contents of a
+   * ColumnFamily. The result of newIterator() is initially invalid
    * (caller must call one of the Seek methods on the iterator
    * before using it).</p>
    *
@@ -3532,9 +3590,8 @@ public void setOptions(
       /* @Nullable */final ColumnFamilyHandle columnFamilyHandle,
       final MutableColumnFamilyOptions mutableColumnFamilyOptions)
       throws RocksDBException {
-    setOptions(nativeHandle_, columnFamilyHandle.nativeHandle_,
-        mutableColumnFamilyOptions.getKeys(),
-        mutableColumnFamilyOptions.getValues());
+    setOptions(nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+        mutableColumnFamilyOptions.getKeys(), mutableColumnFamilyOptions.getValues());
   }
 
   /**
@@ -4081,7 +4138,7 @@ public ColumnFamilyMetaData getColumnFamilyMetaData(
    *
    * @return the column family metadata
    */
-  public ColumnFamilyMetaData GetColumnFamilyMetaData() {
+  public ColumnFamilyMetaData getColumnFamilyMetaData() {
     return getColumnFamilyMetaData(null);
   }
 
@@ -4479,7 +4536,6 @@ private native void dropColumnFamily(
       final long handle, final long cfHandle) throws RocksDBException;
   private native void dropColumnFamilies(final long handle,
       final long[] cfHandles) throws RocksDBException;
-  //TODO(AR) best way to express DestroyColumnFamilyHandle? ...maybe in ColumnFamilyHandle?
   private native void put(final long handle, final byte[] key,
       final int keyOffset, final int keyLength, final byte[] value,
       final int valueOffset, int valueLength) throws RocksDBException;
diff --git a/java/src/main/java/org/rocksdb/Status.java b/java/src/main/java/org/rocksdb/Status.java
index e633940c297..033ed3ea1c0 100644
--- a/java/src/main/java/org/rocksdb/Status.java
+++ b/java/src/main/java/org/rocksdb/Status.java
@@ -5,6 +5,8 @@
 
 package org.rocksdb;
 
+import java.util.Objects;
+
 /**
  * Represents the status returned by a function call in RocksDB.
  *
@@ -135,4 +137,19 @@ public byte getValue() {
       return value;
     }
   }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    Status status = (Status) o;
+    return code == status.code && subCode == status.subCode && Objects.equals(state, status.state);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(code, subCode, state);
+  }
 }
diff --git a/java/src/main/java/org/rocksdb/TableFileCreationBriefInfo.java b/java/src/main/java/org/rocksdb/TableFileCreationBriefInfo.java
new file mode 100644
index 00000000000..5a383ade41d
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/TableFileCreationBriefInfo.java
@@ -0,0 +1,107 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+public class TableFileCreationBriefInfo {
+  private final String dbName;
+  private final String columnFamilyName;
+  private final String filePath;
+  private final int jobId;
+  private final TableFileCreationReason reason;
+
+  /**
+   * Access is private as this will only be constructed from
+   * C++ via JNI, either directly of via
+   * {@link TableFileCreationInfo#TableFileCreationInfo(long, TableProperties, Status, String,
+   * String, String, int, byte)}.
+   *
+   * @param dbName the database name
+   * @param columnFamilyName the column family name
+   * @param filePath the path to the table file
+   * @param jobId the job identifier
+   * @param tableFileCreationReasonValue the reason for creation of the table file
+   */
+  protected TableFileCreationBriefInfo(final String dbName, final String columnFamilyName,
+      final String filePath, final int jobId, final byte tableFileCreationReasonValue) {
+    this.dbName = dbName;
+    this.columnFamilyName = columnFamilyName;
+    this.filePath = filePath;
+    this.jobId = jobId;
+    this.reason = TableFileCreationReason.fromValue(tableFileCreationReasonValue);
+  }
+
+  /**
+   * Get the name of the database where the file was created.
+   *
+   * @return the name of the database.
+   */
+  public String getDbName() {
+    return dbName;
+  }
+
+  /**
+   * Get the name of the column family where the file was created.
+   *
+   * @return the name of the column family.
+   */
+  public String getColumnFamilyName() {
+    return columnFamilyName;
+  }
+
+  /**
+   * Get the path to the created file.
+   *
+   * @return the path.
+   */
+  public String getFilePath() {
+    return filePath;
+  }
+
+  /**
+   * Get the id of the job (which could be flush or compaction) that
+   * created the file.
+   *
+   * @return the id of the job.
+   */
+  public int getJobId() {
+    return jobId;
+  }
+
+  /**
+   * Get the reason for creating the table.
+   *
+   * @return the reason for creating the table.
+   */
+  public TableFileCreationReason getReason() {
+    return reason;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    TableFileCreationBriefInfo that = (TableFileCreationBriefInfo) o;
+    return jobId == that.jobId && Objects.equals(dbName, that.dbName)
+        && Objects.equals(columnFamilyName, that.columnFamilyName)
+        && Objects.equals(filePath, that.filePath) && reason == that.reason;
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(dbName, columnFamilyName, filePath, jobId, reason);
+  }
+
+  @Override
+  public String toString() {
+    return "TableFileCreationBriefInfo{"
+        + "dbName='" + dbName + '\'' + ", columnFamilyName='" + columnFamilyName + '\''
+        + ", filePath='" + filePath + '\'' + ", jobId=" + jobId + ", reason=" + reason + '}';
+  }
+}
diff --git a/java/src/main/java/org/rocksdb/TableFileCreationInfo.java b/java/src/main/java/org/rocksdb/TableFileCreationInfo.java
new file mode 100644
index 00000000000..7742f32f19d
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/TableFileCreationInfo.java
@@ -0,0 +1,86 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+public class TableFileCreationInfo extends TableFileCreationBriefInfo {
+  private final long fileSize;
+  private final TableProperties tableProperties;
+  private final Status status;
+
+  /**
+   * Access is protected as this will only be constructed from
+   * C++ via JNI.
+   *
+   * @param fileSize the size of the table file
+   * @param tableProperties the properties of the table file
+   * @param status the status of the creation operation
+   * @param dbName the database name
+   * @param columnFamilyName the column family name
+   * @param filePath the path to the table file
+   * @param jobId the job identifier
+   * @param tableFileCreationReasonValue the reason for creation of the table file
+   */
+  protected TableFileCreationInfo(final long fileSize, final TableProperties tableProperties,
+      final Status status, final String dbName, final String columnFamilyName,
+      final String filePath, final int jobId, final byte tableFileCreationReasonValue) {
+    super(dbName, columnFamilyName, filePath, jobId, tableFileCreationReasonValue);
+    this.fileSize = fileSize;
+    this.tableProperties = tableProperties;
+    this.status = status;
+  }
+
+  /**
+   * Get the size of the file.
+   *
+   * @return the size.
+   */
+  public long getFileSize() {
+    return fileSize;
+  }
+
+  /**
+   * Get the detailed properties of the created file.
+   *
+   * @return the properties.
+   */
+  public TableProperties getTableProperties() {
+    return tableProperties;
+  }
+
+  /**
+   * Get the status indicating whether the creation was successful or not.
+   *
+   * @return the status.
+   */
+  public Status getStatus() {
+    return status;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    TableFileCreationInfo that = (TableFileCreationInfo) o;
+    return fileSize == that.fileSize && Objects.equals(tableProperties, that.tableProperties)
+        && Objects.equals(status, that.status);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(fileSize, tableProperties, status);
+  }
+
+  @Override
+  public String toString() {
+    return "TableFileCreationInfo{"
+        + "fileSize=" + fileSize + ", tableProperties=" + tableProperties + ", status=" + status
+        + '}';
+  }
+}
diff --git a/java/src/main/java/org/rocksdb/TableFileCreationReason.java b/java/src/main/java/org/rocksdb/TableFileCreationReason.java
new file mode 100644
index 00000000000..d3984663dd2
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/TableFileCreationReason.java
@@ -0,0 +1,46 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public enum TableFileCreationReason {
+  FLUSH((byte) 0x00),
+  COMPACTION((byte) 0x01),
+  RECOVERY((byte) 0x02),
+  MISC((byte) 0x03);
+
+  private final byte value;
+
+  TableFileCreationReason(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation.
+   *
+   * @return the internal representation
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the TableFileCreationReason from the internal representation value.
+   *
+   * @return the table file creation reason.
+   *
+   * @throws IllegalArgumentException if the value is unknown.
+   */
+  static TableFileCreationReason fromValue(final byte value) {
+    for (final TableFileCreationReason tableFileCreationReason : TableFileCreationReason.values()) {
+      if (tableFileCreationReason.value == value) {
+        return tableFileCreationReason;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal value provided for TableFileCreationReason: " + value);
+  }
+}
diff --git a/java/src/main/java/org/rocksdb/TableFileDeletionInfo.java b/java/src/main/java/org/rocksdb/TableFileDeletionInfo.java
new file mode 100644
index 00000000000..8aad03ae8fa
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/TableFileDeletionInfo.java
@@ -0,0 +1,86 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+public class TableFileDeletionInfo {
+  private final String dbName;
+  private final String filePath;
+  private final int jobId;
+  private final Status status;
+
+  /**
+   * Access is package private as this will only be constructed from
+   * C++ via JNI and for testing.
+   */
+  TableFileDeletionInfo(
+      final String dbName, final String filePath, final int jobId, final Status status) {
+    this.dbName = dbName;
+    this.filePath = filePath;
+    this.jobId = jobId;
+    this.status = status;
+  }
+
+  /**
+   * Get the name of the database where the file was deleted.
+   *
+   * @return the name of the database.
+   */
+  public String getDbName() {
+    return dbName;
+  }
+
+  /**
+   * Get the path to the deleted file.
+   *
+   * @return the path.
+   */
+  public String getFilePath() {
+    return filePath;
+  }
+
+  /**
+   * Get the id of the job which deleted the file.
+   *
+   * @return the id of the job.
+   */
+  public int getJobId() {
+    return jobId;
+  }
+
+  /**
+   * Get the status indicating whether the deletion was successful or not.
+   *
+   * @return the status
+   */
+  public Status getStatus() {
+    return status;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    TableFileDeletionInfo that = (TableFileDeletionInfo) o;
+    return jobId == that.jobId && Objects.equals(dbName, that.dbName)
+        && Objects.equals(filePath, that.filePath) && Objects.equals(status, that.status);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(dbName, filePath, jobId, status);
+  }
+
+  @Override
+  public String toString() {
+    return "TableFileDeletionInfo{"
+        + "dbName='" + dbName + '\'' + ", filePath='" + filePath + '\'' + ", jobId=" + jobId
+        + ", status=" + status + '}';
+  }
+}
diff --git a/java/src/main/java/org/rocksdb/TableProperties.java b/java/src/main/java/org/rocksdb/TableProperties.java
index 8c0b7e370e2..c1baea2a4bc 100644
--- a/java/src/main/java/org/rocksdb/TableProperties.java
+++ b/java/src/main/java/org/rocksdb/TableProperties.java
@@ -1,7 +1,9 @@
 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 package org.rocksdb;
 
+import java.util.Arrays;
 import java.util.Map;
+import java.util.Objects;
 
 /**
  * TableProperties contains read-only properties of its associated
@@ -27,6 +29,8 @@ public class TableProperties {
   private final long columnFamilyId;
   private final long creationTime;
   private final long oldestKeyTime;
+  private final long slowCompressionEstimatedDataSize;
+  private final long fastCompressionEstimatedDataSize;
   private final byte[] columnFamilyName;
   private final String filterPolicyName;
   private final String comparatorName;
@@ -39,24 +43,22 @@ public class TableProperties {
   private final Map<String, Long> propertiesOffsets;
 
   /**
-   * Access is private as this will only be constructed from
-   * C++ via JNI.
+   * Access is package private as this will only be constructed from
+   * C++ via JNI and for testing.
    */
-  private TableProperties(final long dataSize, final long indexSize,
-      final long indexPartitions, final long topLevelIndexSize,
-      final long indexKeyIsUserKey, final long indexValueIsDeltaEncoded,
-      final long filterSize, final long rawKeySize, final long rawValueSize,
-      final long numDataBlocks, final long numEntries, final long numDeletions,
-      final long numMergeOperands, final long numRangeDeletions,
-      final long formatVersion, final long fixedKeyLen,
-      final long columnFamilyId, final long creationTime,
-      final long oldestKeyTime, final byte[] columnFamilyName,
-      final String filterPolicyName, final String comparatorName,
+  TableProperties(final long dataSize, final long indexSize, final long indexPartitions,
+      final long topLevelIndexSize, final long indexKeyIsUserKey,
+      final long indexValueIsDeltaEncoded, final long filterSize, final long rawKeySize,
+      final long rawValueSize, final long numDataBlocks, final long numEntries,
+      final long numDeletions, final long numMergeOperands, final long numRangeDeletions,
+      final long formatVersion, final long fixedKeyLen, final long columnFamilyId,
+      final long creationTime, final long oldestKeyTime,
+      final long slowCompressionEstimatedDataSize, final long fastCompressionEstimatedDataSize,
+      final byte[] columnFamilyName, final String filterPolicyName, final String comparatorName,
       final String mergeOperatorName, final String prefixExtractorName,
       final String propertyCollectorsNames, final String compressionName,
       final Map<String, String> userCollectedProperties,
-      final Map<String, String> readableProperties,
-      final Map<String, Long> propertiesOffsets) {
+      final Map<String, String> readableProperties, final Map<String, Long> propertiesOffsets) {
     this.dataSize = dataSize;
     this.indexSize = indexSize;
     this.indexPartitions = indexPartitions;
@@ -76,6 +78,8 @@ private TableProperties(final long dataSize, final long indexSize,
     this.columnFamilyId = columnFamilyId;
     this.creationTime = creationTime;
     this.oldestKeyTime = oldestKeyTime;
+    this.slowCompressionEstimatedDataSize = slowCompressionEstimatedDataSize;
+    this.fastCompressionEstimatedDataSize = fastCompressionEstimatedDataSize;
     this.columnFamilyName = columnFamilyName;
     this.filterPolicyName = filterPolicyName;
     this.comparatorName = comparatorName;
@@ -268,6 +272,26 @@ public long getOldestKeyTime() {
     return oldestKeyTime;
   }
 
+  /**
+   * Get the estimated size of data blocks compressed with a relatively slower
+   * compression algorithm.
+   *
+   * @return 0 means unknown, otherwise the timestamp.
+   */
+  public long getSlowCompressionEstimatedDataSize() {
+    return slowCompressionEstimatedDataSize;
+  }
+
+  /**
+   * Get the estimated size of data blocks compressed with a relatively faster
+   * compression algorithm.
+   *
+   * @return 0 means unknown, otherwise the timestamp.
+   */
+  public long getFastCompressionEstimatedDataSize() {
+    return fastCompressionEstimatedDataSize;
+  }
+
   /**
    * Get the name of the column family with which this
    * SST file is associated.
@@ -363,4 +387,49 @@ public Map<String, String> getReadableProperties() {
   public Map<String, Long> getPropertiesOffsets() {
     return propertiesOffsets;
   }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    TableProperties that = (TableProperties) o;
+    return dataSize == that.dataSize && indexSize == that.indexSize
+        && indexPartitions == that.indexPartitions && topLevelIndexSize == that.topLevelIndexSize
+        && indexKeyIsUserKey == that.indexKeyIsUserKey
+        && indexValueIsDeltaEncoded == that.indexValueIsDeltaEncoded
+        && filterSize == that.filterSize && rawKeySize == that.rawKeySize
+        && rawValueSize == that.rawValueSize && numDataBlocks == that.numDataBlocks
+        && numEntries == that.numEntries && numDeletions == that.numDeletions
+        && numMergeOperands == that.numMergeOperands && numRangeDeletions == that.numRangeDeletions
+        && formatVersion == that.formatVersion && fixedKeyLen == that.fixedKeyLen
+        && columnFamilyId == that.columnFamilyId && creationTime == that.creationTime
+        && oldestKeyTime == that.oldestKeyTime
+        && slowCompressionEstimatedDataSize == that.slowCompressionEstimatedDataSize
+        && fastCompressionEstimatedDataSize == that.fastCompressionEstimatedDataSize
+        && Arrays.equals(columnFamilyName, that.columnFamilyName)
+        && Objects.equals(filterPolicyName, that.filterPolicyName)
+        && Objects.equals(comparatorName, that.comparatorName)
+        && Objects.equals(mergeOperatorName, that.mergeOperatorName)
+        && Objects.equals(prefixExtractorName, that.prefixExtractorName)
+        && Objects.equals(propertyCollectorsNames, that.propertyCollectorsNames)
+        && Objects.equals(compressionName, that.compressionName)
+        && Objects.equals(userCollectedProperties, that.userCollectedProperties)
+        && Objects.equals(readableProperties, that.readableProperties)
+        && Objects.equals(propertiesOffsets, that.propertiesOffsets);
+  }
+
+  @Override
+  public int hashCode() {
+    int result = Objects.hash(dataSize, indexSize, indexPartitions, topLevelIndexSize,
+        indexKeyIsUserKey, indexValueIsDeltaEncoded, filterSize, rawKeySize, rawValueSize,
+        numDataBlocks, numEntries, numDeletions, numMergeOperands, numRangeDeletions, formatVersion,
+        fixedKeyLen, columnFamilyId, creationTime, oldestKeyTime, slowCompressionEstimatedDataSize,
+        fastCompressionEstimatedDataSize, filterPolicyName, comparatorName, mergeOperatorName,
+        prefixExtractorName, propertyCollectorsNames, compressionName, userCollectedProperties,
+        readableProperties, propertiesOffsets);
+    result = 31 * result + Arrays.hashCode(columnFamilyName);
+    return result;
+  }
 }
diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java
index 7a37f35b9e1..0d6cc5a92eb 100644
--- a/java/src/main/java/org/rocksdb/TickerType.java
+++ b/java/src/main/java/org/rocksdb/TickerType.java
@@ -742,6 +742,28 @@ public enum TickerType {
     COMPACT_WRITE_BYTES_PERIODIC((byte) -0x14),
     COMPACT_WRITE_BYTES_TTL((byte) -0x15),
 
+    /**
+     * DB error handler statistics
+     */
+    ERROR_HANDLER_BG_ERROR_COUNT((byte) -0x16),
+    ERROR_HANDLER_BG_IO_ERROR_COUNT((byte) -0x17),
+    ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT((byte) -0x18),
+    ERROR_HANDLER_AUTORESUME_COUNT((byte) -0x19),
+    ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT((byte) -0x1A),
+    ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT((byte) -0x1B),
+
+    /**
+     * Bytes of raw data (payload) found on memtable at flush time.
+     * Contains the sum of garbage payload (bytes that are discarded
+     * at flush time) and useful payload (bytes of data that will
+     * eventually be written to SSTable).
+     */
+    MEMTABLE_PAYLOAD_BYTES_AT_FLUSH((byte) -0x1C),
+    /**
+     * Outdated bytes of data present on memtable at flush time.
+     */
+    MEMTABLE_GARBAGE_BYTES_AT_FLUSH((byte) -0x1D),
+
     TICKER_ENUM_MAX((byte) 0x5F);
 
     private final byte value;
diff --git a/java/src/main/java/org/rocksdb/Transaction.java b/java/src/main/java/org/rocksdb/Transaction.java
index f176701fa01..768329a675b 100644
--- a/java/src/main/java/org/rocksdb/Transaction.java
+++ b/java/src/main/java/org/rocksdb/Transaction.java
@@ -611,9 +611,9 @@ public RocksIterator getIterator(final ReadOptions readOptions) {
   }
 
   /**
-   * Returns an iterator that will iterate on all keys in the default
-   * column family including both keys in the DB and uncommitted keys in this
-   * transaction.
+   * Returns an iterator that will iterate on all keys in the column family
+   * specified by {@code columnFamilyHandle} including both keys in the DB
+   * and uncommitted keys in this transaction.
    *
    * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is read
    * from the DB but will NOT change which keys are read from this transaction
diff --git a/java/src/main/java/org/rocksdb/TtlDB.java b/java/src/main/java/org/rocksdb/TtlDB.java
index c1e3bb473a9..a7adaf4b222 100644
--- a/java/src/main/java/org/rocksdb/TtlDB.java
+++ b/java/src/main/java/org/rocksdb/TtlDB.java
@@ -113,7 +113,7 @@ public static TtlDB open(final DBOptions options, final String db_path,
       throws RocksDBException {
     if (columnFamilyDescriptors.size() != ttlValues.size()) {
       throw new IllegalArgumentException("There must be a ttl value per column"
-          + "family handle.");
+          + " family handle.");
     }
 
     final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][];
diff --git a/java/src/main/java/org/rocksdb/WriteStallCondition.java b/java/src/main/java/org/rocksdb/WriteStallCondition.java
new file mode 100644
index 00000000000..3bc9d410431
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/WriteStallCondition.java
@@ -0,0 +1,44 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public enum WriteStallCondition {
+  NORMAL((byte) 0x0),
+  DELAYED((byte) 0x1),
+  STOPPED((byte) 0x2);
+
+  private final byte value;
+
+  WriteStallCondition(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation.
+   *
+   * @return the internal representation
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the WriteStallCondition from the internal representation value.
+   *
+   * @return the flush reason.
+   *
+   * @throws IllegalArgumentException if the value is unknown.
+   */
+  static WriteStallCondition fromValue(final byte value) {
+    for (final WriteStallCondition writeStallCondition : WriteStallCondition.values()) {
+      if (writeStallCondition.value == value) {
+        return writeStallCondition;
+      }
+    }
+
+    throw new IllegalArgumentException("Illegal value provided for WriteStallCondition: " + value);
+  }
+}
diff --git a/java/src/main/java/org/rocksdb/WriteStallInfo.java b/java/src/main/java/org/rocksdb/WriteStallInfo.java
new file mode 100644
index 00000000000..4aef0eda9ad
--- /dev/null
+++ b/java/src/main/java/org/rocksdb/WriteStallInfo.java
@@ -0,0 +1,75 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Objects;
+
+public class WriteStallInfo {
+  private final String columnFamilyName;
+  private final WriteStallCondition currentCondition;
+  private final WriteStallCondition previousCondition;
+
+  /**
+   * Access is package private as this will only be constructed from
+   * C++ via JNI and for testing.
+   */
+  WriteStallInfo(final String columnFamilyName, final byte currentConditionValue,
+      final byte previousConditionValue) {
+    this.columnFamilyName = columnFamilyName;
+    this.currentCondition = WriteStallCondition.fromValue(currentConditionValue);
+    this.previousCondition = WriteStallCondition.fromValue(previousConditionValue);
+  }
+
+  /**
+   * Get the name of the column family.
+   *
+   * @return the name of the column family.
+   */
+  public String getColumnFamilyName() {
+    return columnFamilyName;
+  }
+
+  /**
+   * Get the current state of the write controller.
+   *
+   * @return the current state.
+   */
+  public WriteStallCondition getCurrentCondition() {
+    return currentCondition;
+  }
+
+  /**
+   * Get the previous state of the write controller.
+   *
+   * @return the previous state.
+   */
+  public WriteStallCondition getPreviousCondition() {
+    return previousCondition;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o)
+      return true;
+    if (o == null || getClass() != o.getClass())
+      return false;
+    WriteStallInfo that = (WriteStallInfo) o;
+    return Objects.equals(columnFamilyName, that.columnFamilyName)
+        && currentCondition == that.currentCondition && previousCondition == that.previousCondition;
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(columnFamilyName, currentCondition, previousCondition);
+  }
+
+  @Override
+  public String toString() {
+    return "WriteStallInfo{"
+        + "columnFamilyName='" + columnFamilyName + '\'' + ", currentCondition=" + currentCondition
+        + ", previousCondition=" + previousCondition + '}';
+  }
+}
diff --git a/java/src/test/java/org/rocksdb/ColumnFamilyTest.java b/java/src/test/java/org/rocksdb/ColumnFamilyTest.java
index a9a08763523..9fab479b272 100644
--- a/java/src/test/java/org/rocksdb/ColumnFamilyTest.java
+++ b/java/src/test/java/org/rocksdb/ColumnFamilyTest.java
@@ -5,16 +5,17 @@
 
 package org.rocksdb;
 
-import java.util.*;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
 
+import java.util.*;
 import org.junit.ClassRule;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
 
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.assertj.core.api.Assertions.assertThat;
-
 public class ColumnFamilyTest {
 
   @ClassRule
@@ -141,33 +142,19 @@ public void openWithColumnFamilies() throws RocksDBException {
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath(), cfNames,
              columnFamilyHandleList)) {
-
-      try {
-        assertThat(columnFamilyHandleList.size()).isEqualTo(2);
-        db.put("dfkey1".getBytes(), "dfvalue".getBytes());
-        db.put(columnFamilyHandleList.get(0), "dfkey2".getBytes(),
-            "dfvalue".getBytes());
-        db.put(columnFamilyHandleList.get(1), "newcfkey1".getBytes(),
-            "newcfvalue".getBytes());
-
-        String retVal = new String(db.get(columnFamilyHandleList.get(1),
-            "newcfkey1".getBytes()));
-        assertThat(retVal).isEqualTo("newcfvalue");
-        assertThat((db.get(columnFamilyHandleList.get(1),
-            "dfkey1".getBytes()))).isNull();
-        db.delete(columnFamilyHandleList.get(1), "newcfkey1".getBytes());
-        assertThat((db.get(columnFamilyHandleList.get(1),
-            "newcfkey1".getBytes()))).isNull();
-        db.delete(columnFamilyHandleList.get(0), new WriteOptions(),
-            "dfkey2".getBytes());
-        assertThat(db.get(columnFamilyHandleList.get(0), new ReadOptions(),
-            "dfkey2".getBytes())).isNull();
-      } finally {
-        for (final ColumnFamilyHandle columnFamilyHandle :
-            columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
-      }
+      assertThat(columnFamilyHandleList.size()).isEqualTo(2);
+      db.put("dfkey1".getBytes(), "dfvalue".getBytes());
+      db.put(columnFamilyHandleList.get(0), "dfkey2".getBytes(), "dfvalue".getBytes());
+      db.put(columnFamilyHandleList.get(1), "newcfkey1".getBytes(), "newcfvalue".getBytes());
+
+      String retVal = new String(db.get(columnFamilyHandleList.get(1), "newcfkey1".getBytes()));
+      assertThat(retVal).isEqualTo("newcfvalue");
+      assertThat((db.get(columnFamilyHandleList.get(1), "dfkey1".getBytes()))).isNull();
+      db.delete(columnFamilyHandleList.get(1), "newcfkey1".getBytes());
+      assertThat((db.get(columnFamilyHandleList.get(1), "newcfkey1".getBytes()))).isNull();
+      db.delete(columnFamilyHandleList.get(0), new WriteOptions(), "dfkey2".getBytes());
+      assertThat(db.get(columnFamilyHandleList.get(0), new ReadOptions(), "dfkey2".getBytes()))
+          .isNull();
     }
   }
 
@@ -184,30 +171,22 @@ public void getWithOutValueAndCf() throws RocksDBException {
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
              columnFamilyHandleList)) {
-      try {
-        db.put(columnFamilyHandleList.get(0), new WriteOptions(),
-            "key1".getBytes(), "value".getBytes());
-        db.put("key2".getBytes(), "12345678".getBytes());
-        final byte[] outValue = new byte[5];
-        // not found value
-        int getResult = db.get("keyNotFound".getBytes(), outValue);
-        assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND);
-        // found value which fits in outValue
-        getResult = db.get(columnFamilyHandleList.get(0), "key1".getBytes(),
-            outValue);
-        assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
-        assertThat(outValue).isEqualTo("value".getBytes());
-        // found value which fits partially
-        getResult = db.get(columnFamilyHandleList.get(0), new ReadOptions(),
-            "key2".getBytes(), outValue);
-        assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
-        assertThat(outValue).isEqualTo("12345".getBytes());
-      } finally {
-        for (final ColumnFamilyHandle columnFamilyHandle :
-            columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
-      }
+      db.put(
+          columnFamilyHandleList.get(0), new WriteOptions(), "key1".getBytes(), "value".getBytes());
+      db.put("key2".getBytes(), "12345678".getBytes());
+      final byte[] outValue = new byte[5];
+      // not found value
+      int getResult = db.get("keyNotFound".getBytes(), outValue);
+      assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND);
+      // found value which fits in outValue
+      getResult = db.get(columnFamilyHandleList.get(0), "key1".getBytes(), outValue);
+      assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
+      assertThat(outValue).isEqualTo("value".getBytes());
+      // found value which fits partially
+      getResult =
+          db.get(columnFamilyHandleList.get(0), new ReadOptions(), "key2".getBytes(), outValue);
+      assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
+      assertThat(outValue).isEqualTo("12345".getBytes());
     }
   }
 
@@ -223,22 +202,12 @@ public void createWriteDropColumnFamily() throws RocksDBException {
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
              columnFamilyHandleList)) {
-      ColumnFamilyHandle tmpColumnFamilyHandle = null;
-      try {
-        tmpColumnFamilyHandle = db.createColumnFamily(
-            new ColumnFamilyDescriptor("tmpCF".getBytes(),
-                new ColumnFamilyOptions()));
-        db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes());
-        db.dropColumnFamily(tmpColumnFamilyHandle);
-        assertThat(tmpColumnFamilyHandle.isOwningHandle()).isTrue();
-      } finally {
-        if (tmpColumnFamilyHandle != null) {
-          tmpColumnFamilyHandle.close();
-        }
-        for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
-      }
+      ColumnFamilyHandle tmpColumnFamilyHandle;
+      tmpColumnFamilyHandle = db.createColumnFamily(
+          new ColumnFamilyDescriptor("tmpCF".getBytes(), new ColumnFamilyOptions()));
+      db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes());
+      db.dropColumnFamily(tmpColumnFamilyHandle);
+      assertThat(tmpColumnFamilyHandle.isOwningHandle()).isTrue();
     }
   }
 
@@ -256,29 +225,15 @@ public void createWriteDropColumnFamilies() throws RocksDBException {
              columnFamilyHandleList)) {
       ColumnFamilyHandle tmpColumnFamilyHandle = null;
       ColumnFamilyHandle tmpColumnFamilyHandle2 = null;
-      try {
-        tmpColumnFamilyHandle = db.createColumnFamily(
-            new ColumnFamilyDescriptor("tmpCF".getBytes(),
-                new ColumnFamilyOptions()));
-        tmpColumnFamilyHandle2 = db.createColumnFamily(
-            new ColumnFamilyDescriptor("tmpCF2".getBytes(),
-                new ColumnFamilyOptions()));
-        db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes());
-        db.put(tmpColumnFamilyHandle2, "key".getBytes(), "value".getBytes());
-        db.dropColumnFamilies(Arrays.asList(tmpColumnFamilyHandle, tmpColumnFamilyHandle2));
-        assertThat(tmpColumnFamilyHandle.isOwningHandle()).isTrue();
-        assertThat(tmpColumnFamilyHandle2.isOwningHandle()).isTrue();
-      } finally {
-        if (tmpColumnFamilyHandle != null) {
-          tmpColumnFamilyHandle.close();
-        }
-        if (tmpColumnFamilyHandle2 != null) {
-          tmpColumnFamilyHandle2.close();
-        }
-        for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
-      }
+      tmpColumnFamilyHandle = db.createColumnFamily(
+          new ColumnFamilyDescriptor("tmpCF".getBytes(), new ColumnFamilyOptions()));
+      tmpColumnFamilyHandle2 = db.createColumnFamily(
+          new ColumnFamilyDescriptor("tmpCF2".getBytes(), new ColumnFamilyOptions()));
+      db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes());
+      db.put(tmpColumnFamilyHandle2, "key".getBytes(), "value".getBytes());
+      db.dropColumnFamilies(Arrays.asList(tmpColumnFamilyHandle, tmpColumnFamilyHandle2));
+      assertThat(tmpColumnFamilyHandle.isOwningHandle()).isTrue();
+      assertThat(tmpColumnFamilyHandle2.isOwningHandle()).isTrue();
     }
   }
 
@@ -300,36 +255,24 @@ public void writeBatch() throws RocksDBException {
                cfDescriptors, columnFamilyHandleList);
            final WriteBatch writeBatch = new WriteBatch();
            final WriteOptions writeOpt = new WriteOptions()) {
-        try {
-          writeBatch.put("key".getBytes(), "value".getBytes());
-          writeBatch.put(db.getDefaultColumnFamily(),
-              "mergeKey".getBytes(), "merge".getBytes());
-          writeBatch.merge(db.getDefaultColumnFamily(), "mergeKey".getBytes(),
-              "merge".getBytes());
-          writeBatch.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(),
-              "value".getBytes());
-          writeBatch.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(),
-              "value2".getBytes());
-          writeBatch.delete("xyz".getBytes());
-          writeBatch.delete(columnFamilyHandleList.get(1), "xyz".getBytes());
-          db.write(writeOpt, writeBatch);
-
-          assertThat(db.get(columnFamilyHandleList.get(1),
-              "xyz".getBytes()) == null);
-          assertThat(new String(db.get(columnFamilyHandleList.get(1),
-              "newcfkey".getBytes()))).isEqualTo("value");
-          assertThat(new String(db.get(columnFamilyHandleList.get(1),
-              "newcfkey2".getBytes()))).isEqualTo("value2");
-          assertThat(new String(db.get("key".getBytes()))).isEqualTo("value");
-          // check if key is merged
-          assertThat(new String(db.get(db.getDefaultColumnFamily(),
-              "mergeKey".getBytes()))).isEqualTo("merge,merge");
-        } finally {
-          for (final ColumnFamilyHandle columnFamilyHandle :
-              columnFamilyHandleList) {
-            columnFamilyHandle.close();
-          }
-        }
+        writeBatch.put("key".getBytes(), "value".getBytes());
+        writeBatch.put(db.getDefaultColumnFamily(), "mergeKey".getBytes(), "merge".getBytes());
+        writeBatch.merge(db.getDefaultColumnFamily(), "mergeKey".getBytes(), "merge".getBytes());
+        writeBatch.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes());
+        writeBatch.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(), "value2".getBytes());
+        writeBatch.delete("xyz".getBytes());
+        writeBatch.delete(columnFamilyHandleList.get(1), "xyz".getBytes());
+        db.write(writeOpt, writeBatch);
+
+        assertThat(db.get(columnFamilyHandleList.get(1), "xyz".getBytes()) == null);
+        assertThat(new String(db.get(columnFamilyHandleList.get(1), "newcfkey".getBytes())))
+            .isEqualTo("value");
+        assertThat(new String(db.get(columnFamilyHandleList.get(1), "newcfkey2".getBytes())))
+            .isEqualTo("value2");
+        assertThat(new String(db.get("key".getBytes()))).isEqualTo("value");
+        // check if key is merged
+        assertThat(new String(db.get(db.getDefaultColumnFamily(), "mergeKey".getBytes())))
+            .isEqualTo("merge,merge");
       }
     }
   }
@@ -346,32 +289,21 @@ public void iteratorOnColumnFamily() throws RocksDBException {
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath(),
              cfDescriptors, columnFamilyHandleList)) {
-      try {
-
-        db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(),
-            "value".getBytes());
-        db.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(),
-            "value2".getBytes());
-        try (final RocksIterator rocksIterator =
-                 db.newIterator(columnFamilyHandleList.get(1))) {
-          rocksIterator.seekToFirst();
-          Map<String, String> refMap = new HashMap<>();
-          refMap.put("newcfkey", "value");
-          refMap.put("newcfkey2", "value2");
-          int i = 0;
-          while (rocksIterator.isValid()) {
-            i++;
-            assertThat(refMap.get(new String(rocksIterator.key()))).
-                isEqualTo(new String(rocksIterator.value()));
-            rocksIterator.next();
-          }
-          assertThat(i).isEqualTo(2);
-        }
-      } finally {
-        for (final ColumnFamilyHandle columnFamilyHandle :
-            columnFamilyHandleList) {
-          columnFamilyHandle.close();
+      db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes());
+      db.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(), "value2".getBytes());
+      try (final RocksIterator rocksIterator = db.newIterator(columnFamilyHandleList.get(1))) {
+        rocksIterator.seekToFirst();
+        Map<String, String> refMap = new HashMap<>();
+        refMap.put("newcfkey", "value");
+        refMap.put("newcfkey2", "value2");
+        int i = 0;
+        while (rocksIterator.isValid()) {
+          i++;
+          assertThat(refMap.get(new String(rocksIterator.key())))
+              .isEqualTo(new String(rocksIterator.value()));
+          rocksIterator.next();
         }
+        assertThat(i).isEqualTo(2);
       }
     }
   }
@@ -388,35 +320,20 @@ public void multiGet() throws RocksDBException {
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath(),
              cfDescriptors, columnFamilyHandleList)) {
-      try {
-        db.put(columnFamilyHandleList.get(0), "key".getBytes(),
-            "value".getBytes());
-        db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(),
-            "value".getBytes());
-
-        final List<byte[]> keys = Arrays.asList(new byte[][]{
-            "key".getBytes(), "newcfkey".getBytes()
-        });
-
-        List<byte[]> retValues = db.multiGetAsList(columnFamilyHandleList, keys);
-        assertThat(retValues.size()).isEqualTo(2);
-        assertThat(new String(retValues.get(0)))
-            .isEqualTo("value");
-        assertThat(new String(retValues.get(1)))
-            .isEqualTo("value");
-        retValues = db.multiGetAsList(new ReadOptions(), columnFamilyHandleList,
-            keys);
-        assertThat(retValues.size()).isEqualTo(2);
-        assertThat(new String(retValues.get(0)))
-            .isEqualTo("value");
-        assertThat(new String(retValues.get(1)))
-            .isEqualTo("value");
-      } finally {
-        for (final ColumnFamilyHandle columnFamilyHandle :
-            columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
-      }
+      db.put(columnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes());
+      db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes());
+
+      final List<byte[]> keys =
+          Arrays.asList(new byte[][] {"key".getBytes(), "newcfkey".getBytes()});
+
+      List<byte[]> retValues = db.multiGetAsList(columnFamilyHandleList, keys);
+      assertThat(retValues.size()).isEqualTo(2);
+      assertThat(new String(retValues.get(0))).isEqualTo("value");
+      assertThat(new String(retValues.get(1))).isEqualTo("value");
+      retValues = db.multiGetAsList(new ReadOptions(), columnFamilyHandleList, keys);
+      assertThat(retValues.size()).isEqualTo(2);
+      assertThat(new String(retValues.get(0))).isEqualTo("value");
+      assertThat(new String(retValues.get(1))).isEqualTo("value");
     }
   }
 
@@ -432,35 +349,19 @@ public void multiGetAsList() throws RocksDBException {
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath(),
              cfDescriptors, columnFamilyHandleList)) {
-      try {
-        db.put(columnFamilyHandleList.get(0), "key".getBytes(),
-            "value".getBytes());
-        db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(),
-            "value".getBytes());
-
-        final List<byte[]> keys = Arrays.asList(new byte[][]{
-            "key".getBytes(), "newcfkey".getBytes()
-        });
-        List<byte[]> retValues = db.multiGetAsList(columnFamilyHandleList,
-            keys);
-        assertThat(retValues.size()).isEqualTo(2);
-        assertThat(new String(retValues.get(0)))
-            .isEqualTo("value");
-        assertThat(new String(retValues.get(1)))
-            .isEqualTo("value");
-        retValues = db.multiGetAsList(new ReadOptions(), columnFamilyHandleList,
-            keys);
-        assertThat(retValues.size()).isEqualTo(2);
-        assertThat(new String(retValues.get(0)))
-            .isEqualTo("value");
-        assertThat(new String(retValues.get(1)))
-            .isEqualTo("value");
-      } finally {
-        for (final ColumnFamilyHandle columnFamilyHandle :
-            columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
-      }
+      db.put(columnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes());
+      db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes());
+
+      final List<byte[]> keys =
+          Arrays.asList(new byte[][] {"key".getBytes(), "newcfkey".getBytes()});
+      List<byte[]> retValues = db.multiGetAsList(columnFamilyHandleList, keys);
+      assertThat(retValues.size()).isEqualTo(2);
+      assertThat(new String(retValues.get(0))).isEqualTo("value");
+      assertThat(new String(retValues.get(1))).isEqualTo("value");
+      retValues = db.multiGetAsList(new ReadOptions(), columnFamilyHandleList, keys);
+      assertThat(retValues.size()).isEqualTo(2);
+      assertThat(new String(retValues.get(0))).isEqualTo("value");
+      assertThat(new String(retValues.get(1))).isEqualTo("value");
     }
   }
 
@@ -476,30 +377,18 @@ public void properties() throws RocksDBException {
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath(),
              cfDescriptors, columnFamilyHandleList)) {
-      try {
-        assertThat(db.getProperty("rocksdb.estimate-num-keys")).
-            isNotNull();
-        assertThat(db.getLongProperty(columnFamilyHandleList.get(0),
-            "rocksdb.estimate-num-keys")).isGreaterThanOrEqualTo(0);
-        assertThat(db.getProperty("rocksdb.stats")).isNotNull();
-        assertThat(db.getProperty(columnFamilyHandleList.get(0),
-            "rocksdb.sstables")).isNotNull();
-        assertThat(db.getProperty(columnFamilyHandleList.get(1),
-            "rocksdb.estimate-num-keys")).isNotNull();
-        assertThat(db.getProperty(columnFamilyHandleList.get(1),
-            "rocksdb.stats")).isNotNull();
-        assertThat(db.getProperty(columnFamilyHandleList.get(1),
-            "rocksdb.sstables")).isNotNull();
-        assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys")).
-            isNotNull();
-        assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys")).
-            isGreaterThanOrEqualTo(0);
-      } finally {
-        for (final ColumnFamilyHandle columnFamilyHandle :
-            columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
-      }
+      assertThat(db.getProperty("rocksdb.estimate-num-keys")).isNotNull();
+      assertThat(db.getLongProperty(columnFamilyHandleList.get(0), "rocksdb.estimate-num-keys"))
+          .isGreaterThanOrEqualTo(0);
+      assertThat(db.getProperty("rocksdb.stats")).isNotNull();
+      assertThat(db.getProperty(columnFamilyHandleList.get(0), "rocksdb.sstables")).isNotNull();
+      assertThat(db.getProperty(columnFamilyHandleList.get(1), "rocksdb.estimate-num-keys"))
+          .isNotNull();
+      assertThat(db.getProperty(columnFamilyHandleList.get(1), "rocksdb.stats")).isNotNull();
+      assertThat(db.getProperty(columnFamilyHandleList.get(1), "rocksdb.sstables")).isNotNull();
+      assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys")).isNotNull();
+      assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys"))
+          .isGreaterThanOrEqualTo(0);
     }
   }
 
@@ -547,10 +436,6 @@ public void iterators() throws RocksDBException {
             rocksIterator.close();
           }
         }
-        for (final ColumnFamilyHandle columnFamilyHandle :
-            columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
       }
     }
   }
@@ -566,15 +451,8 @@ public void failPutDisposedCF() throws RocksDBException {
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath(),
              cfDescriptors, columnFamilyHandleList)) {
-      try {
-        db.dropColumnFamily(columnFamilyHandleList.get(1));
-        db.put(columnFamilyHandleList.get(1), "key".getBytes(),
-            "value".getBytes());
-      } finally {
-        for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
-      }
+      db.dropColumnFamily(columnFamilyHandleList.get(1));
+      db.put(columnFamilyHandleList.get(1), "key".getBytes(), "value".getBytes());
     }
   }
 
@@ -589,15 +467,8 @@ public void failRemoveDisposedCF() throws RocksDBException {
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath(),
              cfDescriptors, columnFamilyHandleList)) {
-      try {
-        db.dropColumnFamily(columnFamilyHandleList.get(1));
-        db.delete(columnFamilyHandleList.get(1), "key".getBytes());
-      } finally {
-        for (final ColumnFamilyHandle columnFamilyHandle :
-            columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
-      }
+      db.dropColumnFamily(columnFamilyHandleList.get(1));
+      db.delete(columnFamilyHandleList.get(1), "key".getBytes());
     }
   }
 
@@ -612,15 +483,8 @@ public void failGetDisposedCF() throws RocksDBException {
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
              columnFamilyHandleList)) {
-      try {
-        db.dropColumnFamily(columnFamilyHandleList.get(1));
-        db.get(columnFamilyHandleList.get(1), "key".getBytes());
-      } finally {
-        for (final ColumnFamilyHandle columnFamilyHandle :
-            columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
-      }
+      db.dropColumnFamily(columnFamilyHandleList.get(1));
+      db.get(columnFamilyHandleList.get(1), "key".getBytes());
     }
   }
 
@@ -635,19 +499,11 @@ public void failMultiGetWithoutCorrectNumberOfCF() throws RocksDBException {
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
              columnFamilyHandleList)) {
-      try {
-        final List<byte[]> keys = new ArrayList<>();
-        keys.add("key".getBytes());
-        keys.add("newcfkey".getBytes());
-        final List<ColumnFamilyHandle> cfCustomList = new ArrayList<>();
-        db.multiGetAsList(cfCustomList, keys);
-
-      } finally {
-        for (final ColumnFamilyHandle columnFamilyHandle :
-            columnFamilyHandleList) {
-          columnFamilyHandle.close();
-        }
-      }
+      final List<byte[]> keys = new ArrayList<>();
+      keys.add("key".getBytes());
+      keys.add("newcfkey".getBytes());
+      final List<ColumnFamilyHandle> cfCustomList = new ArrayList<>();
+      db.multiGetAsList(cfCustomList, keys);
     }
   }
 
@@ -661,25 +517,12 @@ public void testByteCreateFolumnFamily() throws RocksDBException {
       final byte[] b0 = new byte[]{(byte) 0x00};
       final byte[] b1 = new byte[]{(byte) 0x01};
       final byte[] b2 = new byte[]{(byte) 0x02};
-      ColumnFamilyHandle cf1 = null, cf2 = null, cf3 = null;
-      try {
-        cf1 = db.createColumnFamily(new ColumnFamilyDescriptor(b0));
-        cf2 = db.createColumnFamily(new ColumnFamilyDescriptor(b1));
-        final List<byte[]> families = RocksDB.listColumnFamilies(options,
-            dbFolder.getRoot().getAbsolutePath());
-        assertThat(families).contains("default".getBytes(), b0, b1);
-        cf3 = db.createColumnFamily(new ColumnFamilyDescriptor(b2));
-      } finally {
-        if (cf1 != null) {
-          cf1.close();
-        }
-        if (cf2 != null) {
-          cf2.close();
-        }
-        if (cf3 != null) {
-          cf3.close();
-        }
-      }
+      db.createColumnFamily(new ColumnFamilyDescriptor(b0));
+      db.createColumnFamily(new ColumnFamilyDescriptor(b1));
+      final List<byte[]> families =
+          RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath());
+      assertThat(families).contains("default".getBytes(), b0, b1);
+      db.createColumnFamily(new ColumnFamilyDescriptor(b2));
     }
   }
 
@@ -690,22 +533,13 @@ public void testCFNamesWithZeroBytes() throws RocksDBException {
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath());
     ) {
-      try {
-        final byte[] b0 = new byte[]{0, 0};
-        final byte[] b1 = new byte[]{0, 1};
-        cf1 = db.createColumnFamily(new ColumnFamilyDescriptor(b0));
-        cf2 = db.createColumnFamily(new ColumnFamilyDescriptor(b1));
-        final List<byte[]> families = RocksDB.listColumnFamilies(options,
-            dbFolder.getRoot().getAbsolutePath());
-        assertThat(families).contains("default".getBytes(), b0, b1);
-      } finally {
-        if (cf1 != null) {
-          cf1.close();
-        }
-        if (cf2 != null) {
-          cf2.close();
-        }
-      }
+      final byte[] b0 = new byte[] {0, 0};
+      final byte[] b1 = new byte[] {0, 1};
+      cf1 = db.createColumnFamily(new ColumnFamilyDescriptor(b0));
+      cf2 = db.createColumnFamily(new ColumnFamilyDescriptor(b1));
+      final List<byte[]> families =
+          RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath());
+      assertThat(families).contains("default".getBytes(), b0, b1);
     }
   }
 
@@ -716,17 +550,57 @@ public void testCFNameSimplifiedChinese() throws RocksDBException {
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath());
     ) {
+      final String simplifiedChinese = "\u7b80\u4f53\u5b57";
+      columnFamilyHandle =
+          db.createColumnFamily(new ColumnFamilyDescriptor(simplifiedChinese.getBytes()));
+
+      final List<byte[]> families =
+          RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath());
+      assertThat(families).contains("default".getBytes(), simplifiedChinese.getBytes());
+    }
+  }
+
+  @Test
+  public void testDestroyColumnFamilyHandle() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());) {
+      final byte[] name1 = "cf1".getBytes();
+      final byte[] name2 = "cf2".getBytes();
+      final ColumnFamilyDescriptor desc1 = new ColumnFamilyDescriptor(name1);
+      final ColumnFamilyDescriptor desc2 = new ColumnFamilyDescriptor(name2);
+      final ColumnFamilyHandle cf1 = db.createColumnFamily(desc1);
+      final ColumnFamilyHandle cf2 = db.createColumnFamily(desc2);
+      assertTrue(cf1.isOwningHandle());
+      assertTrue(cf2.isOwningHandle());
+      assertFalse(cf1.isDefaultColumnFamily());
+      db.destroyColumnFamilyHandle(cf1);
+      // At this point cf1 should not be used!
+      assertFalse(cf1.isOwningHandle());
+      assertTrue(cf2.isOwningHandle());
+    }
+  }
+
+  @Test
+  @Deprecated
+  /**
+   * @deprecated Now explicitly closing instances of ColumnFamilyHandle is not required.
+   *     RocksDB instance will take care of closing its associated ColumnFamilyHandle objects.
+   */
+  public void testColumnFamilyCloseBeforeDb() throws RocksDBException {
+    final List<ColumnFamilyDescriptor> cfNames =
+        Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+            new ColumnFamilyDescriptor("new_cf".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+
+    try (final DBOptions options =
+             new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(
+             options, dbFolder.getRoot().getAbsolutePath(), cfNames, columnFamilyHandleList)) {
       try {
-        final String simplifiedChinese = "\u7b80\u4f53\u5b57";
-        columnFamilyHandle = db.createColumnFamily(
-            new ColumnFamilyDescriptor(simplifiedChinese.getBytes()));
-
-        final List<byte[]> families = RocksDB.listColumnFamilies(options,
-            dbFolder.getRoot().getAbsolutePath());
-        assertThat(families).contains("default".getBytes(),
-            simplifiedChinese.getBytes());
+        db.put("testKey".getBytes(), "tstValue".getBytes());
+        // Do something...
       } finally {
-        if (columnFamilyHandle != null) {
+        for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
           columnFamilyHandle.close();
         }
       }
diff --git a/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java b/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java
index e05f1eef3a7..35a14eb5490 100644
--- a/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java
+++ b/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java
@@ -39,29 +39,22 @@ public void columnFamilyOptions_setCompactionFilterFactory()
 
       final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
 
-      try (final RocksDB rocksDb = RocksDB.open(options,
-               dbFolder.getRoot().getAbsolutePath(), cfNames, cfHandles);
-      ) {
-        try {
-          final byte[] key1 = "key1".getBytes();
-          final byte[] key2 = "key2".getBytes();
+      try (final RocksDB rocksDb =
+               RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfNames, cfHandles)) {
+        final byte[] key1 = "key1".getBytes();
+        final byte[] key2 = "key2".getBytes();
 
-          final byte[] value1 = "value1".getBytes();
-          final byte[] value2 = new byte[0];
+        final byte[] value1 = "value1".getBytes();
+        final byte[] value2 = new byte[0];
 
-          rocksDb.put(cfHandles.get(1), key1, value1);
-          rocksDb.put(cfHandles.get(1), key2, value2);
+        rocksDb.put(cfHandles.get(1), key1, value1);
+        rocksDb.put(cfHandles.get(1), key2, value2);
 
-          rocksDb.compactRange(cfHandles.get(1));
+        rocksDb.compactRange(cfHandles.get(1));
 
-          assertThat(rocksDb.get(cfHandles.get(1), key1)).isEqualTo(value1);
-          final boolean exists = rocksDb.keyMayExist(cfHandles.get(1), key2, null);
-          assertThat(exists).isFalse();
-        } finally {
-          for (final ColumnFamilyHandle cfHandle : cfHandles) {
-            cfHandle.close();
-          }
-        }
+        assertThat(rocksDb.get(cfHandles.get(1), key1)).isEqualTo(value1);
+        final boolean exists = rocksDb.keyMayExist(cfHandles.get(1), key2, null);
+        assertThat(exists).isFalse();
       }
     }
   }
diff --git a/java/src/test/java/org/rocksdb/DBOptionsTest.java b/java/src/test/java/org/rocksdb/DBOptionsTest.java
index 32e5aa21ce5..17964aaef1e 100644
--- a/java/src/test/java/org/rocksdb/DBOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/DBOptionsTest.java
@@ -5,13 +5,16 @@
 
 package org.rocksdb;
 
-import org.junit.ClassRule;
-import org.junit.Test;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
 
 import java.nio.file.Paths;
 import java.util.*;
-
-import static org.assertj.core.api.Assertions.assertThat;
+import java.util.concurrent.atomic.AtomicBoolean;
+import org.junit.ClassRule;
+import org.junit.Test;
 
 public class DBOptionsTest {
 
@@ -895,4 +898,38 @@ public void skipCheckingSstFileSizesOnDbOpen() {
       assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(true);
     }
   }
+
+  @Test
+  public void eventListeners() {
+    final AtomicBoolean wasCalled1 = new AtomicBoolean();
+    final AtomicBoolean wasCalled2 = new AtomicBoolean();
+    try (final DBOptions options = new DBOptions();
+         final AbstractEventListener el1 =
+             new AbstractEventListener() {
+               @Override
+               public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) {
+                 wasCalled1.set(true);
+               }
+             };
+         final AbstractEventListener el2 =
+             new AbstractEventListener() {
+               @Override
+               public void onMemTableSealed(final MemTableInfo memTableInfo) {
+                 wasCalled2.set(true);
+               }
+             }) {
+      assertThat(options.setListeners(Arrays.asList(el1, el2))).isEqualTo(options);
+      List<AbstractEventListener> listeners = options.listeners();
+      assertEquals(el1, listeners.get(0));
+      assertEquals(el2, listeners.get(1));
+      options.setListeners(Collections.<AbstractEventListener>emptyList());
+      listeners.get(0).onTableFileDeleted(null);
+      assertTrue(wasCalled1.get());
+      listeners.get(1).onMemTableSealed(null);
+      assertTrue(wasCalled2.get());
+      List<AbstractEventListener> listeners2 = options.listeners();
+      assertNotNull(listeners2);
+      assertEquals(0, listeners2.size());
+    }
+  }
 }
diff --git a/java/src/test/java/org/rocksdb/EventListenerTest.java b/java/src/test/java/org/rocksdb/EventListenerTest.java
new file mode 100644
index 00000000000..61193ff6765
--- /dev/null
+++ b/java/src/test/java/org/rocksdb/EventListenerTest.java
@@ -0,0 +1,765 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.*;
+
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.*;
+import java.util.concurrent.atomic.AtomicBoolean;
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.AbstractEventListener.EnabledEventCallback;
+import org.rocksdb.test.TestableEventListener;
+
+public class EventListenerTest {
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
+
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  public static final Random rand = PlatformRandomHelper.getPlatformSpecificRandomFactory();
+
+  void flushDb(final AbstractEventListener el, final AtomicBoolean wasCbCalled)
+      throws RocksDBException {
+    try (final Options opt =
+             new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el));
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(db).isNotNull();
+      final byte[] value = new byte[24];
+      rand.nextBytes(value);
+      db.put("testKey".getBytes(), value);
+      db.flush(new FlushOptions());
+      assertTrue(wasCbCalled.get());
+    }
+  }
+
+  @Test
+  public void onFlushCompleted() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onFlushCompletedListener = new AbstractEventListener() {
+      @Override
+      public void onFlushCompleted(final RocksDB rocksDb, final FlushJobInfo flushJobInfo) {
+        assertNotNull(flushJobInfo.getColumnFamilyName());
+        assertEquals(FlushReason.MANUAL_FLUSH, flushJobInfo.getFlushReason());
+        wasCbCalled.set(true);
+      }
+    };
+    flushDb(onFlushCompletedListener, wasCbCalled);
+  }
+
+  @Test
+  public void onFlushBegin() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onFlushBeginListener = new AbstractEventListener() {
+      @Override
+      public void onFlushBegin(final RocksDB rocksDb, final FlushJobInfo flushJobInfo) {
+        assertNotNull(flushJobInfo.getColumnFamilyName());
+        assertEquals(FlushReason.MANUAL_FLUSH, flushJobInfo.getFlushReason());
+        wasCbCalled.set(true);
+      }
+    };
+    flushDb(onFlushBeginListener, wasCbCalled);
+  }
+
+  void deleteTableFile(final AbstractEventListener el, final AtomicBoolean wasCbCalled)
+      throws RocksDBException {
+    try (final Options opt =
+             new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el));
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(db).isNotNull();
+      final byte[] value = new byte[24];
+      rand.nextBytes(value);
+      db.put("testKey".getBytes(), value);
+      final RocksDB.LiveFiles liveFiles = db.getLiveFiles();
+      assertNotNull(liveFiles);
+      assertNotNull(liveFiles.files);
+      assertFalse(liveFiles.files.isEmpty());
+      db.deleteFile(liveFiles.files.get(0));
+      assertTrue(wasCbCalled.get());
+    }
+  }
+
+  @Test
+  public void onTableFileDeleted() throws RocksDBException, InterruptedException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onTableFileDeletedListener = new AbstractEventListener() {
+      @Override
+      public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) {
+        assertNotNull(tableFileDeletionInfo.getDbName());
+        wasCbCalled.set(true);
+      }
+    };
+    deleteTableFile(onTableFileDeletedListener, wasCbCalled);
+  }
+
+  void compactRange(final AbstractEventListener el, final AtomicBoolean wasCbCalled)
+      throws RocksDBException {
+    try (final Options opt =
+             new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el));
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(db).isNotNull();
+      final byte[] value = new byte[24];
+      rand.nextBytes(value);
+      db.put("testKey".getBytes(), value);
+      db.compactRange();
+      assertTrue(wasCbCalled.get());
+    }
+  }
+
+  @Test
+  public void onCompactionBegin() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onCompactionBeginListener = new AbstractEventListener() {
+      @Override
+      public void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+        assertEquals(CompactionReason.kManualCompaction, compactionJobInfo.compactionReason());
+        wasCbCalled.set(true);
+      }
+    };
+    compactRange(onCompactionBeginListener, wasCbCalled);
+  }
+
+  @Test
+  public void onCompactionCompleted() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onCompactionCompletedListener = new AbstractEventListener() {
+      @Override
+      public void onCompactionCompleted(
+          final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+        assertEquals(CompactionReason.kManualCompaction, compactionJobInfo.compactionReason());
+        wasCbCalled.set(true);
+      }
+    };
+    compactRange(onCompactionCompletedListener, wasCbCalled);
+  }
+
+  @Test
+  public void onTableFileCreated() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onTableFileCreatedListener = new AbstractEventListener() {
+      @Override
+      public void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo) {
+        assertEquals(TableFileCreationReason.FLUSH, tableFileCreationInfo.getReason());
+        wasCbCalled.set(true);
+      }
+    };
+    flushDb(onTableFileCreatedListener, wasCbCalled);
+  }
+
+  @Test
+  public void onTableFileCreationStarted() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onTableFileCreationStartedListener = new AbstractEventListener() {
+      @Override
+      public void onTableFileCreationStarted(
+          final TableFileCreationBriefInfo tableFileCreationBriefInfo) {
+        assertEquals(TableFileCreationReason.FLUSH, tableFileCreationBriefInfo.getReason());
+        wasCbCalled.set(true);
+      }
+    };
+    flushDb(onTableFileCreationStartedListener, wasCbCalled);
+  }
+
+  void deleteColumnFamilyHandle(final AbstractEventListener el, final AtomicBoolean wasCbCalled)
+      throws RocksDBException {
+    try (final Options opt =
+             new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el));
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(db).isNotNull();
+      final byte[] value = new byte[24];
+      rand.nextBytes(value);
+      db.put("testKey".getBytes(), value);
+      ColumnFamilyHandle columnFamilyHandle = db.getDefaultColumnFamily();
+      columnFamilyHandle.close();
+      assertTrue(wasCbCalled.get());
+    }
+  }
+
+  @Test
+  public void onColumnFamilyHandleDeletionStarted() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onColumnFamilyHandleDeletionStartedListener =
+        new AbstractEventListener() {
+          @Override
+          public void onColumnFamilyHandleDeletionStarted(
+              final ColumnFamilyHandle columnFamilyHandle) {
+            assertNotNull(columnFamilyHandle);
+            wasCbCalled.set(true);
+          }
+        };
+    deleteColumnFamilyHandle(onColumnFamilyHandleDeletionStartedListener, wasCbCalled);
+  }
+
+  void ingestExternalFile(final AbstractEventListener el, final AtomicBoolean wasCbCalled)
+      throws RocksDBException {
+    try (final Options opt =
+             new Options().setCreateIfMissing(true).setListeners(Collections.singletonList(el));
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(db).isNotNull();
+      final String uuid = UUID.randomUUID().toString();
+      final SstFileWriter sstFileWriter = new SstFileWriter(new EnvOptions(), opt);
+      final Path externalFilePath = Paths.get(db.getName(), uuid);
+      sstFileWriter.open(externalFilePath.toString());
+      sstFileWriter.put("testKey".getBytes(), uuid.getBytes());
+      sstFileWriter.finish();
+      db.ingestExternalFile(
+          Collections.singletonList(externalFilePath.toString()), new IngestExternalFileOptions());
+      assertTrue(wasCbCalled.get());
+    }
+  }
+
+  @Test
+  public void onExternalFileIngested() throws RocksDBException {
+    final AtomicBoolean wasCbCalled = new AtomicBoolean();
+    final AbstractEventListener onExternalFileIngestedListener = new AbstractEventListener() {
+      @Override
+      public void onExternalFileIngested(
+          final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo) {
+        assertNotNull(db);
+        wasCbCalled.set(true);
+      }
+    };
+    ingestExternalFile(onExternalFileIngestedListener, wasCbCalled);
+  }
+
+  @Test
+  public void testAllCallbacksInvocation() {
+    final int TEST_INT_VAL = -1;
+    final long TEST_LONG_VAL = -1;
+    // Expected test data objects
+    final Map<String, String> userCollectedPropertiesTestData =
+        Collections.singletonMap("key", "value");
+    final Map<String, String> readablePropertiesTestData = Collections.singletonMap("key", "value");
+    final Map<String, Long> propertiesOffsetsTestData =
+        Collections.singletonMap("key", TEST_LONG_VAL);
+    final TableProperties tablePropertiesTestData = new TableProperties(TEST_LONG_VAL,
+        TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL,
+        TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL,
+        TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL,
+        TEST_LONG_VAL, TEST_LONG_VAL, "columnFamilyName".getBytes(), "filterPolicyName",
+        "comparatorName", "mergeOperatorName", "prefixExtractorName", "propertyCollectorsNames",
+        "compressionName", userCollectedPropertiesTestData, readablePropertiesTestData,
+        propertiesOffsetsTestData);
+    final FlushJobInfo flushJobInfoTestData = new FlushJobInfo(Integer.MAX_VALUE,
+        "testColumnFamily", "/file/path", TEST_LONG_VAL, Integer.MAX_VALUE, true, true,
+        TEST_LONG_VAL, TEST_LONG_VAL, tablePropertiesTestData, (byte) 0x0a);
+    final Status statusTestData = new Status(Status.Code.Incomplete, Status.SubCode.NoSpace, null);
+    final TableFileDeletionInfo tableFileDeletionInfoTestData =
+        new TableFileDeletionInfo("dbName", "/file/path", Integer.MAX_VALUE, statusTestData);
+    final TableFileCreationInfo tableFileCreationInfoTestData =
+        new TableFileCreationInfo(TEST_LONG_VAL, tablePropertiesTestData, statusTestData, "dbName",
+            "columnFamilyName", "/file/path", Integer.MAX_VALUE, (byte) 0x03);
+    final TableFileCreationBriefInfo tableFileCreationBriefInfoTestData =
+        new TableFileCreationBriefInfo(
+            "dbName", "columnFamilyName", "/file/path", Integer.MAX_VALUE, (byte) 0x03);
+    final MemTableInfo memTableInfoTestData = new MemTableInfo(
+        "columnFamilyName", TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL, TEST_LONG_VAL);
+    final FileOperationInfo fileOperationInfoTestData = new FileOperationInfo("/file/path",
+        TEST_LONG_VAL, TEST_LONG_VAL, 1_600_699_420_000_000_000L, 5_000_000_000L, statusTestData);
+    final WriteStallInfo writeStallInfoTestData =
+        new WriteStallInfo("columnFamilyName", (byte) 0x1, (byte) 0x2);
+    final ExternalFileIngestionInfo externalFileIngestionInfoTestData =
+        new ExternalFileIngestionInfo("columnFamilyName", "/external/file/path",
+            "/internal/file/path", TEST_LONG_VAL, tablePropertiesTestData);
+
+    final CapturingTestableEventListener listener = new CapturingTestableEventListener() {
+      @Override
+      public void onFlushCompleted(final RocksDB db, final FlushJobInfo flushJobInfo) {
+        super.onFlushCompleted(db, flushJobInfo);
+        assertEquals(flushJobInfoTestData, flushJobInfo);
+      }
+
+      @Override
+      public void onFlushBegin(final RocksDB db, final FlushJobInfo flushJobInfo) {
+        super.onFlushBegin(db, flushJobInfo);
+        assertEquals(flushJobInfoTestData, flushJobInfo);
+      }
+
+      @Override
+      public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) {
+        super.onTableFileDeleted(tableFileDeletionInfo);
+        assertEquals(tableFileDeletionInfoTestData, tableFileDeletionInfo);
+      }
+
+      @Override
+      public void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+        super.onCompactionBegin(db, compactionJobInfo);
+        assertArrayEquals(
+            "compactionColumnFamily".getBytes(), compactionJobInfo.columnFamilyName());
+        assertEquals(statusTestData, compactionJobInfo.status());
+        assertEquals(TEST_LONG_VAL, compactionJobInfo.threadId());
+        assertEquals(Integer.MAX_VALUE, compactionJobInfo.jobId());
+        assertEquals(Integer.MAX_VALUE, compactionJobInfo.baseInputLevel());
+        assertEquals(Integer.MAX_VALUE, compactionJobInfo.outputLevel());
+        assertEquals(Collections.singletonList("inputFile.sst"), compactionJobInfo.inputFiles());
+        assertEquals(Collections.singletonList("outputFile.sst"), compactionJobInfo.outputFiles());
+        assertEquals(Collections.singletonMap("tableProperties", tablePropertiesTestData),
+            compactionJobInfo.tableProperties());
+        assertEquals(CompactionReason.kFlush, compactionJobInfo.compactionReason());
+        assertEquals(CompressionType.SNAPPY_COMPRESSION, compactionJobInfo.compression());
+      }
+
+      @Override
+      public void onCompactionCompleted(
+          final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+        super.onCompactionCompleted(db, compactionJobInfo);
+        assertArrayEquals(
+            "compactionColumnFamily".getBytes(), compactionJobInfo.columnFamilyName());
+        assertEquals(statusTestData, compactionJobInfo.status());
+        assertEquals(TEST_LONG_VAL, compactionJobInfo.threadId());
+        assertEquals(Integer.MAX_VALUE, compactionJobInfo.jobId());
+        assertEquals(Integer.MAX_VALUE, compactionJobInfo.baseInputLevel());
+        assertEquals(Integer.MAX_VALUE, compactionJobInfo.outputLevel());
+        assertEquals(Collections.singletonList("inputFile.sst"), compactionJobInfo.inputFiles());
+        assertEquals(Collections.singletonList("outputFile.sst"), compactionJobInfo.outputFiles());
+        assertEquals(Collections.singletonMap("tableProperties", tablePropertiesTestData),
+            compactionJobInfo.tableProperties());
+        assertEquals(CompactionReason.kFlush, compactionJobInfo.compactionReason());
+        assertEquals(CompressionType.SNAPPY_COMPRESSION, compactionJobInfo.compression());
+      }
+
+      @Override
+      public void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo) {
+        super.onTableFileCreated(tableFileCreationInfo);
+        assertEquals(tableFileCreationInfoTestData, tableFileCreationInfo);
+      }
+
+      @Override
+      public void onTableFileCreationStarted(
+          final TableFileCreationBriefInfo tableFileCreationBriefInfo) {
+        super.onTableFileCreationStarted(tableFileCreationBriefInfo);
+        assertEquals(tableFileCreationBriefInfoTestData, tableFileCreationBriefInfo);
+      }
+
+      @Override
+      public void onMemTableSealed(final MemTableInfo memTableInfo) {
+        super.onMemTableSealed(memTableInfo);
+        assertEquals(memTableInfoTestData, memTableInfo);
+      }
+
+      @Override
+      public void onColumnFamilyHandleDeletionStarted(final ColumnFamilyHandle columnFamilyHandle) {
+        super.onColumnFamilyHandleDeletionStarted(columnFamilyHandle);
+      }
+
+      @Override
+      public void onExternalFileIngested(
+          final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo) {
+        super.onExternalFileIngested(db, externalFileIngestionInfo);
+        assertEquals(externalFileIngestionInfoTestData, externalFileIngestionInfo);
+      }
+
+      @Override
+      public void onBackgroundError(
+          final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) {
+        super.onBackgroundError(backgroundErrorReason, backgroundError);
+      }
+
+      @Override
+      public void onStallConditionsChanged(final WriteStallInfo writeStallInfo) {
+        super.onStallConditionsChanged(writeStallInfo);
+        assertEquals(writeStallInfoTestData, writeStallInfo);
+      }
+
+      @Override
+      public void onFileReadFinish(final FileOperationInfo fileOperationInfo) {
+        super.onFileReadFinish(fileOperationInfo);
+        assertEquals(fileOperationInfoTestData, fileOperationInfo);
+      }
+
+      @Override
+      public void onFileWriteFinish(final FileOperationInfo fileOperationInfo) {
+        super.onFileWriteFinish(fileOperationInfo);
+        assertEquals(fileOperationInfoTestData, fileOperationInfo);
+      }
+
+      @Override
+      public void onFileFlushFinish(final FileOperationInfo fileOperationInfo) {
+        super.onFileFlushFinish(fileOperationInfo);
+        assertEquals(fileOperationInfoTestData, fileOperationInfo);
+      }
+
+      @Override
+      public void onFileSyncFinish(final FileOperationInfo fileOperationInfo) {
+        super.onFileSyncFinish(fileOperationInfo);
+        assertEquals(fileOperationInfoTestData, fileOperationInfo);
+      }
+
+      @Override
+      public void onFileRangeSyncFinish(final FileOperationInfo fileOperationInfo) {
+        super.onFileRangeSyncFinish(fileOperationInfo);
+        assertEquals(fileOperationInfoTestData, fileOperationInfo);
+      }
+
+      @Override
+      public void onFileTruncateFinish(final FileOperationInfo fileOperationInfo) {
+        assertEquals(fileOperationInfoTestData, fileOperationInfo);
+        super.onFileTruncateFinish(fileOperationInfo);
+      }
+
+      @Override
+      public void onFileCloseFinish(final FileOperationInfo fileOperationInfo) {
+        super.onFileCloseFinish(fileOperationInfo);
+        assertEquals(fileOperationInfoTestData, fileOperationInfo);
+      }
+
+      @Override
+      public boolean shouldBeNotifiedOnFileIO() {
+        super.shouldBeNotifiedOnFileIO();
+        return false;
+      }
+
+      @Override
+      public boolean onErrorRecoveryBegin(
+          final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) {
+        super.onErrorRecoveryBegin(backgroundErrorReason, backgroundError);
+        assertEquals(BackgroundErrorReason.FLUSH, backgroundErrorReason);
+        assertEquals(statusTestData, backgroundError);
+        return true;
+      }
+
+      @Override
+      public void onErrorRecoveryCompleted(final Status oldBackgroundError) {
+        super.onErrorRecoveryCompleted(oldBackgroundError);
+        assertEquals(statusTestData, oldBackgroundError);
+      }
+    };
+
+    // test action
+    listener.invokeAllCallbacks();
+
+    // assert
+    assertAllEventsCalled(listener);
+  }
+
+  @Test
+  public void testEnabledCallbacks() {
+    final EnabledEventCallback enabledEvents[] = {
+        EnabledEventCallback.ON_MEMTABLE_SEALED, EnabledEventCallback.ON_ERROR_RECOVERY_COMPLETED};
+
+    final CapturingTestableEventListener listener =
+        new CapturingTestableEventListener(enabledEvents);
+
+    // test action
+    listener.invokeAllCallbacks();
+
+    // assert
+    assertEventsCalled(listener, enabledEvents);
+  }
+
+  private static void assertAllEventsCalled(
+      final CapturingTestableEventListener capturingTestableEventListener) {
+    assertEventsCalled(capturingTestableEventListener, EnumSet.allOf(EnabledEventCallback.class));
+  }
+
+  private static void assertEventsCalled(
+      final CapturingTestableEventListener capturingTestableEventListener,
+      final EnabledEventCallback[] expected) {
+    assertEventsCalled(capturingTestableEventListener, EnumSet.copyOf(Arrays.asList(expected)));
+  }
+
+  private static void assertEventsCalled(
+      final CapturingTestableEventListener capturingTestableEventListener,
+      final EnumSet<EnabledEventCallback> expected) {
+    final ListenerEvents capturedEvents = capturingTestableEventListener.capturedListenerEvents;
+
+    if (expected.contains(EnabledEventCallback.ON_FLUSH_COMPLETED)) {
+      assertTrue("onFlushCompleted was not called", capturedEvents.flushCompleted);
+    } else {
+      assertFalse("onFlushCompleted was not called", capturedEvents.flushCompleted);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_FLUSH_BEGIN)) {
+      assertTrue("onFlushBegin was not called", capturedEvents.flushBegin);
+    } else {
+      assertFalse("onFlushBegin was called", capturedEvents.flushBegin);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_TABLE_FILE_DELETED)) {
+      assertTrue("onTableFileDeleted was not called", capturedEvents.tableFileDeleted);
+    } else {
+      assertFalse("onTableFileDeleted was called", capturedEvents.tableFileDeleted);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_COMPACTION_BEGIN)) {
+      assertTrue("onCompactionBegin was not called", capturedEvents.compactionBegin);
+    } else {
+      assertFalse("onCompactionBegin was called", capturedEvents.compactionBegin);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_COMPACTION_COMPLETED)) {
+      assertTrue("onCompactionCompleted was not called", capturedEvents.compactionCompleted);
+    } else {
+      assertFalse("onCompactionCompleted was called", capturedEvents.compactionCompleted);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_TABLE_FILE_CREATED)) {
+      assertTrue("onTableFileCreated was not called", capturedEvents.tableFileCreated);
+    } else {
+      assertFalse("onTableFileCreated was called", capturedEvents.tableFileCreated);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_TABLE_FILE_CREATION_STARTED)) {
+      assertTrue(
+          "onTableFileCreationStarted was not called", capturedEvents.tableFileCreationStarted);
+    } else {
+      assertFalse("onTableFileCreationStarted was called", capturedEvents.tableFileCreationStarted);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_MEMTABLE_SEALED)) {
+      assertTrue("onMemTableSealed was not called", capturedEvents.memTableSealed);
+    } else {
+      assertFalse("onMemTableSealed was called", capturedEvents.memTableSealed);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_COLUMN_FAMILY_HANDLE_DELETION_STARTED)) {
+      assertTrue("onColumnFamilyHandleDeletionStarted was not called",
+          capturedEvents.columnFamilyHandleDeletionStarted);
+    } else {
+      assertFalse("onColumnFamilyHandleDeletionStarted was called",
+          capturedEvents.columnFamilyHandleDeletionStarted);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_EXTERNAL_FILE_INGESTED)) {
+      assertTrue("onExternalFileIngested was not called", capturedEvents.externalFileIngested);
+    } else {
+      assertFalse("onExternalFileIngested was called", capturedEvents.externalFileIngested);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_BACKGROUND_ERROR)) {
+      assertTrue("onBackgroundError was not called", capturedEvents.backgroundError);
+    } else {
+      assertFalse("onBackgroundError was called", capturedEvents.backgroundError);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_STALL_CONDITIONS_CHANGED)) {
+      assertTrue("onStallConditionsChanged was not called", capturedEvents.stallConditionsChanged);
+    } else {
+      assertFalse("onStallConditionsChanged was called", capturedEvents.stallConditionsChanged);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_FILE_READ_FINISH)) {
+      assertTrue("onFileReadFinish was not called", capturedEvents.fileReadFinish);
+    } else {
+      assertFalse("onFileReadFinish was called", capturedEvents.fileReadFinish);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_FILE_WRITE_FINISH)) {
+      assertTrue("onFileWriteFinish was not called", capturedEvents.fileWriteFinish);
+    } else {
+      assertFalse("onFileWriteFinish was called", capturedEvents.fileWriteFinish);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_FILE_FLUSH_FINISH)) {
+      assertTrue("onFileFlushFinish was not called", capturedEvents.fileFlushFinish);
+    } else {
+      assertFalse("onFileFlushFinish was called", capturedEvents.fileFlushFinish);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_FILE_SYNC_FINISH)) {
+      assertTrue("onFileSyncFinish was not called", capturedEvents.fileSyncFinish);
+    } else {
+      assertFalse("onFileSyncFinish was called", capturedEvents.fileSyncFinish);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_FILE_RANGE_SYNC_FINISH)) {
+      assertTrue("onFileRangeSyncFinish was not called", capturedEvents.fileRangeSyncFinish);
+    } else {
+      assertFalse("onFileRangeSyncFinish was called", capturedEvents.fileRangeSyncFinish);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_FILE_TRUNCATE_FINISH)) {
+      assertTrue("onFileTruncateFinish was not called", capturedEvents.fileTruncateFinish);
+    } else {
+      assertFalse("onFileTruncateFinish was called", capturedEvents.fileTruncateFinish);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_FILE_CLOSE_FINISH)) {
+      assertTrue("onFileCloseFinish was not called", capturedEvents.fileCloseFinish);
+    } else {
+      assertFalse("onFileCloseFinish was called", capturedEvents.fileCloseFinish);
+    }
+
+    if (expected.contains(EnabledEventCallback.SHOULD_BE_NOTIFIED_ON_FILE_IO)) {
+      assertTrue(
+          "shouldBeNotifiedOnFileIO was not called", capturedEvents.shouldBeNotifiedOnFileIO);
+    } else {
+      assertFalse("shouldBeNotifiedOnFileIO was called", capturedEvents.shouldBeNotifiedOnFileIO);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_ERROR_RECOVERY_BEGIN)) {
+      assertTrue("onErrorRecoveryBegin was not called", capturedEvents.errorRecoveryBegin);
+    } else {
+      assertFalse("onErrorRecoveryBegin was called", capturedEvents.errorRecoveryBegin);
+    }
+
+    if (expected.contains(EnabledEventCallback.ON_ERROR_RECOVERY_COMPLETED)) {
+      assertTrue("onErrorRecoveryCompleted was not called", capturedEvents.errorRecoveryCompleted);
+    } else {
+      assertFalse("onErrorRecoveryCompleted was called", capturedEvents.errorRecoveryCompleted);
+    }
+  }
+
+  /**
+   * Members are volatile as they may be written
+   * and read by different threads.
+   */
+  private static class ListenerEvents {
+    volatile boolean flushCompleted;
+    volatile boolean flushBegin;
+    volatile boolean tableFileDeleted;
+    volatile boolean compactionBegin;
+    volatile boolean compactionCompleted;
+    volatile boolean tableFileCreated;
+    volatile boolean tableFileCreationStarted;
+    volatile boolean memTableSealed;
+    volatile boolean columnFamilyHandleDeletionStarted;
+    volatile boolean externalFileIngested;
+    volatile boolean backgroundError;
+    volatile boolean stallConditionsChanged;
+    volatile boolean fileReadFinish;
+    volatile boolean fileWriteFinish;
+    volatile boolean fileFlushFinish;
+    volatile boolean fileSyncFinish;
+    volatile boolean fileRangeSyncFinish;
+    volatile boolean fileTruncateFinish;
+    volatile boolean fileCloseFinish;
+    volatile boolean shouldBeNotifiedOnFileIO;
+    volatile boolean errorRecoveryBegin;
+    volatile boolean errorRecoveryCompleted;
+  }
+
+  private static class CapturingTestableEventListener extends TestableEventListener {
+    final ListenerEvents capturedListenerEvents = new ListenerEvents();
+
+    public CapturingTestableEventListener() {}
+
+    public CapturingTestableEventListener(final EnabledEventCallback... enabledEventCallbacks) {
+      super(enabledEventCallbacks);
+    }
+
+    @Override
+    public void onFlushCompleted(final RocksDB db, final FlushJobInfo flushJobInfo) {
+      capturedListenerEvents.flushCompleted = true;
+    }
+
+    @Override
+    public void onFlushBegin(final RocksDB db, final FlushJobInfo flushJobInfo) {
+      capturedListenerEvents.flushBegin = true;
+    }
+
+    @Override
+    public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) {
+      capturedListenerEvents.tableFileDeleted = true;
+    }
+
+    @Override
+    public void onCompactionBegin(final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+      capturedListenerEvents.compactionBegin = true;
+    }
+
+    @Override
+    public void onCompactionCompleted(final RocksDB db, final CompactionJobInfo compactionJobInfo) {
+      capturedListenerEvents.compactionCompleted = true;
+    }
+
+    @Override
+    public void onTableFileCreated(final TableFileCreationInfo tableFileCreationInfo) {
+      capturedListenerEvents.tableFileCreated = true;
+    }
+
+    @Override
+    public void onTableFileCreationStarted(
+        final TableFileCreationBriefInfo tableFileCreationBriefInfo) {
+      capturedListenerEvents.tableFileCreationStarted = true;
+    }
+
+    @Override
+    public void onMemTableSealed(final MemTableInfo memTableInfo) {
+      capturedListenerEvents.memTableSealed = true;
+    }
+
+    @Override
+    public void onColumnFamilyHandleDeletionStarted(final ColumnFamilyHandle columnFamilyHandle) {
+      capturedListenerEvents.columnFamilyHandleDeletionStarted = true;
+    }
+
+    @Override
+    public void onExternalFileIngested(
+        final RocksDB db, final ExternalFileIngestionInfo externalFileIngestionInfo) {
+      capturedListenerEvents.externalFileIngested = true;
+    }
+
+    @Override
+    public void onBackgroundError(
+        final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) {
+      capturedListenerEvents.backgroundError = true;
+    }
+
+    @Override
+    public void onStallConditionsChanged(final WriteStallInfo writeStallInfo) {
+      capturedListenerEvents.stallConditionsChanged = true;
+    }
+
+    @Override
+    public void onFileReadFinish(final FileOperationInfo fileOperationInfo) {
+      capturedListenerEvents.fileReadFinish = true;
+    }
+
+    @Override
+    public void onFileWriteFinish(final FileOperationInfo fileOperationInfo) {
+      capturedListenerEvents.fileWriteFinish = true;
+    }
+
+    @Override
+    public void onFileFlushFinish(final FileOperationInfo fileOperationInfo) {
+      capturedListenerEvents.fileFlushFinish = true;
+    }
+
+    @Override
+    public void onFileSyncFinish(final FileOperationInfo fileOperationInfo) {
+      capturedListenerEvents.fileSyncFinish = true;
+    }
+
+    @Override
+    public void onFileRangeSyncFinish(final FileOperationInfo fileOperationInfo) {
+      capturedListenerEvents.fileRangeSyncFinish = true;
+    }
+
+    @Override
+    public void onFileTruncateFinish(final FileOperationInfo fileOperationInfo) {
+      capturedListenerEvents.fileTruncateFinish = true;
+    }
+
+    @Override
+    public void onFileCloseFinish(final FileOperationInfo fileOperationInfo) {
+      capturedListenerEvents.fileCloseFinish = true;
+    }
+
+    @Override
+    public boolean shouldBeNotifiedOnFileIO() {
+      capturedListenerEvents.shouldBeNotifiedOnFileIO = true;
+      return false;
+    }
+
+    @Override
+    public boolean onErrorRecoveryBegin(
+        final BackgroundErrorReason backgroundErrorReason, final Status backgroundError) {
+      capturedListenerEvents.errorRecoveryBegin = true;
+      return true;
+    }
+
+    @Override
+    public void onErrorRecoveryCompleted(final Status oldBackgroundError) {
+      capturedListenerEvents.errorRecoveryCompleted = true;
+    }
+  }
+}
diff --git a/java/src/test/java/org/rocksdb/LRUCacheTest.java b/java/src/test/java/org/rocksdb/LRUCacheTest.java
index d2cd15b7e97..275cb560a13 100644
--- a/java/src/test/java/org/rocksdb/LRUCacheTest.java
+++ b/java/src/test/java/org/rocksdb/LRUCacheTest.java
@@ -5,23 +5,27 @@
 
 package org.rocksdb;
 
+import static org.assertj.core.api.Assertions.assertThat;
+
+import org.junit.ClassRule;
 import org.junit.Test;
 
 public class LRUCacheTest {
-
-  static {
-    RocksDB.loadLibrary();
-  }
+  @ClassRule
+  public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE =
+      new RocksNativeLibraryResource();
 
   @Test
   public void newLRUCache() {
-    final long capacity = 1000;
+    final long capacity = 80000000;
     final int numShardBits = 16;
     final boolean strictCapacityLimit = true;
-    final double highPriPoolRatio = 5;
+    final double highPriPoolRatio = 0.05;
     try(final Cache lruCache = new LRUCache(capacity,
         numShardBits, strictCapacityLimit, highPriPoolRatio)) {
       //no op
+      assertThat(lruCache.getUsage()).isGreaterThanOrEqualTo(0);
+      assertThat(lruCache.getPinnedUsage()).isGreaterThanOrEqualTo(0);
     }
   }
 }
diff --git a/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java b/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java
index d1bdf0f8844..970e58c0c2e 100644
--- a/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java
+++ b/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java
@@ -15,6 +15,9 @@
 import static org.junit.Assert.assertEquals;
 
 public class NativeComparatorWrapperTest {
+  static {
+    RocksDB.loadLibrary();
+  }
 
   @Rule
   public TemporaryFolder dbFolder = new TemporaryFolder();
diff --git a/java/src/test/java/org/rocksdb/OptionsTest.java b/java/src/test/java/org/rocksdb/OptionsTest.java
index 043de032c1a..e402cb4748a 100644
--- a/java/src/test/java/org/rocksdb/OptionsTest.java
+++ b/java/src/test/java/org/rocksdb/OptionsTest.java
@@ -6,13 +6,13 @@
 package org.rocksdb;
 
 import static org.assertj.core.api.Assertions.assertThat;
-import static org.junit.Assert.assertArrayEquals;
-import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.*;
 
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Paths;
 import java.util.*;
+import java.util.concurrent.atomic.AtomicBoolean;
 import org.junit.ClassRule;
 import org.junit.Test;
 import org.rocksdb.test.RemoveEmptyValueCompactionFilterFactory;
@@ -1436,4 +1436,38 @@ public void skipCheckingSstFileSizesOnDbOpen() {
       assertThat(options.skipCheckingSstFileSizesOnDbOpen()).isEqualTo(true);
     }
   }
+
+  @Test
+  public void eventListeners() {
+    final AtomicBoolean wasCalled1 = new AtomicBoolean();
+    final AtomicBoolean wasCalled2 = new AtomicBoolean();
+    try (final Options options = new Options();
+         final AbstractEventListener el1 =
+             new AbstractEventListener() {
+               @Override
+               public void onTableFileDeleted(final TableFileDeletionInfo tableFileDeletionInfo) {
+                 wasCalled1.set(true);
+               }
+             };
+         final AbstractEventListener el2 =
+             new AbstractEventListener() {
+               @Override
+               public void onMemTableSealed(final MemTableInfo memTableInfo) {
+                 wasCalled2.set(true);
+               }
+             }) {
+      assertThat(options.setListeners(Arrays.asList(el1, el2))).isEqualTo(options);
+      List<AbstractEventListener> listeners = options.listeners();
+      assertEquals(el1, listeners.get(0));
+      assertEquals(el2, listeners.get(1));
+      options.setListeners(Collections.<AbstractEventListener>emptyList());
+      listeners.get(0).onTableFileDeleted(null);
+      assertTrue(wasCalled1.get());
+      listeners.get(1).onMemTableSealed(null);
+      assertTrue(wasCalled2.get());
+      List<AbstractEventListener> listeners2 = options.listeners();
+      assertNotNull(listeners2);
+      assertEquals(0, listeners2.size());
+    }
+  }
 }
diff --git a/java/src/test/java/org/rocksdb/ReadOnlyTest.java b/java/src/test/java/org/rocksdb/ReadOnlyTest.java
index ad6e746aa62..5b40a5df1fa 100644
--- a/java/src/test/java/org/rocksdb/ReadOnlyTest.java
+++ b/java/src/test/java/org/rocksdb/ReadOnlyTest.java
@@ -31,115 +31,60 @@ public void readOnlyOpen() throws RocksDBException {
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath())) {
       db.put("key".getBytes(), "value".getBytes());
-      try (final RocksDB db2 = RocksDB.openReadOnly(
-          dbFolder.getRoot().getAbsolutePath())) {
-        assertThat("value").
-            isEqualTo(new String(db2.get("key".getBytes())));
-      }
+    }
+    try (final RocksDB db = RocksDB.openReadOnly(dbFolder.getRoot().getAbsolutePath())) {
+      assertThat("value").isEqualTo(new String(db.get("key".getBytes())));
     }
 
     try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) {
       final List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
-      cfDescriptors.add(new ColumnFamilyDescriptor(
-          RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts));
-
+      cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts));
       final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
-      try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath(),
-          cfDescriptors, columnFamilyHandleList)) {
-        try (final ColumnFamilyOptions newCfOpts = new ColumnFamilyOptions();
-             final ColumnFamilyOptions newCf2Opts = new ColumnFamilyOptions()
-        ) {
-          columnFamilyHandleList.add(db.createColumnFamily(
-              new ColumnFamilyDescriptor("new_cf".getBytes(), newCfOpts)));
-          columnFamilyHandleList.add(db.createColumnFamily(
-              new ColumnFamilyDescriptor("new_cf2".getBytes(), newCf2Opts)));
-          db.put(columnFamilyHandleList.get(2), "key2".getBytes(),
-              "value2".getBytes());
+      try (final RocksDB db = RocksDB.open(
+               dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) {
+        columnFamilyHandleList.add(
+            db.createColumnFamily(new ColumnFamilyDescriptor("new_cf".getBytes(), cfOpts)));
+        columnFamilyHandleList.add(
+            db.createColumnFamily(new ColumnFamilyDescriptor("new_cf2".getBytes(), cfOpts)));
+        db.put(columnFamilyHandleList.get(2), "key2".getBytes(), "value2".getBytes());
+      }
 
-          final List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
-              new ArrayList<>();
-          try (final RocksDB db2 = RocksDB.openReadOnly(
-              dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
-              readOnlyColumnFamilyHandleList)) {
-            try (final ColumnFamilyOptions newCfOpts2 =
-                     new ColumnFamilyOptions();
-                 final ColumnFamilyOptions newCf2Opts2 =
-                     new ColumnFamilyOptions()
-            ) {
-              assertThat(db2.get("key2".getBytes())).isNull();
-              assertThat(db2.get(readOnlyColumnFamilyHandleList.get(0),
-                  "key2".getBytes())).
-                  isNull();
-              cfDescriptors.clear();
-              cfDescriptors.add(
-                  new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
-                      newCfOpts2));
-              cfDescriptors.add(new ColumnFamilyDescriptor("new_cf2".getBytes(),
-                      newCf2Opts2));
+      columnFamilyHandleList.clear();
+      try (final RocksDB db = RocksDB.openReadOnly(
+               dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) {
+        assertThat(db.get("key2".getBytes())).isNull();
+        assertThat(db.get(columnFamilyHandleList.get(0), "key2".getBytes())).isNull();
+      }
 
-              final List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList2
-                  = new ArrayList<>();
-              try (final RocksDB db3 = RocksDB.openReadOnly(
-                  dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
-                  readOnlyColumnFamilyHandleList2)) {
-                try {
-                  assertThat(new String(db3.get(
-                      readOnlyColumnFamilyHandleList2.get(1),
-                      "key2".getBytes()))).isEqualTo("value2");
-                } finally {
-                  for (final ColumnFamilyHandle columnFamilyHandle :
-                      readOnlyColumnFamilyHandleList2) {
-                    columnFamilyHandle.close();
-                  }
-                }
-              }
-            } finally {
-              for (final ColumnFamilyHandle columnFamilyHandle :
-                  readOnlyColumnFamilyHandleList) {
-                columnFamilyHandle.close();
-              }
-            }
-          }
-        } finally {
-          for (final ColumnFamilyHandle columnFamilyHandle :
-              columnFamilyHandleList) {
-            columnFamilyHandle.close();
-          }
-        }
+      cfDescriptors.clear();
+      cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts));
+      cfDescriptors.add(new ColumnFamilyDescriptor("new_cf2".getBytes(), cfOpts));
+      columnFamilyHandleList.clear();
+      try (final RocksDB db = RocksDB.openReadOnly(
+               dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) {
+        assertThat(new String(db.get(columnFamilyHandleList.get(1), "key2".getBytes())))
+            .isEqualTo("value2");
       }
     }
   }
 
   @Test(expected = RocksDBException.class)
   public void failToWriteInReadOnly() throws RocksDBException {
-    try (final Options options = new Options()
-        .setCreateIfMissing(true)) {
-
-      try (final RocksDB db = RocksDB.open(options,
-          dbFolder.getRoot().getAbsolutePath())) {
-        //no-op
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      try (final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) {
+        // no-op
       }
     }
 
     try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) {
-      final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
-          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)
-      );
+      final List<ColumnFamilyDescriptor> cfDescriptors =
+          Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts));
 
-      final List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
-          new ArrayList<>();
-      try (final RocksDB rDb = RocksDB.openReadOnly(
-          dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
-          readOnlyColumnFamilyHandleList)) {
-        try {
-          // test that put fails in readonly mode
-          rDb.put("key".getBytes(), "value".getBytes());
-        } finally {
-          for (final ColumnFamilyHandle columnFamilyHandle :
-              readOnlyColumnFamilyHandleList) {
-            columnFamilyHandle.close();
-          }
-        }
+      final List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList = new ArrayList<>();
+      try (final RocksDB rDb = RocksDB.openReadOnly(dbFolder.getRoot().getAbsolutePath(),
+               cfDescriptors, readOnlyColumnFamilyHandleList)) {
+        // test that put fails in readonly mode
+        rDb.put("key".getBytes(), "value".getBytes());
       }
     }
   }
@@ -161,15 +106,7 @@ public void failToCFWriteInReadOnly() throws RocksDBException {
       try (final RocksDB rDb = RocksDB.openReadOnly(
           dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
           readOnlyColumnFamilyHandleList)) {
-        try {
-          rDb.put(readOnlyColumnFamilyHandleList.get(0),
-              "key".getBytes(), "value".getBytes());
-        } finally {
-          for (final ColumnFamilyHandle columnFamilyHandle :
-              readOnlyColumnFamilyHandleList) {
-            columnFamilyHandle.close();
-          }
-        }
+        rDb.put(readOnlyColumnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes());
       }
     }
   }
@@ -193,14 +130,7 @@ public void failToRemoveInReadOnly() throws RocksDBException {
       try (final RocksDB rDb = RocksDB.openReadOnly(
           dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
           readOnlyColumnFamilyHandleList)) {
-        try {
-          rDb.delete("key".getBytes());
-        } finally {
-          for (final ColumnFamilyHandle columnFamilyHandle :
-              readOnlyColumnFamilyHandleList) {
-            columnFamilyHandle.close();
-          }
-        }
+        rDb.delete("key".getBytes());
       }
     }
   }
@@ -223,15 +153,8 @@ public void failToCFRemoveInReadOnly() throws RocksDBException {
       try (final RocksDB rDb = RocksDB.openReadOnly(
           dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
           readOnlyColumnFamilyHandleList)) {
-        try {
           rDb.delete(readOnlyColumnFamilyHandleList.get(0),
               "key".getBytes());
-        } finally {
-          for (final ColumnFamilyHandle columnFamilyHandle :
-              readOnlyColumnFamilyHandleList) {
-            columnFamilyHandle.close();
-          }
-        }
       }
     }
   }
@@ -256,15 +179,8 @@ public void failToWriteBatchReadOnly() throws RocksDBException {
           readOnlyColumnFamilyHandleList);
            final WriteBatch wb = new WriteBatch();
            final WriteOptions wOpts = new WriteOptions()) {
-        try {
           wb.put("key".getBytes(), "value".getBytes());
           rDb.write(wOpts, wb);
-        } finally {
-          for (final ColumnFamilyHandle columnFamilyHandle :
-              readOnlyColumnFamilyHandleList) {
-            columnFamilyHandle.close();
-          }
-        }
       }
     }
   }
@@ -289,16 +205,9 @@ public void failToCFWriteBatchReadOnly() throws RocksDBException {
           readOnlyColumnFamilyHandleList);
            final WriteBatch wb = new WriteBatch();
            final WriteOptions wOpts = new WriteOptions()) {
-        try {
           wb.put(readOnlyColumnFamilyHandleList.get(0), "key".getBytes(),
               "value".getBytes());
           rDb.write(wOpts, wb);
-        } finally {
-          for (final ColumnFamilyHandle columnFamilyHandle :
-              readOnlyColumnFamilyHandleList) {
-            columnFamilyHandle.close();
-          }
-        }
       }
     }
   }
@@ -318,14 +227,7 @@ public void errorIfWalFileExists() throws RocksDBException {
       try (final DBOptions options = new DBOptions();
            final RocksDB rDb = RocksDB.openReadOnly(options, dbFolder.getRoot().getAbsolutePath(),
                cfDescriptors, readOnlyColumnFamilyHandleList, true);) {
-        try {
-          // no-op... should have raised an error as errorIfWalFileExists=true
-
-        } finally {
-          for (final ColumnFamilyHandle columnFamilyHandle : readOnlyColumnFamilyHandleList) {
-            columnFamilyHandle.close();
-          }
-        }
+        // no-op... should have raised an error as errorIfWalFileExists=true
       }
     }
   }
diff --git a/java/src/test/java/org/rocksdb/ReadOptionsTest.java b/java/src/test/java/org/rocksdb/ReadOptionsTest.java
index 675023ef3a2..689c48cb0ef 100644
--- a/java/src/test/java/org/rocksdb/ReadOptionsTest.java
+++ b/java/src/test/java/org/rocksdb/ReadOptionsTest.java
@@ -159,6 +159,8 @@ public void iterateUpperBound() {
       Slice upperBound = buildRandomSlice();
       opt.setIterateUpperBound(upperBound);
       assertThat(Arrays.equals(upperBound.data(), opt.iterateUpperBound().data())).isTrue();
+      opt.setIterateUpperBound(null);
+      assertThat(opt.iterateUpperBound()).isNull();
     }
   }
 
@@ -175,6 +177,8 @@ public void iterateLowerBound() {
       Slice lowerBound = buildRandomSlice();
       opt.setIterateLowerBound(lowerBound);
       assertThat(Arrays.equals(lowerBound.data(), opt.iterateLowerBound().data())).isTrue();
+      opt.setIterateLowerBound(null);
+      assertThat(opt.iterateLowerBound()).isNull();
     }
   }
 
diff --git a/java/src/test/java/org/rocksdb/RocksDBTest.java b/java/src/test/java/org/rocksdb/RocksDBTest.java
index fc62dc80e1f..20588084c80 100644
--- a/java/src/test/java/org/rocksdb/RocksDBTest.java
+++ b/java/src/test/java/org/rocksdb/RocksDBTest.java
@@ -1271,6 +1271,26 @@ public void getApproximateMemTableStats() throws RocksDBException {
     }
   }
 
+  @Test
+  public void getApproximateMemTableStatsSingleKey() throws RocksDBException {
+    final byte key1[] = "key1".getBytes(UTF_8);
+    final byte key2[] = "key2".getBytes(UTF_8);
+    final byte key3[] = "key3".getBytes(UTF_8);
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        db.put(key1, key1);
+
+        final RocksDB.CountAndSize stats =
+            db.getApproximateMemTableStats(new Range(new Slice(key1), new Slice(key3)));
+
+        assertThat(stats).isNotNull();
+        assertThat(stats.count).isEqualTo(1);
+        assertThat(stats.size).isGreaterThan(1);
+      }
+    }
+  }
+
   @Ignore("TODO(AR) re-enable when ready!")
   @Test
   public void compactFiles() throws RocksDBException {
@@ -1456,11 +1476,11 @@ public void getLiveFiles() throws RocksDBException {
       try (final RocksDB db = RocksDB.open(options, dbPath)) {
         final RocksDB.LiveFiles livefiles = db.getLiveFiles(true);
         assertThat(livefiles).isNotNull();
-        assertThat(livefiles.manifestFileSize).isEqualTo(13);
+        assertThat(livefiles.manifestFileSize).isEqualTo(57);
         assertThat(livefiles.files.size()).isEqualTo(3);
         assertThat(livefiles.files.get(0)).isEqualTo("/CURRENT");
-        assertThat(livefiles.files.get(1)).isEqualTo("/MANIFEST-000001");
-        assertThat(livefiles.files.get(2)).isEqualTo("/OPTIONS-000005");
+        assertThat(livefiles.files.get(1)).isEqualTo("/MANIFEST-000004");
+        assertThat(livefiles.files.get(2)).isEqualTo("/OPTIONS-000007");
       }
     }
   }
diff --git a/java/src/test/java/org/rocksdb/test/TestableEventListener.java b/java/src/test/java/org/rocksdb/test/TestableEventListener.java
new file mode 100644
index 00000000000..865ad5cf78b
--- /dev/null
+++ b/java/src/test/java/org/rocksdb/test/TestableEventListener.java
@@ -0,0 +1,23 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb.test;
+
+import org.rocksdb.AbstractEventListener;
+
+public class TestableEventListener extends AbstractEventListener {
+  public TestableEventListener() {
+    super();
+  }
+
+  public TestableEventListener(final EnabledEventCallback... enabledEventCallbacks) {
+    super(enabledEventCallbacks);
+  }
+
+  public void invokeAllCallbacks() {
+    invokeAllCallbacks(nativeHandle_);
+  }
+
+  private static native void invokeAllCallbacks(final long handle);
+}
diff --git a/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java b/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java
index 89081947193..f80e69c1c60 100644
--- a/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java
+++ b/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java
@@ -156,8 +156,10 @@ public boolean equals(final Object o) {
 
     @Override
     public int hashCode() {
-
-      return Objects.hash(action, columnFamilyId, key, value);
+      int result = Objects.hash(action, columnFamilyId);
+      result = 31 * result + Arrays.hashCode(key);
+      result = 31 * result + Arrays.hashCode(value);
+      return result;
     }
   }
 
diff --git a/logging/auto_roll_logger.cc b/logging/auto_roll_logger.cc
index 3533724ba34..1ff08c1adef 100644
--- a/logging/auto_roll_logger.cc
+++ b/logging/auto_roll_logger.cc
@@ -6,8 +6,12 @@
 #include "logging/auto_roll_logger.h"
 
 #include <algorithm>
+
 #include "file/filename.h"
 #include "logging/logging.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
 #include "util/mutexlock.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -15,7 +19,9 @@ namespace ROCKSDB_NAMESPACE {
 #ifndef ROCKSDB_LITE
 // -- AutoRollLogger
 
-AutoRollLogger::AutoRollLogger(Env* env, const std::string& dbname,
+AutoRollLogger::AutoRollLogger(const std::shared_ptr<FileSystem>& fs,
+                               const std::shared_ptr<SystemClock>& clock,
+                               const std::string& dbname,
                                const std::string& db_log_dir,
                                size_t log_max_size,
                                size_t log_file_time_to_roll,
@@ -24,24 +30,26 @@ AutoRollLogger::AutoRollLogger(Env* env, const std::string& dbname,
     : Logger(log_level),
       dbname_(dbname),
       db_log_dir_(db_log_dir),
-      env_(env),
+      fs_(fs),
+      clock_(clock),
       status_(Status::OK()),
       kMaxLogFileSize(log_max_size),
       kLogFileTimeToRoll(log_file_time_to_roll),
       kKeepLogFileNum(keep_log_file_num),
-      cached_now(static_cast<uint64_t>(env_->NowMicros() * 1e-6)),
+      cached_now(static_cast<uint64_t>(clock_->NowMicros() * 1e-6)),
       ctime_(cached_now),
       cached_now_access_count(0),
       call_NowMicros_every_N_records_(100),
       mutex_() {
-  Status s = env->GetAbsolutePath(dbname, &db_absolute_path_);
+  Status s = fs->GetAbsolutePath(dbname, io_options_, &db_absolute_path_,
+                                 &io_context_);
   if (s.IsNotSupported()) {
     db_absolute_path_ = dbname;
   } else {
     status_ = s;
   }
   log_fname_ = InfoLogFileName(dbname_, db_absolute_path_, db_log_dir_);
-  if (env_->FileExists(log_fname_).ok()) {
+  if (fs_->FileExists(log_fname_, io_options_, &io_context_).ok()) {
     RollLogFile();
   }
   GetExistingFiles();
@@ -53,7 +61,7 @@ AutoRollLogger::AutoRollLogger(Env* env, const std::string& dbname,
 
 Status AutoRollLogger::ResetLogger() {
   TEST_SYNC_POINT("AutoRollLogger::ResetLogger:BeforeNewLogger");
-  status_ = env_->NewLogger(log_fname_, &logger_);
+  status_ = fs_->NewLogger(log_fname_, io_options_, &logger_, &io_context_);
   TEST_SYNC_POINT("AutoRollLogger::ResetLogger:AfterNewLogger");
 
   if (!status_.ok()) {
@@ -67,7 +75,7 @@ Status AutoRollLogger::ResetLogger() {
         "The underlying logger doesn't support GetLogFileSize()");
   }
   if (status_.ok()) {
-    cached_now = static_cast<uint64_t>(env_->NowMicros() * 1e-6);
+    cached_now = static_cast<uint64_t>(clock_->NowMicros() * 1e-6);
     ctime_ = cached_now;
     cached_now_access_count = 0;
   }
@@ -79,14 +87,14 @@ void AutoRollLogger::RollLogFile() {
   // This function is called when log is rotating. Two rotations
   // can happen quickly (NowMicro returns same value). To not overwrite
   // previous log file we increment by one micro second and try again.
-  uint64_t now = env_->NowMicros();
+  uint64_t now = clock_->NowMicros();
   std::string old_fname;
   do {
     old_fname = OldInfoLogFileName(
       dbname_, now, db_absolute_path_, db_log_dir_);
     now++;
-  } while (env_->FileExists(old_fname).ok());
-  Status s = env_->RenameFile(log_fname_, old_fname);
+  } while (fs_->FileExists(old_fname, io_options_, &io_context_).ok());
+  Status s = fs_->RenameFile(log_fname_, old_fname, io_options_, &io_context_);
   if (!s.ok()) {
     // What should we do on error?
   }
@@ -103,7 +111,7 @@ void AutoRollLogger::GetExistingFiles() {
   std::string parent_dir;
   std::vector<std::string> info_log_files;
   Status s =
-      GetInfoLogFiles(env_, db_log_dir_, dbname_, &parent_dir, &info_log_files);
+      GetInfoLogFiles(fs_, db_log_dir_, dbname_, &parent_dir, &info_log_files);
   if (status_.ok()) {
     status_ = s;
   }
@@ -117,7 +125,7 @@ void AutoRollLogger::GetExistingFiles() {
 }
 
 Status AutoRollLogger::TrimOldLogFiles() {
-  // Here we directly list info files and delete them through Env.
+  // Here we directly list info files and delete them through FileSystem.
   // The deletion isn't going through DB, so there are shortcomes:
   // 1. the deletion is not rate limited by SstFileManager
   // 2. there is a chance that an I/O will be issued here
@@ -130,7 +138,8 @@ Status AutoRollLogger::TrimOldLogFiles() {
   // it's essentially the same thing, and checking empty before accessing
   // the queue feels safer.
   while (!old_log_files_.empty() && old_log_files_.size() >= kKeepLogFileNum) {
-    Status s = env_->DeleteFile(old_log_files_.front());
+    Status s =
+        fs_->DeleteFile(old_log_files_.front(), io_options_, &io_context_);
     // Remove the file from the tracking anyway. It's possible that
     // DB cleaned up the old log file, or people cleaned it up manually.
     old_log_files_.pop();
@@ -241,7 +250,7 @@ void AutoRollLogger::LogHeader(const char* format, va_list args) {
 
 bool AutoRollLogger::LogExpired() {
   if (cached_now_access_count >= call_NowMicros_every_N_records_) {
-    cached_now = static_cast<uint64_t>(env_->NowMicros() * 1e-6);
+    cached_now = static_cast<uint64_t>(clock_->NowMicros() * 1e-6);
     cached_now_access_count = 0;
   }
 
@@ -267,15 +276,16 @@ Status CreateLoggerFromOptions(const std::string& dbname,
   std::string fname =
       InfoLogFileName(dbname, db_absolute_path, options.db_log_dir);
 
+  const auto& clock = env->GetSystemClock();
   env->CreateDirIfMissing(dbname)
       .PermitUncheckedError();  // In case it does not exist
   // Currently we only support roll by time-to-roll and log size
 #ifndef ROCKSDB_LITE
   if (options.log_file_time_to_roll > 0 || options.max_log_file_size > 0) {
     AutoRollLogger* result = new AutoRollLogger(
-        env, dbname, options.db_log_dir, options.max_log_file_size,
-        options.log_file_time_to_roll, options.keep_log_file_num,
-        options.info_log_level);
+        env->GetFileSystem(), clock, dbname, options.db_log_dir,
+        options.max_log_file_size, options.log_file_time_to_roll,
+        options.keep_log_file_num, options.info_log_level);
     s = result->GetStatus();
     if (!s.ok()) {
       delete result;
@@ -286,9 +296,9 @@ Status CreateLoggerFromOptions(const std::string& dbname,
   }
 #endif  // !ROCKSDB_LITE
   // Open a log file in the same directory as the db
-  env->RenameFile(fname,
-                  OldInfoLogFileName(dbname, env->NowMicros(), db_absolute_path,
-                                     options.db_log_dir))
+  env->RenameFile(
+         fname, OldInfoLogFileName(dbname, clock->NowMicros(), db_absolute_path,
+                                   options.db_log_dir))
       .PermitUncheckedError();
   s = env->NewLogger(fname, logger);
   if (logger->get() != nullptr) {
diff --git a/logging/auto_roll_logger.h b/logging/auto_roll_logger.h
index 2b63fc9d9c5..ccbce1d9940 100644
--- a/logging/auto_roll_logger.h
+++ b/logging/auto_roll_logger.h
@@ -18,14 +18,18 @@
 #include "util/mutexlock.h"
 
 namespace ROCKSDB_NAMESPACE {
+class FileSystem;
+class SystemClock;
 
 #ifndef ROCKSDB_LITE
 // Rolls the log file by size and/or time
 class AutoRollLogger : public Logger {
  public:
-  AutoRollLogger(Env* env, const std::string& dbname,
-                 const std::string& db_log_dir, size_t log_max_size,
-                 size_t log_file_time_to_roll, size_t keep_log_file_num,
+  AutoRollLogger(const std::shared_ptr<FileSystem>& fs,
+                 const std::shared_ptr<SystemClock>& clock,
+                 const std::string& dbname, const std::string& db_log_dir,
+                 size_t log_max_size, size_t log_file_time_to_roll,
+                 size_t keep_log_file_num,
                  const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL);
 
   using Logger::Logv;
@@ -134,7 +138,8 @@ class AutoRollLogger : public Logger {
   std::string dbname_;
   std::string db_log_dir_;
   std::string db_absolute_path_;
-  Env* env_;
+  std::shared_ptr<FileSystem> fs_;
+  std::shared_ptr<SystemClock> clock_;
   std::shared_ptr<Logger> logger_;
   // current status of the logger
   Status status_;
@@ -148,11 +153,13 @@ class AutoRollLogger : public Logger {
   // Full path is stored here. It consumes signifianctly more memory
   // than only storing file name. Can optimize if it causes a problem.
   std::queue<std::string> old_log_files_;
-  // to avoid frequent env->NowMicros() calls, we cached the current time
+  // to avoid frequent clock->NowMicros() calls, we cached the current time
   uint64_t cached_now;
   uint64_t ctime_;
   uint64_t cached_now_access_count;
   uint64_t call_NowMicros_every_N_records_;
+  IOOptions io_options_;
+  IODebugContext io_context_;
   mutable port::Mutex mutex_;
 };
 #endif  // !ROCKSDB_LITE
diff --git a/logging/auto_roll_logger_test.cc b/logging/auto_roll_logger_test.cc
index 520d9de2354..59e0ebac658 100644
--- a/logging/auto_roll_logger_test.cc
+++ b/logging/auto_roll_logger_test.cc
@@ -7,8 +7,9 @@
 #ifndef ROCKSDB_LITE
 
 #include "logging/auto_roll_logger.h"
-#include <errno.h>
+
 #include <sys/stat.h>
+
 #include <algorithm>
 #include <cmath>
 #include <fstream>
@@ -17,18 +18,24 @@
 #include <string>
 #include <thread>
 #include <vector>
+
 #include "logging/logging.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace {
-class NoSleepEnv : public EnvWrapper {
+class NoSleepClock : public SystemClockWrapper {
  public:
-  NoSleepEnv(Env* base) : EnvWrapper(base) {}
+  NoSleepClock(
+      const std::shared_ptr<SystemClock>& base = SystemClock::Default())
+      : SystemClockWrapper(base) {}
+  const char* Name() const override { return "NoSleepClock"; }
   void SleepForMicroseconds(int micros) override {
     fake_time_ += static_cast<uint64_t>(micros);
   }
@@ -76,7 +83,9 @@ class AutoRollLoggerTest : public testing::Test {
 
   void RollLogFileBySizeTest(AutoRollLogger* logger, size_t log_max_size,
                              const std::string& log_message);
-  void RollLogFileByTimeTest(Env*, AutoRollLogger* logger, size_t time,
+  void RollLogFileByTimeTest(const std::shared_ptr<FileSystem>& fs,
+                             const std::shared_ptr<SystemClock>& sc,
+                             AutoRollLogger* logger, size_t time,
                              const std::string& log_message);
   // return list of files under kTestDir that contains "LOG"
   std::vector<std::string> GetLogFiles() {
@@ -157,21 +166,22 @@ void AutoRollLoggerTest::RollLogFileBySizeTest(AutoRollLogger* logger,
   ASSERT_TRUE(message_size == logger->GetLogFileSize());
 }
 
-void AutoRollLoggerTest::RollLogFileByTimeTest(Env* env, AutoRollLogger* logger,
-                                               size_t time,
-                                               const std::string& log_message) {
+void AutoRollLoggerTest::RollLogFileByTimeTest(
+    const std::shared_ptr<FileSystem>& fs,
+    const std::shared_ptr<SystemClock>& sc, AutoRollLogger* logger, size_t time,
+    const std::string& log_message) {
   uint64_t expected_ctime;
   uint64_t actual_ctime;
 
   uint64_t total_log_size;
-  EXPECT_OK(env->GetFileSize(kLogFile, &total_log_size));
+  EXPECT_OK(fs->GetFileSize(kLogFile, IOOptions(), &total_log_size, nullptr));
   expected_ctime = logger->TEST_ctime();
   logger->SetCallNowMicrosEveryNRecords(0);
 
   // -- Write to the log for several times, which is supposed
   // to be finished before time.
   for (int i = 0; i < 10; ++i) {
-    env->SleepForMicroseconds(50000);
+    sc->SleepForMicroseconds(50000);
     LogMessage(logger, log_message.c_str());
     EXPECT_OK(logger->GetStatus());
     // Make sure we always write to the same log file (by
@@ -186,7 +196,7 @@ void AutoRollLoggerTest::RollLogFileByTimeTest(Env* env, AutoRollLogger* logger,
   }
 
   // -- Make the log file expire
-  env->SleepForMicroseconds(static_cast<int>(time * 1000000));
+  sc->SleepForMicroseconds(static_cast<int>(time * 1000000));
   LogMessage(logger, log_message.c_str());
 
   // At this time, the new log file should be created.
@@ -200,15 +210,15 @@ TEST_F(AutoRollLoggerTest, RollLogFileBySize) {
     size_t log_max_size = 1024 * 5;
     size_t keep_log_file_num = 10;
 
-    AutoRollLogger logger(Env::Default(), kTestDir, "", log_max_size, 0,
-                          keep_log_file_num);
+    AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(),
+                          kTestDir, "", log_max_size, 0, keep_log_file_num);
 
     RollLogFileBySizeTest(&logger, log_max_size,
                           kSampleMessage + ":RollLogFileBySize");
 }
 
 TEST_F(AutoRollLoggerTest, RollLogFileByTime) {
-  NoSleepEnv nse(Env::Default());
+  auto nsc = std::make_shared<NoSleepClock>();
 
   size_t time = 2;
   size_t log_size = 1024 * 5;
@@ -217,10 +227,11 @@ TEST_F(AutoRollLoggerTest, RollLogFileByTime) {
   InitTestDb();
   // -- Test the existence of file during the server restart.
   ASSERT_EQ(Status::NotFound(), default_env->FileExists(kLogFile));
-  AutoRollLogger logger(&nse, kTestDir, "", log_size, time, keep_log_file_num);
+  AutoRollLogger logger(default_env->GetFileSystem(), nsc, kTestDir, "",
+                        log_size, time, keep_log_file_num);
   ASSERT_OK(default_env->FileExists(kLogFile));
 
-  RollLogFileByTimeTest(&nse, &logger, time,
+  RollLogFileByTimeTest(default_env->GetFileSystem(), nsc, &logger, time,
                         kSampleMessage + ":RollLogFileByTime");
 }
 
@@ -255,15 +266,17 @@ TEST_F(AutoRollLoggerTest, OpenLogFilesMultipleTimesWithOptionLog_max_size) {
   size_t log_size = 1024;
   size_t keep_log_file_num = 10;
 
-  AutoRollLogger* logger = new AutoRollLogger(Env::Default(), kTestDir, "",
-                                              log_size, 0, keep_log_file_num);
+  AutoRollLogger* logger =
+      new AutoRollLogger(FileSystem::Default(), SystemClock::Default(),
+                         kTestDir, "", log_size, 0, keep_log_file_num);
 
   LogMessage(logger, kSampleMessage.c_str());
   ASSERT_GT(logger->GetLogFileSize(), kZero);
   delete logger;
 
   // reopens the log file and an empty log file will be created.
-  logger = new AutoRollLogger(Env::Default(), kTestDir, "", log_size, 0, 10);
+  logger = new AutoRollLogger(FileSystem::Default(), SystemClock::Default(),
+                              kTestDir, "", log_size, 0, 10);
   ASSERT_EQ(logger->GetLogFileSize(), kZero);
   delete logger;
 }
@@ -274,16 +287,16 @@ TEST_F(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) {
 
   InitTestDb();
 
-  NoSleepEnv nse(Env::Default());
-  AutoRollLogger logger(&nse, kTestDir, "", log_max_size, time,
-                        keep_log_file_num);
+  auto nsc = std::make_shared<NoSleepClock>();
+  AutoRollLogger logger(FileSystem::Default(), nsc, kTestDir, "", log_max_size,
+                        time, keep_log_file_num);
 
   // Test the ability to roll by size
   RollLogFileBySizeTest(&logger, log_max_size,
                         kSampleMessage + ":CompositeRollByTimeAndSizeLogger");
 
   // Test the ability to roll by Time
-  RollLogFileByTimeTest(&nse, &logger, time,
+  RollLogFileByTimeTest(FileSystem::Default(), nsc, &logger, time,
                         kSampleMessage + ":CompositeRollByTimeAndSizeLogger");
 }
 
@@ -292,7 +305,9 @@ TEST_F(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) {
 // port
 TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) {
   DBOptions options;
-  NoSleepEnv nse(Env::Default());
+  auto nsc = std::make_shared<NoSleepClock>();
+  std::unique_ptr<Env> nse(new CompositeEnvWrapper(Env::Default(), nsc));
+
   std::shared_ptr<Logger> logger;
 
   // Normal logger
@@ -311,14 +326,15 @@ TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) {
       kSampleMessage + ":CreateLoggerFromOptions - size");
 
   // Only roll by Time
-  options.env = &nse;
+  options.env = nse.get();
   InitTestDb();
   options.max_log_file_size = 0;
   options.log_file_time_to_roll = 2;
   ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
   auto_roll_logger =
     dynamic_cast<AutoRollLogger*>(logger.get());
-  RollLogFileByTimeTest(&nse, auto_roll_logger, options.log_file_time_to_roll,
+  RollLogFileByTimeTest(options.env->GetFileSystem(), nsc, auto_roll_logger,
+                        options.log_file_time_to_roll,
                         kSampleMessage + ":CreateLoggerFromOptions - time");
 
   // roll by both Time and size
@@ -330,7 +346,8 @@ TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) {
     dynamic_cast<AutoRollLogger*>(logger.get());
   RollLogFileBySizeTest(auto_roll_logger, options.max_log_file_size,
                         kSampleMessage + ":CreateLoggerFromOptions - both");
-  RollLogFileByTimeTest(&nse, auto_roll_logger, options.log_file_time_to_roll,
+  RollLogFileByTimeTest(options.env->GetFileSystem(), nsc, auto_roll_logger,
+                        options.log_file_time_to_roll,
                         kSampleMessage + ":CreateLoggerFromOptions - both");
 
   // Set keep_log_file_num
@@ -403,8 +420,8 @@ TEST_F(AutoRollLoggerTest, AutoDeleting) {
     const size_t kMaxFileSize = 512;
     {
       size_t log_num = 8;
-      AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0,
-                            log_num);
+      AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(),
+                            dbname, db_log_dir, kMaxFileSize, 0, log_num);
       RollNTimesBySize(&logger, log_num, kMaxFileSize);
 
       ASSERT_EQ(log_num, GetLogFiles().size());
@@ -412,8 +429,8 @@ TEST_F(AutoRollLoggerTest, AutoDeleting) {
     // Shrink number of files
     {
       size_t log_num = 5;
-      AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0,
-                            log_num);
+      AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(),
+                            dbname, db_log_dir, kMaxFileSize, 0, log_num);
       ASSERT_EQ(log_num, GetLogFiles().size());
 
       RollNTimesBySize(&logger, 3, kMaxFileSize);
@@ -423,8 +440,8 @@ TEST_F(AutoRollLoggerTest, AutoDeleting) {
     // Increase number of files again.
     {
       size_t log_num = 7;
-      AutoRollLogger logger(Env::Default(), dbname, db_log_dir, kMaxFileSize, 0,
-                            log_num);
+      AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(),
+                            dbname, db_log_dir, kMaxFileSize, 0, log_num);
       ASSERT_EQ(6, GetLogFiles().size());
 
       RollNTimesBySize(&logger, 3, kMaxFileSize);
@@ -486,7 +503,8 @@ TEST_F(AutoRollLoggerTest, InfoLogLevel) {
   // an extra-scope to force the AutoRollLogger to flush the log file when it
   // becomes out of scope.
   {
-    AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0, 10);
+    AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(),
+                          kTestDir, "", log_size, 0, 10);
     for (int log_level = InfoLogLevel::HEADER_LEVEL;
          log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) {
       logger.SetInfoLogLevel((InfoLogLevel)log_level);
@@ -524,7 +542,8 @@ TEST_F(AutoRollLoggerTest, Close) {
 
   size_t log_size = 8192;
   size_t log_lines = 0;
-  AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0, 10);
+  AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(), kTestDir,
+                        "", log_size, 0, 10);
   for (int log_level = InfoLogLevel::HEADER_LEVEL;
        log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) {
     logger.SetInfoLogLevel((InfoLogLevel)log_level);
@@ -591,8 +610,9 @@ TEST_F(AutoRollLoggerTest, LogHeaderTest) {
 
     InitTestDb();
 
-    AutoRollLogger logger(Env::Default(), kTestDir, /*db_log_dir=*/"",
-                          LOG_MAX_SIZE, /*log_file_time_to_roll=*/0,
+    AutoRollLogger logger(FileSystem::Default(), SystemClock::Default(),
+                          kTestDir, /*db_log_dir=*/"", LOG_MAX_SIZE,
+                          /*log_file_time_to_roll=*/0,
                           /*keep_log_file_num=*/10);
 
     if (test_num == 0) {
diff --git a/logging/env_logger.h b/logging/env_logger.h
index 9fecb50cf94..e8e9f1abe31 100644
--- a/logging/env_logger.h
+++ b/logging/env_logger.h
@@ -31,15 +31,16 @@ class EnvLogger : public Logger {
             const std::string& fname, const EnvOptions& options, Env* env,
             InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL)
       : Logger(log_level),
-        file_(std::move(writable_file), fname, options, env),
-        last_flush_micros_(0),
         env_(env),
+        clock_(env_->GetSystemClock().get()),
+        file_(std::move(writable_file), fname, options, clock_),
+        last_flush_micros_(0),
         flush_pending_(false) {}
 
   ~EnvLogger() {
     if (!closed_) {
       closed_ = true;
-      CloseHelper();
+      CloseHelper().PermitUncheckedError();
     }
   }
 
@@ -48,9 +49,9 @@ class EnvLogger : public Logger {
     mutex_.AssertHeld();
     if (flush_pending_) {
       flush_pending_ = false;
-      file_.Flush();
+      file_.Flush().PermitUncheckedError();
     }
-    last_flush_micros_ = env_->NowMicros();
+    last_flush_micros_ = clock_->NowMicros();
   }
 
   void Flush() override {
@@ -134,9 +135,9 @@ class EnvLogger : public Logger {
       assert(p <= limit);
       mutex_.Lock();
       // We will ignore any error returned by Append().
-      file_.Append(Slice(base, p - base));
+      file_.Append(Slice(base, p - base)).PermitUncheckedError();
       flush_pending_ = true;
-      const uint64_t now_micros = env_->NowMicros();
+      const uint64_t now_micros = clock_->NowMicros();
       if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) {
         FlushLocked();
       }
@@ -154,11 +155,12 @@ class EnvLogger : public Logger {
   }
 
  private:
+  Env* env_;
+  SystemClock* clock_;
   WritableFileWriter file_;
   mutable port::Mutex mutex_;  // Mutex to protect the shared variables below.
   const static uint64_t flush_every_seconds_ = 5;
   std::atomic_uint_fast64_t last_flush_micros_;
-  Env* env_;
   std::atomic<bool> flush_pending_;
 };
 
diff --git a/logging/event_logger.cc b/logging/event_logger.cc
index f1747ad253a..78bf4f8ff5b 100644
--- a/logging/event_logger.cc
+++ b/logging/event_logger.cc
@@ -10,7 +10,6 @@
 #include <sstream>
 #include <string>
 
-#include "logging/logging.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
diff --git a/memory/arena.cc b/memory/arena.cc
index ed46459d9cc..bcdad5c76fa 100644
--- a/memory/arena.cc
+++ b/memory/arena.cc
@@ -12,11 +12,13 @@
 #include <sys/mman.h>
 #endif
 #include <algorithm>
+
 #include "logging/logging.h"
 #include "port/malloc.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "test_util/sync_point.h"
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -160,7 +162,7 @@ char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size,
 
 #ifdef MAP_HUGETLB
   if (huge_page_size > 0 && bytes > 0) {
-    // Allocate from a huge page TBL table.
+    // Allocate from a huge page TLB table.
     assert(logger != nullptr);  // logger need to be passed in.
     size_t reserved_size =
         ((bytes - 1U) / huge_page_size + 1U) * huge_page_size;
@@ -170,7 +172,7 @@ char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size,
     if (addr == nullptr) {
       ROCKS_LOG_WARN(logger,
                      "AllocateAligned fail to allocate huge TLB pages: %s",
-                     strerror(errno));
+                     errnoStr(errno).c_str());
       // fail back to malloc
     } else {
       return addr;
diff --git a/memory/jemalloc_nodump_allocator.cc b/memory/jemalloc_nodump_allocator.cc
index 980b08b955f..a01034e3fd6 100644
--- a/memory/jemalloc_nodump_allocator.cc
+++ b/memory/jemalloc_nodump_allocator.cc
@@ -132,6 +132,9 @@ size_t JemallocNodumpAllocator::UsableSize(void* p,
 Status NewJemallocNodumpAllocator(
     JemallocAllocatorOptions& options,
     std::shared_ptr<MemoryAllocator>* memory_allocator) {
+  if (memory_allocator == nullptr) {
+    return Status::InvalidArgument("memory_allocator must be non-null.");
+  }
   *memory_allocator = nullptr;
   Status unsupported = Status::NotSupported(
       "JemallocNodumpAllocator only available with jemalloc version >= 5 "
@@ -143,9 +146,6 @@ Status NewJemallocNodumpAllocator(
   if (!HasJemalloc()) {
     return unsupported;
   }
-  if (memory_allocator == nullptr) {
-    return Status::InvalidArgument("memory_allocator must be non-null.");
-  }
   if (options.limit_tcache_size &&
       options.tcache_size_lower_bound >= options.tcache_size_upper_bound) {
     return Status::InvalidArgument(
diff --git a/memtable/memtablerep_bench.cc b/memtable/memtablerep_bench.cc
index 0f620304247..d476d03fb1d 100644
--- a/memtable/memtablerep_bench.cc
+++ b/memtable/memtablerep_bench.cc
@@ -31,6 +31,7 @@ int main() {
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/system_clock.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "test_util/testutil.h"
 #include "util/gflags_compat.h"
@@ -417,7 +418,7 @@ class Benchmark {
     uint64_t bytes_written = 0;
     uint64_t bytes_read = 0;
     uint64_t read_hits = 0;
-    StopWatchNano timer(Env::Default(), true);
+    StopWatchNano timer(SystemClock::Default().get(), true);
     RunThreads(&threads, &bytes_written, &bytes_read, true, &read_hits);
     auto elapsed_time = static_cast<double>(timer.ElapsedNanos() / 1000);
     std::cout << "Elapsed time: " << static_cast<int>(elapsed_time) << " us"
diff --git a/memtable/write_buffer_manager.cc b/memtable/write_buffer_manager.cc
index 9b747087081..ecbccb82b3d 100644
--- a/memtable/write_buffer_manager.cc
+++ b/memtable/write_buffer_manager.cc
@@ -8,7 +8,9 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "rocksdb/write_buffer_manager.h"
-#include <mutex>
+
+#include "cache/cache_entry_roles.h"
+#include "db/db_impl/db_impl.h"
 #include "util/coding.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -49,12 +51,16 @@ struct WriteBufferManager::CacheRep {};
 #endif  // ROCKSDB_LITE
 
 WriteBufferManager::WriteBufferManager(size_t _buffer_size,
-                                       std::shared_ptr<Cache> cache)
+                                       std::shared_ptr<Cache> cache,
+                                       bool allow_stall)
     : buffer_size_(_buffer_size),
       mutable_limit_(buffer_size_ * 7 / 8),
       memory_used_(0),
       memory_active_(0),
-      cache_rep_(nullptr) {
+      dummy_size_(0),
+      cache_rep_(nullptr),
+      allow_stall_(allow_stall),
+      stall_active_(false) {
 #ifndef ROCKSDB_LITE
   if (cache) {
     // Construct the cache key using the pointer to this.
@@ -77,6 +83,17 @@ WriteBufferManager::~WriteBufferManager() {
 #endif  // ROCKSDB_LITE
 }
 
+void WriteBufferManager::ReserveMem(size_t mem) {
+  if (cache_rep_ != nullptr) {
+    ReserveMemWithCache(mem);
+  } else if (enabled()) {
+    memory_used_.fetch_add(mem, std::memory_order_relaxed);
+  }
+  if (enabled()) {
+    memory_active_.fetch_add(mem, std::memory_order_relaxed);
+  }
+}
+
 // Should only be called from write thread
 void WriteBufferManager::ReserveMemWithCache(size_t mem) {
 #ifndef ROCKSDB_LITE
@@ -91,9 +108,9 @@ void WriteBufferManager::ReserveMemWithCache(size_t mem) {
     // Expand size by at least 256KB.
     // Add a dummy record to the cache
     Cache::Handle* handle = nullptr;
-    Status s =
-        cache_rep_->cache_->Insert(cache_rep_->GetNextCacheKey(), nullptr,
-                                   kSizeDummyEntry, nullptr, &handle);
+    Status s = cache_rep_->cache_->Insert(
+        cache_rep_->GetNextCacheKey(), nullptr, kSizeDummyEntry,
+        GetNoopDeleterForRole<CacheEntryRole::kWriteBuffer>(), &handle);
     s.PermitUncheckedError();  // TODO: What to do on error?
     // We keep the handle even if insertion fails and a null handle is
     // returned, so that when memory shrinks, we don't release extra
@@ -104,12 +121,31 @@ void WriteBufferManager::ReserveMemWithCache(size_t mem) {
     // it in the future.
     cache_rep_->dummy_handles_.push_back(handle);
     cache_rep_->cache_allocated_size_ += kSizeDummyEntry;
+    dummy_size_.fetch_add(kSizeDummyEntry, std::memory_order_relaxed);
   }
 #else
   (void)mem;
 #endif  // ROCKSDB_LITE
 }
 
+void WriteBufferManager::ScheduleFreeMem(size_t mem) {
+  if (enabled()) {
+    memory_active_.fetch_sub(mem, std::memory_order_relaxed);
+  }
+}
+
+void WriteBufferManager::FreeMem(size_t mem) {
+  if (cache_rep_ != nullptr) {
+    FreeMemWithCache(mem);
+  } else if (enabled()) {
+    memory_used_.fetch_sub(mem, std::memory_order_relaxed);
+  }
+  // Check if stall is active and can be ended.
+  if (allow_stall_) {
+    EndWriteStall();
+  }
+}
+
 void WriteBufferManager::FreeMemWithCache(size_t mem) {
 #ifndef ROCKSDB_LITE
   assert(cache_rep_ != nullptr);
@@ -137,9 +173,56 @@ void WriteBufferManager::FreeMemWithCache(size_t mem) {
     }
     cache_rep_->dummy_handles_.pop_back();
     cache_rep_->cache_allocated_size_ -= kSizeDummyEntry;
+    dummy_size_.fetch_sub(kSizeDummyEntry, std::memory_order_relaxed);
   }
 #else
   (void)mem;
 #endif  // ROCKSDB_LITE
 }
+
+void WriteBufferManager::BeginWriteStall(StallInterface* wbm_stall) {
+  assert(wbm_stall != nullptr);
+  if (wbm_stall) {
+    std::unique_lock<std::mutex> lock(mu_);
+    queue_.push_back(wbm_stall);
+  }
+  // In case thread enqueue itself and memory got freed in parallel, end the
+  // stall.
+  if (!ShouldStall()) {
+    EndWriteStall();
+  }
+}
+
+// Called when memory is freed in FreeMem.
+void WriteBufferManager::EndWriteStall() {
+  if (enabled() && !IsStallThresholdExceeded()) {
+    {
+      std::unique_lock<std::mutex> lock(mu_);
+      stall_active_.store(false, std::memory_order_relaxed);
+      if (queue_.empty()) {
+        return;
+      }
+    }
+
+    // Get the instances from the list and call WBMStallInterface::Signal to
+    // change the state to running and unblock the DB instances.
+    // Check ShouldStall() incase stall got active by other DBs.
+    while (!ShouldStall() && !queue_.empty()) {
+      std::unique_lock<std::mutex> lock(mu_);
+      StallInterface* wbm_stall = queue_.front();
+      queue_.pop_front();
+      wbm_stall->Signal();
+    }
+  }
+}
+
+void WriteBufferManager::RemoveDBFromQueue(StallInterface* wbm_stall) {
+  assert(wbm_stall != nullptr);
+  if (enabled() && allow_stall_) {
+    std::unique_lock<std::mutex> lock(mu_);
+    queue_.remove(wbm_stall);
+    wbm_stall->Signal();
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/memtable/write_buffer_manager_test.cc b/memtable/write_buffer_manager_test.cc
index 0cdd7c4780b..7e3de41d18a 100644
--- a/memtable/write_buffer_manager_test.cc
+++ b/memtable/write_buffer_manager_test.cc
@@ -11,10 +11,11 @@
 #include "test_util/testharness.h"
 
 namespace ROCKSDB_NAMESPACE {
-
 class WriteBufferManagerTest : public testing::Test {};
 
 #ifndef ROCKSDB_LITE
+const size_t kSizeDummyEntry = 256 * 1024;
+
 TEST_F(WriteBufferManagerTest, ShouldFlush) {
   // A write buffer manager of size 10MB
   std::unique_ptr<WriteBufferManager> wbf(
@@ -46,7 +47,33 @@ TEST_F(WriteBufferManagerTest, ShouldFlush) {
   ASSERT_TRUE(wbf->ShouldFlush());
 
   wbf->FreeMem(7 * 1024 * 1024);
-  // 9MB total, 8MB mutable.
+  // 8MB total, 8MB mutable.
+  ASSERT_FALSE(wbf->ShouldFlush());
+
+  // change size: 8M limit, 7M mutable limit
+  wbf->SetBufferSize(8 * 1024 * 1024);
+  // 8MB total, 8MB mutable.
+  ASSERT_TRUE(wbf->ShouldFlush());
+
+  wbf->ScheduleFreeMem(2 * 1024 * 1024);
+  // 8MB total, 6MB mutable.
+  ASSERT_TRUE(wbf->ShouldFlush());
+
+  wbf->FreeMem(2 * 1024 * 1024);
+  // 6MB total, 6MB mutable.
+  ASSERT_FALSE(wbf->ShouldFlush());
+
+  wbf->ReserveMem(1 * 1024 * 1024);
+  // 7MB total, 7MB mutable.
+  ASSERT_FALSE(wbf->ShouldFlush());
+
+  wbf->ReserveMem(1 * 1024 * 1024);
+  // 8MB total, 8MB mutable.
+  ASSERT_TRUE(wbf->ShouldFlush());
+
+  wbf->ScheduleFreeMem(1 * 1024 * 1024);
+  wbf->FreeMem(1 * 1024 * 1024);
+  // 7MB total, 7MB mutable.
   ASSERT_FALSE(wbf->ShouldFlush());
 }
 
@@ -65,28 +92,35 @@ TEST_F(WriteBufferManagerTest, CacheCost) {
   wbf->ReserveMem(333 * 1024);
   ASSERT_GE(cache->GetPinnedUsage(), 2 * 256 * 1024);
   ASSERT_LT(cache->GetPinnedUsage(), 2 * 256 * 1024 + 10000);
+  // 2 dummy entries are added for size 333 kb.
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 2 * kSizeDummyEntry);
 
   // Allocate another 512KB
   wbf->ReserveMem(512 * 1024);
   ASSERT_GE(cache->GetPinnedUsage(), 4 * 256 * 1024);
   ASSERT_LT(cache->GetPinnedUsage(), 4 * 256 * 1024 + 10000);
+  // 2 more dummy entries are added for size 512.
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 4 * kSizeDummyEntry);
 
   // Allocate another 10MB
   wbf->ReserveMem(10 * 1024 * 1024);
   ASSERT_GE(cache->GetPinnedUsage(), 11 * 1024 * 1024);
   ASSERT_LT(cache->GetPinnedUsage(), 11 * 1024 * 1024 + 10000);
+  // 40 more entries are added for size 10 * 1024 * 1024.
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry);
 
   // Free 1MB will not cause any change in cache cost
   wbf->FreeMem(1024 * 1024);
   ASSERT_GE(cache->GetPinnedUsage(), 11 * 1024 * 1024);
   ASSERT_LT(cache->GetPinnedUsage(), 11 * 1024 * 1024 + 10000);
-
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 44 * kSizeDummyEntry);
   ASSERT_FALSE(wbf->ShouldFlush());
 
   // Allocate another 41MB
   wbf->ReserveMem(41 * 1024 * 1024);
   ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024);
   ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 + 10000);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 204 * kSizeDummyEntry);
   ASSERT_TRUE(wbf->ShouldFlush());
 
   ASSERT_TRUE(wbf->ShouldFlush());
@@ -94,7 +128,7 @@ TEST_F(WriteBufferManagerTest, CacheCost) {
   wbf->ScheduleFreeMem(20 * 1024 * 1024);
   ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024);
   ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 + 10000);
-
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 204 * kSizeDummyEntry);
   // Still need flush as the hard limit hits
   ASSERT_TRUE(wbf->ShouldFlush());
 
@@ -102,6 +136,7 @@ TEST_F(WriteBufferManagerTest, CacheCost) {
   wbf->FreeMem(20 * 1024 * 1024);
   ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 256 * 1024);
   ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 256 * 1024 + 10000);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 203 * kSizeDummyEntry);
 
   ASSERT_FALSE(wbf->ShouldFlush());
 
@@ -109,19 +144,23 @@ TEST_F(WriteBufferManagerTest, CacheCost) {
   wbf->FreeMem(16 * 1024);
   ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 2 * 256 * 1024);
   ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 2 * 256 * 1024 + 10000);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 202 * kSizeDummyEntry);
 
   wbf->FreeMem(16 * 1024);
   ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 3 * 256 * 1024);
   ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 3 * 256 * 1024 + 10000);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 201 * kSizeDummyEntry);
 
   // Reserve 512KB will not cause any change in cache cost
   wbf->ReserveMem(512 * 1024);
   ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 3 * 256 * 1024);
   ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 3 * 256 * 1024 + 10000);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 201 * kSizeDummyEntry);
 
   wbf->FreeMem(16 * 1024);
   ASSERT_GE(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 4 * 256 * 1024);
   ASSERT_LT(cache->GetPinnedUsage(), 51 * 1024 * 1024 - 4 * 256 * 1024 + 10000);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 200 * kSizeDummyEntry);
 
   // Destory write buffer manger should free everything
   wbf.reset();
@@ -137,6 +176,7 @@ TEST_F(WriteBufferManagerTest, NoCapCacheCost) {
   wbf->ReserveMem(10 * 1024 * 1024);
   ASSERT_GE(cache->GetPinnedUsage(), 10 * 1024 * 1024);
   ASSERT_LT(cache->GetPinnedUsage(), 10 * 1024 * 1024 + 10000);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 40 * kSizeDummyEntry);
   ASSERT_FALSE(wbf->ShouldFlush());
 
   wbf->FreeMem(9 * 1024 * 1024);
@@ -145,6 +185,7 @@ TEST_F(WriteBufferManagerTest, NoCapCacheCost) {
   }
   ASSERT_GE(cache->GetPinnedUsage(), 1024 * 1024);
   ASSERT_LT(cache->GetPinnedUsage(), 1024 * 1024 + 10000);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 4 * kSizeDummyEntry);
 }
 
 TEST_F(WriteBufferManagerTest, CacheFull) {
@@ -156,16 +197,20 @@ TEST_F(WriteBufferManagerTest, CacheFull) {
   std::shared_ptr<Cache> cache = NewLRUCache(lo);
   std::unique_ptr<WriteBufferManager> wbf(new WriteBufferManager(0, cache));
   wbf->ReserveMem(10 * 1024 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 40 * kSizeDummyEntry);
   size_t prev_pinned = cache->GetPinnedUsage();
   ASSERT_GE(prev_pinned, 10 * 1024 * 1024);
+
   // Some insert will fail
   wbf->ReserveMem(10 * 1024 * 1024);
   ASSERT_LE(cache->GetPinnedUsage(), 12 * 1024 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 80 * kSizeDummyEntry);
 
   // Increase capacity so next insert will succeed
   cache->SetCapacity(30 * 1024 * 1024);
   wbf->ReserveMem(10 * 1024 * 1024);
   ASSERT_GT(cache->GetPinnedUsage(), 20 * 1024 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 120 * kSizeDummyEntry);
 
   // Gradually release 20 MB
   for (int i = 0; i < 40; i++) {
@@ -173,6 +218,7 @@ TEST_F(WriteBufferManagerTest, CacheFull) {
   }
   ASSERT_GE(cache->GetPinnedUsage(), 10 * 1024 * 1024);
   ASSERT_LT(cache->GetPinnedUsage(), 20 * 1024 * 1024);
+  ASSERT_EQ(wbf->dummy_entries_in_cache_usage(), 95 * kSizeDummyEntry);
 }
 
 #endif  // ROCKSDB_LITE
diff --git a/monitoring/histogram_test.cc b/monitoring/histogram_test.cc
index 36a7d71542c..fd7c004378b 100644
--- a/monitoring/histogram_test.cc
+++ b/monitoring/histogram_test.cc
@@ -3,11 +3,15 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
+#include "monitoring/histogram.h"
+
 #include <cmath>
 
-#include "monitoring/histogram.h"
 #include "monitoring/histogram_windowing.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/mock_time_env.h"
 #include "test_util/testharness.h"
+#include "util/random.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -16,16 +20,22 @@ class HistogramTest : public testing::Test {};
 namespace {
   const double kIota = 0.1;
   const HistogramBucketMapper bucketMapper;
-  Env* env = Env::Default();
+  std::shared_ptr<MockSystemClock> clock =
+      std::make_shared<MockSystemClock>(SystemClock::Default());
 }
 
 void PopulateHistogram(Histogram& histogram,
              uint64_t low, uint64_t high, uint64_t loop = 1) {
+  Random rnd(test::RandomSeed());
   for (; loop > 0; loop--) {
     for (uint64_t i = low; i <= high; i++) {
       histogram.Add(i);
+      // sleep a random microseconds [0-10)
+      clock->MockSleepForMicroseconds(rnd.Uniform(10));
     }
   }
+  // make sure each data population at least take some time
+  clock->MockSleepForMicroseconds(1);
 }
 
 void BasicOperation(Histogram& histogram) {
@@ -131,23 +141,23 @@ TEST_F(HistogramTest, HistogramWindowingExpire) {
 
   HistogramWindowingImpl
       histogramWindowing(num_windows, micros_per_window, min_num_per_window);
-
+  histogramWindowing.TEST_UpdateClock(clock);
   PopulateHistogram(histogramWindowing, 1, 1, 100);
-  env->SleepForMicroseconds(micros_per_window);
+  clock->MockSleepForMicroseconds(micros_per_window);
   ASSERT_EQ(histogramWindowing.num(), 100);
   ASSERT_EQ(histogramWindowing.min(), 1);
   ASSERT_EQ(histogramWindowing.max(), 1);
   ASSERT_EQ(histogramWindowing.Average(), 1);
 
   PopulateHistogram(histogramWindowing, 2, 2, 100);
-  env->SleepForMicroseconds(micros_per_window);
+  clock->MockSleepForMicroseconds(micros_per_window);
   ASSERT_EQ(histogramWindowing.num(), 200);
   ASSERT_EQ(histogramWindowing.min(), 1);
   ASSERT_EQ(histogramWindowing.max(), 2);
   ASSERT_EQ(histogramWindowing.Average(), 1.5);
 
   PopulateHistogram(histogramWindowing, 3, 3, 100);
-  env->SleepForMicroseconds(micros_per_window);
+  clock->MockSleepForMicroseconds(micros_per_window);
   ASSERT_EQ(histogramWindowing.num(), 300);
   ASSERT_EQ(histogramWindowing.min(), 1);
   ASSERT_EQ(histogramWindowing.max(), 3);
@@ -155,7 +165,7 @@ TEST_F(HistogramTest, HistogramWindowingExpire) {
 
   // dropping oldest window with value 1, remaining 2 ~ 4
   PopulateHistogram(histogramWindowing, 4, 4, 100);
-  env->SleepForMicroseconds(micros_per_window);
+  clock->MockSleepForMicroseconds(micros_per_window);
   ASSERT_EQ(histogramWindowing.num(), 300);
   ASSERT_EQ(histogramWindowing.min(), 2);
   ASSERT_EQ(histogramWindowing.max(), 4);
@@ -163,7 +173,7 @@ TEST_F(HistogramTest, HistogramWindowingExpire) {
 
   // dropping oldest window with value 2, remaining 3 ~ 5
   PopulateHistogram(histogramWindowing, 5, 5, 100);
-  env->SleepForMicroseconds(micros_per_window);
+  clock->MockSleepForMicroseconds(micros_per_window);
   ASSERT_EQ(histogramWindowing.num(), 300);
   ASSERT_EQ(histogramWindowing.min(), 3);
   ASSERT_EQ(histogramWindowing.max(), 5);
@@ -179,18 +189,20 @@ TEST_F(HistogramTest, HistogramWindowingMerge) {
       histogramWindowing(num_windows, micros_per_window, min_num_per_window);
   HistogramWindowingImpl
       otherWindowing(num_windows, micros_per_window, min_num_per_window);
+  histogramWindowing.TEST_UpdateClock(clock);
+  otherWindowing.TEST_UpdateClock(clock);
 
   PopulateHistogram(histogramWindowing, 1, 1, 100);
   PopulateHistogram(otherWindowing, 1, 1, 100);
-  env->SleepForMicroseconds(micros_per_window);
+  clock->MockSleepForMicroseconds(micros_per_window);
 
   PopulateHistogram(histogramWindowing, 2, 2, 100);
   PopulateHistogram(otherWindowing, 2, 2, 100);
-  env->SleepForMicroseconds(micros_per_window);
+  clock->MockSleepForMicroseconds(micros_per_window);
 
   PopulateHistogram(histogramWindowing, 3, 3, 100);
   PopulateHistogram(otherWindowing, 3, 3, 100);
-  env->SleepForMicroseconds(micros_per_window);
+  clock->MockSleepForMicroseconds(micros_per_window);
 
   histogramWindowing.Merge(otherWindowing);
   ASSERT_EQ(histogramWindowing.num(), 600);
@@ -200,14 +212,14 @@ TEST_F(HistogramTest, HistogramWindowingMerge) {
 
   // dropping oldest window with value 1, remaining 2 ~ 4
   PopulateHistogram(histogramWindowing, 4, 4, 100);
-  env->SleepForMicroseconds(micros_per_window);
+  clock->MockSleepForMicroseconds(micros_per_window);
   ASSERT_EQ(histogramWindowing.num(), 500);
   ASSERT_EQ(histogramWindowing.min(), 2);
   ASSERT_EQ(histogramWindowing.max(), 4);
 
   // dropping oldest window with value 2, remaining 3 ~ 5
   PopulateHistogram(histogramWindowing, 5, 5, 100);
-  env->SleepForMicroseconds(micros_per_window);
+  clock->MockSleepForMicroseconds(micros_per_window);
   ASSERT_EQ(histogramWindowing.num(), 400);
   ASSERT_EQ(histogramWindowing.min(), 3);
   ASSERT_EQ(histogramWindowing.max(), 5);
diff --git a/monitoring/histogram_windowing.cc b/monitoring/histogram_windowing.cc
index e114a6686c0..f31bbe06ace 100644
--- a/monitoring/histogram_windowing.cc
+++ b/monitoring/histogram_windowing.cc
@@ -8,15 +8,17 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "monitoring/histogram_windowing.h"
-#include "monitoring/histogram.h"
-#include "util/cast_util.h"
 
 #include <algorithm>
 
+#include "monitoring/histogram.h"
+#include "rocksdb/system_clock.h"
+#include "util/cast_util.h"
+
 namespace ROCKSDB_NAMESPACE {
 
 HistogramWindowingImpl::HistogramWindowingImpl() {
-  env_ = Env::Default();
+  clock_ = SystemClock::Default();
   window_stats_.reset(new HistogramStat[static_cast<size_t>(num_windows_)]);
   Clear();
 }
@@ -28,7 +30,7 @@ HistogramWindowingImpl::HistogramWindowingImpl(
       num_windows_(num_windows),
       micros_per_window_(micros_per_window),
       min_num_per_window_(min_num_per_window) {
-  env_ = Env::Default();
+  clock_ = SystemClock::Default();
   window_stats_.reset(new HistogramStat[static_cast<size_t>(num_windows_)]);
   Clear();
 }
@@ -44,7 +46,7 @@ void HistogramWindowingImpl::Clear() {
     window_stats_[i].Clear();
   }
   current_window_.store(0, std::memory_order_relaxed);
-  last_swap_time_.store(env_->NowMicros(), std::memory_order_relaxed);
+  last_swap_time_.store(clock_->NowMicros(), std::memory_order_relaxed);
 }
 
 bool HistogramWindowingImpl::Empty() const { return stats_.Empty(); }
@@ -129,7 +131,7 @@ void HistogramWindowingImpl::Data(HistogramData * const data) const {
 }
 
 void HistogramWindowingImpl::TimerTick() {
-  uint64_t curr_time = env_->NowMicros();
+  uint64_t curr_time = clock_->NowMicros();
   size_t curr_window_ = static_cast<size_t>(current_window());
   if (curr_time - last_swap_time() > micros_per_window_ &&
       window_stats_[curr_window_].num() >= min_num_per_window_) {
@@ -144,7 +146,7 @@ void HistogramWindowingImpl::SwapHistoryBucket() {
   // If mutex is held by Merge() or Clear(), next Add() will take care of the
   // swap, if needed.
   if (mutex_.try_lock()) {
-    last_swap_time_.store(env_->NowMicros(), std::memory_order_relaxed);
+    last_swap_time_.store(clock_->NowMicros(), std::memory_order_relaxed);
 
     uint64_t curr_window = current_window();
     uint64_t next_window = (curr_window == num_windows_ - 1) ?
diff --git a/monitoring/histogram_windowing.h b/monitoring/histogram_windowing.h
index 72545b07f36..f8da07b3665 100644
--- a/monitoring/histogram_windowing.h
+++ b/monitoring/histogram_windowing.h
@@ -10,9 +10,9 @@
 #pragma once
 
 #include "monitoring/histogram.h"
-#include "rocksdb/env.h"
 
 namespace ROCKSDB_NAMESPACE {
+class SystemClock;
 
 class HistogramWindowingImpl : public Histogram
 {
@@ -44,7 +44,13 @@ class HistogramWindowingImpl : public Histogram
   virtual double StandardDeviation() const override;
   virtual void Data(HistogramData* const data) const override;
 
-private:
+#ifndef NDEBUG
+  void TEST_UpdateClock(const std::shared_ptr<SystemClock>& clock) {
+    clock_ = clock;
+  }
+#endif  // NDEBUG
+
+ private:
   void TimerTick();
   void SwapHistoryBucket();
   inline uint64_t current_window() const {
@@ -54,7 +60,7 @@ class HistogramWindowingImpl : public Histogram
     return last_swap_time_.load(std::memory_order_relaxed);
   }
 
-  Env* env_;
+  std::shared_ptr<SystemClock> clock_;
   std::mutex mutex_;
 
   // Aggregated stats over windows_stats_, all the computation is done
diff --git a/monitoring/instrumented_mutex.cc b/monitoring/instrumented_mutex.cc
index d82e594c1d0..adca63f263c 100644
--- a/monitoring/instrumented_mutex.cc
+++ b/monitoring/instrumented_mutex.cc
@@ -4,15 +4,17 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "monitoring/instrumented_mutex.h"
+
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_util.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/sync_point.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace {
 #ifndef NPERF_CONTEXT
-Statistics* stats_for_report(Env* env, Statistics* stats) {
-  if (env != nullptr && stats != nullptr &&
+Statistics* stats_for_report(SystemClock* clock, Statistics* stats) {
+  if (clock != nullptr && stats != nullptr &&
       stats->get_stats_level() > kExceptTimeForMutex) {
     return stats;
   } else {
@@ -25,7 +27,7 @@ Statistics* stats_for_report(Env* env, Statistics* stats) {
 void InstrumentedMutex::Lock() {
   PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(
       db_mutex_lock_nanos, stats_code_ == DB_MUTEX_WAIT_MICROS,
-      stats_for_report(env_, stats_), stats_code_);
+      stats_for_report(clock_, stats_), stats_code_);
   LockInternal();
 }
 
@@ -39,7 +41,7 @@ void InstrumentedMutex::LockInternal() {
 void InstrumentedCondVar::Wait() {
   PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(
       db_condition_wait_nanos, stats_code_ == DB_MUTEX_WAIT_MICROS,
-      stats_for_report(env_, stats_), stats_code_);
+      stats_for_report(clock_, stats_), stats_code_);
   WaitInternal();
 }
 
@@ -53,7 +55,7 @@ void InstrumentedCondVar::WaitInternal() {
 bool InstrumentedCondVar::TimedWait(uint64_t abs_time_us) {
   PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(
       db_condition_wait_nanos, stats_code_ == DB_MUTEX_WAIT_MICROS,
-      stats_for_report(env_, stats_), stats_code_);
+      stats_for_report(clock_, stats_), stats_code_);
   return TimedWaitInternal(abs_time_us);
 }
 
diff --git a/monitoring/instrumented_mutex.h b/monitoring/instrumented_mutex.h
index 50c1f29c8a5..19af1b4731a 100644
--- a/monitoring/instrumented_mutex.h
+++ b/monitoring/instrumented_mutex.h
@@ -7,8 +7,8 @@
 
 #include "monitoring/statistics.h"
 #include "port/port.h"
-#include "rocksdb/env.h"
 #include "rocksdb/statistics.h"
+#include "rocksdb/system_clock.h"
 #include "rocksdb/thread_status.h"
 #include "util/stop_watch.h"
 
@@ -20,13 +20,16 @@ class InstrumentedCondVar;
 class InstrumentedMutex {
  public:
   explicit InstrumentedMutex(bool adaptive = false)
-      : mutex_(adaptive), stats_(nullptr), env_(nullptr),
-        stats_code_(0) {}
+      : mutex_(adaptive), stats_(nullptr), clock_(nullptr), stats_code_(0) {}
 
-  InstrumentedMutex(
-      Statistics* stats, Env* env,
-      int stats_code, bool adaptive = false)
-      : mutex_(adaptive), stats_(stats), env_(env),
+  explicit InstrumentedMutex(SystemClock* clock, bool adaptive = false)
+      : mutex_(adaptive), stats_(nullptr), clock_(clock), stats_code_(0) {}
+
+  InstrumentedMutex(Statistics* stats, SystemClock* clock, int stats_code,
+                    bool adaptive = false)
+      : mutex_(adaptive),
+        stats_(stats),
+        clock_(clock),
         stats_code_(stats_code) {}
 
   void Lock();
@@ -44,7 +47,7 @@ class InstrumentedMutex {
   friend class InstrumentedCondVar;
   port::Mutex mutex_;
   Statistics* stats_;
-  Env* env_;
+  SystemClock* clock_;
   int stats_code_;
 };
 
@@ -71,7 +74,7 @@ class InstrumentedCondVar {
   explicit InstrumentedCondVar(InstrumentedMutex* instrumented_mutex)
       : cond_(&(instrumented_mutex->mutex_)),
         stats_(instrumented_mutex->stats_),
-        env_(instrumented_mutex->env_),
+        clock_(instrumented_mutex->clock_),
         stats_code_(instrumented_mutex->stats_code_) {}
 
   void Wait();
@@ -91,7 +94,7 @@ class InstrumentedCondVar {
   bool TimedWaitInternal(uint64_t abs_time_us);
   port::CondVar cond_;
   Statistics* stats_;
-  Env* env_;
+  SystemClock* clock_;
   int stats_code_;
 };
 
diff --git a/monitoring/iostats_context.cc b/monitoring/iostats_context.cc
index 2960f05e8cc..23bf3a694f0 100644
--- a/monitoring/iostats_context.cc
+++ b/monitoring/iostats_context.cc
@@ -9,19 +9,23 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL
+#ifdef NIOSTATS_CONTEXT
+// Should not be used because the counters are not thread-safe.
+// Put here just to make get_iostats_context() simple without ifdef.
+static IOStatsContext iostats_context;
+#elif defined(ROCKSDB_SUPPORT_THREAD_LOCAL)
 __thread IOStatsContext iostats_context;
+#else
+#error \
+    "No thread-local support. Disable iostats context with -DNIOSTATS_CONTEXT."
 #endif
 
 IOStatsContext* get_iostats_context() {
-#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL
   return &iostats_context;
-#else
-  return nullptr;
-#endif
 }
 
 void IOStatsContext::Reset() {
+#ifndef NIOSTATS_CONTEXT
   thread_pool_id = Env::Priority::TOTAL;
   bytes_read = 0;
   bytes_written = 0;
@@ -33,6 +37,9 @@ void IOStatsContext::Reset() {
   prepare_write_nanos = 0;
   fsync_nanos = 0;
   logger_nanos = 0;
+  cpu_write_nanos = 0;
+  cpu_read_nanos = 0;
+#endif  //! NIOSTATS_CONTEXT
 }
 
 #define IOSTATS_CONTEXT_OUTPUT(counter)         \
@@ -41,6 +48,10 @@ void IOStatsContext::Reset() {
   }
 
 std::string IOStatsContext::ToString(bool exclude_zero_counters) const {
+#ifdef NIOSTATS_CONTEXT
+  (void)exclude_zero_counters;
+  return "";
+#else
   std::ostringstream ss;
   IOSTATS_CONTEXT_OUTPUT(thread_pool_id);
   IOSTATS_CONTEXT_OUTPUT(bytes_read);
@@ -53,10 +64,13 @@ std::string IOStatsContext::ToString(bool exclude_zero_counters) const {
   IOSTATS_CONTEXT_OUTPUT(fsync_nanos);
   IOSTATS_CONTEXT_OUTPUT(prepare_write_nanos);
   IOSTATS_CONTEXT_OUTPUT(logger_nanos);
+  IOSTATS_CONTEXT_OUTPUT(cpu_write_nanos);
+  IOSTATS_CONTEXT_OUTPUT(cpu_read_nanos);
 
   std::string str = ss.str();
   str.erase(str.find_last_not_of(", ") + 1);
   return str;
+#endif  //! NIOSTATS_CONTEXT
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/monitoring/iostats_context_imp.h b/monitoring/iostats_context_imp.h
index a7f095d6ef1..69b0c659071 100644
--- a/monitoring/iostats_context_imp.h
+++ b/monitoring/iostats_context_imp.h
@@ -7,7 +7,7 @@
 #include "monitoring/perf_step_timer.h"
 #include "rocksdb/iostats_context.h"
 
-#ifdef ROCKSDB_SUPPORT_THREAD_LOCAL
+#if defined(ROCKSDB_SUPPORT_THREAD_LOCAL) && !defined(NIOSTATS_CONTEXT)
 namespace ROCKSDB_NAMESPACE {
 extern __thread IOStatsContext iostats_context;
 }  // namespace ROCKSDB_NAMESPACE
@@ -38,13 +38,13 @@ extern __thread IOStatsContext iostats_context;
   iostats_step_timer_##metric.Start();
 
 // Declare and set start time of the timer
-#define IOSTATS_CPU_TIMER_GUARD(metric, env)           \
+#define IOSTATS_CPU_TIMER_GUARD(metric, clock)         \
   PerfStepTimer iostats_step_timer_##metric(           \
-      &(iostats_context.metric), env, true,            \
+      &(iostats_context.metric), clock, true,          \
       PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); \
   iostats_step_timer_##metric.Start();
 
-#else  // ROCKSDB_SUPPORT_THREAD_LOCAL
+#else  // ROCKSDB_SUPPORT_THREAD_LOCAL && !NIOSTATS_CONTEXT
 
 #define IOSTATS_ADD(metric, value)
 #define IOSTATS_ADD_IF_POSITIVE(metric, value)
@@ -55,6 +55,6 @@ extern __thread IOStatsContext iostats_context;
 #define IOSTATS(metric) 0
 
 #define IOSTATS_TIMER_GUARD(metric)
-#define IOSTATS_CPU_TIMER_GUARD(metric, env)   static_cast<void>(env)
+#define IOSTATS_CPU_TIMER_GUARD(metric, clock) static_cast<void>(clock)
 
-#endif  // ROCKSDB_SUPPORT_THREAD_LOCAL
+#endif  // ROCKSDB_SUPPORT_THREAD_LOCAL && !NIOSTATS_CONTEXT
diff --git a/monitoring/perf_context.cc b/monitoring/perf_context.cc
index 53f5024050a..d45d84fb6e3 100644
--- a/monitoring/perf_context.cc
+++ b/monitoring/perf_context.cc
@@ -9,26 +9,22 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-#if defined(NPERF_CONTEXT) || !defined(ROCKSDB_SUPPORT_THREAD_LOCAL)
+#if defined(NPERF_CONTEXT)
+// Should not be used because the counters are not thread-safe.
+// Put here just to make get_perf_context() simple without ifdef.
 PerfContext perf_context;
-#else
+#elif defined(ROCKSDB_SUPPORT_THREAD_LOCAL)
 #if defined(OS_SOLARIS)
-__thread PerfContext perf_context_;
-#else
+__thread PerfContext perf_context;
+#else   // OS_SOLARIS
 thread_local PerfContext perf_context;
-#endif
+#endif  // OS_SOLARIS
+#else
+#error "No thread-local support. Disable perf context with -DNPERF_CONTEXT."
 #endif
 
 PerfContext* get_perf_context() {
-#if defined(NPERF_CONTEXT) || !defined(ROCKSDB_SUPPORT_THREAD_LOCAL)
-  return &perf_context;
-#else
-#if defined(OS_SOLARIS)
-  return &perf_context_;
-#else
   return &perf_context;
-#endif
-#endif
 }
 
 PerfContext::~PerfContext() {
diff --git a/monitoring/perf_context_imp.h b/monitoring/perf_context_imp.h
index cdca27621ec..b7a56adef59 100644
--- a/monitoring/perf_context_imp.h
+++ b/monitoring/perf_context_imp.h
@@ -25,8 +25,8 @@ extern thread_local PerfContext perf_context;
 #define PERF_TIMER_STOP(metric)
 #define PERF_TIMER_START(metric)
 #define PERF_TIMER_GUARD(metric)
-#define PERF_TIMER_GUARD_WITH_ENV(metric, env)
-#define PERF_CPU_TIMER_GUARD(metric, env)
+#define PERF_TIMER_GUARD_WITH_CLOCK(metric, clock)
+#define PERF_CPU_TIMER_GUARD(metric, clock)
 #define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \
                                                ticker_type)
 #define PERF_TIMER_MEASURE(metric)
@@ -46,14 +46,14 @@ extern thread_local PerfContext perf_context;
   perf_step_timer_##metric.Start();
 
 // Declare and set start time of the timer
-#define PERF_TIMER_GUARD_WITH_ENV(metric, env)                         \
-  PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), env); \
+#define PERF_TIMER_GUARD_WITH_CLOCK(metric, clock)                       \
+  PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), clock); \
   perf_step_timer_##metric.Start();
 
 // Declare and set start time of the timer
-#define PERF_CPU_TIMER_GUARD(metric, env)              \
+#define PERF_CPU_TIMER_GUARD(metric, clock)            \
   PerfStepTimer perf_step_timer_##metric(              \
-      &(perf_context.metric), env, true,               \
+      &(perf_context.metric), clock, true,             \
       PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); \
   perf_step_timer_##metric.Start();
 
diff --git a/monitoring/perf_step_timer.h b/monitoring/perf_step_timer.h
index f2d35d9d62d..fb049f7252b 100644
--- a/monitoring/perf_step_timer.h
+++ b/monitoring/perf_step_timer.h
@@ -5,26 +5,26 @@
 //
 #pragma once
 #include "monitoring/perf_level_imp.h"
-#include "rocksdb/env.h"
-#include "util/stop_watch.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/system_clock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class PerfStepTimer {
  public:
   explicit PerfStepTimer(
-      uint64_t* metric, Env* env = nullptr, bool use_cpu_time = false,
+      uint64_t* metric, SystemClock* clock = nullptr, bool use_cpu_time = false,
       PerfLevel enable_level = PerfLevel::kEnableTimeExceptForMutex,
       Statistics* statistics = nullptr, uint32_t ticker_type = 0)
       : perf_counter_enabled_(perf_level >= enable_level),
         use_cpu_time_(use_cpu_time),
-        env_((perf_counter_enabled_ || statistics != nullptr)
-                 ? ((env != nullptr) ? env : Env::Default())
-                 : nullptr),
+        ticker_type_(ticker_type),
+        clock_((perf_counter_enabled_ || statistics != nullptr)
+                   ? (clock ? clock : SystemClock::Default().get())
+                   : nullptr),
         start_(0),
         metric_(metric),
-        statistics_(statistics),
-        ticker_type_(ticker_type) {}
+        statistics_(statistics) {}
 
   ~PerfStepTimer() {
     Stop();
@@ -36,14 +36,6 @@ class PerfStepTimer {
     }
   }
 
-  uint64_t time_now() {
-    if (!use_cpu_time_) {
-      return env_->NowNanos();
-    } else {
-      return env_->NowCPUNanos();
-    }
-  }
-
   void Measure() {
     if (start_) {
       uint64_t now = time_now();
@@ -67,13 +59,21 @@ class PerfStepTimer {
   }
 
  private:
+  uint64_t time_now() {
+    if (!use_cpu_time_) {
+      return clock_->NowNanos();
+    } else {
+      return clock_->CPUNanos();
+    }
+  }
+
   const bool perf_counter_enabled_;
   const bool use_cpu_time_;
-  Env* const env_;
+  uint32_t ticker_type_;
+  SystemClock* const clock_;
   uint64_t start_;
   uint64_t* metric_;
   Statistics* statistics_;
-  uint32_t ticker_type_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/monitoring/persistent_stats_history.cc b/monitoring/persistent_stats_history.cc
index 7cc869cf219..86fe98f1fc0 100644
--- a/monitoring/persistent_stats_history.cc
+++ b/monitoring/persistent_stats_history.cc
@@ -12,7 +12,6 @@
 #include <string>
 #include <utility>
 #include "db/db_impl/db_impl.h"
-#include "port/likely.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc
index 2c7f65a89da..18d8eb16079 100644
--- a/monitoring/statistics.cc
+++ b/monitoring/statistics.cc
@@ -8,7 +8,6 @@
 #include <algorithm>
 #include <cinttypes>
 #include <cstdio>
-#include "port/likely.h"
 #include "rocksdb/statistics.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -192,6 +191,20 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
      "rocksdb.block.cache.compression.dict.add.redundant"},
     {FILES_MARKED_TRASH, "rocksdb.files.marked.trash"},
     {FILES_DELETED_IMMEDIATELY, "rocksdb.files.deleted.immediately"},
+    {ERROR_HANDLER_BG_ERROR_COUNT, "rocksdb.error.handler.bg.errro.count"},
+    {ERROR_HANDLER_BG_IO_ERROR_COUNT,
+     "rocksdb.error.handler.bg.io.errro.count"},
+    {ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT,
+     "rocksdb.error.handler.bg.retryable.io.errro.count"},
+    {ERROR_HANDLER_AUTORESUME_COUNT, "rocksdb.error.handler.autoresume.count"},
+    {ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT,
+     "rocksdb.error.handler.autoresume.retry.total.count"},
+    {ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT,
+     "rocksdb.error.handler.autoresume.success.count"},
+    {MEMTABLE_PAYLOAD_BYTES_AT_FLUSH,
+     "rocksdb.memtable.payload.bytes.at.flush"},
+    {MEMTABLE_GARBAGE_BYTES_AT_FLUSH,
+     "rocksdb.memtable.garbage.bytes.at.flush"},
 };
 
 const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
@@ -247,6 +260,8 @@ const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
      "rocksdb.num.index.and.filter.blocks.read.per.level"},
     {NUM_DATA_BLOCKS_READ_PER_LEVEL, "rocksdb.num.data.blocks.read.per.level"},
     {NUM_SST_READ_PER_LEVEL, "rocksdb.num.sst.read.per.level"},
+    {ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
+     "rocksdb.error.handler.autoresume.retry.count"},
 };
 
 std::shared_ptr<Statistics> CreateDBStatistics() {
diff --git a/monitoring/stats_history_test.cc b/monitoring/stats_history_test.cc
index a1affb6d193..c1a2ad989f1 100644
--- a/monitoring/stats_history_test.cc
+++ b/monitoring/stats_history_test.cc
@@ -22,6 +22,7 @@
 #include "rocksdb/cache.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/rate_limiter.h"
+#include "test_util/mock_time_env.h"
 #include "test_util/sync_point.h"
 #include "test_util/testutil.h"
 #include "util/random.h"
@@ -32,20 +33,23 @@ namespace ROCKSDB_NAMESPACE {
 class StatsHistoryTest : public DBTestBase {
  public:
   StatsHistoryTest()
-      : DBTestBase("/stats_history_test", /*env_do_fsync=*/true),
-        mock_env_(new MockTimeEnv(Env::Default())) {}
+      : DBTestBase("/stats_history_test", /*env_do_fsync=*/true) {
+    mock_clock_ = std::make_shared<MockSystemClock>(env_->GetSystemClock());
+    mock_env_.reset(new CompositeEnvWrapper(env_, mock_clock_));
+  }
 
  protected:
-  std::unique_ptr<MockTimeEnv> mock_env_;
+  std::shared_ptr<MockSystemClock> mock_clock_;
+  std::unique_ptr<Env> mock_env_;
 
   void SetUp() override {
-    mock_env_->InstallTimedWaitFixCallback();
+    mock_clock_->InstallTimedWaitFixCallback();
     SyncPoint::GetInstance()->SetCallBack(
         "DBImpl::StartPeriodicWorkScheduler:Init", [&](void* arg) {
           auto* periodic_work_scheduler_ptr =
               reinterpret_cast<PeriodicWorkScheduler**>(arg);
           *periodic_work_scheduler_ptr =
-              PeriodicWorkTestScheduler::Default(mock_env_.get());
+              PeriodicWorkTestScheduler::Default(mock_clock_);
         });
   }
 };
@@ -65,17 +69,17 @@ TEST_F(StatsHistoryTest, RunStatsDumpPeriodSec) {
   // Wait for the first stats persist to finish, as the initial delay could be
   // different.
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec - 1); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
 
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   ASSERT_GE(counter, 1);
 
   // Test cancel job through SetOptions
   ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "0"}}));
   int old_val = counter;
   for (int i = 1; i < 20; ++i) {
-    mock_env_->MockSleepForSeconds(kPeriodSec);
+    mock_clock_->MockSleepForSeconds(kPeriodSec);
   }
   ASSERT_EQ(counter, old_val);
   Close();
@@ -97,17 +101,17 @@ TEST_F(StatsHistoryTest, StatsPersistScheduling) {
   // Wait for the first stats persist to finish, as the initial delay could be
   // different.
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec - 1); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
 
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   ASSERT_GE(counter, 1);
 
   // Test cancel job through SetOptions
   ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}}));
   int old_val = counter;
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec * 2); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec * 2); });
   ASSERT_EQ(counter, old_val);
 
   Close();
@@ -129,7 +133,7 @@ TEST_F(StatsHistoryTest, PersistentStatsFreshInstall) {
   ASSERT_EQ(kPeriodSec, dbfull()->GetDBOptions().stats_persist_period_sec);
 
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   ASSERT_GE(counter, 1);
   Close();
 }
@@ -148,30 +152,31 @@ TEST_F(StatsHistoryTest, GetStatsHistoryInMemory) {
 
   // make sure the first stats persist to finish
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec - 1); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
 
   // Wait for stats persist to finish
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
 
   std::unique_ptr<StatsHistoryIterator> stats_iter;
-  ASSERT_OK(db_->GetStatsHistory(0, mock_env_->NowSeconds() + 1, &stats_iter));
+  ASSERT_OK(
+      db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter));
   ASSERT_TRUE(stats_iter != nullptr);
   // disabled stats snapshots
   ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}}));
   size_t stats_count = 0;
   for (; stats_iter->Valid(); stats_iter->Next()) {
     auto stats_map = stats_iter->GetStatsMap();
-    ASSERT_EQ(stats_iter->GetStatsTime(), mock_env_->NowSeconds());
+    ASSERT_EQ(stats_iter->GetStatsTime(), mock_clock_->NowSeconds());
     stats_count += stats_map.size();
   }
   ASSERT_GT(stats_count, 0);
   // Wait a bit and verify no more stats are found
   for (int i = 0; i < 10; ++i) {
     dbfull()->TEST_WaitForStatsDumpRun(
-        [&] { mock_env_->MockSleepForSeconds(1); });
+        [&] { mock_clock_->MockSleepForSeconds(1); });
   }
-  ASSERT_OK(db_->GetStatsHistory(0, mock_env_->NowSeconds(), &stats_iter));
+  ASSERT_OK(db_->GetStatsHistory(0, mock_clock_->NowSeconds(), &stats_iter));
   ASSERT_TRUE(stats_iter != nullptr);
   size_t stats_count_new = 0;
   for (; stats_iter->Valid(); stats_iter->Next()) {
@@ -224,11 +229,12 @@ TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) {
   const int kIterations = 10;
   for (int i = 0; i < kIterations; ++i) {
     dbfull()->TEST_WaitForStatsDumpRun(
-        [&] { mock_env_->MockSleepForSeconds(kPeriodSec); });
+        [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   }
 
   std::unique_ptr<StatsHistoryIterator> stats_iter;
-  ASSERT_OK(db_->GetStatsHistory(0, mock_env_->NowSeconds() + 1, &stats_iter));
+  ASSERT_OK(
+      db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter));
   ASSERT_TRUE(stats_iter != nullptr);
   size_t stats_count = 0;
   int slice_count = 0;
@@ -239,18 +245,19 @@ TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) {
   }
   size_t stats_history_size = dbfull()->TEST_EstimateInMemoryStatsHistorySize();
   ASSERT_GE(slice_count, kIterations - 1);
-  ASSERT_GE(stats_history_size, 13000);
-  // capping memory cost at 13000 bytes since one slice is around 10000~13000
-  ASSERT_OK(dbfull()->SetDBOptions({{"stats_history_buffer_size", "13000"}}));
-  ASSERT_EQ(13000, dbfull()->GetDBOptions().stats_history_buffer_size);
+  ASSERT_GE(stats_history_size, 14000);
+  // capping memory cost at 14000 bytes since one slice is around 10000~14000
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_history_buffer_size", "14000"}}));
+  ASSERT_EQ(14000, dbfull()->GetDBOptions().stats_history_buffer_size);
 
   // Wait for stats persist to finish
   for (int i = 0; i < kIterations; ++i) {
     dbfull()->TEST_WaitForStatsDumpRun(
-        [&] { mock_env_->MockSleepForSeconds(kPeriodSec); });
+        [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   }
 
-  ASSERT_OK(db_->GetStatsHistory(0, mock_env_->NowSeconds() + 1, &stats_iter));
+  ASSERT_OK(
+      db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter));
   ASSERT_TRUE(stats_iter != nullptr);
   size_t stats_count_reopen = 0;
   slice_count = 0;
@@ -295,11 +302,11 @@ TEST_F(StatsHistoryTest, GetStatsHistoryFromDisk) {
   // Wait for the first stats persist to finish, as the initial delay could be
   // different.
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec - 1); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
 
   // Wait for stats persist to finish
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
 
   auto iter =
       db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
@@ -307,14 +314,14 @@ TEST_F(StatsHistoryTest, GetStatsHistoryFromDisk) {
   delete iter;
 
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   iter =
       db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
   int key_count2 = countkeys(iter);
   delete iter;
 
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   iter =
       db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
   int key_count3 = countkeys(iter);
@@ -323,7 +330,8 @@ TEST_F(StatsHistoryTest, GetStatsHistoryFromDisk) {
   ASSERT_GE(key_count3, key_count2);
   ASSERT_EQ(key_count3 - key_count2, key_count2 - key_count1);
   std::unique_ptr<StatsHistoryIterator> stats_iter;
-  ASSERT_OK(db_->GetStatsHistory(0, mock_env_->NowSeconds() + 1, &stats_iter));
+  ASSERT_OK(
+      db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter));
   ASSERT_TRUE(stats_iter != nullptr);
   size_t stats_count = 0;
   int slice_count = 0;
@@ -344,7 +352,8 @@ TEST_F(StatsHistoryTest, GetStatsHistoryFromDisk) {
   ASSERT_EQ(stats_count, key_count3 - 2);
   // verify reopen will not cause data loss
   ReopenWithColumnFamilies({"default", "pikachu"}, options);
-  ASSERT_OK(db_->GetStatsHistory(0, mock_env_->NowSeconds() + 1, &stats_iter));
+  ASSERT_OK(
+      db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter));
   ASSERT_TRUE(stats_iter != nullptr);
   size_t stats_count_reopen = 0;
   int slice_count_reopen = 0;
@@ -386,37 +395,38 @@ TEST_F(StatsHistoryTest, PersitentStatsVerifyValue) {
   // Wait for the first stats persist to finish, as the initial delay could be
   // different.
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec - 1); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
 
   // Wait for stats persist to finish
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   auto iter =
       db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
   countkeys(iter);
   delete iter;
 
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   iter =
       db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
   countkeys(iter);
   delete iter;
 
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   iter =
       db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
   countkeys(iter);
   delete iter;
 
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
 
   std::map<std::string, uint64_t> stats_map_after;
   ASSERT_TRUE(options.statistics->getTickerMap(&stats_map_after));
   std::unique_ptr<StatsHistoryIterator> stats_iter;
-  ASSERT_OK(db_->GetStatsHistory(0, mock_env_->NowSeconds() + 1, &stats_iter));
+  ASSERT_OK(
+      db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter));
   ASSERT_TRUE(stats_iter != nullptr);
   std::string sample = "rocksdb.num.iterator.deleted";
   uint64_t recovered_value = 0;
@@ -433,7 +443,8 @@ TEST_F(StatsHistoryTest, PersitentStatsVerifyValue) {
 
   // test stats value retains after recovery
   ReopenWithColumnFamilies({"default", "pikachu"}, options);
-  ASSERT_OK(db_->GetStatsHistory(0, mock_env_->NowSeconds() + 1, &stats_iter));
+  ASSERT_OK(
+      db_->GetStatsHistory(0, mock_clock_->NowSeconds() + 1, &stats_iter));
   ASSERT_TRUE(stats_iter != nullptr);
   uint64_t new_recovered_value = 0;
   for (int i = 2; stats_iter->Valid(); stats_iter->Next(), i++) {
@@ -473,10 +484,10 @@ TEST_F(StatsHistoryTest, PersistentStatsCreateColumnFamilies) {
 
   // make sure the first stats persist to finish
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec - 1); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
 
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   auto iter =
       db_->NewIterator(ReadOptions(), dbfull()->PersistentStatsColumnFamily());
   int key_count = countkeys(iter);
@@ -485,7 +496,7 @@ TEST_F(StatsHistoryTest, PersistentStatsCreateColumnFamilies) {
   uint64_t num_write_wal = 0;
   std::string sample = "rocksdb.write.wal";
   std::unique_ptr<StatsHistoryIterator> stats_iter;
-  ASSERT_OK(db_->GetStatsHistory(0, mock_env_->NowSeconds(), &stats_iter));
+  ASSERT_OK(db_->GetStatsHistory(0, mock_clock_->NowSeconds(), &stats_iter));
   ASSERT_TRUE(stats_iter != nullptr);
   for (; stats_iter->Valid(); stats_iter->Next()) {
     auto stats_map = stats_iter->GetStatsMap();
@@ -521,7 +532,7 @@ TEST_F(StatsHistoryTest, PersistentStatsCreateColumnFamilies) {
   ASSERT_NOK(db_->CreateColumnFamily(cf_opts, kPersistentStatsColumnFamilyName,
                                      &handle));
   // verify stats is not affected by prior failed CF creation
-  ASSERT_OK(db_->GetStatsHistory(0, mock_env_->NowSeconds(), &stats_iter));
+  ASSERT_OK(db_->GetStatsHistory(0, mock_clock_->NowSeconds(), &stats_iter));
   ASSERT_TRUE(stats_iter != nullptr);
   num_write_wal = 0;
   for (; stats_iter->Valid(); stats_iter->Next()) {
@@ -552,7 +563,7 @@ TEST_F(StatsHistoryTest, PersistentStatsReadOnly) {
 
   // Reopen and flush memtable.
   ASSERT_OK(TryReopen(options));
-  Flush();
+  ASSERT_OK(Flush());
   Close();
   // Now check keys in read only mode.
   ASSERT_OK(ReadOnlyReopen(options));
@@ -573,7 +584,7 @@ TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) {
   // Wait for the first stats persist to finish, as the initial delay could be
   // different.
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec - 1); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
 
   ColumnFamilyData* cfd_default =
       static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily())
@@ -592,7 +603,7 @@ TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) {
   ASSERT_EQ("v0", Get(1, "Eevee"));
 
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   // writing to all three cf, flush default cf
   // LogNumbers: default: 14, stats: 4, pikachu: 4
   ASSERT_OK(Flush());
@@ -617,7 +628,7 @@ TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) {
   ASSERT_EQ("v2", Get("foo2"));
 
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   // writing to default and stats cf, flushing default cf
   // LogNumbers: default: 19, stats: 19, pikachu: 19
   ASSERT_OK(Flush());
@@ -632,7 +643,7 @@ TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) {
   ASSERT_EQ("v3", Get(1, "Jolteon"));
 
   dbfull()->TEST_WaitForStatsDumpRun(
-      [&] { mock_env_->MockSleepForSeconds(kPeriodSec); });
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
   // writing to all three cf, flushing test cf
   // LogNumbers: default: 19, stats: 19, pikachu: 22
   ASSERT_OK(Flush(1));
diff --git a/monitoring/thread_status_updater.cc b/monitoring/thread_status_updater.cc
index 7e4b299a8c0..267a0c0b082 100644
--- a/monitoring/thread_status_updater.cc
+++ b/monitoring/thread_status_updater.cc
@@ -4,9 +4,12 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "monitoring/thread_status_updater.h"
+
 #include <memory>
+
 #include "port/likely.h"
 #include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
 #include "util/mutexlock.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -159,7 +162,7 @@ Status ThreadStatusUpdater::GetThreadList(
     std::vector<ThreadStatus>* thread_list) {
   thread_list->clear();
   std::vector<std::shared_ptr<ThreadStatusData>> valid_list;
-  uint64_t now_micros = Env::Default()->NowMicros();
+  uint64_t now_micros = SystemClock::Default()->NowMicros();
 
   std::lock_guard<std::mutex> lck(thread_list_mutex_);
   for (auto* thread_data : thread_data_set_) {
diff --git a/monitoring/thread_status_util.cc b/monitoring/thread_status_util.cc
index 13a79163c3e..37fcb9f4869 100644
--- a/monitoring/thread_status_util.cc
+++ b/monitoring/thread_status_util.cc
@@ -7,6 +7,7 @@
 
 #include "monitoring/thread_status_updater.h"
 #include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -57,7 +58,7 @@ void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType op) {
   }
 
   if (op != ThreadStatus::OP_UNKNOWN) {
-    uint64_t current_time = Env::Default()->NowMicros();
+    uint64_t current_time = SystemClock::Default()->NowMicros();
     thread_updater_local_cache_->SetOperationStartTime(current_time);
   } else {
     // TDOO(yhchiang): we could report the time when we set operation to
diff --git a/monitoring/thread_status_util_debug.cc b/monitoring/thread_status_util_debug.cc
index 375fe8c0ae0..c493ddca542 100644
--- a/monitoring/thread_status_util_debug.cc
+++ b/monitoring/thread_status_util_debug.cc
@@ -7,7 +7,7 @@
 
 #include "monitoring/thread_status_updater.h"
 #include "monitoring/thread_status_util.h"
-#include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -23,7 +23,7 @@ void ThreadStatusUtil::TEST_SetStateDelay(
 void ThreadStatusUtil::TEST_StateDelay(const ThreadStatus::StateType state) {
   auto delay = states_delay[state].load(std::memory_order_relaxed);
   if (delay > 0) {
-    Env::Default()->SleepForMicroseconds(delay);
+    SystemClock::Default()->SleepForMicroseconds(delay);
   }
 }
 
diff --git a/options/cf_options.cc b/options/cf_options.cc
index fb56f238857..005a90c8554 100644
--- a/options/cf_options.cc
+++ b/options/cf_options.cc
@@ -38,96 +38,94 @@ namespace ROCKSDB_NAMESPACE {
 // http://en.cppreference.com/w/cpp/concept/StandardLayoutType
 // https://gist.github.com/graphitemaster/494f21190bb2c63c5516
 #ifndef ROCKSDB_LITE
-static ColumnFamilyOptions dummy_cf_options;
+static ImmutableCFOptions dummy_cf_options;
 template <typename T1>
-int offset_of(T1 ColumnFamilyOptions::*member) {
-  return int(size_t(&(dummy_cf_options.*member)) - size_t(&dummy_cf_options));
-}
-template <typename T1>
-int offset_of(T1 AdvancedColumnFamilyOptions::*member) {
+int offset_of(T1 ImmutableCFOptions::*member) {
   return int(size_t(&(dummy_cf_options.*member)) - size_t(&dummy_cf_options));
 }
 
 static Status ParseCompressionOptions(const std::string& value,
                                       const std::string& name,
                                       CompressionOptions& compression_opts) {
-  size_t start = 0;
-  size_t end = value.find(':');
-  if (end == std::string::npos) {
+  const char kDelimiter = ':';
+  std::istringstream field_stream(value);
+  std::string field;
+
+  if (!std::getline(field_stream, field, kDelimiter)) {
     return Status::InvalidArgument("unable to parse the specified CF option " +
                                    name);
   }
-  compression_opts.window_bits = ParseInt(value.substr(start, end - start));
-  start = end + 1;
-  end = value.find(':', start);
-  if (end == std::string::npos) {
+  compression_opts.window_bits = ParseInt(field);
+
+  if (!std::getline(field_stream, field, kDelimiter)) {
     return Status::InvalidArgument("unable to parse the specified CF option " +
                                    name);
   }
-  compression_opts.level = ParseInt(value.substr(start, end - start));
-  start = end + 1;
-  if (start >= value.size()) {
+  compression_opts.level = ParseInt(field);
+
+  if (!std::getline(field_stream, field, kDelimiter)) {
     return Status::InvalidArgument("unable to parse the specified CF option " +
                                    name);
   }
-  end = value.find(':', start);
-  compression_opts.strategy =
-      ParseInt(value.substr(start, value.size() - start));
+  compression_opts.strategy = ParseInt(field);
+
   // max_dict_bytes is optional for backwards compatibility
-  if (end != std::string::npos) {
-    start = end + 1;
-    if (start >= value.size()) {
+  if (!field_stream.eof()) {
+    if (!std::getline(field_stream, field, kDelimiter)) {
       return Status::InvalidArgument(
           "unable to parse the specified CF option " + name);
     }
-    compression_opts.max_dict_bytes =
-        ParseInt(value.substr(start, value.size() - start));
-    end = value.find(':', start);
+    compression_opts.max_dict_bytes = ParseInt(field);
   }
+
   // zstd_max_train_bytes is optional for backwards compatibility
-  if (end != std::string::npos) {
-    start = end + 1;
-    if (start >= value.size()) {
+  if (!field_stream.eof()) {
+    if (!std::getline(field_stream, field, kDelimiter)) {
       return Status::InvalidArgument(
           "unable to parse the specified CF option " + name);
     }
-    compression_opts.zstd_max_train_bytes =
-        ParseInt(value.substr(start, value.size() - start));
-    end = value.find(':', start);
+    compression_opts.zstd_max_train_bytes = ParseInt(field);
   }
 
   // parallel_threads is optional for backwards compatibility
-  if (end != std::string::npos) {
-    start = end + 1;
-    if (start >= value.size()) {
+  if (!field_stream.eof()) {
+    if (!std::getline(field_stream, field, kDelimiter)) {
       return Status::InvalidArgument(
           "unable to parse the specified CF option " + name);
     }
     // Since parallel_threads comes before enabled but was added optionally
     // later, we need to check if this is the final token (meaning it is the
-    // enabled bit), or if there is another token (meaning this one is
-    // parallel_threads)
-    end = value.find(':', start);
-    if (end != std::string::npos) {
-      compression_opts.parallel_threads =
-          ParseInt(value.substr(start, value.size() - start));
+    // enabled bit), or if there are more tokens (meaning this one is
+    // parallel_threads).
+    if (!field_stream.eof()) {
+      compression_opts.parallel_threads = ParseInt(field);
     } else {
       // parallel_threads is not serialized with this format, but enabled is
-      compression_opts.parallel_threads = CompressionOptions().parallel_threads;
-      compression_opts.enabled =
-          ParseBoolean("", value.substr(start, value.size() - start));
+      compression_opts.enabled = ParseBoolean("", field);
     }
   }
 
   // enabled is optional for backwards compatibility
-  if (end != std::string::npos) {
-    start = end + 1;
-    if (start >= value.size()) {
+  if (!field_stream.eof()) {
+    if (!std::getline(field_stream, field, kDelimiter)) {
       return Status::InvalidArgument(
           "unable to parse the specified CF option " + name);
     }
-    compression_opts.enabled =
-        ParseBoolean("", value.substr(start, value.size() - start));
+    compression_opts.enabled = ParseBoolean("", field);
+  }
+
+  // max_dict_buffer_bytes is optional for backwards compatibility
+  if (!field_stream.eof()) {
+    if (!std::getline(field_stream, field, kDelimiter)) {
+      return Status::InvalidArgument(
+          "unable to parse the specified CF option " + name);
+    }
+    compression_opts.max_dict_buffer_bytes = ParseUint64(field);
+  }
+
+  if (!field_stream.eof()) {
+    return Status::InvalidArgument("unable to parse the specified CF option " +
+                                   name);
   }
   return Status::OK();
 }
@@ -161,6 +159,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
         {"enabled",
          {offsetof(struct CompressionOptions, enabled), OptionType::kBoolean,
           OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+        {"max_dict_buffer_bytes",
+         {offsetof(struct CompressionOptions, max_dict_buffer_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
 };
 
 static std::unordered_map<std::string, OptionTypeInfo>
@@ -362,7 +364,7 @@ static std::unordered_map<std::string, OptionTypeInfo>
              offsetof(struct MutableCFOptions, compaction_options_fifo),
              OptionVerificationType::kNormal, OptionTypeFlags::kMutable,
              [](const ConfigOptions& opts, const std::string& name,
-                const std::string& value, char* addr) {
+                const std::string& value, void* addr) {
                // This is to handle backward compatibility, where
                // compaction_options_fifo could be assigned a single scalar
                // value, say, like "23", which would be assigned to
@@ -370,7 +372,7 @@ static std::unordered_map<std::string, OptionTypeInfo>
                if (name == "compaction_options_fifo" &&
                    value.find("=") == std::string::npos) {
                  // Old format. Parse just a single uint64_t value.
-                 auto options = reinterpret_cast<CompactionOptionsFIFO*>(addr);
+                 auto options = static_cast<CompactionOptionsFIFO*>(addr);
                  options->max_table_files_size = ParseUint64(value);
                  return Status::OK();
                } else {
@@ -408,6 +410,14 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct MutableCFOptions, blob_compression_type),
           OptionType::kCompressionType, OptionVerificationType::kNormal,
           OptionTypeFlags::kMutable}},
+        {"enable_blob_garbage_collection",
+         {offsetof(struct MutableCFOptions, enable_blob_garbage_collection),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"blob_garbage_collection_age_cutoff",
+         {offsetof(struct MutableCFOptions, blob_garbage_collection_age_cutoff),
+          OptionType::kDouble, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
         {"sample_for_compression",
          {offsetof(struct MutableCFOptions, sample_for_compression),
           OptionType::kUInt64T, OptionVerificationType::kNormal,
@@ -423,13 +433,12 @@ static std::unordered_map<std::string, OptionTypeInfo>
              OptionVerificationType::kNormal,
              (OptionTypeFlags::kMutable | OptionTypeFlags::kCompareNever),
              [](const ConfigOptions& opts, const std::string& name,
-                const std::string& value, char* addr) {
+                const std::string& value, void* addr) {
                // This is to handle backward compatibility, where
                // compression_options was a ":" separated list.
                if (name == kOptNameCompOpts &&
                    value.find("=") == std::string::npos) {
-                 auto* compression =
-                     reinterpret_cast<CompressionOptions*>(addr);
+                 auto* compression = static_cast<CompressionOptions*>(addr);
                  return ParseCompressionOptions(value, name, *compression);
                } else {
                  return OptionTypeInfo::ParseStruct(
@@ -444,13 +453,12 @@ static std::unordered_map<std::string, OptionTypeInfo>
              OptionVerificationType::kNormal,
              (OptionTypeFlags::kMutable | OptionTypeFlags::kCompareNever),
              [](const ConfigOptions& opts, const std::string& name,
-                const std::string& value, char* addr) {
+                const std::string& value, void* addr) {
                // This is to handle backward compatibility, where
                // compression_options was a ":" separated list.
                if (name == kOptNameBMCompOpts &&
                    value.find("=") == std::string::npos) {
-                 auto* compression =
-                     reinterpret_cast<CompressionOptions*>(addr);
+                 auto* compression = static_cast<CompressionOptions*>(addr);
                  return ParseCompressionOptions(value, name, *compression);
                } else {
                  return OptionTypeInfo::ParseStruct(
@@ -478,64 +486,65 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
           OptionTypeFlags::kNone}},
         {"inplace_update_support",
-         {offset_of(&ColumnFamilyOptions::inplace_update_support),
+         {offset_of(&ImmutableCFOptions::inplace_update_support),
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
         {"level_compaction_dynamic_level_bytes",
-         {offset_of(&ColumnFamilyOptions::level_compaction_dynamic_level_bytes),
+         {offset_of(&ImmutableCFOptions::level_compaction_dynamic_level_bytes),
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
         {"optimize_filters_for_hits",
-         {offset_of(&ColumnFamilyOptions::optimize_filters_for_hits),
+         {offset_of(&ImmutableCFOptions::optimize_filters_for_hits),
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
         {"force_consistency_checks",
-         {offset_of(&ColumnFamilyOptions::force_consistency_checks),
+         {offset_of(&ImmutableCFOptions::force_consistency_checks),
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
         {"purge_redundant_kvs_while_flush",
-         {offset_of(&ColumnFamilyOptions::purge_redundant_kvs_while_flush),
+         {offset_of(&ImmutableCFOptions::purge_redundant_kvs_while_flush),
           OptionType::kBoolean, OptionVerificationType::kDeprecated,
           OptionTypeFlags::kNone}},
         {"max_mem_compaction_level",
          {0, OptionType::kInt, OptionVerificationType::kDeprecated,
           OptionTypeFlags::kNone}},
         {"max_write_buffer_number_to_maintain",
-         {offset_of(&ColumnFamilyOptions::max_write_buffer_number_to_maintain),
+         {offset_of(&ImmutableCFOptions::max_write_buffer_number_to_maintain),
           OptionType::kInt, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone, 0}},
         {"max_write_buffer_size_to_maintain",
-         {offset_of(&ColumnFamilyOptions::max_write_buffer_size_to_maintain),
+         {offset_of(&ImmutableCFOptions::max_write_buffer_size_to_maintain),
           OptionType::kInt64T, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
         {"min_write_buffer_number_to_merge",
-         {offset_of(&ColumnFamilyOptions::min_write_buffer_number_to_merge),
+         {offset_of(&ImmutableCFOptions::min_write_buffer_number_to_merge),
           OptionType::kInt, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone, 0}},
         {"num_levels",
-         {offset_of(&ColumnFamilyOptions::num_levels), OptionType::kInt,
+         {offset_of(&ImmutableCFOptions::num_levels), OptionType::kInt,
           OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
         {"bloom_locality",
-         {offset_of(&ColumnFamilyOptions::bloom_locality), OptionType::kUInt32T,
+         {offset_of(&ImmutableCFOptions::bloom_locality), OptionType::kUInt32T,
           OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
         {"rate_limit_delay_max_milliseconds",
          {0, OptionType::kUInt, OptionVerificationType::kDeprecated,
           OptionTypeFlags::kNone}},
         {"compression_per_level",
          OptionTypeInfo::Vector<CompressionType>(
-             offset_of(&ColumnFamilyOptions::compression_per_level),
+             offset_of(&ImmutableCFOptions::compression_per_level),
              OptionVerificationType::kNormal, OptionTypeFlags::kNone,
              {0, OptionType::kCompressionType})},
         {"comparator",
-         {offset_of(&ColumnFamilyOptions::comparator), OptionType::kComparator,
-          OptionVerificationType::kByName, OptionTypeFlags::kCompareLoose,
+         {offset_of(&ImmutableCFOptions::user_comparator),
+          OptionType::kComparator, OptionVerificationType::kByName,
+          OptionTypeFlags::kCompareLoose,
           // Parses the string and sets the corresponding comparator
-          [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
-             const std::string& value, char* addr) {
-            auto old_comparator = reinterpret_cast<const Comparator**>(addr);
+          [](const ConfigOptions& opts, const std::string& /*name*/,
+             const std::string& value, void* addr) {
+            auto old_comparator = static_cast<const Comparator**>(addr);
             const Comparator* new_comparator = *old_comparator;
-            Status status = ObjectRegistry::NewInstance()->NewStaticObject(
-                value, &new_comparator);
+            Status status =
+                opts.registry->NewStaticObject(value, &new_comparator);
             if (status.ok()) {
               *old_comparator = new_comparator;
               return status;
@@ -544,61 +553,45 @@ static std::unordered_map<std::string, OptionTypeInfo>
           }}},
         {"memtable_insert_with_hint_prefix_extractor",
          {offset_of(
-              &ColumnFamilyOptions::memtable_insert_with_hint_prefix_extractor),
+              &ImmutableCFOptions::memtable_insert_with_hint_prefix_extractor),
           OptionType::kSliceTransform, OptionVerificationType::kByNameAllowNull,
           OptionTypeFlags::kNone}},
         {"memtable_factory",
-         {offset_of(&ColumnFamilyOptions::memtable_factory),
+         {offset_of(&ImmutableCFOptions::memtable_factory),
           OptionType::kMemTableRepFactory, OptionVerificationType::kByName,
           OptionTypeFlags::kNone}},
         {"memtable",
-         {offset_of(&ColumnFamilyOptions::memtable_factory),
+         {offset_of(&ImmutableCFOptions::memtable_factory),
           OptionType::kMemTableRepFactory, OptionVerificationType::kAlias,
           OptionTypeFlags::kNone,
           // Parses the value string and updates the memtable_factory
           [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
-             const std::string& value, char* addr) {
+             const std::string& value, void* addr) {
             std::unique_ptr<MemTableRepFactory> new_mem_factory;
             Status s = GetMemTableRepFactoryFromString(value, &new_mem_factory);
             if (s.ok()) {
               auto memtable_factory =
-                  reinterpret_cast<std::shared_ptr<MemTableRepFactory>*>(addr);
+                  static_cast<std::shared_ptr<MemTableRepFactory>*>(addr);
               memtable_factory->reset(new_mem_factory.release());
             }
             return s;
           }}},
-        {"table_factory",
-         {offset_of(&ColumnFamilyOptions::table_factory),
-          OptionType::kConfigurable, OptionVerificationType::kByName,
-          (OptionTypeFlags::kShared | OptionTypeFlags::kCompareLoose |
-           OptionTypeFlags::kDontPrepare),
-          // Creates a new TableFactory based on value
-          [](const ConfigOptions& opts, const std::string& /*name*/,
-             const std::string& value, char* addr) {
-            auto table_factory =
-                reinterpret_cast<std::shared_ptr<TableFactory>*>(addr);
-            return TableFactory::CreateFromString(opts, value, table_factory);
-          },
-          // Converts the TableFactory into its string representation
-          [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
-             const char* addr, std::string* value) {
-            const auto* table_factory =
-                reinterpret_cast<const std::shared_ptr<TableFactory>*>(addr);
-            *value = table_factory->get() ? table_factory->get()->Name()
-                                          : kNullptrString;
-            return Status::OK();
-          },
-          /* No equals function for table factories */ nullptr}},
+        {"table_factory", OptionTypeInfo::AsCustomSharedPtr<TableFactory>(
+                              offset_of(&ImmutableCFOptions::table_factory),
+                              OptionVerificationType::kByName,
+                              (OptionTypeFlags::kCompareLoose |
+                               OptionTypeFlags::kStringNameOnly |
+                               OptionTypeFlags::kDontPrepare))},
         {"block_based_table_factory",
-         {offset_of(&ColumnFamilyOptions::table_factory),
-          OptionType::kConfigurable, OptionVerificationType::kAlias,
+         {offset_of(&ImmutableCFOptions::table_factory),
+          OptionType::kCustomizable, OptionVerificationType::kAlias,
           OptionTypeFlags::kShared | OptionTypeFlags::kCompareLoose,
           // Parses the input value and creates a BlockBasedTableFactory
           [](const ConfigOptions& opts, const std::string& name,
-             const std::string& value, char* addr) {
+             const std::string& value, void* addr) {
             BlockBasedTableOptions* old_opts = nullptr;
             auto table_factory =
-                reinterpret_cast<std::shared_ptr<TableFactory>*>(addr);
+                static_cast<std::shared_ptr<TableFactory>*>(addr);
             if (table_factory->get() != nullptr) {
               old_opts =
                   table_factory->get()->GetOptions<BlockBasedTableOptions>();
@@ -622,15 +615,15 @@ static std::unordered_map<std::string, OptionTypeInfo>
             }
           }}},
         {"plain_table_factory",
-         {offset_of(&ColumnFamilyOptions::table_factory),
-          OptionType::kConfigurable, OptionVerificationType::kAlias,
+         {offset_of(&ImmutableCFOptions::table_factory),
+          OptionType::kCustomizable, OptionVerificationType::kAlias,
           OptionTypeFlags::kShared | OptionTypeFlags::kCompareLoose,
           // Parses the input value and creates a PlainTableFactory
           [](const ConfigOptions& opts, const std::string& name,
-             const std::string& value, char* addr) {
+             const std::string& value, void* addr) {
             PlainTableOptions* old_opts = nullptr;
             auto table_factory =
-                reinterpret_cast<std::shared_ptr<TableFactory>*>(addr);
+                static_cast<std::shared_ptr<TableFactory>*>(addr);
             if (table_factory->get() != nullptr) {
               old_opts = table_factory->get()->GetOptions<PlainTableOptions>();
             }
@@ -653,25 +646,24 @@ static std::unordered_map<std::string, OptionTypeInfo>
             }
           }}},
         {"compaction_filter",
-         {offset_of(&ColumnFamilyOptions::compaction_filter),
+         {offset_of(&ImmutableCFOptions::compaction_filter),
           OptionType::kCompactionFilter, OptionVerificationType::kByName,
           OptionTypeFlags::kNone}},
         {"compaction_filter_factory",
-         {offset_of(&ColumnFamilyOptions::compaction_filter_factory),
+         {offset_of(&ImmutableCFOptions::compaction_filter_factory),
           OptionType::kCompactionFilterFactory, OptionVerificationType::kByName,
           OptionTypeFlags::kNone}},
         {"merge_operator",
-         {offset_of(&ColumnFamilyOptions::merge_operator),
+         {offset_of(&ImmutableCFOptions::merge_operator),
           OptionType::kMergeOperator,
           OptionVerificationType::kByNameAllowFromNull,
           OptionTypeFlags::kCompareLoose,
           // Parses the input value as a MergeOperator, updating the value
-          [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
-             const std::string& value, char* addr) {
-            auto mop = reinterpret_cast<std::shared_ptr<MergeOperator>*>(addr);
+          [](const ConfigOptions& opts, const std::string& /*name*/,
+             const std::string& value, void* addr) {
+            auto mop = static_cast<std::shared_ptr<MergeOperator>*>(addr);
             Status status =
-                ObjectRegistry::NewInstance()->NewSharedObject<MergeOperator>(
-                    value, mop);
+                opts.registry->NewSharedObject<MergeOperator>(value, mop);
             // Only support static comparator for now.
             if (status.ok()) {
               return status;
@@ -679,11 +671,11 @@ static std::unordered_map<std::string, OptionTypeInfo>
             return Status::OK();
           }}},
         {"compaction_style",
-         {offset_of(&ColumnFamilyOptions::compaction_style),
+         {offset_of(&ImmutableCFOptions::compaction_style),
           OptionType::kCompactionStyle, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
         {"compaction_pri",
-         {offset_of(&ColumnFamilyOptions::compaction_pri),
+         {offset_of(&ImmutableCFOptions::compaction_pri),
           OptionType::kCompactionPri, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
 };
@@ -692,10 +684,9 @@ const std::string OptionsHelper::kCFOptionsName = "ColumnFamilyOptions";
 
 class ConfigurableMutableCFOptions : public Configurable {
  public:
-  ConfigurableMutableCFOptions(const MutableCFOptions& mcf) {
+  explicit ConfigurableMutableCFOptions(const MutableCFOptions& mcf) {
     mutable_ = mcf;
-    ConfigurableHelper::RegisterOptions(*this, &mutable_,
-                                        &cf_mutable_options_type_info);
+    RegisterOptions(&mutable_, &cf_mutable_options_type_info);
   }
 
  protected:
@@ -710,9 +701,7 @@ class ConfigurableCFOptions : public ConfigurableMutableCFOptions {
         immutable_(opts),
         cf_options_(opts),
         opt_map_(map) {
-    ConfigurableHelper::RegisterOptions(*this, OptionsHelper::kCFOptionsName,
-                                        &immutable_,
-                                        &cf_immutable_options_type_info);
+    RegisterOptions(&immutable_, &cf_immutable_options_type_info);
   }
 
  protected:
@@ -720,10 +709,10 @@ class ConfigurableCFOptions : public ConfigurableMutableCFOptions {
       const ConfigOptions& config_options,
       const std::unordered_map<std::string, std::string>& opts_map,
       std::unordered_map<std::string, std::string>* unused) override {
-    Status s = ConfigurableHelper::ConfigureOptions(config_options, *this,
-                                                    opts_map, unused);
+    Status s = Configurable::ConfigureOptions(config_options, opts_map, unused);
     if (s.ok()) {
-      cf_options_ = BuildColumnFamilyOptions(immutable_, mutable_);
+      UpdateColumnFamilyOptions(mutable_, &cf_options_);
+      UpdateColumnFamilyOptions(immutable_, &cf_options_);
       s = PrepareOptions(config_options);
     }
     return s;
@@ -777,7 +766,7 @@ class ConfigurableCFOptions : public ConfigurableMutableCFOptions {
   }
 
  private:
-  ColumnFamilyOptions immutable_;
+  ImmutableCFOptions immutable_;
   ColumnFamilyOptions cf_options_;
   const std::unordered_map<std::string, std::string>* opt_map_;
 };
@@ -795,18 +784,16 @@ std::unique_ptr<Configurable> CFOptionsAsConfigurable(
 }
 #endif  // ROCKSDB_LITE
 
-ImmutableCFOptions::ImmutableCFOptions(const Options& options)
-    : ImmutableCFOptions(ImmutableDBOptions(options), options) {}
+ImmutableCFOptions::ImmutableCFOptions() : ImmutableCFOptions(Options()) {}
 
-ImmutableCFOptions::ImmutableCFOptions(const ImmutableDBOptions& db_options,
-                                       const ColumnFamilyOptions& cf_options)
+ImmutableCFOptions::ImmutableCFOptions(const ColumnFamilyOptions& cf_options)
     : compaction_style(cf_options.compaction_style),
       compaction_pri(cf_options.compaction_pri),
       user_comparator(cf_options.comparator),
       internal_comparator(InternalKeyComparator(cf_options.comparator)),
-      merge_operator(cf_options.merge_operator.get()),
+      merge_operator(cf_options.merge_operator),
       compaction_filter(cf_options.compaction_filter),
-      compaction_filter_factory(cf_options.compaction_filter_factory.get()),
+      compaction_filter_factory(cf_options.compaction_filter_factory),
       min_write_buffer_number_to_merge(
           cf_options.min_write_buffer_number_to_merge),
       max_write_buffer_number_to_maintain(
@@ -815,45 +802,45 @@ ImmutableCFOptions::ImmutableCFOptions(const ImmutableDBOptions& db_options,
           cf_options.max_write_buffer_size_to_maintain),
       inplace_update_support(cf_options.inplace_update_support),
       inplace_callback(cf_options.inplace_callback),
-      info_log(db_options.info_log.get()),
-      statistics(db_options.statistics.get()),
-      rate_limiter(db_options.rate_limiter.get()),
-      info_log_level(db_options.info_log_level),
-      env(db_options.env),
-      fs(db_options.fs.get()),
-      allow_mmap_reads(db_options.allow_mmap_reads),
-      allow_mmap_writes(db_options.allow_mmap_writes),
-      db_paths(db_options.db_paths),
-      memtable_factory(cf_options.memtable_factory.get()),
-      table_factory(cf_options.table_factory.get()),
+      memtable_factory(cf_options.memtable_factory),
+      table_factory(cf_options.table_factory),
       table_properties_collector_factories(
           cf_options.table_properties_collector_factories),
-      advise_random_on_open(db_options.advise_random_on_open),
       bloom_locality(cf_options.bloom_locality),
       purge_redundant_kvs_while_flush(
           cf_options.purge_redundant_kvs_while_flush),
-      use_fsync(db_options.use_fsync),
       compression_per_level(cf_options.compression_per_level),
       level_compaction_dynamic_level_bytes(
           cf_options.level_compaction_dynamic_level_bytes),
-      access_hint_on_compaction_start(
-          db_options.access_hint_on_compaction_start),
-      new_table_reader_for_compaction_inputs(
-          db_options.new_table_reader_for_compaction_inputs),
       num_levels(cf_options.num_levels),
       optimize_filters_for_hits(cf_options.optimize_filters_for_hits),
       force_consistency_checks(cf_options.force_consistency_checks),
-      allow_ingest_behind(db_options.allow_ingest_behind),
-      preserve_deletes(db_options.preserve_deletes),
-      listeners(db_options.listeners),
-      row_cache(db_options.row_cache),
       memtable_insert_with_hint_prefix_extractor(
-          cf_options.memtable_insert_with_hint_prefix_extractor.get()),
+          cf_options.memtable_insert_with_hint_prefix_extractor),
       cf_paths(cf_options.cf_paths),
       compaction_thread_limiter(cf_options.compaction_thread_limiter),
-      file_checksum_gen_factory(db_options.file_checksum_gen_factory.get()),
-      sst_partitioner_factory(cf_options.sst_partitioner_factory),
-      allow_data_in_errors(db_options.allow_data_in_errors) {}
+      sst_partitioner_factory(cf_options.sst_partitioner_factory) {}
+
+ImmutableOptions::ImmutableOptions() : ImmutableOptions(Options()) {}
+
+ImmutableOptions::ImmutableOptions(const Options& options)
+    : ImmutableOptions(options, options) {}
+
+ImmutableOptions::ImmutableOptions(const DBOptions& db_options,
+                                   const ColumnFamilyOptions& cf_options)
+    : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {}
+
+ImmutableOptions::ImmutableOptions(const DBOptions& db_options,
+                                   const ImmutableCFOptions& cf_options)
+    : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {}
+
+ImmutableOptions::ImmutableOptions(const ImmutableDBOptions& db_options,
+                                   const ColumnFamilyOptions& cf_options)
+    : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {}
+
+ImmutableOptions::ImmutableOptions(const ImmutableDBOptions& db_options,
+                                   const ImmutableCFOptions& cf_options)
+    : ImmutableDBOptions(db_options), ImmutableCFOptions(cf_options) {}
 
 // Multiple two operands. If they overflow, return op1.
 uint64_t MultiplyCheckOverflow(uint64_t op1, double op2) {
@@ -1021,9 +1008,38 @@ void MutableCFOptions::Dump(Logger* log) const {
                  blob_file_size);
   ROCKS_LOG_INFO(log, "                    blob_compression_type: %s",
                  CompressionTypeToString(blob_compression_type).c_str());
+  ROCKS_LOG_INFO(log, "           enable_blob_garbage_collection: %s",
+                 enable_blob_garbage_collection ? "true" : "false");
+  ROCKS_LOG_INFO(log, "       blob_garbage_collection_age_cutoff: %f",
+                 blob_garbage_collection_age_cutoff);
 }
 
 MutableCFOptions::MutableCFOptions(const Options& options)
     : MutableCFOptions(ColumnFamilyOptions(options)) {}
 
+#ifndef ROCKSDB_LITE
+Status GetMutableOptionsFromStrings(
+    const MutableCFOptions& base_options,
+    const std::unordered_map<std::string, std::string>& options_map,
+    Logger* /*info_log*/, MutableCFOptions* new_options) {
+  assert(new_options);
+  *new_options = base_options;
+  ConfigOptions config_options;
+  Status s = OptionTypeInfo::ParseType(
+      config_options, options_map, cf_mutable_options_type_info, new_options);
+  if (!s.ok()) {
+    *new_options = base_options;
+  }
+  return s;
+}
+
+Status GetStringFromMutableCFOptions(const ConfigOptions& config_options,
+                                     const MutableCFOptions& mutable_opts,
+                                     std::string* opt_string) {
+  assert(opt_string);
+  opt_string->clear();
+  return OptionTypeInfo::SerializeType(
+      config_options, cf_mutable_options_type_info, &mutable_opts, opt_string);
+}
+#endif  // ROCKSDB_LITE
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/options/cf_options.h b/options/cf_options.h
index ca086b5c80a..d4e77f04f7b 100644
--- a/options/cf_options.h
+++ b/options/cf_options.h
@@ -20,24 +20,23 @@ namespace ROCKSDB_NAMESPACE {
 // of DB. Raw pointers defined in this struct do not have ownership to the data
 // they point to. Options contains std::shared_ptr to these data.
 struct ImmutableCFOptions {
+ public:
   static const char* kName() { return "ImmutableCFOptions"; }
-  explicit ImmutableCFOptions(const Options& options);
-
-  ImmutableCFOptions(const ImmutableDBOptions& db_options,
-                     const ColumnFamilyOptions& cf_options);
+  explicit ImmutableCFOptions();
+  explicit ImmutableCFOptions(const ColumnFamilyOptions& cf_options);
 
   CompactionStyle compaction_style;
 
   CompactionPri compaction_pri;
 
   const Comparator* user_comparator;
-  InternalKeyComparator internal_comparator;
+  InternalKeyComparator internal_comparator;  // Only in Immutable
 
-  MergeOperator* merge_operator;
+  std::shared_ptr<MergeOperator> merge_operator;
 
   const CompactionFilter* compaction_filter;
 
-  CompactionFilterFactory* compaction_filter_factory;
+  std::shared_ptr<CompactionFilterFactory> compaction_filter_factory;
 
   int min_write_buffer_number_to_merge;
 
@@ -52,78 +51,54 @@ struct ImmutableCFOptions {
                                    Slice delta_value,
                                    std::string* merged_value);
 
-  Logger* info_log;
-
-  Statistics* statistics;
-
-  RateLimiter* rate_limiter;
-
-  InfoLogLevel info_log_level;
-
-  Env* env;
-
-  FileSystem* fs;
+  std::shared_ptr<MemTableRepFactory> memtable_factory;
 
-  // Allow the OS to mmap file for reading sst tables. Default: false
-  bool allow_mmap_reads;
-
-  // Allow the OS to mmap file for writing. Default: false
-  bool allow_mmap_writes;
-
-  std::vector<DbPath> db_paths;
-
-  MemTableRepFactory* memtable_factory;
-
-  TableFactory* table_factory;
+  std::shared_ptr<TableFactory> table_factory;
 
   Options::TablePropertiesCollectorFactories
       table_properties_collector_factories;
 
-  bool advise_random_on_open;
-
   // This options is required by PlainTableReader. May need to move it
   // to PlainTableOptions just like bloom_bits_per_key
   uint32_t bloom_locality;
 
   bool purge_redundant_kvs_while_flush;
 
-  bool use_fsync;
-
   std::vector<CompressionType> compression_per_level;
 
   bool level_compaction_dynamic_level_bytes;
 
-  Options::AccessHint access_hint_on_compaction_start;
-
-  bool new_table_reader_for_compaction_inputs;
-
   int num_levels;
 
   bool optimize_filters_for_hits;
 
   bool force_consistency_checks;
 
-  bool allow_ingest_behind;
-
-  bool preserve_deletes;
+  std::shared_ptr<const SliceTransform>
+      memtable_insert_with_hint_prefix_extractor;
 
-  // A vector of EventListeners which callback functions will be called
-  // when specific RocksDB event happens.
-  std::vector<std::shared_ptr<EventListener>> listeners;
+  std::vector<DbPath> cf_paths;
 
-  std::shared_ptr<Cache> row_cache;
+  std::shared_ptr<ConcurrentTaskLimiter> compaction_thread_limiter;
 
-  const SliceTransform* memtable_insert_with_hint_prefix_extractor;
+  std::shared_ptr<SstPartitionerFactory> sst_partitioner_factory;
+};
 
-  std::vector<DbPath> cf_paths;
+struct ImmutableOptions : public ImmutableDBOptions, public ImmutableCFOptions {
+  explicit ImmutableOptions();
+  explicit ImmutableOptions(const Options& options);
 
-  std::shared_ptr<ConcurrentTaskLimiter> compaction_thread_limiter;
+  ImmutableOptions(const DBOptions& db_options,
+                   const ColumnFamilyOptions& cf_options);
 
-  FileChecksumGenFactory* file_checksum_gen_factory;
+  ImmutableOptions(const ImmutableDBOptions& db_options,
+                   const ImmutableCFOptions& cf_options);
 
-  std::shared_ptr<SstPartitionerFactory> sst_partitioner_factory;
+  ImmutableOptions(const DBOptions& db_options,
+                   const ImmutableCFOptions& cf_options);
 
-  bool allow_data_in_errors;
+  ImmutableOptions(const ImmutableDBOptions& db_options,
+                   const ColumnFamilyOptions& cf_options);
 };
 
 struct MutableCFOptions {
@@ -163,6 +138,9 @@ struct MutableCFOptions {
         min_blob_size(options.min_blob_size),
         blob_file_size(options.blob_file_size),
         blob_compression_type(options.blob_compression_type),
+        enable_blob_garbage_collection(options.enable_blob_garbage_collection),
+        blob_garbage_collection_age_cutoff(
+            options.blob_garbage_collection_age_cutoff),
         max_sequential_skip_in_iterations(
             options.max_sequential_skip_in_iterations),
         check_flush_compaction_key_order(
@@ -173,6 +151,7 @@ struct MutableCFOptions {
         bottommost_compression(options.bottommost_compression),
         compression_opts(options.compression_opts),
         bottommost_compression_opts(options.bottommost_compression_opts),
+        bottommost_temperature(options.bottommost_temperature),
         sample_for_compression(
             options.sample_for_compression) {  // TODO: is 0 fine here?
     RefreshDerivedOptions(options.num_levels, options.compaction_style);
@@ -206,12 +185,15 @@ struct MutableCFOptions {
         min_blob_size(0),
         blob_file_size(0),
         blob_compression_type(kNoCompression),
+        enable_blob_garbage_collection(false),
+        blob_garbage_collection_age_cutoff(0.0),
         max_sequential_skip_in_iterations(0),
         check_flush_compaction_key_order(true),
         paranoid_file_checks(false),
         report_bg_io_stats(false),
         compression(Snappy_Supported() ? kSnappyCompression : kNoCompression),
         bottommost_compression(kDisableCompressionOption),
+        bottommost_temperature(Temperature::kUnknown),
         sample_for_compression(0) {}
 
   explicit MutableCFOptions(const Options& options);
@@ -267,6 +249,8 @@ struct MutableCFOptions {
   uint64_t min_blob_size;
   uint64_t blob_file_size;
   CompressionType blob_compression_type;
+  bool enable_blob_garbage_collection;
+  double blob_garbage_collection_age_cutoff;
 
   // Misc options
   uint64_t max_sequential_skip_in_iterations;
@@ -277,6 +261,9 @@ struct MutableCFOptions {
   CompressionType bottommost_compression;
   CompressionOptions compression_opts;
   CompressionOptions bottommost_compression_opts;
+  // TODO this experimental option isn't made configurable
+  // through strings yet.
+  Temperature bottommost_temperature;
 
   uint64_t sample_for_compression;
 
@@ -296,4 +283,15 @@ uint64_t MaxFileSizeForLevel(const MutableCFOptions& cf_options,
 // `pin_l0_filter_and_index_blocks_in_cache` is set.
 size_t MaxFileSizeForL0MetaPin(const MutableCFOptions& cf_options);
 
+#ifndef ROCKSDB_LITE
+Status GetStringFromMutableCFOptions(const ConfigOptions& config_options,
+                                     const MutableCFOptions& mutable_opts,
+                                     std::string* opt_string);
+
+Status GetMutableOptionsFromStrings(
+    const MutableCFOptions& base_options,
+    const std::unordered_map<std::string, std::string>& options_map,
+    Logger* info_log, MutableCFOptions* new_options);
+#endif  // ROCKSDB_LITE
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/options/configurable.cc b/options/configurable.cc
index 8c11b0b0edf..f425f193ce8 100644
--- a/options/configurable.cc
+++ b/options/configurable.cc
@@ -8,6 +8,7 @@
 #include "logging/logging.h"
 #include "options/configurable_helper.h"
 #include "options/options_helper.h"
+#include "rocksdb/customizable.h"
 #include "rocksdb/status.h"
 #include "rocksdb/utilities/object_registry.h"
 #include "rocksdb/utilities/options_type.h"
@@ -16,10 +17,10 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-void ConfigurableHelper::RegisterOptions(
-    Configurable& configurable, const std::string& name, void* opt_ptr,
+void Configurable::RegisterOptions(
+    const std::string& name, void* opt_ptr,
     const std::unordered_map<std::string, OptionTypeInfo>* type_map) {
-  Configurable::RegisteredOptions opts;
+  RegisteredOptions opts;
   opts.name = name;
 #ifndef ROCKSDB_LITE
   opts.type_map = type_map;
@@ -27,7 +28,7 @@ void ConfigurableHelper::RegisterOptions(
   (void)type_map;
 #endif  // ROCKSDB_LITE
   opts.opt_ptr = opt_ptr;
-  configurable.options_.emplace_back(opts);
+  options_.emplace_back(opts);
 }
 
 //*************************************************************************
@@ -57,13 +58,9 @@ Status Configurable::PrepareOptions(const ConfigOptions& opts) {
       }
     }
   }
+#else
+  (void)opts;
 #endif  // ROCKSDB_LITE
-  if (status.ok()) {
-    auto inner = Inner();
-    if (inner != nullptr) {
-      status = inner->PrepareOptions(opts);
-    }
-  }
   if (status.ok()) {
     prepared_ = true;
   }
@@ -94,13 +91,10 @@ Status Configurable::ValidateOptions(const DBOptions& db_opts,
       }
     }
   }
+#else
+  (void)db_opts;
+  (void)cf_opts;
 #endif  // ROCKSDB_LITE
-  if (status.ok()) {
-    const auto inner = Inner();
-    if (inner != nullptr) {
-      status = inner->ValidateOptions(db_opts, cf_opts);
-    }
-  }
   return status;
 }
 
@@ -116,12 +110,7 @@ const void* Configurable::GetOptionsPtr(const std::string& name) const {
       return o.opt_ptr;
     }
   }
-  auto inner = Inner();
-  if (inner != nullptr) {
-    return inner->GetOptionsPtr(name);
-  } else {
-    return nullptr;
-  }
+  return nullptr;
 }
 
 std::string Configurable::GetOptionName(const std::string& opt_name) const {
@@ -172,7 +161,10 @@ Status Configurable::ConfigureOptions(
 #ifndef ROCKSDB_LITE
   if (!config_options.ignore_unknown_options) {
     // If we are not ignoring unused, get the defaults in case we need to reset
-    GetOptionString(config_options, &curr_opts).PermitUncheckedError();
+    ConfigOptions copy = config_options;
+    copy.depth = ConfigOptions::kDepthDetailed;
+    copy.delimiter = "; ";
+    GetOptionString(copy, &curr_opts).PermitUncheckedError();
   }
 #endif  // ROCKSDB_LITE
   Status s = ConfigurableHelper::ConfigureOptions(config_options, *this,
@@ -234,9 +226,8 @@ Status Configurable::ConfigureFromString(const ConfigOptions& config_options,
 Status Configurable::ConfigureOption(const ConfigOptions& config_options,
                                      const std::string& name,
                                      const std::string& value) {
-  const std::string& opt_name = GetOptionName(name);
-  return ConfigurableHelper::ConfigureSingleOption(config_options, *this,
-                                                   opt_name, value);
+  return ConfigurableHelper::ConfigureSingleOption(config_options, *this, name,
+                                                   value);
 }
 
 /**
@@ -250,9 +241,16 @@ Status Configurable::ParseOption(const ConfigOptions& config_options,
                                  const OptionTypeInfo& opt_info,
                                  const std::string& opt_name,
                                  const std::string& opt_value, void* opt_ptr) {
-  if (opt_info.IsMutable() || opt_info.IsConfigurable()) {
-    return opt_info.Parse(config_options, opt_name, opt_value, opt_ptr);
-  } else if (prepared_) {
+  if (opt_info.IsMutable()) {
+    if (config_options.mutable_options_only) {
+      // This option is mutable. Treat all of its children as mutable as well
+      ConfigOptions copy = config_options;
+      copy.mutable_options_only = false;
+      return opt_info.Parse(copy, opt_name, opt_value, opt_ptr);
+    } else {
+      return opt_info.Parse(config_options, opt_name, opt_value, opt_ptr);
+    }
+  } else if (config_options.mutable_options_only) {
     return Status::InvalidArgument("Option not changeable: " + opt_name);
   } else {
     return opt_info.Parse(config_options, opt_name, opt_value, opt_ptr);
@@ -375,15 +373,91 @@ Status ConfigurableHelper::ConfigureSomeOptions(
 Status ConfigurableHelper::ConfigureSingleOption(
     const ConfigOptions& config_options, Configurable& configurable,
     const std::string& name, const std::string& value) {
-  std::string opt_name;
+  const std::string& opt_name = configurable.GetOptionName(name);
+  std::string elem_name;
   void* opt_ptr = nullptr;
   const auto opt_info =
-      FindOption(configurable.options_, name, &opt_name, &opt_ptr);
+      FindOption(configurable.options_, opt_name, &elem_name, &opt_ptr);
   if (opt_info == nullptr) {
     return Status::NotFound("Could not find option: ", name);
   } else {
-    return ConfigureOption(config_options, configurable, *opt_info, name,
-                           opt_name, value, opt_ptr);
+    return ConfigureOption(config_options, configurable, *opt_info, opt_name,
+                           elem_name, value, opt_ptr);
+  }
+}
+Status ConfigurableHelper::ConfigureCustomizableOption(
+    const ConfigOptions& config_options, Configurable& configurable,
+    const OptionTypeInfo& opt_info, const std::string& opt_name,
+    const std::string& name, const std::string& value, void* opt_ptr) {
+  Customizable* custom = opt_info.AsRawPointer<Customizable>(opt_ptr);
+  ConfigOptions copy = config_options;
+  if (opt_info.IsMutable()) {
+    // This option is mutable. Pass that property on to any subsequent calls
+    copy.mutable_options_only = false;
+  }
+
+  if (opt_info.IsMutable() || !config_options.mutable_options_only) {
+    // Either the option is mutable, or we are processing all of the options
+    if (opt_name == name ||
+        EndsWith(opt_name, ConfigurableHelper::kIdPropSuffix) ||
+        name == ConfigurableHelper::kIdPropName) {
+      return configurable.ParseOption(copy, opt_info, opt_name, value, opt_ptr);
+    } else if (value.empty()) {
+      return Status::OK();
+    } else if (custom == nullptr || !StartsWith(name, custom->GetId() + ".")) {
+      return configurable.ParseOption(copy, opt_info, name, value, opt_ptr);
+    } else if (value.find("=") != std::string::npos) {
+      return custom->ConfigureFromString(copy, value);
+    } else {
+      return custom->ConfigureOption(copy, name, value);
+    }
+  } else {
+    // We are processing immutable options, which means that we cannot change
+    // the Customizable object itself, but could change its mutable properties.
+    // Check to make sure that nothing is trying to change the Customizable
+    if (custom == nullptr) {
+      // We do not have a Customizable to configure.  This is OK if the
+      // value is empty (nothing being configured) but an error otherwise
+      if (value.empty()) {
+        return Status::OK();
+      } else {
+        return Status::InvalidArgument("Option not changeable: " + opt_name);
+      }
+    } else if (EndsWith(opt_name, ConfigurableHelper::kIdPropSuffix) ||
+               name == ConfigurableHelper::kIdPropName) {
+      // We have a property of the form "id=value" or "table.id=value"
+      // This is OK if we ID/value matches the current customizable object
+      if (custom->GetId() == value) {
+        return Status::OK();
+      } else {
+        return Status::InvalidArgument("Option not changeable: " + opt_name);
+      }
+    } else if (opt_name == name) {
+      // The properties are of one of forms:
+      //    name = { id = id; prop1 = value1; ... }
+      //    name = { prop1=value1; prop2=value2; ... }
+      //    name = ID
+      // Convert the value to a map and extract the ID
+      // If the ID does not match that of the current customizable, return an
+      // error. Otherwise, update the current customizable via the properties
+      // map
+      std::unordered_map<std::string, std::string> props;
+      std::string id;
+      Status s = GetOptionsMap(value, custom->GetId(), &id, &props);
+      if (!s.ok()) {
+        return s;
+      } else if (custom->GetId() != id) {
+        return Status::InvalidArgument("Option not changeable: " + opt_name);
+      } else if (props.empty()) {
+        return Status::OK();
+      } else {
+        return custom->ConfigureFromMap(copy, props);
+      }
+    } else {
+      // Attempting to configure one of the properties of the customizable
+      // Let it through
+      return custom->ConfigureOption(copy, name, value);
+    }
   }
 }
 
@@ -391,7 +465,10 @@ Status ConfigurableHelper::ConfigureOption(
     const ConfigOptions& config_options, Configurable& configurable,
     const OptionTypeInfo& opt_info, const std::string& opt_name,
     const std::string& name, const std::string& value, void* opt_ptr) {
-  if (opt_name == name) {
+  if (opt_info.IsCustomizable()) {
+    return ConfigureCustomizableOption(config_options, configurable, opt_info,
+                                       opt_name, name, value, opt_ptr);
+  } else if (opt_name == name) {
     return configurable.ParseOption(config_options, opt_info, opt_name, value,
                                     opt_ptr);
   } else if (opt_info.IsStruct() || opt_info.IsConfigurable()) {
@@ -403,6 +480,32 @@ Status ConfigurableHelper::ConfigureOption(
 }
 #endif  // ROCKSDB_LITE
 
+Status ConfigurableHelper::ConfigureNewObject(
+    const ConfigOptions& config_options_in, Configurable* object,
+    const std::string& id, const std::string& base_opts,
+    const std::unordered_map<std::string, std::string>& opts) {
+  if (object != nullptr) {
+    ConfigOptions config_options = config_options_in;
+    config_options.invoke_prepare_options = false;
+    if (!base_opts.empty()) {
+#ifndef ROCKSDB_LITE
+      // Don't run prepare options on the base, as we would do that on the
+      // overlay opts instead
+      Status status = object->ConfigureFromString(config_options, base_opts);
+      if (!status.ok()) {
+        return status;
+      }
+#endif  // ROCKSDB_LITE
+    }
+    if (!opts.empty()) {
+      return object->ConfigureFromMap(config_options, opts);
+    }
+  } else if (!opts.empty()) {  // No object but no map.  This is OK
+    return Status::InvalidArgument("Cannot configure null object ", id);
+  }
+  return Status::OK();
+}
+
 //*******************************************************************************
 //
 //       Methods for Converting Options into strings
@@ -489,8 +592,25 @@ Status ConfigurableHelper::SerializeOptions(const ConfigOptions& config_options,
       const auto& opt_info = map_iter.second;
       if (opt_info.ShouldSerialize()) {
         std::string value;
-        Status s = opt_info.Serialize(config_options, prefix + opt_name,
-                                      opt_iter.opt_ptr, &value);
+        Status s;
+        if (!config_options.mutable_options_only) {
+          s = opt_info.Serialize(config_options, prefix + opt_name,
+                                 opt_iter.opt_ptr, &value);
+        } else if (opt_info.IsMutable()) {
+          ConfigOptions copy = config_options;
+          copy.mutable_options_only = false;
+          s = opt_info.Serialize(copy, prefix + opt_name, opt_iter.opt_ptr,
+                                 &value);
+        } else if (opt_info.IsConfigurable()) {
+          // If it is a Configurable and we are either printing all of the
+          // details or not printing only the name, this option should be
+          // included in the list
+          if (config_options.IsDetailed() ||
+              !opt_info.IsEnabled(OptionTypeFlags::kStringNameOnly)) {
+            s = opt_info.Serialize(config_options, prefix + opt_name,
+                                   opt_iter.opt_ptr, &value);
+          }
+        }
         if (!s.ok()) {
           return s;
         } else if (!value.empty()) {
@@ -519,7 +639,7 @@ Status Configurable::GetOptionNames(
 }
 
 Status ConfigurableHelper::ListOptions(
-    const ConfigOptions& /*config_options*/, const Configurable& configurable,
+    const ConfigOptions& config_options, const Configurable& configurable,
     const std::string& prefix, std::unordered_set<std::string>* result) {
   Status status;
   for (auto const& opt_iter : configurable.options_) {
@@ -529,7 +649,11 @@ Status ConfigurableHelper::ListOptions(
       // If the option is no longer used in rocksdb and marked as deprecated,
       // we skip it in the serialization.
       if (!opt_info.IsDeprecated() && !opt_info.IsAlias()) {
-        result->emplace(prefix + opt_name);
+        if (!config_options.mutable_options_only) {
+          result->emplace(prefix + opt_name);
+        } else if (opt_info.IsMutable()) {
+          result->emplace(prefix + opt_name);
+        }
       }
     }
   }
@@ -594,11 +718,23 @@ bool ConfigurableHelper::AreEquivalent(const ConfigOptions& config_options,
         return false;
       } else {
         for (const auto& map_iter : *(o.type_map)) {
-          if (config_options.IsCheckEnabled(map_iter.second.GetSanityLevel()) &&
-              !this_one.OptionsAreEqual(config_options, map_iter.second,
-                                        map_iter.first, this_offset,
-                                        that_offset, mismatch)) {
-            return false;
+          const auto& opt_info = map_iter.second;
+          if (config_options.IsCheckEnabled(opt_info.GetSanityLevel())) {
+            if (!config_options.mutable_options_only) {
+              if (!this_one.OptionsAreEqual(config_options, opt_info,
+                                            map_iter.first, this_offset,
+                                            that_offset, mismatch)) {
+                return false;
+              }
+            } else if (opt_info.IsMutable()) {
+              ConfigOptions copy = config_options;
+              copy.mutable_options_only = false;
+              if (!this_one.OptionsAreEqual(copy, opt_info, map_iter.first,
+                                            this_offset, that_offset,
+                                            mismatch)) {
+                return false;
+              }
+            }
           }
         }
       }
@@ -607,4 +743,47 @@ bool ConfigurableHelper::AreEquivalent(const ConfigOptions& config_options,
   return true;
 }
 #endif  // ROCKSDB_LITE
+
+Status ConfigurableHelper::GetOptionsMap(
+    const std::string& value, const Customizable* customizable, std::string* id,
+    std::unordered_map<std::string, std::string>* props) {
+  if (customizable != nullptr) {
+    return GetOptionsMap(value, customizable->GetId(), id, props);
+  } else {
+    return GetOptionsMap(value, "", id, props);
+  }
+}
+
+Status ConfigurableHelper::GetOptionsMap(
+    const std::string& value, const std::string& default_id, std::string* id,
+    std::unordered_map<std::string, std::string>* props) {
+  assert(id);
+  assert(props);
+  Status status;
+  if (value.empty() || value == kNullptrString) {
+    *id = default_id;
+  } else if (value.find('=') == std::string::npos) {
+    *id = value;
+#ifndef ROCKSDB_LITE
+  } else {
+    status = StringToMap(value, props);
+    if (status.ok()) {
+      auto iter = props->find(ConfigurableHelper::kIdPropName);
+      if (iter != props->end()) {
+        *id = iter->second;
+        props->erase(iter);
+      } else if (default_id.empty()) {  // Should this be an error??
+        status = Status::InvalidArgument("Name property is missing");
+      } else {
+        *id = default_id;
+      }
+    }
+#else
+  } else {
+    *id = value;
+    props->clear();
+#endif
+  }
+  return status;
+}
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/options/configurable_helper.h b/options/configurable_helper.h
index 6a2454727e2..b822b0b8eec 100644
--- a/options/configurable_helper.h
+++ b/options/configurable_helper.h
@@ -20,35 +20,8 @@ namespace ROCKSDB_NAMESPACE {
 // of configuring the objects.
 class ConfigurableHelper {
  public:
-  // Registers the input name with the options and associated map.
-  // When classes register their options in this manner, most of the
-  // functionality (excluding unknown options and validate/prepare) is
-  // implemented by the base class.
-  //
-  // This method should be called in the class constructor to register the
-  // option set for this object.  For example, to register the options
-  // associated with the BlockBasedTableFactory, the constructor calls this
-  // method passing in:
-  // - the name of the options ("BlockBasedTableOptions");
-  // - the options object (the BlockBasedTableOptions object for this object;
-  // - the options type map for the BlockBasedTableOptions.
-  // This registration allows the Configurable class to process the option
-  // values associated with the BlockBasedTableOptions without further code in
-  // the derived class.
-  //
-  // @param name    The name of this set of options (@see GetOptionsPtr)
-  // @param opt_ptr Pointer to the options to associate with this name
-  // @param opt_map Options map that controls how this option is configured.
-  template <typename T>
-  static void RegisterOptions(
-      Configurable& configurable, T* opt_ptr,
-      const std::unordered_map<std::string, OptionTypeInfo>* opt_map) {
-    RegisterOptions(configurable, T::kName(), opt_ptr, opt_map);
-  }
-  static void RegisterOptions(
-      Configurable& configurable, const std::string& name, void* opt_ptr,
-      const std::unordered_map<std::string, OptionTypeInfo>* opt_map);
-
+  constexpr static const char* kIdPropName = "id";
+  constexpr static const char* kIdPropSuffix = ".id";
   // Configures the input Configurable object based on the parameters.
   // On successful completion, the Configurable is updated with the settings
   // from the opt_map.
@@ -75,6 +48,43 @@ class ConfigurableHelper {
       const std::unordered_map<std::string, std::string>& options,
       std::unordered_map<std::string, std::string>* unused);
 
+  // Helper method for configuring a new customizable object.
+  // If base_opts are set, this is the "default" options to use for the new
+  // object.  Then any values in "new_opts" are applied to the object.
+  // Returns OK if the object could be successfully configured
+  // @return NotFound If any of the names in the base or new opts were not valid
+  //      for this object.
+  // @return NotSupported  If any of the names are valid but the object does
+  //       not know how to convert the value.  This can happen if, for example,
+  //       there is some nested Configurable that cannot be created.
+  // @return InvalidArgument If any of the values cannot be successfully
+  //       parsed.
+  static Status ConfigureNewObject(
+      const ConfigOptions& config_options, Configurable* object,
+      const std::string& id, const std::string& base_opts,
+      const std::unordered_map<std::string, std::string>& new_opts);
+
+  // Splits the input opt_value into the ID field and the remaining options.
+  // The input opt_value can be in the form of "name" or "name=value
+  // [;name=value]". The first form uses the "name" as an id with no options The
+  // latter form converts the input into a map of name=value pairs and sets "id"
+  // to the "id" value from the map.
+  // @param opt_value The value to split into id and options
+  // @param id The id field from the opt_value
+  // @param options The remaining name/value pairs from the opt_value
+  // @param default_id If specified and there is no id field in the map, this
+  // value is returned as the ID
+  // @return OK if the value was converted to a map succesfully and an ID was
+  // found.
+  // @return InvalidArgument if the value could not be converted to a map or
+  // there was or there is no id property in the map.
+  static Status GetOptionsMap(
+      const std::string& opt_value, const Customizable* custom, std::string* id,
+      std::unordered_map<std::string, std::string>* options);
+  static Status GetOptionsMap(
+      const std::string& opt_value, const std::string& default_id,
+      std::string* id, std::unordered_map<std::string, std::string>* options);
+
 #ifndef ROCKSDB_LITE
   // Internal method to configure a set of options for this object.
   // Classes may override this value to change its behavior.
@@ -205,6 +215,11 @@ class ConfigurableHelper {
   static const OptionTypeInfo* FindOption(
       const std::vector<Configurable::RegisteredOptions>& options,
       const std::string& name, std::string* opt_name, void** opt_ptr);
+
+  static Status ConfigureCustomizableOption(
+      const ConfigOptions& config_options, Configurable& configurable,
+      const OptionTypeInfo& opt_info, const std::string& opt_name,
+      const std::string& name, const std::string& value, void* opt_ptr);
 #endif  // ROCKSDB_LITE
 };
 
diff --git a/options/configurable_test.cc b/options/configurable_test.cc
index 27f8775269b..5983e2dc614 100644
--- a/options/configurable_test.cc
+++ b/options/configurable_test.cc
@@ -45,6 +45,22 @@ class StringLogger : public Logger {
  private:
   std::string string_;
 };
+static std::unordered_map<std::string, OptionTypeInfo> struct_option_info = {
+#ifndef ROCKSDB_LITE
+    {"struct", OptionTypeInfo::Struct("struct", &simple_option_info, 0,
+                                      OptionVerificationType::kNormal,
+                                      OptionTypeFlags::kMutable)},
+#endif  // ROCKSDB_LITE
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> imm_struct_option_info =
+    {
+#ifndef ROCKSDB_LITE
+        {"struct", OptionTypeInfo::Struct("struct", &simple_option_info, 0,
+                                          OptionVerificationType::kNormal,
+                                          OptionTypeFlags::kNone)},
+#endif  // ROCKSDB_LITE
+};
 
 class SimpleConfigurable : public TestConfigurable<Configurable> {
  public:
@@ -62,46 +78,20 @@ class SimpleConfigurable : public TestConfigurable<Configurable> {
       : TestConfigurable(name, mode, map) {
     if ((mode & TestConfigMode::kUniqueMode) != 0) {
       unique_.reset(SimpleConfigurable::Create("Unique" + name_));
-      ConfigurableHelper::RegisterOptions(*this, name_ + "Unique", &unique_,
-                                          &unique_option_info);
+      RegisterOptions(name_ + "Unique", &unique_, &unique_option_info);
     }
     if ((mode & TestConfigMode::kSharedMode) != 0) {
       shared_.reset(SimpleConfigurable::Create("Shared" + name_));
-      ConfigurableHelper::RegisterOptions(*this, name_ + "Shared", &shared_,
-                                          &shared_option_info);
+      RegisterOptions(name_ + "Shared", &shared_, &shared_option_info);
     }
     if ((mode & TestConfigMode::kRawPtrMode) != 0) {
       pointer_ = SimpleConfigurable::Create("Pointer" + name_);
-      ConfigurableHelper::RegisterOptions(*this, name_ + "Pointer", &pointer_,
-                                          &pointer_option_info);
+      RegisterOptions(name_ + "Pointer", &pointer_, &pointer_option_info);
     }
   }
 
 };  // End class SimpleConfigurable
 
-static std::unordered_map<std::string, OptionTypeInfo> wrapped_option_info = {
-#ifndef ROCKSDB_LITE
-    {"inner",
-     {0, OptionType::kConfigurable, OptionVerificationType::kNormal,
-      OptionTypeFlags::kShared}},
-#endif  // ROCKSDB_LITE
-};
-class WrappedConfigurable : public SimpleConfigurable {
- public:
-  WrappedConfigurable(const std::string& name, unsigned char mode,
-                      const std::shared_ptr<Configurable>& t)
-      : SimpleConfigurable(name, mode, &simple_option_info), inner_(t) {
-    ConfigurableHelper::RegisterOptions(*this, "WrappedOptions", &inner_,
-                                        &wrapped_option_info);
-  }
-
- protected:
-  Configurable* Inner() const override { return inner_.get(); }
-
- private:
-  std::shared_ptr<Configurable> inner_;
-};
-
 using ConfigTestFactoryFunc = std::function<Configurable*()>;
 
 class ConfigurableTest : public testing::Test {
@@ -257,19 +247,15 @@ class ValidatedConfigurable : public SimpleConfigurable {
       : SimpleConfigurable(name, TestConfigMode::kDefaultMode),
         validated(false),
         prepared(0) {
-    ConfigurableHelper::RegisterOptions(*this, "Validated", &validated,
-                                        &validated_option_info);
-    ConfigurableHelper::RegisterOptions(*this, "Prepared", &prepared,
-                                        &prepared_option_info);
+    RegisterOptions("Validated", &validated, &validated_option_info);
+    RegisterOptions("Prepared", &prepared, &prepared_option_info);
     if ((mode & TestConfigMode::kUniqueMode) != 0) {
       unique_.reset(new ValidatedConfigurable(
           "Unique" + name_, TestConfigMode::kDefaultMode, false));
       if (dont_prepare) {
-        ConfigurableHelper::RegisterOptions(*this, name_ + "Unique", &unique_,
-                                            &dont_prepare_option_info);
+        RegisterOptions(name_ + "Unique", &unique_, &dont_prepare_option_info);
       } else {
-        ConfigurableHelper::RegisterOptions(*this, name_ + "Unique", &unique_,
-                                            &unique_option_info);
+        RegisterOptions(name_ + "Unique", &unique_, &unique_option_info);
       }
     }
   }
@@ -345,6 +331,69 @@ TEST_F(ConfigurableTest, PrepareOptionsTest) {
   ASSERT_EQ(*up, 0);
 }
 
+TEST_F(ConfigurableTest, MutableOptionsTest) {
+  static std::unordered_map<std::string, OptionTypeInfo> imm_option_info = {
+#ifndef ROCKSDB_LITE
+      {"imm", OptionTypeInfo::Struct("imm", &simple_option_info, 0,
+                                     OptionVerificationType::kNormal,
+                                     OptionTypeFlags::kNone)},
+#endif  // ROCKSDB_LITE
+  };
+
+  class MutableConfigurable : public SimpleConfigurable {
+   public:
+    MutableConfigurable()
+        : SimpleConfigurable("mutable", TestConfigMode::kDefaultMode |
+                                            TestConfigMode::kUniqueMode |
+                                            TestConfigMode::kSharedMode) {
+      RegisterOptions("struct", &options_, &struct_option_info);
+      RegisterOptions("imm", &options_, &imm_option_info);
+    }
+  };
+  MutableConfigurable mc;
+  ConfigOptions options = config_options_;
+
+  ASSERT_OK(mc.ConfigureOption(options, "bool", "true"));
+  ASSERT_OK(mc.ConfigureOption(options, "int", "42"));
+  auto* opts = mc.GetOptions<TestOptions>("mutable");
+  ASSERT_NE(opts, nullptr);
+  ASSERT_EQ(opts->i, 42);
+  ASSERT_EQ(opts->b, true);
+  ASSERT_OK(mc.ConfigureOption(options, "struct", "{bool=false;}"));
+  ASSERT_OK(mc.ConfigureOption(options, "imm", "{int=55;}"));
+
+  options.mutable_options_only = true;
+
+  // Now only mutable options should be settable.
+  ASSERT_NOK(mc.ConfigureOption(options, "bool", "true"));
+  ASSERT_OK(mc.ConfigureOption(options, "int", "24"));
+  ASSERT_EQ(opts->i, 24);
+  ASSERT_EQ(opts->b, false);
+  ASSERT_NOK(mc.ConfigureFromString(options, "bool=false;int=33;"));
+  ASSERT_EQ(opts->i, 24);
+  ASSERT_EQ(opts->b, false);
+
+  // Setting options through an immutable struct fails
+  ASSERT_NOK(mc.ConfigureOption(options, "imm", "{int=55;}"));
+  ASSERT_NOK(mc.ConfigureOption(options, "imm.int", "55"));
+  ASSERT_EQ(opts->i, 24);
+  ASSERT_EQ(opts->b, false);
+
+  // Setting options through an mutable struct succeeds
+  ASSERT_OK(mc.ConfigureOption(options, "struct", "{int=44;}"));
+  ASSERT_EQ(opts->i, 44);
+  ASSERT_OK(mc.ConfigureOption(options, "struct.int", "55"));
+  ASSERT_EQ(opts->i, 55);
+
+  // Setting nested immutable configurable options fail
+  ASSERT_NOK(mc.ConfigureOption(options, "shared", "{bool=true;}"));
+  ASSERT_NOK(mc.ConfigureOption(options, "shared.bool", "true"));
+
+  // Setting nested mutable configurable options succeeds
+  ASSERT_OK(mc.ConfigureOption(options, "unique", "{bool=true}"));
+  ASSERT_OK(mc.ConfigureOption(options, "unique.bool", "true"));
+}
+
 TEST_F(ConfigurableTest, DeprecatedOptionsTest) {
   static std::unordered_map<std::string, OptionTypeInfo>
       deprecated_option_info = {
@@ -476,13 +525,6 @@ TEST_F(ConfigurableTest, MatchesTest) {
 }
 
 static Configurable* SimpleStructFactory() {
-  static std::unordered_map<std::string, OptionTypeInfo> struct_option_info = {
-#ifndef ROCKSDB_LITE
-      {"struct", OptionTypeInfo::Struct("struct", &simple_option_info, 0,
-                                        OptionVerificationType::kNormal,
-                                        OptionTypeFlags::kMutable)},
-#endif  // ROCKSDB_LITE
-  };
   return SimpleConfigurable::Create(
       "simple-struct", TestConfigMode::kDefaultMode, &struct_option_info);
 }
@@ -607,17 +649,6 @@ static std::unordered_map<std::string, ConfigTestFactoryFunc> TestFactories = {
                                              TestConfigMode::kSimpleMode |
                                              TestConfigMode::kNestedMode);
      }},
-    {"ThreeWay",
-     []() {
-       std::shared_ptr<Configurable> child;
-       child.reset(
-           SimpleConfigurable::Create("child", TestConfigMode::kDefaultMode));
-       std::shared_ptr<Configurable> parent;
-       parent.reset(new WrappedConfigurable(
-           "parent", TestConfigMode::kDefaultMode, child));
-       return new WrappedConfigurable("master", TestConfigMode::kDefaultMode,
-                                      parent);
-     }},
     {"ThreeDeep",
      []() {
        Configurable* simple = SimpleConfigurable::Create(
@@ -765,10 +796,6 @@ INSTANTIATE_TEST_CASE_P(
                                             "pointer={int=22;string=pointer};"
                                             "unique={int=33;string=unique};"
                                             "shared={int=44;string=shared}"),
-        std::pair<std::string, std::string>("ThreeWay",
-                                            "int=11;bool=true;string=outer;"
-                                            "inner={int=22;string=parent;"
-                                            "inner={int=33;string=child}};"),
         std::pair<std::string, std::string>("ThreeDeep",
                                             "int=11;bool=true;string=outer;"
                                             "unique={int=22;string=inner;"
diff --git a/options/configurable_test.h b/options/configurable_test.h
index 52c3599f667..cf9d0667837 100644
--- a/options/configurable_test.h
+++ b/options/configurable_test.h
@@ -112,11 +112,10 @@ class TestConfigurable : public Configurable {
       : name_(name), pointer_(nullptr) {
     prefix_ = "test." + name + ".";
     if ((mode & TestConfigMode::kSimpleMode) != 0) {
-      ConfigurableHelper::RegisterOptions(*this, name_, &options_, map);
+      RegisterOptions(name_, &options_, map);
     }
     if ((mode & TestConfigMode::kEnumMode) != 0) {
-      ConfigurableHelper::RegisterOptions(*this, name_ + "Enum", &options_,
-                                          &enum_option_info);
+      RegisterOptions(name_ + "Enum", &options_, &enum_option_info);
     }
   }
 
diff --git a/options/customizable.cc b/options/customizable.cc
new file mode 100644
index 00000000000..3488f326bac
--- /dev/null
+++ b/options/customizable.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/customizable.h"
+
+#include "options/configurable_helper.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/status.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::string Customizable::GetOptionName(const std::string& long_name) const {
+  const std::string& name = Name();
+  size_t name_len = name.size();
+  if (long_name.size() > name_len + 1 &&
+      long_name.compare(0, name_len, name) == 0 &&
+      long_name.at(name_len) == '.') {
+    return long_name.substr(name_len + 1);
+  } else {
+    return Configurable::GetOptionName(long_name);
+  }
+}
+
+#ifndef ROCKSDB_LITE
+Status Customizable::GetOption(const ConfigOptions& config_options,
+                               const std::string& opt_name,
+                               std::string* value) const {
+  if (opt_name == ConfigurableHelper::kIdPropName) {
+    *value = GetId();
+    return Status::OK();
+  } else {
+    return Configurable::GetOption(config_options, opt_name, value);
+  }
+}
+
+std::string Customizable::SerializeOptions(const ConfigOptions& config_options,
+                                           const std::string& prefix) const {
+  std::string result;
+  std::string parent;
+  if (!config_options.IsShallow()) {
+    parent = Configurable::SerializeOptions(config_options, "");
+  }
+  if (parent.empty()) {
+    result = GetId();
+  } else {
+    result.append(prefix + ConfigurableHelper::kIdPropName + "=" + GetId() +
+                  config_options.delimiter);
+    result.append(parent);
+  }
+  return result;
+}
+
+#endif  // ROCKSDB_LITE
+
+bool Customizable::AreEquivalent(const ConfigOptions& config_options,
+                                 const Configurable* other,
+                                 std::string* mismatch) const {
+  if (config_options.sanity_level > ConfigOptions::kSanityLevelNone &&
+      this != other) {
+    const Customizable* custom = reinterpret_cast<const Customizable*>(other);
+    if (GetId() != custom->GetId()) {
+      *mismatch = ConfigurableHelper::kIdPropName;
+      return false;
+    } else if (config_options.sanity_level >
+               ConfigOptions::kSanityLevelLooselyCompatible) {
+      bool matches =
+          Configurable::AreEquivalent(config_options, other, mismatch);
+      return matches;
+    }
+  }
+  return true;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/options/customizable_helper.h b/options/customizable_helper.h
new file mode 100644
index 00000000000..cd7cc26f830
--- /dev/null
+++ b/options/customizable_helper.h
@@ -0,0 +1,221 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <functional>
+#include <memory>
+#include <unordered_map>
+
+#include "options/configurable_helper.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/customizable.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/object_registry.h"
+
+namespace ROCKSDB_NAMESPACE {
+template <typename T>
+using SharedFactoryFunc =
+    std::function<bool(const std::string&, std::shared_ptr<T>*)>;
+
+template <typename T>
+using UniqueFactoryFunc =
+    std::function<bool(const std::string&, std::unique_ptr<T>*)>;
+
+template <typename T>
+using StaticFactoryFunc = std::function<bool(const std::string&, T**)>;
+
+// Creates a new shared Customizable object based on the input parameters.
+// This method parses the input value to determine the type of instance to
+// create. If there is an existing instance (in result) and it is the same type
+// as the object being created, the existing configuration is stored and used as
+// the default for the new object.
+//
+// The value parameter specified the instance class of the object to create.
+// If it is a simple string (e.g. BlockBasedTable), then the instance will be
+// created using the default settings.  If the value is a set of name-value
+// pairs, then the "id" value is used to determine the instance to create and
+// the remaining parameters are used to configure the object.  Id name-value
+// pairs are specified, there must be an "id=value" pairing or an error will
+// result.
+//
+// The config_options parameter controls the process and how errors are
+// returned. If ignore_unknown_options=true, unknown values are ignored during
+// the configuration If ignore_unsupported_options=true, unknown instance types
+// are ignored If invoke_prepare_options=true, the resulting instance wll be
+// initialized (via PrepareOptions
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param value Either the simple name of the instance to create, or a set of
+// name-value pairs to
+//              create and initailzie the object
+// @param func  Optional function to call to attempt to create an instance
+// @param result The newly created instance.
+template <typename T>
+static Status LoadSharedObject(const ConfigOptions& config_options,
+                               const std::string& value,
+                               const SharedFactoryFunc<T>& func,
+                               std::shared_ptr<T>* result) {
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status =
+      ConfigurableHelper::GetOptionsMap(value, result->get(), &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  }
+  std::string curr_opts;
+#ifndef ROCKSDB_LITE
+  if (result->get() != nullptr && result->get()->GetId() == id) {
+    // Try to get the existing options, ignoring any errors
+    ConfigOptions embedded = config_options;
+    embedded.delimiter = ";";
+    result->get()->GetOptionString(embedded, &curr_opts).PermitUncheckedError();
+  }
+#endif
+  if (func == nullptr || !func(id, result)) {  // No factory, or it failed
+    if (value.empty()) {
+      // No Id and no options.  Clear the object
+      result->reset();
+      return Status::OK();
+    } else if (id.empty()) {  // We have no Id but have options.  Not good
+      return Status::NotSupported("Cannot reset object ", id);
+    } else {
+#ifndef ROCKSDB_LITE
+      status = config_options.registry->NewSharedObject(id, result);
+#else
+      status = Status::NotSupported("Cannot load object in LITE mode ", id);
+#endif
+      if (!status.ok()) {
+        if (config_options.ignore_unsupported_options &&
+            status.IsNotSupported()) {
+          return Status::OK();
+        } else {
+          return status;
+        }
+      }
+    }
+  }
+  return ConfigurableHelper::ConfigureNewObject(config_options, result->get(),
+                                                id, curr_opts, opt_map);
+}
+
+// Creates a new unique customizable instance object based on the input
+// parameters.
+// @see LoadSharedObject for more information on the inner workings of this
+// method.
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param value Either the simple name of the instance to create, or a set of
+// name-value pairs to
+//              create and initailzie the object
+// @param func  Optional function to call to attempt to create an instance
+// @param result The newly created instance.
+template <typename T>
+static Status LoadUniqueObject(const ConfigOptions& config_options,
+                               const std::string& value,
+                               const UniqueFactoryFunc<T>& func,
+                               std::unique_ptr<T>* result) {
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status =
+      ConfigurableHelper::GetOptionsMap(value, result->get(), &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  }
+  std::string curr_opts;
+#ifndef ROCKSDB_LITE
+  if (result->get() != nullptr && result->get()->GetId() == id) {
+    // Try to get the existing options, ignoring any errors
+    ConfigOptions embedded = config_options;
+    embedded.delimiter = ";";
+    result->get()->GetOptionString(embedded, &curr_opts).PermitUncheckedError();
+  }
+#endif
+  if (func == nullptr || !func(id, result)) {  // No factory, or it failed
+    if (value.empty()) {
+      // No Id and no options.  Clear the object
+      result->reset();
+      return Status::OK();
+    } else if (id.empty()) {  // We have no Id but have options.  Not good
+      return Status::NotSupported("Cannot reset object ", id);
+    } else {
+#ifndef ROCKSDB_LITE
+      status = config_options.registry->NewUniqueObject(id, result);
+#else
+      status = Status::NotSupported("Cannot load object in LITE mode ", id);
+#endif  // ROCKSDB_LITE
+      if (!status.ok()) {
+        if (config_options.ignore_unsupported_options &&
+            status.IsNotSupported()) {
+          return Status::OK();
+        } else {
+          return status;
+        }
+      }
+    }
+  }
+  return ConfigurableHelper::ConfigureNewObject(config_options, result->get(),
+                                                id, curr_opts, opt_map);
+}
+// Creates a new static (raw pointer) customizable instance object based on the
+// input parameters.
+// @see LoadSharedObject for more information on the inner workings of this
+// method.
+//
+// @param config_options Controls how the instance is created and errors are
+// handled
+// @param value Either the simple name of the instance to create, or a set of
+// name-value pairs to
+//              create and initailzie the object
+// @param func  Optional function to call to attempt to create an instance
+// @param result The newly created instance.
+template <typename T>
+static Status LoadStaticObject(const ConfigOptions& config_options,
+                               const std::string& value,
+                               const StaticFactoryFunc<T>& func, T** result) {
+  std::string id;
+  std::unordered_map<std::string, std::string> opt_map;
+  Status status =
+      ConfigurableHelper::GetOptionsMap(value, *result, &id, &opt_map);
+  if (!status.ok()) {  // GetOptionsMap failed
+    return status;
+  }
+  std::string curr_opts;
+#ifndef ROCKSDB_LITE
+  if (*result != nullptr && (*result)->GetId() == id) {
+    // Try to get the existing options, ignoring any errors
+    ConfigOptions embedded = config_options;
+    embedded.delimiter = ";";
+    (*result)->GetOptionString(embedded, &curr_opts).PermitUncheckedError();
+  }
+#endif
+  if (func == nullptr || !func(id, result)) {  // No factory, or it failed
+    if (value.empty()) {
+      // No Id and no options.  Clear the object
+      *result = nullptr;
+      return Status::OK();
+    } else if (id.empty()) {  // We have no Id but have options.  Not good
+      return Status::NotSupported("Cannot reset object ", id);
+    } else {
+#ifndef ROCKSDB_LITE
+      status = config_options.registry->NewStaticObject(id, result);
+#else
+      status = Status::NotSupported("Cannot load object in LITE mode ", id);
+#endif  // ROCKSDB_LITE
+      if (!status.ok()) {
+        if (config_options.ignore_unsupported_options &&
+            status.IsNotSupported()) {
+          return Status::OK();
+        } else {
+          return status;
+        }
+      }
+    }
+  }
+  return ConfigurableHelper::ConfigureNewObject(config_options, *result, id,
+                                                curr_opts, opt_map);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/options/customizable_test.cc b/options/customizable_test.cc
new file mode 100644
index 00000000000..d48ed104034
--- /dev/null
+++ b/options/customizable_test.cc
@@ -0,0 +1,831 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/customizable.h"
+
+#include <cctype>
+#include <cinttypes>
+#include <cstring>
+#include <unordered_map>
+
+#include "options/configurable_helper.h"
+#include "options/customizable_helper.h"
+#include "options/options_helper.h"
+#include "options/options_parser.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+#include "table/mock_table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+#ifndef GFLAGS
+bool FLAGS_enable_print = false;
+#else
+#include "util/gflags_compat.h"
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+DEFINE_bool(enable_print, false, "Print options generated to console.");
+#endif  // GFLAGS
+
+namespace ROCKSDB_NAMESPACE {
+class StringLogger : public Logger {
+ public:
+  using Logger::Logv;
+  void Logv(const char* format, va_list ap) override {
+    char buffer[1000];
+    vsnprintf(buffer, sizeof(buffer), format, ap);
+    string_.append(buffer);
+  }
+  const std::string& str() const { return string_; }
+  void clear() { string_.clear(); }
+
+ private:
+  std::string string_;
+};
+
+class TestCustomizable : public Customizable {
+ public:
+  TestCustomizable(const std::string& name) : name_(name) {}
+  // Method to allow CheckedCast to work for this class
+  static const char* kClassName() {
+    return "TestCustomizable";
+    ;
+  }
+
+  const char* Name() const override { return name_.c_str(); }
+  static const char* Type() { return "test.custom"; }
+  static Status CreateFromString(const ConfigOptions& opts,
+                                 const std::string& value,
+                                 std::unique_ptr<TestCustomizable>* result);
+  static Status CreateFromString(const ConfigOptions& opts,
+                                 const std::string& value,
+                                 std::shared_ptr<TestCustomizable>* result);
+  static Status CreateFromString(const ConfigOptions& opts,
+                                 const std::string& value,
+                                 TestCustomizable** result);
+  bool IsInstanceOf(const std::string& name) const override {
+    if (name == kClassName()) {
+      return true;
+    } else {
+      return Customizable::IsInstanceOf(name);
+    }
+  }
+
+ protected:
+  const std::string name_;
+};
+
+struct AOptions {
+  int i = 0;
+  bool b = false;
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> a_option_info = {
+#ifndef ROCKSDB_LITE
+    {"int",
+     {offsetof(struct AOptions, i), OptionType::kInt,
+      OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+    {"bool",
+     {offsetof(struct AOptions, b), OptionType::kBoolean,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+class ACustomizable : public TestCustomizable {
+ public:
+  explicit ACustomizable(const std::string& id)
+      : TestCustomizable("A"), id_(id) {
+    RegisterOptions("A", &opts_, &a_option_info);
+  }
+  std::string GetId() const override { return id_; }
+  static const char* kClassName() { return "A"; }
+
+ private:
+  AOptions opts_;
+  const std::string id_;
+};
+
+#ifndef ROCKSDB_LITE
+static int A_count = 0;
+const FactoryFunc<TestCustomizable>& a_func =
+    ObjectLibrary::Default()->Register<TestCustomizable>(
+        "A.*",
+        [](const std::string& name, std::unique_ptr<TestCustomizable>* guard,
+           std::string* /* msg */) {
+          guard->reset(new ACustomizable(name));
+          A_count++;
+          return guard->get();
+        });
+#endif  // ROCKSDB_LITE
+
+struct BOptions {
+  std::string s;
+  bool b = false;
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> b_option_info = {
+#ifndef ROCKSDB_LITE
+    {"string",
+     {offsetof(struct BOptions, s), OptionType::kString,
+      OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
+    {"bool",
+     {offsetof(struct BOptions, b), OptionType::kBoolean,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+
+class BCustomizable : public TestCustomizable {
+ private:
+ public:
+  explicit BCustomizable(const std::string& name) : TestCustomizable(name) {
+    RegisterOptions(name, &opts_, &b_option_info);
+  }
+  static const char* kClassName() { return "B"; }
+
+ private:
+  BOptions opts_;
+};
+
+static bool LoadSharedB(const std::string& id,
+                        std::shared_ptr<TestCustomizable>* result) {
+  if (id == "B") {
+    result->reset(new BCustomizable(id));
+    return true;
+  } else if (id.empty()) {
+    result->reset();
+    return true;
+  } else {
+    return false;
+  }
+}
+Status TestCustomizable::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::shared_ptr<TestCustomizable>* result) {
+  return LoadSharedObject<TestCustomizable>(config_options, value, LoadSharedB,
+                                            result);
+}
+
+Status TestCustomizable::CreateFromString(
+    const ConfigOptions& config_options, const std::string& value,
+    std::unique_ptr<TestCustomizable>* result) {
+  return LoadUniqueObject<TestCustomizable>(
+      config_options, value,
+      [](const std::string& id, std::unique_ptr<TestCustomizable>* u) {
+        if (id == "B") {
+          u->reset(new BCustomizable(id));
+          return true;
+        } else if (id.empty()) {
+          u->reset();
+          return true;
+        } else {
+          return false;
+        }
+      },
+      result);
+}
+
+Status TestCustomizable::CreateFromString(const ConfigOptions& config_options,
+                                          const std::string& value,
+                                          TestCustomizable** result) {
+  return LoadStaticObject<TestCustomizable>(
+      config_options, value,
+      [](const std::string& id, TestCustomizable** ptr) {
+        if (id == "B") {
+          *ptr = new BCustomizable(id);
+          return true;
+        } else if (id.empty()) {
+          *ptr = nullptr;
+          return true;
+        } else {
+          return false;
+        }
+      },
+      result);
+}
+
+#ifndef ROCKSDB_LITE
+const FactoryFunc<TestCustomizable>& s_func =
+    ObjectLibrary::Default()->Register<TestCustomizable>(
+        "S", [](const std::string& name,
+                std::unique_ptr<TestCustomizable>* /* guard */,
+                std::string* /* msg */) { return new BCustomizable(name); });
+#endif  // ROCKSDB_LITE
+
+struct SimpleOptions {
+  bool b = true;
+  bool is_mutable = true;
+  std::unique_ptr<TestCustomizable> cu;
+  std::shared_ptr<TestCustomizable> cs;
+  TestCustomizable* cp = nullptr;
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> simple_option_info = {
+#ifndef ROCKSDB_LITE
+    {"bool",
+     {offsetof(struct SimpleOptions, b), OptionType::kBoolean,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"unique", OptionTypeInfo::AsCustomUniquePtr<TestCustomizable>(
+                   offsetof(struct SimpleOptions, cu),
+                   OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+    {"shared", OptionTypeInfo::AsCustomSharedPtr<TestCustomizable>(
+                   offsetof(struct SimpleOptions, cs),
+                   OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+    {"pointer", OptionTypeInfo::AsCustomRawPtr<TestCustomizable>(
+                    offsetof(struct SimpleOptions, cp),
+                    OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+#endif  // ROCKSDB_LITE
+};
+
+class SimpleConfigurable : public Configurable {
+ private:
+  SimpleOptions simple_;
+
+ public:
+  SimpleConfigurable() {
+    RegisterOptions("simple", &simple_, &simple_option_info);
+  }
+
+  explicit SimpleConfigurable(
+      const std::unordered_map<std::string, OptionTypeInfo>* map) {
+    RegisterOptions("simple", &simple_, map);
+  }
+
+  bool IsPrepared() const override {
+    if (simple_.is_mutable) {
+      return false;
+    } else {
+      return Configurable::IsPrepared();
+    }
+  }
+
+ private:
+};
+
+class CustomizableTest : public testing::Test {
+ public:
+  ConfigOptions config_options_;
+};
+
+#ifndef ROCKSDB_LITE  // GetOptionsFromMap is not supported in ROCKSDB_LITE
+// Tests that a Customizable can be created by:
+//    - a simple name
+//    - a XXX.id option
+//    - a property with a name
+TEST_F(CustomizableTest, CreateByNameTest) {
+  ObjectLibrary::Default()->Register<TestCustomizable>(
+      "TEST.*",
+      [](const std::string& name, std::unique_ptr<TestCustomizable>* guard,
+         std::string* /* msg */) {
+        guard->reset(new TestCustomizable(name));
+        return guard->get();
+      });
+  std::unique_ptr<Configurable> configurable(new SimpleConfigurable());
+  SimpleOptions* simple = configurable->GetOptions<SimpleOptions>("simple");
+  ASSERT_NE(simple, nullptr);
+  ASSERT_OK(
+      configurable->ConfigureFromString(config_options_, "unique={id=TEST_1}"));
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(simple->cu->GetId(), "TEST_1");
+  ASSERT_OK(
+      configurable->ConfigureFromString(config_options_, "unique.id=TEST_2"));
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(simple->cu->GetId(), "TEST_2");
+  ASSERT_OK(
+      configurable->ConfigureFromString(config_options_, "unique=TEST_3"));
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(simple->cu->GetId(), "TEST_3");
+}
+
+TEST_F(CustomizableTest, ToStringTest) {
+  std::unique_ptr<TestCustomizable> custom(new TestCustomizable("test"));
+  ASSERT_EQ(custom->ToString(config_options_), "test");
+}
+
+TEST_F(CustomizableTest, SimpleConfigureTest) {
+  std::unordered_map<std::string, std::string> opt_map = {
+      {"unique", "id=A;int=1;bool=true"},
+      {"shared", "id=B;string=s"},
+  };
+  std::unique_ptr<Configurable> configurable(new SimpleConfigurable());
+  ASSERT_OK(configurable->ConfigureFromMap(config_options_, opt_map));
+  SimpleOptions* simple = configurable->GetOptions<SimpleOptions>("simple");
+  ASSERT_NE(simple, nullptr);
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(simple->cu->GetId(), "A");
+  std::string opt_str;
+  std::string mismatch;
+  ASSERT_OK(configurable->GetOptionString(config_options_, &opt_str));
+  std::unique_ptr<Configurable> copy(new SimpleConfigurable());
+  ASSERT_OK(copy->ConfigureFromString(config_options_, opt_str));
+  ASSERT_TRUE(
+      configurable->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+static void GetMapFromProperties(
+    const std::string& props,
+    std::unordered_map<std::string, std::string>* map) {
+  std::istringstream iss(props);
+  std::unordered_map<std::string, std::string> copy_map;
+  std::string line;
+  map->clear();
+  for (int line_num = 0; std::getline(iss, line); line_num++) {
+    std::string name;
+    std::string value;
+    ASSERT_OK(
+        RocksDBOptionsParser::ParseStatement(&name, &value, line, line_num));
+    (*map)[name] = value;
+  }
+}
+
+TEST_F(CustomizableTest, ConfigureFromPropsTest) {
+  std::unordered_map<std::string, std::string> opt_map = {
+      {"unique.id", "A"}, {"unique.A.int", "1"},    {"unique.A.bool", "true"},
+      {"shared.id", "B"}, {"shared.B.string", "s"},
+  };
+  std::unique_ptr<Configurable> configurable(new SimpleConfigurable());
+  ASSERT_OK(configurable->ConfigureFromMap(config_options_, opt_map));
+  SimpleOptions* simple = configurable->GetOptions<SimpleOptions>("simple");
+  ASSERT_NE(simple, nullptr);
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(simple->cu->GetId(), "A");
+  std::string opt_str;
+  std::string mismatch;
+  config_options_.delimiter = "\n";
+  std::unordered_map<std::string, std::string> props;
+  ASSERT_OK(configurable->GetOptionString(config_options_, &opt_str));
+  GetMapFromProperties(opt_str, &props);
+  std::unique_ptr<Configurable> copy(new SimpleConfigurable());
+  ASSERT_OK(copy->ConfigureFromMap(config_options_, props));
+  ASSERT_TRUE(
+      configurable->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+TEST_F(CustomizableTest, ConfigureFromShortTest) {
+  std::unordered_map<std::string, std::string> opt_map = {
+      {"unique.id", "A"}, {"unique.A.int", "1"},    {"unique.A.bool", "true"},
+      {"shared.id", "B"}, {"shared.B.string", "s"},
+  };
+  std::unique_ptr<Configurable> configurable(new SimpleConfigurable());
+  ASSERT_OK(configurable->ConfigureFromMap(config_options_, opt_map));
+  SimpleOptions* simple = configurable->GetOptions<SimpleOptions>("simple");
+  ASSERT_NE(simple, nullptr);
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(simple->cu->GetId(), "A");
+}
+
+TEST_F(CustomizableTest, AreEquivalentOptionsTest) {
+  std::unordered_map<std::string, std::string> opt_map = {
+      {"unique", "id=A;int=1;bool=true"},
+      {"shared", "id=A;int=1;bool=true"},
+  };
+  std::string mismatch;
+  ConfigOptions config_options = config_options_;
+  config_options.invoke_prepare_options = false;
+  std::unique_ptr<Configurable> c1(new SimpleConfigurable());
+  std::unique_ptr<Configurable> c2(new SimpleConfigurable());
+  ASSERT_OK(c1->ConfigureFromMap(config_options, opt_map));
+  ASSERT_OK(c2->ConfigureFromMap(config_options, opt_map));
+  ASSERT_TRUE(c1->AreEquivalent(config_options, c2.get(), &mismatch));
+  SimpleOptions* simple = c1->GetOptions<SimpleOptions>("simple");
+  ASSERT_TRUE(
+      simple->cu->AreEquivalent(config_options, simple->cs.get(), &mismatch));
+  ASSERT_OK(simple->cu->ConfigureOption(config_options, "int", "2"));
+  ASSERT_FALSE(
+      simple->cu->AreEquivalent(config_options, simple->cs.get(), &mismatch));
+  ASSERT_FALSE(c1->AreEquivalent(config_options, c2.get(), &mismatch));
+  ConfigOptions loosely = config_options;
+  loosely.sanity_level = ConfigOptions::kSanityLevelLooselyCompatible;
+  ASSERT_TRUE(c1->AreEquivalent(loosely, c2.get(), &mismatch));
+  ASSERT_TRUE(simple->cu->AreEquivalent(loosely, simple->cs.get(), &mismatch));
+
+  ASSERT_OK(c1->ConfigureOption(config_options, "shared", "id=B;string=3"));
+  ASSERT_TRUE(c1->AreEquivalent(loosely, c2.get(), &mismatch));
+  ASSERT_FALSE(c1->AreEquivalent(config_options, c2.get(), &mismatch));
+  ASSERT_FALSE(simple->cs->AreEquivalent(loosely, simple->cu.get(), &mismatch));
+  simple->cs.reset();
+  ASSERT_TRUE(c1->AreEquivalent(loosely, c2.get(), &mismatch));
+  ASSERT_FALSE(c1->AreEquivalent(config_options, c2.get(), &mismatch));
+}
+
+// Tests that we can initialize a customizable from its options
+TEST_F(CustomizableTest, ConfigureStandaloneCustomTest) {
+  std::unique_ptr<TestCustomizable> base, copy;
+  auto registry = ObjectRegistry::NewInstance();
+  ASSERT_OK(registry->NewUniqueObject<TestCustomizable>("A", &base));
+  ASSERT_OK(registry->NewUniqueObject<TestCustomizable>("A", &copy));
+  ASSERT_OK(base->ConfigureFromString(config_options_, "int=33;bool=true"));
+  std::string opt_str;
+  std::string mismatch;
+  ASSERT_OK(base->GetOptionString(config_options_, &opt_str));
+  ASSERT_OK(copy->ConfigureFromString(config_options_, opt_str));
+  ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+}
+
+// Tests that we fail appropriately if the pattern is not registered
+TEST_F(CustomizableTest, BadNameTest) {
+  config_options_.ignore_unsupported_options = false;
+  std::unique_ptr<Configurable> c1(new SimpleConfigurable());
+  ASSERT_NOK(
+      c1->ConfigureFromString(config_options_, "unique.shared.id=bad name"));
+  config_options_.ignore_unsupported_options = true;
+  ASSERT_OK(
+      c1->ConfigureFromString(config_options_, "unique.shared.id=bad name"));
+}
+
+// Tests that we fail appropriately if a bad option is passed to the underlying
+// configurable
+TEST_F(CustomizableTest, BadOptionTest) {
+  std::unique_ptr<Configurable> c1(new SimpleConfigurable());
+  ConfigOptions ignore = config_options_;
+  ignore.ignore_unknown_options = true;
+
+  ASSERT_NOK(c1->ConfigureFromString(config_options_, "A.int=11"));
+  ASSERT_NOK(c1->ConfigureFromString(config_options_, "shared={id=B;int=1}"));
+  ASSERT_OK(c1->ConfigureFromString(ignore, "shared={id=A;string=s}"));
+  ASSERT_NOK(c1->ConfigureFromString(config_options_, "B.int=11"));
+  ASSERT_OK(c1->ConfigureFromString(ignore, "B.int=11"));
+  ASSERT_NOK(c1->ConfigureFromString(config_options_, "A.string=s"));
+  ASSERT_OK(c1->ConfigureFromString(ignore, "A.string=s"));
+  // Test as detached
+  ASSERT_NOK(
+      c1->ConfigureFromString(config_options_, "shared.id=A;A.string=b}"));
+  ASSERT_OK(c1->ConfigureFromString(ignore, "shared.id=A;A.string=s}"));
+}
+
+// Tests that different IDs lead to different objects
+TEST_F(CustomizableTest, UniqueIdTest) {
+  std::unique_ptr<Configurable> base(new SimpleConfigurable());
+  ASSERT_OK(base->ConfigureFromString(config_options_,
+                                      "unique={id=A_1;int=1;bool=true}"));
+  SimpleOptions* simple = base->GetOptions<SimpleOptions>("simple");
+  ASSERT_NE(simple, nullptr);
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(simple->cu->GetId(), std::string("A_1"));
+  std::string opt_str;
+  std::string mismatch;
+  ASSERT_OK(base->GetOptionString(config_options_, &opt_str));
+  std::unique_ptr<Configurable> copy(new SimpleConfigurable());
+  ASSERT_OK(copy->ConfigureFromString(config_options_, opt_str));
+  ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+  ASSERT_OK(base->ConfigureFromString(config_options_,
+                                      "unique={id=A_2;int=1;bool=true}"));
+  ASSERT_FALSE(base->AreEquivalent(config_options_, copy.get(), &mismatch));
+  ASSERT_EQ(simple->cu->GetId(), std::string("A_2"));
+}
+
+TEST_F(CustomizableTest, IsInstanceOfTest) {
+  std::shared_ptr<TestCustomizable> tc = std::make_shared<ACustomizable>("A");
+
+  ASSERT_TRUE(tc->IsInstanceOf("A"));
+  ASSERT_TRUE(tc->IsInstanceOf("TestCustomizable"));
+  ASSERT_FALSE(tc->IsInstanceOf("B"));
+  ASSERT_EQ(tc->CheckedCast<ACustomizable>(), tc.get());
+  ASSERT_EQ(tc->CheckedCast<TestCustomizable>(), tc.get());
+  ASSERT_EQ(tc->CheckedCast<BCustomizable>(), nullptr);
+
+  tc.reset(new BCustomizable("B"));
+  ASSERT_TRUE(tc->IsInstanceOf("B"));
+  ASSERT_TRUE(tc->IsInstanceOf("TestCustomizable"));
+  ASSERT_FALSE(tc->IsInstanceOf("A"));
+  ASSERT_EQ(tc->CheckedCast<BCustomizable>(), tc.get());
+  ASSERT_EQ(tc->CheckedCast<TestCustomizable>(), tc.get());
+  ASSERT_EQ(tc->CheckedCast<ACustomizable>(), nullptr);
+}
+
+static std::unordered_map<std::string, OptionTypeInfo> inner_option_info = {
+#ifndef ROCKSDB_LITE
+    {"inner",
+     OptionTypeInfo::AsCustomSharedPtr<TestCustomizable>(
+         0, OptionVerificationType::kNormal, OptionTypeFlags::kStringNameOnly)}
+#endif  // ROCKSDB_LITE
+};
+
+class InnerCustomizable : public Customizable {
+ public:
+  explicit InnerCustomizable(const std::shared_ptr<Customizable>& w)
+      : inner_(w) {}
+  static const char* kClassName() { return "Inner"; }
+  bool IsInstanceOf(const std::string& name) const override {
+    if (name == kClassName()) {
+      return true;
+    } else {
+      return Customizable::IsInstanceOf(name);
+    }
+  }
+
+ protected:
+  const Customizable* Inner() const override { return inner_.get(); }
+
+ private:
+  std::shared_ptr<Customizable> inner_;
+};
+
+class WrappedCustomizable1 : public InnerCustomizable {
+ public:
+  explicit WrappedCustomizable1(const std::shared_ptr<Customizable>& w)
+      : InnerCustomizable(w) {}
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "Wrapped1"; }
+};
+
+class WrappedCustomizable2 : public InnerCustomizable {
+ public:
+  explicit WrappedCustomizable2(const std::shared_ptr<Customizable>& w)
+      : InnerCustomizable(w) {}
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "Wrapped2"; }
+};
+
+TEST_F(CustomizableTest, WrappedInnerTest) {
+  std::shared_ptr<TestCustomizable> ac =
+      std::make_shared<TestCustomizable>("A");
+
+  ASSERT_TRUE(ac->IsInstanceOf("A"));
+  ASSERT_TRUE(ac->IsInstanceOf("TestCustomizable"));
+  ASSERT_EQ(ac->CheckedCast<TestCustomizable>(), ac.get());
+  ASSERT_EQ(ac->CheckedCast<InnerCustomizable>(), nullptr);
+  ASSERT_EQ(ac->CheckedCast<WrappedCustomizable1>(), nullptr);
+  ASSERT_EQ(ac->CheckedCast<WrappedCustomizable2>(), nullptr);
+  std::shared_ptr<Customizable> wc1 =
+      std::make_shared<WrappedCustomizable1>(ac);
+
+  ASSERT_TRUE(wc1->IsInstanceOf(WrappedCustomizable1::kClassName()));
+  ASSERT_EQ(wc1->CheckedCast<WrappedCustomizable1>(), wc1.get());
+  ASSERT_EQ(wc1->CheckedCast<WrappedCustomizable2>(), nullptr);
+  ASSERT_EQ(wc1->CheckedCast<InnerCustomizable>(), wc1.get());
+  ASSERT_EQ(wc1->CheckedCast<TestCustomizable>(), ac.get());
+
+  std::shared_ptr<Customizable> wc2 =
+      std::make_shared<WrappedCustomizable2>(wc1);
+  ASSERT_TRUE(wc2->IsInstanceOf(WrappedCustomizable2::kClassName()));
+  ASSERT_EQ(wc2->CheckedCast<WrappedCustomizable2>(), wc2.get());
+  ASSERT_EQ(wc2->CheckedCast<WrappedCustomizable1>(), wc1.get());
+  ASSERT_EQ(wc2->CheckedCast<InnerCustomizable>(), wc2.get());
+  ASSERT_EQ(wc2->CheckedCast<TestCustomizable>(), ac.get());
+}
+
+class ShallowCustomizable : public Customizable {
+ public:
+  ShallowCustomizable() {
+    inner_ = std::make_shared<ACustomizable>("a");
+    RegisterOptions("inner", &inner_, &inner_option_info);
+  };
+  static const char* kClassName() { return "shallow"; }
+  const char* Name() const override { return kClassName(); }
+
+ private:
+  std::shared_ptr<TestCustomizable> inner_;
+};
+
+TEST_F(CustomizableTest, TestStringDepth) {
+  ConfigOptions shallow = config_options_;
+  std::unique_ptr<Configurable> c(new ShallowCustomizable());
+  std::string opt_str;
+  shallow.depth = ConfigOptions::Depth::kDepthShallow;
+  ASSERT_OK(c->GetOptionString(shallow, &opt_str));
+  ASSERT_EQ(opt_str, "inner=a;");
+  shallow.depth = ConfigOptions::Depth::kDepthDetailed;
+  ASSERT_OK(c->GetOptionString(shallow, &opt_str));
+  ASSERT_NE(opt_str, "inner=a;");
+}
+
+// Tests that we only get a new customizable when it changes
+TEST_F(CustomizableTest, NewCustomizableTest) {
+  std::unique_ptr<Configurable> base(new SimpleConfigurable());
+  A_count = 0;
+  ASSERT_OK(base->ConfigureFromString(config_options_,
+                                      "unique={id=A_1;int=1;bool=true}"));
+  SimpleOptions* simple = base->GetOptions<SimpleOptions>("simple");
+  ASSERT_NE(simple, nullptr);
+  ASSERT_NE(simple->cu, nullptr);
+  ASSERT_EQ(A_count, 1);  // Created one A
+  ASSERT_OK(base->ConfigureFromString(config_options_,
+                                      "unique={id=A_1;int=1;bool=false}"));
+  ASSERT_EQ(A_count, 2);  // Create another A_1
+  ASSERT_OK(base->ConfigureFromString(config_options_,
+                                      "unique={id=A_2;int=1;bool=false}"));
+  ASSERT_EQ(A_count, 3);  // Created another A
+  ASSERT_OK(base->ConfigureFromString(config_options_, "unique.id="));
+  ASSERT_EQ(simple->cu, nullptr);
+  ASSERT_EQ(A_count, 3);
+}
+
+TEST_F(CustomizableTest, IgnoreUnknownObjects) {
+  ConfigOptions ignore = config_options_;
+  std::shared_ptr<TestCustomizable> shared;
+  std::unique_ptr<TestCustomizable> unique;
+  TestCustomizable* pointer = nullptr;
+  ignore.ignore_unsupported_options = false;
+  ASSERT_NOK(
+      LoadSharedObject<TestCustomizable>(ignore, "Unknown", nullptr, &shared));
+  ASSERT_NOK(
+      LoadUniqueObject<TestCustomizable>(ignore, "Unknown", nullptr, &unique));
+  ASSERT_NOK(
+      LoadStaticObject<TestCustomizable>(ignore, "Unknown", nullptr, &pointer));
+  ASSERT_EQ(shared.get(), nullptr);
+  ASSERT_EQ(unique.get(), nullptr);
+  ASSERT_EQ(pointer, nullptr);
+  ignore.ignore_unsupported_options = true;
+  ASSERT_OK(
+      LoadSharedObject<TestCustomizable>(ignore, "Unknown", nullptr, &shared));
+  ASSERT_OK(
+      LoadUniqueObject<TestCustomizable>(ignore, "Unknown", nullptr, &unique));
+  ASSERT_OK(
+      LoadStaticObject<TestCustomizable>(ignore, "Unknown", nullptr, &pointer));
+  ASSERT_EQ(shared.get(), nullptr);
+  ASSERT_EQ(unique.get(), nullptr);
+  ASSERT_EQ(pointer, nullptr);
+  ASSERT_OK(LoadSharedObject<TestCustomizable>(ignore, "id=Unknown", nullptr,
+                                               &shared));
+  ASSERT_OK(LoadUniqueObject<TestCustomizable>(ignore, "id=Unknown", nullptr,
+                                               &unique));
+  ASSERT_OK(LoadStaticObject<TestCustomizable>(ignore, "id=Unknown", nullptr,
+                                               &pointer));
+  ASSERT_EQ(shared.get(), nullptr);
+  ASSERT_EQ(unique.get(), nullptr);
+  ASSERT_EQ(pointer, nullptr);
+  ASSERT_OK(LoadSharedObject<TestCustomizable>(ignore, "id=Unknown;option=bad",
+                                               nullptr, &shared));
+  ASSERT_OK(LoadUniqueObject<TestCustomizable>(ignore, "id=Unknown;option=bad",
+                                               nullptr, &unique));
+  ASSERT_OK(LoadStaticObject<TestCustomizable>(ignore, "id=Unknown;option=bad",
+                                               nullptr, &pointer));
+  ASSERT_EQ(shared.get(), nullptr);
+  ASSERT_EQ(unique.get(), nullptr);
+  ASSERT_EQ(pointer, nullptr);
+}
+
+TEST_F(CustomizableTest, FactoryFunctionTest) {
+  std::shared_ptr<TestCustomizable> shared;
+  std::unique_ptr<TestCustomizable> unique;
+  TestCustomizable* pointer = nullptr;
+  ConfigOptions ignore = config_options_;
+  ignore.ignore_unsupported_options = false;
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "B", &shared));
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "B", &unique));
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "B", &pointer));
+  ASSERT_NE(shared.get(), nullptr);
+  ASSERT_NE(unique.get(), nullptr);
+  ASSERT_NE(pointer, nullptr);
+  delete pointer;
+  pointer = nullptr;
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "id=", &shared));
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "id=", &unique));
+  ASSERT_OK(TestCustomizable::CreateFromString(ignore, "id=", &pointer));
+  ASSERT_EQ(shared.get(), nullptr);
+  ASSERT_EQ(unique.get(), nullptr);
+  ASSERT_EQ(pointer, nullptr);
+  ASSERT_NOK(TestCustomizable::CreateFromString(ignore, "option=bad", &shared));
+  ASSERT_NOK(TestCustomizable::CreateFromString(ignore, "option=bad", &unique));
+  ASSERT_NOK(
+      TestCustomizable::CreateFromString(ignore, "option=bad", &pointer));
+  ASSERT_EQ(pointer, nullptr);
+}
+
+TEST_F(CustomizableTest, MutableOptionsTest) {
+  static std::unordered_map<std::string, OptionTypeInfo> mutable_option_info = {
+      {"mutable",
+       OptionTypeInfo::AsCustomSharedPtr<TestCustomizable>(
+           0, OptionVerificationType::kNormal, OptionTypeFlags::kMutable)}};
+  static std::unordered_map<std::string, OptionTypeInfo> immutable_option_info =
+      {{"immutable",
+        OptionTypeInfo::AsCustomSharedPtr<TestCustomizable>(
+            0, OptionVerificationType::kNormal, OptionTypeFlags::kNone)}};
+
+  class MutableCustomizable : public Customizable {
+   private:
+    std::shared_ptr<TestCustomizable> mutable_;
+    std::shared_ptr<TestCustomizable> immutable_;
+
+   public:
+    MutableCustomizable() {
+      RegisterOptions("mutable", &mutable_, &mutable_option_info);
+      RegisterOptions("immutable", &immutable_, &immutable_option_info);
+    }
+    const char* Name() const override { return "MutableCustomizable"; }
+  };
+  MutableCustomizable mc;
+
+  ConfigOptions options = config_options_;
+  ASSERT_FALSE(mc.IsPrepared());
+  ASSERT_OK(mc.ConfigureOption(options, "mutable", "{id=B;}"));
+  ASSERT_OK(mc.ConfigureOption(options, "immutable", "{id=A; int=10}"));
+  auto* mm = mc.GetOptions<std::shared_ptr<TestCustomizable>>("mutable");
+  auto* im = mc.GetOptions<std::shared_ptr<TestCustomizable>>("immutable");
+  ASSERT_NE(mm, nullptr);
+  ASSERT_NE(mm->get(), nullptr);
+  ASSERT_NE(im, nullptr);
+  ASSERT_NE(im->get(), nullptr);
+
+  // Now only deal with mutable options
+  options.mutable_options_only = true;
+
+  // Setting nested immutable customizable options fails
+  ASSERT_NOK(mc.ConfigureOption(options, "immutable", "{id=B;}"));
+  ASSERT_NOK(mc.ConfigureOption(options, "immutable.id", "B"));
+  ASSERT_NOK(mc.ConfigureOption(options, "immutable.bool", "true"));
+  ASSERT_NOK(mc.ConfigureOption(options, "immutable", "bool=true"));
+  ASSERT_NOK(mc.ConfigureOption(options, "immutable", "{int=11;bool=true}"));
+  auto* im_a = im->get()->GetOptions<AOptions>("A");
+  ASSERT_NE(im_a, nullptr);
+  ASSERT_EQ(im_a->i, 10);
+  ASSERT_EQ(im_a->b, false);
+
+  // Setting nested mutable customizable options succeeds but the object did not
+  // change
+  ASSERT_OK(mc.ConfigureOption(options, "immutable.int", "11"));
+  ASSERT_EQ(im_a->i, 11);
+  ASSERT_EQ(im_a, im->get()->GetOptions<AOptions>("A"));
+
+  // The mutable configurable itself can be changed
+  ASSERT_OK(mc.ConfigureOption(options, "mutable.id", "A"));
+  ASSERT_OK(mc.ConfigureOption(options, "mutable", "A"));
+  ASSERT_OK(mc.ConfigureOption(options, "mutable", "{id=A}"));
+  ASSERT_OK(mc.ConfigureOption(options, "mutable", "{bool=true}"));
+
+  // The Nested options in the mutable object can be changed
+  ASSERT_OK(mc.ConfigureOption(options, "mutable", "{bool=true}"));
+  auto* mm_a = mm->get()->GetOptions<AOptions>("A");
+  ASSERT_EQ(mm_a->b, true);
+  ASSERT_OK(mc.ConfigureOption(options, "mutable", "{int=11;bool=false}"));
+  mm_a = mm->get()->GetOptions<AOptions>("A");
+  ASSERT_EQ(mm_a->i, 11);
+  ASSERT_EQ(mm_a->b, false);
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+// This method loads existing test classes into the ObjectRegistry
+static int RegisterTestObjects(ObjectLibrary& library,
+                               const std::string& /*arg*/) {
+  size_t num_types;
+  library.Register<TableFactory>(
+      "MockTable",
+      [](const std::string& /*uri*/, std::unique_ptr<TableFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new mock::MockTableFactory());
+        return guard->get();
+      });
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+
+static int RegisterLocalObjects(ObjectLibrary& library,
+                                const std::string& /*arg*/) {
+  size_t num_types;
+  // Load any locally defined objects here
+  return static_cast<int>(library.GetFactoryCount(&num_types));
+}
+
+class LoadCustomizableTest : public testing::Test {
+ public:
+  LoadCustomizableTest() { config_options_.ignore_unsupported_options = false; }
+  bool RegisterTests(const std::string& arg) {
+#ifndef ROCKSDB_LITE
+    config_options_.registry->AddLibrary("custom-tests", RegisterTestObjects,
+                                         arg);
+    config_options_.registry->AddLibrary("local-tests", RegisterLocalObjects,
+                                         arg);
+    return true;
+#else
+    (void)arg;
+    return false;
+#endif  // !ROCKSDB_LITE
+  }
+
+ protected:
+  DBOptions db_opts_;
+  ColumnFamilyOptions cf_opts_;
+  ConfigOptions config_options_;
+};
+
+TEST_F(LoadCustomizableTest, LoadTableFactoryTest) {
+  std::shared_ptr<TableFactory> factory;
+  ASSERT_NOK(
+      TableFactory::CreateFromString(config_options_, "MockTable", &factory));
+  ASSERT_OK(TableFactory::CreateFromString(
+      config_options_, TableFactory::kBlockBasedTableName(), &factory));
+  ASSERT_NE(factory, nullptr);
+  ASSERT_STREQ(factory->Name(), TableFactory::kBlockBasedTableName());
+
+  if (RegisterTests("Test")) {
+    ASSERT_OK(
+        TableFactory::CreateFromString(config_options_, "MockTable", &factory));
+    ASSERT_NE(factory, nullptr);
+    ASSERT_STREQ(factory->Name(), "MockTable");
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+#ifdef GFLAGS
+  ParseCommandLineFlags(&argc, &argv, true);
+#endif  // GFLAGS
+  return RUN_ALL_TESTS();
+}
diff --git a/options/db_options.cc b/options/db_options.cc
index 4b7dba96d0a..00ff6aacb49 100644
--- a/options/db_options.cc
+++ b/options/db_options.cc
@@ -17,6 +17,7 @@
 #include "rocksdb/file_system.h"
 #include "rocksdb/rate_limiter.h"
 #include "rocksdb/sst_file_manager.h"
+#include "rocksdb/system_clock.h"
 #include "rocksdb/utilities/options_type.h"
 #include "rocksdb/wal_filter.h"
 #include "util/string_util.h"
@@ -136,6 +137,7 @@ static std::unordered_map<std::string, OptionTypeInfo>
           std::shared_ptr<Statistics> statistics;
           std::vector<DbPath> db_paths;
           std::vector<std::shared_ptr<EventListener>> listeners;
+          FileTypeSet checksum_handoff_file_types;
          */
         {"advise_random_on_open",
          {offsetof(struct ImmutableDBOptions, advise_random_on_open),
@@ -198,6 +200,15 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct ImmutableDBOptions, paranoid_checks),
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
+        {"flush_verify_memtable_count",
+         {offsetof(struct ImmutableDBOptions, flush_verify_memtable_count),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"track_and_verify_wals_in_manifest",
+         {offsetof(struct ImmutableDBOptions,
+                   track_and_verify_wals_in_manifest),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
         {"skip_log_error_on_recovery",
          {0, OptionType::kBoolean, OptionVerificationType::kDeprecated,
           OptionTypeFlags::kNone}},
@@ -265,11 +276,11 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct ImmutableDBOptions, wal_dir), OptionType::kString,
           OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
         {"WAL_size_limit_MB",
-         {offsetof(struct ImmutableDBOptions, wal_size_limit_mb),
+         {offsetof(struct ImmutableDBOptions, WAL_size_limit_MB),
           OptionType::kUInt64T, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
         {"WAL_ttl_seconds",
-         {offsetof(struct ImmutableDBOptions, wal_ttl_seconds),
+         {offsetof(struct ImmutableDBOptions, WAL_ttl_seconds),
           OptionType::kUInt64T, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
         {"max_manifest_file_size",
@@ -388,6 +399,9 @@ static std::unordered_map<std::string, OptionTypeInfo>
          {offsetof(struct ImmutableDBOptions, bgerror_resume_retry_interval),
           OptionType::kUInt64T, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
+        {"db_host_id",
+         {offsetof(struct ImmutableDBOptions, db_host_id), OptionType::kString,
+          OptionVerificationType::kNormal, OptionTypeFlags::kCompareNever}},
         // The following properties were handled as special cases in ParseOption
         // This means that the properties could be read from the options file
         // but never written to the file or compared to each other.
@@ -397,9 +411,8 @@ static std::unordered_map<std::string, OptionTypeInfo>
           (OptionTypeFlags::kDontSerialize | OptionTypeFlags::kCompareNever),
           // Parse the input value as a RateLimiter
           [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
-             const std::string& value, char* addr) {
-            auto limiter =
-                reinterpret_cast<std::shared_ptr<RateLimiter>*>(addr);
+             const std::string& value, void* addr) {
+            auto limiter = static_cast<std::shared_ptr<RateLimiter>*>(addr);
             limiter->reset(NewGenericRateLimiter(
                 static_cast<int64_t>(ParseUint64(value))));
             return Status::OK();
@@ -409,11 +422,12 @@ static std::unordered_map<std::string, OptionTypeInfo>
           OptionVerificationType::kNormal,
           (OptionTypeFlags::kDontSerialize | OptionTypeFlags::kCompareNever),
           // Parse the input value as an Env
-          [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
-             const std::string& value, char* addr) {
-            auto old_env = reinterpret_cast<Env**>(addr);  // Get the old value
+          [](const ConfigOptions& opts, const std::string& /*name*/,
+             const std::string& value, void* addr) {
+            auto old_env = static_cast<Env**>(addr);       // Get the old value
             Env* new_env = *old_env;                       // Set new to old
-            Status s = Env::LoadEnv(value, &new_env);      // Update new value
+            Status s = Env::CreateFromString(opts, value,
+                                             &new_env);    // Update new value
             if (s.ok()) {                                  // It worked
               *old_env = new_env;                          // Update the old one
             }
@@ -433,10 +447,9 @@ const std::string OptionsHelper::kDBOptionsName = "DBOptions";
 
 class MutableDBConfigurable : public Configurable {
  public:
-  MutableDBConfigurable(const MutableDBOptions& mdb) {
+  explicit MutableDBConfigurable(const MutableDBOptions& mdb) {
     mutable_ = mdb;
-    ConfigurableHelper::RegisterOptions(*this, &mutable_,
-                                        &db_mutable_options_type_info);
+    RegisterOptions(&mutable_, &db_mutable_options_type_info);
   }
 
  protected:
@@ -445,7 +458,7 @@ class MutableDBConfigurable : public Configurable {
 
 class DBOptionsConfigurable : public MutableDBConfigurable {
  public:
-  DBOptionsConfigurable(const DBOptions& opts)
+  explicit DBOptionsConfigurable(const DBOptions& opts)
       : MutableDBConfigurable(MutableDBOptions(opts)), db_options_(opts) {
     // The ImmutableDBOptions currently requires the env to be non-null.  Make
     // sure it is
@@ -456,8 +469,7 @@ class DBOptionsConfigurable : public MutableDBConfigurable {
       copy.env = Env::Default();
       immutable_ = ImmutableDBOptions(copy);
     }
-    ConfigurableHelper::RegisterOptions(*this, &immutable_,
-                                        &db_immutable_options_type_info);
+    RegisterOptions(&immutable_, &db_immutable_options_type_info);
   }
 
  protected:
@@ -465,8 +477,7 @@ class DBOptionsConfigurable : public MutableDBConfigurable {
       const ConfigOptions& config_options,
       const std::unordered_map<std::string, std::string>& opts_map,
       std::unordered_map<std::string, std::string>* unused) override {
-    Status s = ConfigurableHelper::ConfigureOptions(config_options, *this,
-                                                    opts_map, unused);
+    Status s = Configurable::ConfigureOptions(config_options, opts_map, unused);
     if (s.ok()) {
       db_options_ = BuildDBOptions(immutable_, mutable_);
       s = PrepareOptions(config_options);
@@ -505,8 +516,10 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       create_missing_column_families(options.create_missing_column_families),
       error_if_exists(options.error_if_exists),
       paranoid_checks(options.paranoid_checks),
+      flush_verify_memtable_count(options.flush_verify_memtable_count),
+      track_and_verify_wals_in_manifest(
+          options.track_and_verify_wals_in_manifest),
       env(options.env),
-      fs(options.env->GetFileSystem()),
       rate_limiter(options.rate_limiter),
       sst_file_manager(options.sst_file_manager),
       info_log(options.info_log),
@@ -523,8 +536,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       recycle_log_file_num(options.recycle_log_file_num),
       max_manifest_file_size(options.max_manifest_file_size),
       table_cache_numshardbits(options.table_cache_numshardbits),
-      wal_ttl_seconds(options.WAL_ttl_seconds),
-      wal_size_limit_mb(options.WAL_size_limit_MB),
+      WAL_ttl_seconds(options.WAL_ttl_seconds),
+      WAL_size_limit_MB(options.WAL_size_limit_MB),
       max_write_batch_group_size_bytes(
           options.max_write_batch_group_size_bytes),
       manifest_preallocation_size(options.manifest_preallocation_size),
@@ -579,7 +592,19 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       max_bgerror_resume_count(options.max_bgerror_resume_count),
       bgerror_resume_retry_interval(options.bgerror_resume_retry_interval),
       allow_data_in_errors(options.allow_data_in_errors),
-      disable_manifest_sync(options.disable_manifest_sync) {
+      disable_manifest_sync(options.disable_manifest_sync),
+      db_host_id(options.db_host_id),
+      checksum_handoff_file_types(options.checksum_handoff_file_types),
+      compaction_service(options.compaction_service) {
+  stats = statistics.get();
+  fs = env->GetFileSystem();
+  if (env != nullptr) {
+    clock = env->GetSystemClock().get();
+  } else {
+    clock = SystemClock::Default().get();
+  }
+  logger = info_log.get();
+  stats = statistics.get();
 }
 
 void ImmutableDBOptions::Dump(Logger* log) const {
@@ -589,6 +614,12 @@ void ImmutableDBOptions::Dump(Logger* log) const {
                    create_if_missing);
   ROCKS_LOG_HEADER(log, "                        Options.paranoid_checks: %d",
                    paranoid_checks);
+  ROCKS_LOG_HEADER(log, "            Options.flush_verify_memtable_count: %d",
+                   flush_verify_memtable_count);
+  ROCKS_LOG_HEADER(log,
+                   "                              "
+                   "Options.track_and_verify_wals_in_manifest: %d",
+                   track_and_verify_wals_in_manifest);
   ROCKS_LOG_HEADER(log, "                                    Options.env: %p",
                    env);
   ROCKS_LOG_HEADER(log, "                                     Options.fs: %s",
@@ -598,7 +629,7 @@ void ImmutableDBOptions::Dump(Logger* log) const {
   ROCKS_LOG_HEADER(log, "               Options.max_file_opening_threads: %d",
                    max_file_opening_threads);
   ROCKS_LOG_HEADER(log, "                             Options.statistics: %p",
-                   statistics.get());
+                   stats);
   ROCKS_LOG_HEADER(log, "                              Options.use_fsync: %d",
                    use_fsync);
   ROCKS_LOG_HEADER(
@@ -638,10 +669,10 @@ void ImmutableDBOptions::Dump(Logger* log) const {
                    table_cache_numshardbits);
   ROCKS_LOG_HEADER(log,
                    "                        Options.WAL_ttl_seconds: %" PRIu64,
-                   wal_ttl_seconds);
+                   WAL_ttl_seconds);
   ROCKS_LOG_HEADER(log,
                    "                      Options.WAL_size_limit_MB: %" PRIu64,
-                   wal_size_limit_mb);
+                   WAL_size_limit_MB);
   ROCKS_LOG_HEADER(log,
                    "                       "
                    "Options.max_write_batch_group_size_bytes: %" PRIu64,
@@ -739,6 +770,8 @@ void ImmutableDBOptions::Dump(Logger* log) const {
                    allow_data_in_errors);
   ROCKS_LOG_HEADER(log, "           Options.disable_manifest_sync: %d",
                    disable_manifest_sync);
+  ROCKS_LOG_HEADER(log, "            Options.db_host_id: %s",
+                   db_host_id.c_str());
 }
 
 MutableDBOptions::MutableDBOptions()
@@ -827,4 +860,27 @@ void MutableDBOptions::Dump(Logger* log) const {
                           max_background_flushes);
 }
 
+#ifndef ROCKSDB_LITE
+Status GetMutableDBOptionsFromStrings(
+    const MutableDBOptions& base_options,
+    const std::unordered_map<std::string, std::string>& options_map,
+    MutableDBOptions* new_options) {
+  assert(new_options);
+  *new_options = base_options;
+  ConfigOptions config_options;
+  Status s = OptionTypeInfo::ParseType(
+      config_options, options_map, db_mutable_options_type_info, new_options);
+  if (!s.ok()) {
+    *new_options = base_options;
+  }
+  return s;
+}
+
+Status GetStringFromMutableDBOptions(const ConfigOptions& config_options,
+                                     const MutableDBOptions& mutable_opts,
+                                     std::string* opt_string) {
+  return OptionTypeInfo::SerializeType(
+      config_options, db_mutable_options_type_info, &mutable_opts, opt_string);
+}
+#endif  // ROCKSDB_LITE
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/options/db_options.h b/options/db_options.h
index 61642e42611..cdd08e80a11 100644
--- a/options/db_options.h
+++ b/options/db_options.h
@@ -11,6 +11,7 @@
 #include "rocksdb/options.h"
 
 namespace ROCKSDB_NAMESPACE {
+class SystemClock;
 
 struct ImmutableDBOptions {
   static const char* kName() { return "ImmutableDBOptions"; }
@@ -23,8 +24,9 @@ struct ImmutableDBOptions {
   bool create_missing_column_families;
   bool error_if_exists;
   bool paranoid_checks;
+  bool flush_verify_memtable_count;
+  bool track_and_verify_wals_in_manifest;
   Env* env;
-  std::shared_ptr<FileSystem> fs;
   std::shared_ptr<RateLimiter> rate_limiter;
   std::shared_ptr<SstFileManager> sst_file_manager;
   std::shared_ptr<Logger> info_log;
@@ -41,8 +43,8 @@ struct ImmutableDBOptions {
   size_t recycle_log_file_num;
   uint64_t max_manifest_file_size;
   int table_cache_numshardbits;
-  uint64_t wal_ttl_seconds;
-  uint64_t wal_size_limit_mb;
+  uint64_t WAL_ttl_seconds;
+  uint64_t WAL_size_limit_MB;
   uint64_t max_write_batch_group_size_bytes;
   size_t manifest_preallocation_size;
   bool allow_mmap_reads;
@@ -93,6 +95,14 @@ struct ImmutableDBOptions {
   uint64_t bgerror_resume_retry_interval;
   bool allow_data_in_errors;
   bool disable_manifest_sync;
+  std::string db_host_id;
+  FileTypeSet checksum_handoff_file_types;
+  // Convenience/Helper objects that are not part of the base DBOptions
+  std::shared_ptr<FileSystem> fs;
+  SystemClock* clock;
+  Statistics* stats;
+  Logger* logger;
+  std::shared_ptr<CompactionService> compaction_service;
 };
 
 struct MutableDBOptions {
@@ -123,4 +133,15 @@ struct MutableDBOptions {
   int max_background_flushes;
 };
 
+#ifndef ROCKSDB_LITE
+Status GetStringFromMutableDBOptions(const ConfigOptions& config_options,
+                                     const MutableDBOptions& mutable_opts,
+                                     std::string* opt_string);
+
+Status GetMutableDBOptionsFromStrings(
+    const MutableDBOptions& base_options,
+    const std::unordered_map<std::string, std::string>& options_map,
+    MutableDBOptions* new_options);
+#endif  // ROCKSDB_LITE
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/options/options.cc b/options/options.cc
index cf00059b71d..4faee64b4b1 100644
--- a/options/options.cc
+++ b/options/options.cc
@@ -92,7 +92,10 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options)
       enable_blob_files(options.enable_blob_files),
       min_blob_size(options.min_blob_size),
       blob_file_size(options.blob_file_size),
-      blob_compression_type(options.blob_compression_type) {
+      blob_compression_type(options.blob_compression_type),
+      enable_blob_garbage_collection(options.enable_blob_garbage_collection),
+      blob_garbage_collection_age_cutoff(
+          options.blob_garbage_collection_age_cutoff) {
   assert(memtable_factory.get() != nullptr);
   if (max_bytes_for_level_multiplier_additional.size() <
       static_cast<unsigned int>(num_levels)) {
@@ -198,6 +201,11 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
     ROCKS_LOG_HEADER(
         log, "                 Options.bottommost_compression_opts.enabled: %s",
         bottommost_compression_opts.enabled ? "true" : "false");
+    ROCKS_LOG_HEADER(
+        log,
+        "        Options.bottommost_compression_opts.max_dict_buffer_bytes: "
+        "%" PRIu64,
+        bottommost_compression_opts.max_dict_buffer_bytes);
     ROCKS_LOG_HEADER(log, "           Options.compression_opts.window_bits: %d",
                      compression_opts.window_bits);
     ROCKS_LOG_HEADER(log, "                 Options.compression_opts.level: %d",
@@ -219,6 +227,10 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
     ROCKS_LOG_HEADER(log,
                      "                 Options.compression_opts.enabled: %s",
                      compression_opts.enabled ? "true" : "false");
+    ROCKS_LOG_HEADER(log,
+                     "        Options.compression_opts.max_dict_buffer_bytes: "
+                     "%" PRIu64,
+                     compression_opts.max_dict_buffer_bytes);
     ROCKS_LOG_HEADER(log, "     Options.level0_file_num_compaction_trigger: %d",
                      level0_file_num_compaction_trigger);
     ROCKS_LOG_HEADER(log, "         Options.level0_slowdown_writes_trigger: %d",
@@ -383,6 +395,10 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
                      blob_file_size);
     ROCKS_LOG_HEADER(log, "               Options.blob_compression_type: %s",
                      CompressionTypeToString(blob_compression_type).c_str());
+    ROCKS_LOG_HEADER(log, "      Options.enable_blob_garbage_collection: %s",
+                     enable_blob_garbage_collection ? "true" : "false");
+    ROCKS_LOG_HEADER(log, "  Options.blob_garbage_collection_age_cutoff: %f",
+                     blob_garbage_collection_age_cutoff);
 }  // ColumnFamilyOptions::Dump
 
 void Options::Dump(Logger* log) const {
diff --git a/options/options_helper.cc b/options/options_helper.cc
index e4a53de8bd2..0aaa73b2543 100644
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@@ -28,6 +28,20 @@
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
+ConfigOptions::ConfigOptions()
+#ifndef ROCKSDB_LITE
+    : registry(ObjectRegistry::NewInstance())
+#endif
+{
+  env = Env::Default();
+}
+
+ConfigOptions::ConfigOptions(const DBOptions& db_opts) : env(db_opts.env) {
+#ifndef ROCKSDB_LITE
+  registry = ObjectRegistry::NewInstance();
+#endif
+}
+
 Status ValidateOptions(const DBOptions& db_opts,
                        const ColumnFamilyOptions& cf_opts) {
   Status s;
@@ -51,6 +65,10 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
       immutable_db_options.create_missing_column_families;
   options.error_if_exists = immutable_db_options.error_if_exists;
   options.paranoid_checks = immutable_db_options.paranoid_checks;
+  options.flush_verify_memtable_count =
+      immutable_db_options.flush_verify_memtable_count;
+  options.track_and_verify_wals_in_manifest =
+      immutable_db_options.track_and_verify_wals_in_manifest;
   options.env = immutable_db_options.env;
   options.rate_limiter = immutable_db_options.rate_limiter;
   options.sst_file_manager = immutable_db_options.sst_file_manager;
@@ -84,8 +102,8 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
   options.max_manifest_file_size = immutable_db_options.max_manifest_file_size;
   options.table_cache_numshardbits =
       immutable_db_options.table_cache_numshardbits;
-  options.WAL_ttl_seconds = immutable_db_options.wal_ttl_seconds;
-  options.WAL_size_limit_MB = immutable_db_options.wal_size_limit_mb;
+  options.WAL_ttl_seconds = immutable_db_options.WAL_ttl_seconds;
+  options.WAL_size_limit_MB = immutable_db_options.WAL_size_limit_MB;
   options.manifest_preallocation_size =
       immutable_db_options.manifest_preallocation_size;
   options.allow_mmap_reads = immutable_db_options.allow_mmap_reads;
@@ -166,6 +184,10 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
   options.bgerror_resume_retry_interval =
       immutable_db_options.bgerror_resume_retry_interval;
   options.disable_manifest_sync = immutable_db_options.disable_manifest_sync;
+  options.db_host_id = immutable_db_options.db_host_id;
+  options.allow_data_in_errors = immutable_db_options.allow_data_in_errors;
+  options.checksum_handoff_file_types =
+      immutable_db_options.checksum_handoff_file_types;
   return options;
 }
 
@@ -173,81 +195,115 @@ ColumnFamilyOptions BuildColumnFamilyOptions(
     const ColumnFamilyOptions& options,
     const MutableCFOptions& mutable_cf_options) {
   ColumnFamilyOptions cf_opts(options);
+  UpdateColumnFamilyOptions(mutable_cf_options, &cf_opts);
+  // TODO(yhchiang): find some way to handle the following derived options
+  // * max_file_size
+  return cf_opts;
+}
 
+void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
+                               ColumnFamilyOptions* cf_opts) {
   // Memtable related options
-  cf_opts.write_buffer_size = mutable_cf_options.write_buffer_size;
-  cf_opts.max_write_buffer_number = mutable_cf_options.max_write_buffer_number;
-  cf_opts.arena_block_size = mutable_cf_options.arena_block_size;
-  cf_opts.memtable_prefix_bloom_size_ratio =
-      mutable_cf_options.memtable_prefix_bloom_size_ratio;
-  cf_opts.memtable_whole_key_filtering =
-      mutable_cf_options.memtable_whole_key_filtering;
-  cf_opts.memtable_huge_page_size = mutable_cf_options.memtable_huge_page_size;
-  cf_opts.max_successive_merges = mutable_cf_options.max_successive_merges;
-  cf_opts.inplace_update_num_locks =
-      mutable_cf_options.inplace_update_num_locks;
-  cf_opts.prefix_extractor = mutable_cf_options.prefix_extractor;
+  cf_opts->write_buffer_size = moptions.write_buffer_size;
+  cf_opts->max_write_buffer_number = moptions.max_write_buffer_number;
+  cf_opts->arena_block_size = moptions.arena_block_size;
+  cf_opts->memtable_prefix_bloom_size_ratio =
+      moptions.memtable_prefix_bloom_size_ratio;
+  cf_opts->memtable_whole_key_filtering = moptions.memtable_whole_key_filtering;
+  cf_opts->memtable_huge_page_size = moptions.memtable_huge_page_size;
+  cf_opts->max_successive_merges = moptions.max_successive_merges;
+  cf_opts->inplace_update_num_locks = moptions.inplace_update_num_locks;
+  cf_opts->prefix_extractor = moptions.prefix_extractor;
 
   // Compaction related options
-  cf_opts.disable_auto_compactions =
-      mutable_cf_options.disable_auto_compactions;
-  cf_opts.soft_pending_compaction_bytes_limit =
-      mutable_cf_options.soft_pending_compaction_bytes_limit;
-  cf_opts.hard_pending_compaction_bytes_limit =
-      mutable_cf_options.hard_pending_compaction_bytes_limit;
-  cf_opts.level0_file_num_compaction_trigger =
-      mutable_cf_options.level0_file_num_compaction_trigger;
-  cf_opts.level0_slowdown_writes_trigger =
-      mutable_cf_options.level0_slowdown_writes_trigger;
-  cf_opts.level0_stop_writes_trigger =
-      mutable_cf_options.level0_stop_writes_trigger;
-  cf_opts.max_compaction_bytes = mutable_cf_options.max_compaction_bytes;
-  cf_opts.target_file_size_base = mutable_cf_options.target_file_size_base;
-  cf_opts.target_file_size_multiplier =
-      mutable_cf_options.target_file_size_multiplier;
-  cf_opts.max_bytes_for_level_base =
-      mutable_cf_options.max_bytes_for_level_base;
-  cf_opts.max_bytes_for_level_multiplier =
-      mutable_cf_options.max_bytes_for_level_multiplier;
-  cf_opts.ttl = mutable_cf_options.ttl;
-  cf_opts.periodic_compaction_seconds =
-      mutable_cf_options.periodic_compaction_seconds;
-
-  cf_opts.max_bytes_for_level_multiplier_additional.clear();
-  for (auto value :
-       mutable_cf_options.max_bytes_for_level_multiplier_additional) {
-    cf_opts.max_bytes_for_level_multiplier_additional.emplace_back(value);
+  cf_opts->disable_auto_compactions = moptions.disable_auto_compactions;
+  cf_opts->soft_pending_compaction_bytes_limit =
+      moptions.soft_pending_compaction_bytes_limit;
+  cf_opts->hard_pending_compaction_bytes_limit =
+      moptions.hard_pending_compaction_bytes_limit;
+  cf_opts->level0_file_num_compaction_trigger =
+      moptions.level0_file_num_compaction_trigger;
+  cf_opts->level0_slowdown_writes_trigger =
+      moptions.level0_slowdown_writes_trigger;
+  cf_opts->level0_stop_writes_trigger = moptions.level0_stop_writes_trigger;
+  cf_opts->max_compaction_bytes = moptions.max_compaction_bytes;
+  cf_opts->target_file_size_base = moptions.target_file_size_base;
+  cf_opts->target_file_size_multiplier = moptions.target_file_size_multiplier;
+  cf_opts->max_bytes_for_level_base = moptions.max_bytes_for_level_base;
+  cf_opts->max_bytes_for_level_multiplier =
+      moptions.max_bytes_for_level_multiplier;
+  cf_opts->ttl = moptions.ttl;
+  cf_opts->periodic_compaction_seconds = moptions.periodic_compaction_seconds;
+
+  cf_opts->max_bytes_for_level_multiplier_additional.clear();
+  for (auto value : moptions.max_bytes_for_level_multiplier_additional) {
+    cf_opts->max_bytes_for_level_multiplier_additional.emplace_back(value);
   }
 
-  cf_opts.compaction_options_fifo = mutable_cf_options.compaction_options_fifo;
-  cf_opts.compaction_options_universal =
-      mutable_cf_options.compaction_options_universal;
+  cf_opts->compaction_options_fifo = moptions.compaction_options_fifo;
+  cf_opts->compaction_options_universal = moptions.compaction_options_universal;
 
   // Blob file related options
-  cf_opts.enable_blob_files = mutable_cf_options.enable_blob_files;
-  cf_opts.min_blob_size = mutable_cf_options.min_blob_size;
-  cf_opts.blob_file_size = mutable_cf_options.blob_file_size;
-  cf_opts.blob_compression_type = mutable_cf_options.blob_compression_type;
+  cf_opts->enable_blob_files = moptions.enable_blob_files;
+  cf_opts->min_blob_size = moptions.min_blob_size;
+  cf_opts->blob_file_size = moptions.blob_file_size;
+  cf_opts->blob_compression_type = moptions.blob_compression_type;
+  cf_opts->enable_blob_garbage_collection =
+      moptions.enable_blob_garbage_collection;
+  cf_opts->blob_garbage_collection_age_cutoff =
+      moptions.blob_garbage_collection_age_cutoff;
 
   // Misc options
-  cf_opts.max_sequential_skip_in_iterations =
-      mutable_cf_options.max_sequential_skip_in_iterations;
-  cf_opts.check_flush_compaction_key_order =
-      mutable_cf_options.check_flush_compaction_key_order;
-  cf_opts.paranoid_file_checks = mutable_cf_options.paranoid_file_checks;
-  cf_opts.report_bg_io_stats = mutable_cf_options.report_bg_io_stats;
-  cf_opts.compression = mutable_cf_options.compression;
-  cf_opts.compression_opts = mutable_cf_options.compression_opts;
-  cf_opts.bottommost_compression = mutable_cf_options.bottommost_compression;
-  cf_opts.bottommost_compression_opts =
-      mutable_cf_options.bottommost_compression_opts;
-  cf_opts.sample_for_compression = mutable_cf_options.sample_for_compression;
-
-  cf_opts.table_factory = options.table_factory;
+  cf_opts->max_sequential_skip_in_iterations =
+      moptions.max_sequential_skip_in_iterations;
+  cf_opts->check_flush_compaction_key_order =
+      moptions.check_flush_compaction_key_order;
+  cf_opts->paranoid_file_checks = moptions.paranoid_file_checks;
+  cf_opts->report_bg_io_stats = moptions.report_bg_io_stats;
+  cf_opts->compression = moptions.compression;
+  cf_opts->compression_opts = moptions.compression_opts;
+  cf_opts->bottommost_compression = moptions.bottommost_compression;
+  cf_opts->bottommost_compression_opts = moptions.bottommost_compression_opts;
+  cf_opts->sample_for_compression = moptions.sample_for_compression;
+}
+
+void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions,
+                               ColumnFamilyOptions* cf_opts) {
+  cf_opts->compaction_style = ioptions.compaction_style;
+  cf_opts->compaction_pri = ioptions.compaction_pri;
+  cf_opts->comparator = ioptions.user_comparator;
+  cf_opts->merge_operator = ioptions.merge_operator;
+  cf_opts->compaction_filter = ioptions.compaction_filter;
+  cf_opts->compaction_filter_factory = ioptions.compaction_filter_factory;
+  cf_opts->min_write_buffer_number_to_merge =
+      ioptions.min_write_buffer_number_to_merge;
+  cf_opts->max_write_buffer_number_to_maintain =
+      ioptions.max_write_buffer_number_to_maintain;
+  cf_opts->max_write_buffer_size_to_maintain =
+      ioptions.max_write_buffer_size_to_maintain;
+  cf_opts->inplace_update_support = ioptions.inplace_update_support;
+  cf_opts->inplace_callback = ioptions.inplace_callback;
+  cf_opts->memtable_factory = ioptions.memtable_factory;
+  cf_opts->table_factory = ioptions.table_factory;
+  cf_opts->table_properties_collector_factories =
+      ioptions.table_properties_collector_factories;
+  cf_opts->bloom_locality = ioptions.bloom_locality;
+  cf_opts->purge_redundant_kvs_while_flush =
+      ioptions.purge_redundant_kvs_while_flush;
+  cf_opts->compression_per_level = ioptions.compression_per_level;
+  cf_opts->level_compaction_dynamic_level_bytes =
+      ioptions.level_compaction_dynamic_level_bytes;
+  cf_opts->num_levels = ioptions.num_levels;
+  cf_opts->optimize_filters_for_hits = ioptions.optimize_filters_for_hits;
+  cf_opts->force_consistency_checks = ioptions.force_consistency_checks;
+  cf_opts->memtable_insert_with_hint_prefix_extractor =
+      ioptions.memtable_insert_with_hint_prefix_extractor;
+  cf_opts->cf_paths = ioptions.cf_paths;
+  cf_opts->compaction_thread_limiter = ioptions.compaction_thread_limiter;
+  cf_opts->sst_partitioner_factory = ioptions.sst_partitioner_factory;
+
   // TODO(yhchiang): find some way to handle the following derived options
   // * max_file_size
-
-  return cf_opts;
 }
 
 std::map<CompactionStyle, std::string>
@@ -298,6 +354,17 @@ std::vector<CompressionType> GetSupportedCompressions() {
   return supported_compressions;
 }
 
+std::vector<CompressionType> GetSupportedDictCompressions() {
+  std::vector<CompressionType> dict_compression_types;
+  for (const auto& comp_to_name : OptionsHelper::compression_type_string_map) {
+    CompressionType t = comp_to_name.second;
+    if (t != kDisableCompressionOption && DictCompressionTypeSupported(t)) {
+      dict_compression_types.push_back(t);
+    }
+  }
+  return dict_compression_types;
+}
+
 #ifndef ROCKSDB_LITE
 bool ParseSliceTransformHelper(
     const std::string& kFixedPrefixName, const std::string& kCappedPrefixName,
@@ -354,137 +421,144 @@ bool ParseSliceTransform(
   return false;
 }
 
-static bool ParseOptionHelper(char* opt_address, const OptionType& opt_type,
+static bool ParseOptionHelper(void* opt_address, const OptionType& opt_type,
                               const std::string& value) {
   switch (opt_type) {
     case OptionType::kBoolean:
-      *reinterpret_cast<bool*>(opt_address) = ParseBoolean("", value);
+      *static_cast<bool*>(opt_address) = ParseBoolean("", value);
       break;
     case OptionType::kInt:
-      *reinterpret_cast<int*>(opt_address) = ParseInt(value);
+      *static_cast<int*>(opt_address) = ParseInt(value);
       break;
     case OptionType::kInt32T:
-      *reinterpret_cast<int32_t*>(opt_address) = ParseInt32(value);
+      *static_cast<int32_t*>(opt_address) = ParseInt32(value);
       break;
     case OptionType::kInt64T:
-      PutUnaligned(reinterpret_cast<int64_t*>(opt_address), ParseInt64(value));
+      PutUnaligned(static_cast<int64_t*>(opt_address), ParseInt64(value));
       break;
     case OptionType::kUInt:
-      *reinterpret_cast<unsigned int*>(opt_address) = ParseUint32(value);
+      *static_cast<unsigned int*>(opt_address) = ParseUint32(value);
+      break;
+    case OptionType::kUInt8T:
+      *static_cast<uint8_t*>(opt_address) = ParseUint8(value);
       break;
     case OptionType::kUInt32T:
-      *reinterpret_cast<uint32_t*>(opt_address) = ParseUint32(value);
+      *static_cast<uint32_t*>(opt_address) = ParseUint32(value);
       break;
     case OptionType::kUInt64T:
-      PutUnaligned(reinterpret_cast<uint64_t*>(opt_address), ParseUint64(value));
+      PutUnaligned(static_cast<uint64_t*>(opt_address), ParseUint64(value));
       break;
     case OptionType::kSizeT:
-      PutUnaligned(reinterpret_cast<size_t*>(opt_address), ParseSizeT(value));
+      PutUnaligned(static_cast<size_t*>(opt_address), ParseSizeT(value));
       break;
     case OptionType::kString:
-      *reinterpret_cast<std::string*>(opt_address) = value;
+      *static_cast<std::string*>(opt_address) = value;
       break;
     case OptionType::kDouble:
-      *reinterpret_cast<double*>(opt_address) = ParseDouble(value);
+      *static_cast<double*>(opt_address) = ParseDouble(value);
       break;
     case OptionType::kCompactionStyle:
       return ParseEnum<CompactionStyle>(
           compaction_style_string_map, value,
-          reinterpret_cast<CompactionStyle*>(opt_address));
+          static_cast<CompactionStyle*>(opt_address));
     case OptionType::kCompactionPri:
-      return ParseEnum<CompactionPri>(
-          compaction_pri_string_map, value,
-          reinterpret_cast<CompactionPri*>(opt_address));
+      return ParseEnum<CompactionPri>(compaction_pri_string_map, value,
+                                      static_cast<CompactionPri*>(opt_address));
     case OptionType::kCompressionType:
       return ParseEnum<CompressionType>(
           compression_type_string_map, value,
-          reinterpret_cast<CompressionType*>(opt_address));
+          static_cast<CompressionType*>(opt_address));
     case OptionType::kSliceTransform:
       return ParseSliceTransform(
-          value, reinterpret_cast<std::shared_ptr<const SliceTransform>*>(
-                     opt_address));
+          value,
+          static_cast<std::shared_ptr<const SliceTransform>*>(opt_address));
     case OptionType::kChecksumType:
-      return ParseEnum<ChecksumType>(
-          checksum_type_string_map, value,
-          reinterpret_cast<ChecksumType*>(opt_address));
+      return ParseEnum<ChecksumType>(checksum_type_string_map, value,
+                                     static_cast<ChecksumType*>(opt_address));
     case OptionType::kEncodingType:
-      return ParseEnum<EncodingType>(
-          encoding_type_string_map, value,
-          reinterpret_cast<EncodingType*>(opt_address));
+      return ParseEnum<EncodingType>(encoding_type_string_map, value,
+                                     static_cast<EncodingType*>(opt_address));
     case OptionType::kCompactionStopStyle:
       return ParseEnum<CompactionStopStyle>(
           compaction_stop_style_string_map, value,
-          reinterpret_cast<CompactionStopStyle*>(opt_address));
+          static_cast<CompactionStopStyle*>(opt_address));
+    case OptionType::kEncodedString: {
+      std::string* output_addr = static_cast<std::string*>(opt_address);
+      (Slice(value)).DecodeHex(output_addr);
+      break;
+    }
     default:
       return false;
   }
   return true;
 }
 
-bool SerializeSingleOptionHelper(const char* opt_address,
+bool SerializeSingleOptionHelper(const void* opt_address,
                                  const OptionType opt_type,
                                  std::string* value) {
-
   assert(value);
   switch (opt_type) {
     case OptionType::kBoolean:
-      *value = *(reinterpret_cast<const bool*>(opt_address)) ? "true" : "false";
+      *value = *(static_cast<const bool*>(opt_address)) ? "true" : "false";
       break;
     case OptionType::kInt:
-      *value = ToString(*(reinterpret_cast<const int*>(opt_address)));
+      *value = ToString(*(static_cast<const int*>(opt_address)));
       break;
     case OptionType::kInt32T:
-      *value = ToString(*(reinterpret_cast<const int32_t*>(opt_address)));
+      *value = ToString(*(static_cast<const int32_t*>(opt_address)));
       break;
     case OptionType::kInt64T:
       {
         int64_t v;
-        GetUnaligned(reinterpret_cast<const int64_t*>(opt_address), &v);
+        GetUnaligned(static_cast<const int64_t*>(opt_address), &v);
         *value = ToString(v);
       }
       break;
     case OptionType::kUInt:
-      *value = ToString(*(reinterpret_cast<const unsigned int*>(opt_address)));
+      *value = ToString(*(static_cast<const unsigned int*>(opt_address)));
+      break;
+    case OptionType::kUInt8T:
+      *value = ToString(*(static_cast<const uint8_t*>(opt_address)));
       break;
     case OptionType::kUInt32T:
-      *value = ToString(*(reinterpret_cast<const uint32_t*>(opt_address)));
+      *value = ToString(*(static_cast<const uint32_t*>(opt_address)));
       break;
     case OptionType::kUInt64T:
       {
         uint64_t v;
-        GetUnaligned(reinterpret_cast<const uint64_t*>(opt_address), &v);
+        GetUnaligned(static_cast<const uint64_t*>(opt_address), &v);
         *value = ToString(v);
       }
       break;
     case OptionType::kSizeT:
       {
         size_t v;
-        GetUnaligned(reinterpret_cast<const size_t*>(opt_address), &v);
+        GetUnaligned(static_cast<const size_t*>(opt_address), &v);
         *value = ToString(v);
       }
       break;
     case OptionType::kDouble:
-      *value = ToString(*(reinterpret_cast<const double*>(opt_address)));
+      *value = ToString(*(static_cast<const double*>(opt_address)));
       break;
     case OptionType::kString:
-      *value = EscapeOptionString(
-          *(reinterpret_cast<const std::string*>(opt_address)));
+      *value =
+          EscapeOptionString(*(static_cast<const std::string*>(opt_address)));
       break;
     case OptionType::kCompactionStyle:
       return SerializeEnum<CompactionStyle>(
           compaction_style_string_map,
-          *(reinterpret_cast<const CompactionStyle*>(opt_address)), value);
+          *(static_cast<const CompactionStyle*>(opt_address)), value);
     case OptionType::kCompactionPri:
       return SerializeEnum<CompactionPri>(
           compaction_pri_string_map,
-          *(reinterpret_cast<const CompactionPri*>(opt_address)), value);
+          *(static_cast<const CompactionPri*>(opt_address)), value);
     case OptionType::kCompressionType:
       return SerializeEnum<CompressionType>(
           compression_type_string_map,
-          *(reinterpret_cast<const CompressionType*>(opt_address)), value);
+          *(static_cast<const CompressionType*>(opt_address)), value);
     case OptionType::kSliceTransform: {
       const auto* slice_transform_ptr =
-          reinterpret_cast<const std::shared_ptr<const SliceTransform>*>(
+          static_cast<const std::shared_ptr<const SliceTransform>*>(
               opt_address);
       *value = slice_transform_ptr->get() ? slice_transform_ptr->get()->Name()
                                           : kNullptrString;
@@ -492,7 +566,7 @@ bool SerializeSingleOptionHelper(const char* opt_address,
     }
     case OptionType::kComparator: {
       // it's a const pointer of const Comparator*
-      const auto* ptr = reinterpret_cast<const Comparator* const*>(opt_address);
+      const auto* ptr = static_cast<const Comparator* const*>(opt_address);
       // Since the user-specified comparator will be wrapped by
       // InternalKeyComparator, we should persist the user-specified one
       // instead of InternalKeyComparator.
@@ -510,43 +584,42 @@ bool SerializeSingleOptionHelper(const char* opt_address,
     case OptionType::kCompactionFilter: {
       // it's a const pointer of const CompactionFilter*
       const auto* ptr =
-          reinterpret_cast<const CompactionFilter* const*>(opt_address);
+          static_cast<const CompactionFilter* const*>(opt_address);
       *value = *ptr ? (*ptr)->Name() : kNullptrString;
       break;
     }
     case OptionType::kCompactionFilterFactory: {
       const auto* ptr =
-          reinterpret_cast<const std::shared_ptr<CompactionFilterFactory>*>(
+          static_cast<const std::shared_ptr<CompactionFilterFactory>*>(
               opt_address);
       *value = ptr->get() ? ptr->get()->Name() : kNullptrString;
       break;
     }
     case OptionType::kMemTableRepFactory: {
       const auto* ptr =
-          reinterpret_cast<const std::shared_ptr<MemTableRepFactory>*>(
-              opt_address);
+          static_cast<const std::shared_ptr<MemTableRepFactory>*>(opt_address);
       *value = ptr->get() ? ptr->get()->Name() : kNullptrString;
       break;
     }
     case OptionType::kMergeOperator: {
       const auto* ptr =
-          reinterpret_cast<const std::shared_ptr<MergeOperator>*>(opt_address);
+          static_cast<const std::shared_ptr<MergeOperator>*>(opt_address);
       *value = ptr->get() ? ptr->get()->Name() : kNullptrString;
       break;
     }
     case OptionType::kFilterPolicy: {
       const auto* ptr =
-          reinterpret_cast<const std::shared_ptr<FilterPolicy>*>(opt_address);
+          static_cast<const std::shared_ptr<FilterPolicy>*>(opt_address);
       *value = ptr->get() ? ptr->get()->Name() : kNullptrString;
       break;
     }
     case OptionType::kChecksumType:
       return SerializeEnum<ChecksumType>(
           checksum_type_string_map,
-          *reinterpret_cast<const ChecksumType*>(opt_address), value);
+          *static_cast<const ChecksumType*>(opt_address), value);
     case OptionType::kFlushBlockPolicyFactory: {
       const auto* ptr =
-          reinterpret_cast<const std::shared_ptr<FlushBlockPolicyFactory>*>(
+          static_cast<const std::shared_ptr<FlushBlockPolicyFactory>*>(
               opt_address);
       *value = ptr->get() ? ptr->get()->Name() : kNullptrString;
       break;
@@ -554,11 +627,16 @@ bool SerializeSingleOptionHelper(const char* opt_address,
     case OptionType::kEncodingType:
       return SerializeEnum<EncodingType>(
           encoding_type_string_map,
-          *reinterpret_cast<const EncodingType*>(opt_address), value);
+          *static_cast<const EncodingType*>(opt_address), value);
     case OptionType::kCompactionStopStyle:
       return SerializeEnum<CompactionStopStyle>(
           compaction_stop_style_string_map,
-          *reinterpret_cast<const CompactionStopStyle*>(opt_address), value);
+          *static_cast<const CompactionStopStyle*>(opt_address), value);
+    case OptionType::kEncodedString: {
+      const auto* ptr = static_cast<const std::string*>(opt_address);
+      *value = (Slice(*ptr)).ToString(true);
+      break;
+    }
     default:
       return false;
   }
@@ -577,32 +655,6 @@ Status ConfigureFromMap(
   return s;
 }
 
-Status GetMutableOptionsFromStrings(
-    const MutableCFOptions& base_options,
-    const std::unordered_map<std::string, std::string>& options_map,
-    Logger* /*info_log*/, MutableCFOptions* new_options) {
-  assert(new_options);
-  *new_options = base_options;
-  ConfigOptions config_options;
-  const auto config = CFOptionsAsConfigurable(base_options);
-  return ConfigureFromMap<MutableCFOptions>(config_options, options_map,
-                                            MutableCFOptions::kName(),
-                                            config.get(), new_options);
-}
-
-Status GetMutableDBOptionsFromStrings(
-    const MutableDBOptions& base_options,
-    const std::unordered_map<std::string, std::string>& options_map,
-    MutableDBOptions* new_options) {
-  assert(new_options);
-  *new_options = base_options;
-  ConfigOptions config_options;
-
-  auto config = DBOptionsAsConfigurable(base_options);
-  return ConfigureFromMap<MutableDBOptions>(config_options, options_map,
-                                            MutableDBOptions::kName(),
-                                            config.get(), new_options);
-}
 
 Status StringToMap(const std::string& opts_str,
                    std::unordered_map<std::string, std::string>* opts_map) {
@@ -644,17 +696,11 @@ Status StringToMap(const std::string& opts_str,
   return Status::OK();
 }
 
-Status GetStringFromMutableDBOptions(const ConfigOptions& config_options,
-                                     const MutableDBOptions& mutable_opts,
-                                     std::string* opt_string) {
-  auto config = DBOptionsAsConfigurable(mutable_opts);
-  return config->GetOptionString(config_options, opt_string);
-}
 
 Status GetStringFromDBOptions(std::string* opt_string,
                               const DBOptions& db_options,
                               const std::string& delimiter) {
-  ConfigOptions config_options;
+  ConfigOptions config_options(db_options);
   config_options.delimiter = delimiter;
   return GetStringFromDBOptions(config_options, db_options, opt_string);
 }
@@ -668,14 +714,6 @@ Status GetStringFromDBOptions(const ConfigOptions& config_options,
   return config->GetOptionString(config_options, opt_string);
 }
 
-Status GetStringFromMutableCFOptions(const ConfigOptions& config_options,
-                                     const MutableCFOptions& mutable_opts,
-                                     std::string* opt_string) {
-  assert(opt_string);
-  opt_string->clear();
-  const auto config = CFOptionsAsConfigurable(mutable_opts);
-  return config->GetOptionString(config_options, opt_string);
-}
 
 Status GetStringFromColumnFamilyOptions(std::string* opt_string,
                                         const ColumnFamilyOptions& cf_options,
@@ -767,7 +805,7 @@ Status GetDBOptionsFromMap(
     const std::unordered_map<std::string, std::string>& opts_map,
     DBOptions* new_options, bool input_strings_escaped,
     bool ignore_unknown_options) {
-  ConfigOptions config_options;
+  ConfigOptions config_options(base_options);
   config_options.input_strings_escaped = input_strings_escaped;
   config_options.ignore_unknown_options = ignore_unknown_options;
   return GetDBOptionsFromMap(config_options, base_options, opts_map,
@@ -795,7 +833,7 @@ Status GetDBOptionsFromMap(
 Status GetDBOptionsFromString(const DBOptions& base_options,
                               const std::string& opts_str,
                               DBOptions* new_options) {
-  ConfigOptions config_options;
+  ConfigOptions config_options(base_options);
   config_options.input_strings_escaped = false;
   config_options.ignore_unknown_options = false;
 
@@ -819,7 +857,7 @@ Status GetDBOptionsFromString(const ConfigOptions& config_options,
 
 Status GetOptionsFromString(const Options& base_options,
                             const std::string& opts_str, Options* new_options) {
-  ConfigOptions config_options;
+  ConfigOptions config_options(base_options);
   config_options.input_strings_escaped = false;
   config_options.ignore_unknown_options = false;
 
@@ -834,6 +872,7 @@ Status GetOptionsFromString(const ConfigOptions& config_options,
   std::unordered_map<std::string, std::string> unused_opts;
   std::unordered_map<std::string, std::string> opts_map;
 
+  assert(new_options);
   *new_options = base_options;
   Status s = StringToMap(opts_str, &opts_map);
   if (!s.ok()) {
@@ -946,7 +985,7 @@ Status OptionTypeInfo::Parse(const ConfigOptions& config_options,
     return Status::OK();
   }
   try {
-    char* opt_addr = reinterpret_cast<char*>(opt_ptr) + offset_;
+    void* opt_addr = static_cast<char*>(opt_ptr) + offset_;
     const std::string& opt_value = config_options.input_strings_escaped
                                        ? UnescapeOptionString(value)
                                        : value;
@@ -988,28 +1027,56 @@ Status OptionTypeInfo::Parse(const ConfigOptions& config_options,
   }
 }
 
+Status OptionTypeInfo::ParseType(
+    const ConfigOptions& config_options, const std::string& opts_str,
+    const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+    void* opt_addr, std::unordered_map<std::string, std::string>* unused) {
+  std::unordered_map<std::string, std::string> opts_map;
+  Status status = StringToMap(opts_str, &opts_map);
+  if (!status.ok()) {
+    return status;
+  } else {
+    return ParseType(config_options, opts_map, type_map, opt_addr, unused);
+  }
+}
+
+Status OptionTypeInfo::ParseType(
+    const ConfigOptions& config_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+    void* opt_addr, std::unordered_map<std::string, std::string>* unused) {
+  for (const auto& opts_iter : opts_map) {
+    std::string opt_name;
+    const auto* opt_info = Find(opts_iter.first, type_map, &opt_name);
+    if (opt_info != nullptr) {
+      Status status =
+          opt_info->Parse(config_options, opt_name, opts_iter.second, opt_addr);
+      if (!status.ok()) {
+        return status;
+      }
+    } else if (unused != nullptr) {
+      (*unused)[opts_iter.first] = opts_iter.second;
+    } else if (!config_options.ignore_unknown_options) {
+      return Status::NotFound("Unrecognized option", opts_iter.first);
+    }
+  }
+  return Status::OK();
+}
+
 Status OptionTypeInfo::ParseStruct(
     const ConfigOptions& config_options, const std::string& struct_name,
     const std::unordered_map<std::string, OptionTypeInfo>* struct_map,
-    const std::string& opt_name, const std::string& opt_value, char* opt_addr) {
+    const std::string& opt_name, const std::string& opt_value, void* opt_addr) {
   assert(struct_map);
   Status status;
   if (opt_name == struct_name || EndsWith(opt_name, "." + struct_name)) {
     // This option represents the entire struct
-    std::unordered_map<std::string, std::string> opt_map;
-    status = StringToMap(opt_value, &opt_map);
-    for (const auto& map_iter : opt_map) {
-      if (!status.ok()) {
-        break;
-      }
-      const auto iter = struct_map->find(map_iter.first);
-      if (iter != struct_map->end()) {
-        status = iter->second.Parse(config_options, map_iter.first,
-                                    map_iter.second, opt_addr);
-      } else {
-        status = Status::InvalidArgument("Unrecognized option",
-                                         struct_name + "." + map_iter.first);
-      }
+    std::unordered_map<std::string, std::string> unused;
+    status =
+        ParseType(config_options, opt_value, *struct_map, opt_addr, &unused);
+    if (status.ok() && !unused.empty()) {
+      status = Status::InvalidArgument(
+          "Unrecognized option", struct_name + "." + unused.begin()->first);
     }
   } else if (StartsWith(opt_name, struct_name + ".")) {
     // This option represents a nested field in the struct (e.g, struct.field)
@@ -1041,7 +1108,7 @@ Status OptionTypeInfo::Serialize(const ConfigOptions& config_options,
                                  std::string* opt_value) const {
   // If the option is no longer used in rocksdb and marked as deprecated,
   // we skip it in the serialization.
-  const char* opt_addr = reinterpret_cast<const char*>(opt_ptr) + offset_;
+  const void* opt_addr = static_cast<const char*>(opt_ptr) + offset_;
   if (opt_addr == nullptr || IsDeprecated()) {
     return Status::OK();
   } else if (IsEnabled(OptionTypeFlags::kDontSerialize)) {
@@ -1050,6 +1117,19 @@ Status OptionTypeInfo::Serialize(const ConfigOptions& config_options,
     return serialize_func_(config_options, opt_name, opt_addr, opt_value);
   } else if (SerializeSingleOptionHelper(opt_addr, type_, opt_value)) {
     return Status::OK();
+  } else if (IsCustomizable()) {
+    const Customizable* custom = AsRawPointer<Customizable>(opt_ptr);
+    if (custom == nullptr) {
+      *opt_value = kNullptrString;
+    } else if (IsEnabled(OptionTypeFlags::kStringNameOnly) &&
+               !config_options.IsDetailed()) {
+      *opt_value = custom->GetId();
+    } else {
+      ConfigOptions embedded = config_options;
+      embedded.delimiter = ";";
+      *opt_value = custom->ToString(embedded);
+    }
+    return Status::OK();
   } else if (IsConfigurable()) {
     const Configurable* config = AsRawPointer<Configurable>(opt_ptr);
     if (config != nullptr) {
@@ -1063,10 +1143,31 @@ Status OptionTypeInfo::Serialize(const ConfigOptions& config_options,
   }
 }
 
+Status OptionTypeInfo::SerializeType(
+    const ConfigOptions& config_options,
+    const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+    const void* opt_addr, std::string* result) {
+  Status status;
+  for (const auto& iter : type_map) {
+    std::string single;
+    const auto& opt_info = iter.second;
+    if (opt_info.ShouldSerialize()) {
+      status =
+          opt_info.Serialize(config_options, iter.first, opt_addr, &single);
+      if (!status.ok()) {
+        return status;
+      } else {
+        result->append(iter.first + "=" + single + config_options.delimiter);
+      }
+    }
+  }
+  return status;
+}
+
 Status OptionTypeInfo::SerializeStruct(
     const ConfigOptions& config_options, const std::string& struct_name,
     const std::unordered_map<std::string, OptionTypeInfo>* struct_map,
-    const std::string& opt_name, const char* opt_addr, std::string* value) {
+    const std::string& opt_name, const void* opt_addr, std::string* value) {
   assert(struct_map);
   Status status;
   if (EndsWith(opt_name, struct_name)) {
@@ -1077,19 +1178,12 @@ Status OptionTypeInfo::SerializeStruct(
 
     // This option represents the entire struct
     std::string result;
-    for (const auto& iter : *struct_map) {
-      std::string single;
-      const auto& opt_info = iter.second;
-      if (opt_info.ShouldSerialize()) {
-        status = opt_info.Serialize(embedded, iter.first, opt_addr, &single);
-        if (!status.ok()) {
-          return status;
-        } else {
-          result.append(iter.first + "=" + single + embedded.delimiter);
-        }
-      }
+    status = SerializeType(embedded, *struct_map, opt_addr, &result);
+    if (!status.ok()) {
+      return status;
+    } else {
+      *value = "{" + result + "}";
     }
-    *value = "{" + result + "}";
   } else if (StartsWith(opt_name, struct_name + ".")) {
     // This option represents a nested field in the struct (e.g, struct.field)
     std::string elem_name;
@@ -1115,17 +1209,16 @@ Status OptionTypeInfo::SerializeStruct(
 }
 
 template <typename T>
-bool IsOptionEqual(const char* offset1, const char* offset2) {
-  return (*reinterpret_cast<const T*>(offset1) ==
-          *reinterpret_cast<const T*>(offset2));
+bool IsOptionEqual(const void* offset1, const void* offset2) {
+  return (*static_cast<const T*>(offset1) == *static_cast<const T*>(offset2));
 }
 
 static bool AreEqualDoubles(const double a, const double b) {
   return (fabs(a - b) < 0.00001);
 }
 
-static bool AreOptionsEqual(OptionType type, const char* this_offset,
-                            const char* that_offset) {
+static bool AreOptionsEqual(OptionType type, const void* this_offset,
+                            const void* that_offset) {
   switch (type) {
     case OptionType::kBoolean:
       return IsOptionEqual<bool>(this_offset, that_offset);
@@ -1137,29 +1230,31 @@ static bool AreOptionsEqual(OptionType type, const char* this_offset,
       return IsOptionEqual<int32_t>(this_offset, that_offset);
     case OptionType::kInt64T: {
       int64_t v1, v2;
-      GetUnaligned(reinterpret_cast<const int64_t*>(this_offset), &v1);
-      GetUnaligned(reinterpret_cast<const int64_t*>(that_offset), &v2);
+      GetUnaligned(static_cast<const int64_t*>(this_offset), &v1);
+      GetUnaligned(static_cast<const int64_t*>(that_offset), &v2);
       return (v1 == v2);
     }
+    case OptionType::kUInt8T:
+      return IsOptionEqual<uint8_t>(this_offset, that_offset);
     case OptionType::kUInt32T:
       return IsOptionEqual<uint32_t>(this_offset, that_offset);
     case OptionType::kUInt64T: {
       uint64_t v1, v2;
-      GetUnaligned(reinterpret_cast<const uint64_t*>(this_offset), &v1);
-      GetUnaligned(reinterpret_cast<const uint64_t*>(that_offset), &v2);
+      GetUnaligned(static_cast<const uint64_t*>(this_offset), &v1);
+      GetUnaligned(static_cast<const uint64_t*>(that_offset), &v2);
       return (v1 == v2);
     }
     case OptionType::kSizeT: {
       size_t v1, v2;
-      GetUnaligned(reinterpret_cast<const size_t*>(this_offset), &v1);
-      GetUnaligned(reinterpret_cast<const size_t*>(that_offset), &v2);
+      GetUnaligned(static_cast<const size_t*>(this_offset), &v1);
+      GetUnaligned(static_cast<const size_t*>(that_offset), &v2);
       return (v1 == v2);
     }
     case OptionType::kString:
       return IsOptionEqual<std::string>(this_offset, that_offset);
     case OptionType::kDouble:
-      return AreEqualDoubles(*reinterpret_cast<const double*>(this_offset),
-                             *reinterpret_cast<const double*>(that_offset));
+      return AreEqualDoubles(*static_cast<const double*>(this_offset),
+                             *static_cast<const double*>(that_offset));
     case OptionType::kCompactionStyle:
       return IsOptionEqual<CompactionStyle>(this_offset, that_offset);
     case OptionType::kCompactionStopStyle:
@@ -1172,6 +1267,8 @@ static bool AreOptionsEqual(OptionType type, const char* this_offset,
       return IsOptionEqual<ChecksumType>(this_offset, that_offset);
     case OptionType::kEncodingType:
       return IsOptionEqual<EncodingType>(this_offset, that_offset);
+    case OptionType::kEncodedString:
+      return IsOptionEqual<std::string>(this_offset, that_offset);
     default:
       return false;
   }  // End switch
@@ -1186,8 +1283,8 @@ bool OptionTypeInfo::AreEqual(const ConfigOptions& config_options,
   if (!config_options.IsCheckEnabled(level)) {
     return true;  // If the sanity level is not being checked, skip it
   }
-  const auto this_addr = reinterpret_cast<const char*>(this_ptr) + offset_;
-  const auto that_addr = reinterpret_cast<const char*>(that_ptr) + offset_;
+  const void* this_addr = static_cast<const char*>(this_ptr) + offset_;
+  const void* that_addr = static_cast<const char*>(that_ptr) + offset_;
   if (this_addr == nullptr || that_addr == nullptr) {
     if (this_addr == that_addr) {
       return true;
@@ -1227,25 +1324,35 @@ bool OptionTypeInfo::AreEqual(const ConfigOptions& config_options,
   return false;
 }
 
+bool OptionTypeInfo::TypesAreEqual(
+    const ConfigOptions& config_options,
+    const std::unordered_map<std::string, OptionTypeInfo>& type_map,
+    const void* this_addr, const void* that_addr, std::string* mismatch) {
+  for (const auto& iter : type_map) {
+    const auto& opt_info = iter.second;
+    if (!opt_info.AreEqual(config_options, iter.first, this_addr, that_addr,
+                           mismatch)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 bool OptionTypeInfo::StructsAreEqual(
     const ConfigOptions& config_options, const std::string& struct_name,
     const std::unordered_map<std::string, OptionTypeInfo>* struct_map,
-    const std::string& opt_name, const char* this_addr, const char* that_addr,
+    const std::string& opt_name, const void* this_addr, const void* that_addr,
     std::string* mismatch) {
   assert(struct_map);
   bool matches = true;
   std::string result;
   if (EndsWith(opt_name, struct_name)) {
     // This option represents the entire struct
-    for (const auto& iter : *struct_map) {
-      const auto& opt_info = iter.second;
-
-      matches = opt_info.AreEqual(config_options, iter.first, this_addr,
-                                  that_addr, &result);
-      if (!matches) {
-        *mismatch = struct_name + "." + result;
-        return false;
-      }
+    matches = TypesAreEqual(config_options, *struct_map, this_addr, that_addr,
+                            &result);
+    if (!matches) {
+      *mismatch = struct_name + "." + result;
+      return false;
     }
   } else if (StartsWith(opt_name, struct_name + ".")) {
     // This option represents a nested field in the struct (e.g, struct.field)
diff --git a/options/options_helper.h b/options/options_helper.h
index 4323d5f8e50..a16c265ede9 100644
--- a/options/options_helper.h
+++ b/options/options_helper.h
@@ -18,6 +18,7 @@ namespace ROCKSDB_NAMESPACE {
 struct ColumnFamilyOptions;
 struct ConfigOptions;
 struct DBOptions;
+struct ImmutableCFOptions;
 struct ImmutableDBOptions;
 struct MutableDBOptions;
 struct MutableCFOptions;
@@ -25,6 +26,8 @@ struct Options;
 
 std::vector<CompressionType> GetSupportedCompressions();
 
+std::vector<CompressionType> GetSupportedDictCompressions();
+
 // Checks that the combination of DBOptions and ColumnFamilyOptions are valid
 Status ValidateOptions(const DBOptions& db_opts,
                        const ColumnFamilyOptions& cf_opts);
@@ -36,6 +39,11 @@ ColumnFamilyOptions BuildColumnFamilyOptions(
     const ColumnFamilyOptions& ioptions,
     const MutableCFOptions& mutable_cf_options);
 
+void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions,
+                               ColumnFamilyOptions* cf_opts);
+void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
+                               ColumnFamilyOptions* cf_opts);
+
 #ifndef ROCKSDB_LITE
 std::unique_ptr<Configurable> DBOptionsAsConfigurable(
     const MutableDBOptions& opts);
@@ -46,23 +54,6 @@ std::unique_ptr<Configurable> CFOptionsAsConfigurable(
     const ColumnFamilyOptions& opts,
     const std::unordered_map<std::string, std::string>* opt_map = nullptr);
 
-Status GetStringFromMutableCFOptions(const ConfigOptions& config_options,
-                                     const MutableCFOptions& mutable_opts,
-                                     std::string* opt_string);
-
-Status GetStringFromMutableDBOptions(const ConfigOptions& config_options,
-                                     const MutableDBOptions& mutable_opts,
-                                     std::string* opt_string);
-
-Status GetMutableOptionsFromStrings(
-    const MutableCFOptions& base_options,
-    const std::unordered_map<std::string, std::string>& options_map,
-    Logger* info_log, MutableCFOptions* new_options);
-
-Status GetMutableDBOptionsFromStrings(
-    const MutableDBOptions& base_options,
-    const std::unordered_map<std::string, std::string>& options_map,
-    MutableDBOptions* new_options);
 
 bool ParseSliceTransform(
     const std::string& value,
diff --git a/options/options_parser.cc b/options/options_parser.cc
index e5f6106ab05..42cde218aad 100644
--- a/options/options_parser.cc
+++ b/options/options_parser.cc
@@ -13,7 +13,7 @@
 #include <utility>
 #include <vector>
 
-#include "file/read_write_util.h"
+#include "file/line_file_reader.h"
 #include "file/writable_file_writer.h"
 #include "options/cf_options.h"
 #include "options/db_options.h"
@@ -262,22 +262,17 @@ Status RocksDBOptionsParser::Parse(const ConfigOptions& config_options_in,
   if (!s.ok()) {
     return s;
   }
-  SequentialFileReader sf_reader(std::move(seq_file), file_name,
-                                 config_options.file_readahead_size);
+  LineFileReader lf_reader(std::move(seq_file), file_name,
+                           config_options.file_readahead_size);
 
   OptionSection section = kOptionSectionUnknown;
   std::string title;
   std::string argument;
   std::unordered_map<std::string, std::string> opt_map;
-  std::istringstream iss;
   std::string line;
-  bool has_data = true;
   // we only support single-lined statement.
-  for (int line_num = 1; ReadOneLine(&iss, &sf_reader, &line, &has_data, &s);
-       ++line_num) {
-    if (!s.ok()) {
-      return s;
-    }
+  while (lf_reader.ReadLine(&line)) {
+    int line_num = static_cast<int>(lf_reader.GetLineNumber());
     line = TrimAndRemoveComment(line);
     if (line.empty()) {
       continue;
@@ -313,6 +308,10 @@ Status RocksDBOptionsParser::Parse(const ConfigOptions& config_options_in,
       opt_map.insert({name, value});
     }
   }
+  s = lf_reader.GetStatus();
+  if (!s.ok()) {
+    return s;
+  }
 
   s = EndSection(config_options, section, title, argument, opt_map);
   opt_map.clear();
diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc
index 70bbb5123c3..e5cdce36dca 100644
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@@ -158,6 +158,9 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
       *bbto,
       "cache_index_and_filter_blocks=1;"
       "cache_index_and_filter_blocks_with_high_priority=true;"
+      "metadata_cache_options={top_level_index_pinning=kFallback;"
+      "partition_pinning=kAll;"
+      "unpartitioned_pinning=kFlushedAndSimilar;};"
       "pin_l0_filter_and_index_blocks_in_cache=1;"
       "pin_top_level_index_and_filter=1;"
       "index_type=kHashSearch;"
@@ -176,7 +179,9 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
       "hash_index_allow_collision=false;"
       "verify_compression=true;read_amp_bytes_per_bit=0;"
       "enable_index_compression=false;"
-      "block_align=true",
+      "block_align=true;"
+      "max_auto_readahead_size=0;"
+      "prepopulate_block_cache=kDisable",
       new_bbto));
 
   ASSERT_EQ(unset_bytes_base,
@@ -223,6 +228,11 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
       {offsetof(struct DBOptions, wal_filter), sizeof(const WalFilter*)},
       {offsetof(struct DBOptions, file_checksum_gen_factory),
        sizeof(std::shared_ptr<FileChecksumGenFactory>)},
+      {offsetof(struct DBOptions, db_host_id), sizeof(std::string)},
+      {offsetof(struct DBOptions, checksum_handoff_file_types),
+       sizeof(FileTypeSet)},
+      {offsetof(struct DBOptions, compaction_service),
+       sizeof(std::shared_ptr<CompactionService>)},
   };
 
   char* options_ptr = new char[sizeof(DBOptions)];
@@ -278,6 +288,8 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
                              "skip_log_error_on_recovery=true;"
                              "writable_file_max_buffer_size=1048576;"
                              "paranoid_checks=true;"
+                             "flush_verify_memtable_count=true;"
+                             "track_and_verify_wals_in_manifest=true;"
                              "is_fd_close_on_exec=false;"
                              "bytes_per_sync=4295013613;"
                              "strict_bytes_per_sync=true;"
@@ -330,7 +342,9 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
                              "write_dbid_to_manifest=false;"
                              "best_efforts_recovery=false;"
                              "max_bgerror_resume_count=2;"
-                             "bgerror_resume_retry_interval=1000000",
+                             "bgerror_resume_retry_interval=1000000"
+                             "db_host_id=hostname;"
+                             "allow_data_in_errors=false",
                              new_options));
 
   ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions),
@@ -400,14 +414,14 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
   // Count padding bytes by setting all bytes in the memory to a special char,
   // copy a well constructed struct to this memory and see how many special
   // bytes left.
-  ColumnFamilyOptions* options = new (options_ptr) ColumnFamilyOptions();
   FillWithSpecialChar(options_ptr, sizeof(ColumnFamilyOptions),
                       kColumnFamilyOptionsExcluded);
 
-  // It based on the behavior of compiler that padding bytes are not changed
-  // when copying the struct. It's prone to failure when compiler behavior
-  // changes. We verify there is unset bytes to detect the case.
-  *options = ColumnFamilyOptions();
+  // Invoke a user-defined constructor in the hope that it does not overwrite
+  // padding bytes. Note that previously we relied on the implicitly-defined
+  // copy-assignment operator (i.e., `*options = ColumnFamilyOptions();`) here,
+  // which did in fact modify padding bytes.
+  ColumnFamilyOptions* options = new (options_ptr) ColumnFamilyOptions();
 
   // Deprecatd option which is not initialized. Need to set it to avoid
   // Valgrind error
@@ -433,6 +447,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
   options->max_mem_compaction_level = 0;
   options->compaction_filter = nullptr;
   options->sst_partitioner_factory = nullptr;
+  options->bottommost_temperature = Temperature::kUnknown;
 
   char* new_options_ptr = new char[sizeof(ColumnFamilyOptions)];
   ColumnFamilyOptions* new_options =
@@ -465,8 +480,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "max_bytes_for_level_multiplier=60;"
       "memtable_factory=SkipListFactory;"
       "compression=kNoCompression;"
-      "compression_opts=5:6:7:8:9:true;"
-      "bottommost_compression_opts=4:5:6:7:8:true;"
+      "compression_opts=5:6:7:8:9:10:true:11;"
+      "bottommost_compression_opts=4:5:6:7:8:9:true:10;"
       "bottommost_compression=kDisableCompressionOption;"
       "level0_stop_writes_trigger=33;"
       "num_levels=99;"
@@ -500,6 +515,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "min_blob_size=256;"
       "blob_file_size=1000000;"
       "blob_compression_type=kBZip2Compression;"
+      "enable_blob_garbage_collection=true;"
+      "blob_garbage_collection_age_cutoff=0.5;"
       "compaction_options_fifo={max_table_files_size=3;allow_"
       "compaction=false;};",
       new_options));
diff --git a/options/options_test.cc b/options/options_test.cc
index 33925d7ed44..93f74c4c62e 100644
--- a/options/options_test.cc
+++ b/options/options_test.cc
@@ -102,6 +102,8 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
       {"min_blob_size", "1K"},
       {"blob_file_size", "1G"},
       {"blob_compression_type", "kZSTD"},
+      {"enable_blob_garbage_collection", "true"},
+      {"blob_garbage_collection_age_cutoff", "0.5"},
   };
 
   std::unordered_map<std::string, std::string> db_options_map = {
@@ -109,6 +111,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
       {"create_missing_column_families", "true"},
       {"error_if_exists", "false"},
       {"paranoid_checks", "true"},
+      {"track_and_verify_wals_in_manifest", "true"},
       {"max_open_files", "32"},
       {"max_total_wal_size", "33"},
       {"use_fsync", "true"},
@@ -230,6 +233,8 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_cf_opt.min_blob_size, 1ULL << 10);
   ASSERT_EQ(new_cf_opt.blob_file_size, 1ULL << 30);
   ASSERT_EQ(new_cf_opt.blob_compression_type, kZSTD);
+  ASSERT_EQ(new_cf_opt.enable_blob_garbage_collection, true);
+  ASSERT_EQ(new_cf_opt.blob_garbage_collection_age_cutoff, 0.5);
 
   cf_options_map["write_buffer_size"] = "hello";
   ASSERT_NOK(GetColumnFamilyOptionsFromMap(exact, base_cf_opt, cf_options_map,
@@ -263,6 +268,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_db_opt.create_missing_column_families, true);
   ASSERT_EQ(new_db_opt.error_if_exists, false);
   ASSERT_EQ(new_db_opt.paranoid_checks, true);
+  ASSERT_EQ(new_db_opt.track_and_verify_wals_in_manifest, true);
   ASSERT_EQ(new_db_opt.max_open_files, 32);
   ASSERT_EQ(new_db_opt.max_total_wal_size, static_cast<uint64_t>(33));
   ASSERT_EQ(new_db_opt.use_fsync, true);
@@ -719,12 +725,18 @@ TEST_F(OptionsTest, CompressionOptionsFromString) {
   ASSERT_OK(GetColumnFamilyOptionsFromString(
       ignore, ColumnFamilyOptions(), "compression_opts=5:6:7:8:9:x:false",
       &base_cf_opt));
-  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
       config_options, ColumnFamilyOptions(),
       "compression_opts=1:2:3:4:5:6:true:8", &base_cf_opt));
   ASSERT_OK(GetColumnFamilyOptionsFromString(
       ignore, ColumnFamilyOptions(), "compression_opts=1:2:3:4:5:6:true:8",
       &base_cf_opt));
+  ASSERT_NOK(GetColumnFamilyOptionsFromString(
+      config_options, ColumnFamilyOptions(),
+      "compression_opts=1:2:3:4:5:6:true:8:9", &base_cf_opt));
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      ignore, ColumnFamilyOptions(), "compression_opts=1:2:3:4:5:6:true:8:9",
+      &base_cf_opt));
   ASSERT_NOK(GetColumnFamilyOptionsFromString(
       config_options, ColumnFamilyOptions(), "compression_opts={unknown=bad;}",
       &base_cf_opt));
@@ -785,6 +797,7 @@ TEST_F(OptionsTest, OldInterfaceTest) {
       {"create_missing_column_families", "true"},
       {"error_if_exists", "false"},
       {"paranoid_checks", "true"},
+      {"track_and_verify_wals_in_manifest", "true"},
       {"max_open_files", "32"},
   };
   ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt));
@@ -792,6 +805,7 @@ TEST_F(OptionsTest, OldInterfaceTest) {
   ASSERT_EQ(new_db_opt.create_missing_column_families, true);
   ASSERT_EQ(new_db_opt.error_if_exists, false);
   ASSERT_EQ(new_db_opt.paranoid_checks, true);
+  ASSERT_EQ(new_db_opt.track_and_verify_wals_in_manifest, true);
   ASSERT_EQ(new_db_opt.max_open_files, 32);
   db_options_map["unknown_option"] = "1";
   Status s = GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt);
@@ -859,10 +873,11 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
   ASSERT_EQ(new_opt.format_version, 5U);
   ASSERT_EQ(new_opt.whole_key_filtering, true);
   ASSERT_TRUE(new_opt.filter_policy != nullptr);
-  const BloomFilterPolicy& bfp =
-      dynamic_cast<const BloomFilterPolicy&>(*new_opt.filter_policy);
-  EXPECT_EQ(bfp.GetMillibitsPerKey(), 4567);
-  EXPECT_EQ(bfp.GetWholeBitsPerKey(), 5);
+  const BloomFilterPolicy* bfp =
+      dynamic_cast<const BloomFilterPolicy*>(new_opt.filter_policy.get());
+  EXPECT_EQ(bfp->GetMillibitsPerKey(), 4567);
+  EXPECT_EQ(bfp->GetWholeBitsPerKey(), 5);
+  EXPECT_EQ(bfp->GetMode(), BloomFilterPolicy::kAutoBloom);
   // Verify that only the lower 32bits are stored in
   // new_opt.read_amp_bytes_per_bit.
   EXPECT_EQ(1U, new_opt.read_amp_bytes_per_bit);
@@ -919,6 +934,23 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
             new_opt.cache_index_and_filter_blocks);
   ASSERT_EQ(table_opt.filter_policy, new_opt.filter_policy);
 
+  // Ribbon filter policy
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt, "filter_policy=ribbonfilter:5.678;",
+      &new_opt));
+  ASSERT_TRUE(new_opt.filter_policy != nullptr);
+  bfp = dynamic_cast<const BloomFilterPolicy*>(new_opt.filter_policy.get());
+  EXPECT_EQ(bfp->GetMillibitsPerKey(), 5678);
+  EXPECT_EQ(bfp->GetMode(), BloomFilterPolicy::kStandard128Ribbon);
+  // Old name
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(
+      config_options, table_opt, "filter_policy=experimental_ribbon:6.789;",
+      &new_opt));
+  ASSERT_TRUE(new_opt.filter_policy != nullptr);
+  bfp = dynamic_cast<const BloomFilterPolicy*>(new_opt.filter_policy.get());
+  EXPECT_EQ(bfp->GetMillibitsPerKey(), 6789);
+  EXPECT_EQ(bfp->GetMode(), BloomFilterPolicy::kStandard128Ribbon);
+
   // Check block cache options are overwritten when specified
   // in new format as a struct.
   ASSERT_OK(GetBlockBasedTableOptionsFromString(
@@ -1274,6 +1306,77 @@ TEST_F(OptionsTest, OptionsComposeDecompose) {
   delete new_cf_opts.compaction_filter;
 }
 
+TEST_F(OptionsTest, DBOptionsComposeImmutable) {
+  // Build a DBOptions from an Immutable/Mutable one and verify that
+  // we get same constituent options.
+  ConfigOptions config_options;
+  Random rnd(301);
+  DBOptions base_opts, new_opts;
+  test::RandomInitDBOptions(&base_opts, &rnd);
+  MutableDBOptions m_opts(base_opts);
+  ImmutableDBOptions i_opts(base_opts);
+  new_opts = BuildDBOptions(i_opts, m_opts);
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_options, base_opts,
+                                                  new_opts));
+}
+
+TEST_F(OptionsTest, GetMutableDBOptions) {
+  Random rnd(228);
+  DBOptions base_opts;
+  std::string opts_str;
+  std::unordered_map<std::string, std::string> opts_map;
+  ConfigOptions config_options;
+
+  test::RandomInitDBOptions(&base_opts, &rnd);
+  ImmutableDBOptions i_opts(base_opts);
+  MutableDBOptions m_opts(base_opts);
+  MutableDBOptions new_opts;
+  ASSERT_OK(GetStringFromMutableDBOptions(config_options, m_opts, &opts_str));
+  ASSERT_OK(StringToMap(opts_str, &opts_map));
+  ASSERT_OK(GetMutableDBOptionsFromStrings(m_opts, opts_map, &new_opts));
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(
+      config_options, base_opts, BuildDBOptions(i_opts, new_opts)));
+}
+
+TEST_F(OptionsTest, CFOptionsComposeImmutable) {
+  // Build a DBOptions from an Immutable/Mutable one and verify that
+  // we get same constituent options.
+  ConfigOptions config_options;
+  Random rnd(301);
+  ColumnFamilyOptions base_opts, new_opts;
+  DBOptions dummy;  // Needed to create ImmutableCFOptions
+  test::RandomInitCFOptions(&base_opts, dummy, &rnd);
+  MutableCFOptions m_opts(base_opts);
+  ImmutableCFOptions i_opts(base_opts);
+  UpdateColumnFamilyOptions(i_opts, &new_opts);
+  UpdateColumnFamilyOptions(m_opts, &new_opts);
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_opts,
+                                                  new_opts));
+  delete new_opts.compaction_filter;
+}
+
+TEST_F(OptionsTest, GetMutableCFOptions) {
+  Random rnd(228);
+  ColumnFamilyOptions base, copy;
+  std::string opts_str;
+  std::unordered_map<std::string, std::string> opts_map;
+  ConfigOptions config_options;
+  DBOptions dummy;  // Needed to create ImmutableCFOptions
+
+  test::RandomInitCFOptions(&base, dummy, &rnd);
+  ColumnFamilyOptions result;
+  MutableCFOptions m_opts(base), new_opts;
+
+  ASSERT_OK(GetStringFromMutableCFOptions(config_options, m_opts, &opts_str));
+  ASSERT_OK(StringToMap(opts_str, &opts_map));
+  ASSERT_OK(GetMutableOptionsFromStrings(m_opts, opts_map, nullptr, &new_opts));
+  UpdateColumnFamilyOptions(ImmutableCFOptions(base), &copy);
+  UpdateColumnFamilyOptions(new_opts, &copy);
+
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base, copy));
+  delete copy.compaction_filter;
+}
+
 TEST_F(OptionsTest, ColumnFamilyOptionsSerialization) {
   Options options;
   ColumnFamilyOptions base_opt, new_opt;
@@ -1353,6 +1456,7 @@ TEST_F(OptionsTest, MutableTableOptions) {
   ASSERT_EQ(bbto->block_size, 1024);
   ASSERT_OK(bbtf->PrepareOptions(config_options));
   ASSERT_TRUE(bbtf->IsPrepared());
+  config_options.mutable_options_only = true;
   ASSERT_OK(bbtf->ConfigureOption(config_options, "block_size", "1024"));
   ASSERT_EQ(bbto->block_align, true);
   ASSERT_NOK(bbtf->ConfigureOption(config_options, "block_align", "false"));
@@ -1372,6 +1476,79 @@ TEST_F(OptionsTest, MutableTableOptions) {
   ASSERT_EQ(bbto->block_size, 8192);
 }
 
+TEST_F(OptionsTest, MutableCFOptions) {
+  ConfigOptions config_options;
+  ColumnFamilyOptions cf_opts;
+
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      config_options, cf_opts,
+      "paranoid_file_checks=true; block_based_table_factory.block_align=false; "
+      "block_based_table_factory.block_size=8192;",
+      &cf_opts));
+  ASSERT_TRUE(cf_opts.paranoid_file_checks);
+  ASSERT_NE(cf_opts.table_factory.get(), nullptr);
+  const auto bbto = cf_opts.table_factory->GetOptions<BlockBasedTableOptions>();
+  ASSERT_NE(bbto, nullptr);
+  ASSERT_EQ(bbto->block_size, 8192);
+  ASSERT_EQ(bbto->block_align, false);
+  std::unordered_map<std::string, std::string> unused_opts;
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts, {{"paranoid_file_checks", "false"}}, &cf_opts));
+  ASSERT_EQ(cf_opts.paranoid_file_checks, false);
+
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts,
+      {{"block_based_table_factory.block_size", "16384"}}, &cf_opts));
+  ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions<BlockBasedTableOptions>());
+  ASSERT_EQ(bbto->block_size, 16384);
+
+  config_options.mutable_options_only = true;
+  // Force consistency checks is not mutable
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts, {{"force_consistency_checks", "true"}},
+      &cf_opts));
+
+  // Attempt to change the table.  It is not mutable, so this should fail and
+  // leave the original intact
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts, {{"table_factory", "PlainTable"}}, &cf_opts));
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts, {{"table_factory.id", "PlainTable"}}, &cf_opts));
+  ASSERT_NE(cf_opts.table_factory.get(), nullptr);
+  ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions<BlockBasedTableOptions>());
+
+  // Change the block size.  Should update the value in the current table
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts,
+      {{"block_based_table_factory.block_size", "8192"}}, &cf_opts));
+  ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions<BlockBasedTableOptions>());
+  ASSERT_EQ(bbto->block_size, 8192);
+
+  // Attempt to turn off block cache fails, as this option is not mutable
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts,
+      {{"block_based_table_factory.no_block_cache", "true"}}, &cf_opts));
+  ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions<BlockBasedTableOptions>());
+
+  // Attempt to change the block size via a config string/map.  Should update
+  // the current value
+  ASSERT_OK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts,
+      {{"block_based_table_factory", "{block_size=32768}"}}, &cf_opts));
+  ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions<BlockBasedTableOptions>());
+  ASSERT_EQ(bbto->block_size, 32768);
+
+  // Attempt to change the block size and no cache through the map.  Should
+  // fail, leaving the old values intact
+  ASSERT_NOK(GetColumnFamilyOptionsFromMap(
+      config_options, cf_opts,
+      {{"block_based_table_factory",
+        "{block_size=16384; no_block_cache=true}"}},
+      &cf_opts));
+  ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions<BlockBasedTableOptions>());
+  ASSERT_EQ(bbto->block_size, 32768);
+}
+
 #endif  // !ROCKSDB_LITE
 
 Status StringToMap(
@@ -1564,6 +1741,94 @@ TEST_F(OptionsTest, GetStringFromCompressionType) {
   ASSERT_NOK(
       GetStringFromCompressionType(&res, static_cast<CompressionType>(-10)));
 }
+
+TEST_F(OptionsTest, OnlyMutableDBOptions) {
+  std::string opt_str;
+  Random rnd(302);
+  ConfigOptions cfg_opts;
+  DBOptions db_opts;
+  DBOptions mdb_opts;
+  std::unordered_set<std::string> m_names;
+  std::unordered_set<std::string> a_names;
+
+  test::RandomInitDBOptions(&db_opts, &rnd);
+  auto db_config = DBOptionsAsConfigurable(db_opts);
+
+  // Get all of the DB Option names (mutable or not)
+  ASSERT_OK(db_config->GetOptionNames(cfg_opts, &a_names));
+
+  // Get only the mutable options from db_opts and set those in mdb_opts
+  cfg_opts.mutable_options_only = true;
+
+  // Get only the Mutable DB Option names
+  ASSERT_OK(db_config->GetOptionNames(cfg_opts, &m_names));
+  ASSERT_OK(GetStringFromDBOptions(cfg_opts, db_opts, &opt_str));
+  ASSERT_OK(GetDBOptionsFromString(cfg_opts, mdb_opts, opt_str, &mdb_opts));
+  std::string mismatch;
+  // Comparing only the mutable options, the two are equivalent
+  auto mdb_config = DBOptionsAsConfigurable(mdb_opts);
+  ASSERT_TRUE(mdb_config->AreEquivalent(cfg_opts, db_config.get(), &mismatch));
+  ASSERT_TRUE(db_config->AreEquivalent(cfg_opts, mdb_config.get(), &mismatch));
+
+  ASSERT_GT(a_names.size(), m_names.size());
+  for (const auto& n : m_names) {
+    std::string m, d;
+    ASSERT_OK(mdb_config->GetOption(cfg_opts, n, &m));
+    ASSERT_OK(db_config->GetOption(cfg_opts, n, &d));
+    ASSERT_EQ(m, d);
+  }
+
+  cfg_opts.mutable_options_only = false;
+  // Comparing all of the options, the two are not equivalent
+  ASSERT_FALSE(mdb_config->AreEquivalent(cfg_opts, db_config.get(), &mismatch));
+  ASSERT_FALSE(db_config->AreEquivalent(cfg_opts, mdb_config.get(), &mismatch));
+}
+
+TEST_F(OptionsTest, OnlyMutableCFOptions) {
+  std::string opt_str;
+  Random rnd(302);
+  ConfigOptions cfg_opts;
+  DBOptions db_opts;
+  ColumnFamilyOptions mcf_opts;
+  ColumnFamilyOptions cf_opts;
+  std::unordered_set<std::string> m_names;
+  std::unordered_set<std::string> a_names;
+
+  test::RandomInitCFOptions(&cf_opts, db_opts, &rnd);
+  auto cf_config = CFOptionsAsConfigurable(cf_opts);
+
+  // Get all of the CF Option names (mutable or not)
+  ASSERT_OK(cf_config->GetOptionNames(cfg_opts, &a_names));
+
+  // Get only the mutable options from cf_opts and set those in mcf_opts
+  cfg_opts.mutable_options_only = true;
+  // Get only the Mutable CF Option names
+  ASSERT_OK(cf_config->GetOptionNames(cfg_opts, &m_names));
+  ASSERT_OK(GetStringFromColumnFamilyOptions(cfg_opts, cf_opts, &opt_str));
+  ASSERT_OK(
+      GetColumnFamilyOptionsFromString(cfg_opts, mcf_opts, opt_str, &mcf_opts));
+  std::string mismatch;
+
+  auto mcf_config = CFOptionsAsConfigurable(mcf_opts);
+  // Comparing only the mutable options, the two are equivalent
+  ASSERT_TRUE(mcf_config->AreEquivalent(cfg_opts, cf_config.get(), &mismatch));
+  ASSERT_TRUE(cf_config->AreEquivalent(cfg_opts, mcf_config.get(), &mismatch));
+
+  ASSERT_GT(a_names.size(), m_names.size());
+  for (const auto& n : m_names) {
+    std::string m, d;
+    ASSERT_OK(mcf_config->GetOption(cfg_opts, n, &m));
+    ASSERT_OK(cf_config->GetOption(cfg_opts, n, &d));
+    ASSERT_EQ(m, d);
+  }
+
+  cfg_opts.mutable_options_only = false;
+  // Comparing all of the options, the two are not equivalent
+  ASSERT_FALSE(mcf_config->AreEquivalent(cfg_opts, cf_config.get(), &mismatch));
+  ASSERT_FALSE(cf_config->AreEquivalent(cfg_opts, mcf_config.get(), &mismatch));
+
+  delete cf_opts.compaction_filter;
+}
 #endif  // !ROCKSDB_LITE
 
 TEST_F(OptionsTest, ConvertOptionsTest) {
@@ -1591,6 +1856,23 @@ TEST_F(OptionsTest, ConvertOptionsTest) {
 }
 
 #ifndef ROCKSDB_LITE
+const static std::string kCustomEnvName = "Custom";
+const static std::string kCustomEnvProp = "env=" + kCustomEnvName;
+class CustomEnv : public EnvWrapper {
+ public:
+  explicit CustomEnv(Env* _target) : EnvWrapper(_target) {}
+};
+
+static int RegisterCustomEnv(ObjectLibrary& library, const std::string& arg) {
+  library.Register<Env>(
+      arg, [](const std::string& /*name*/, std::unique_ptr<Env>* /*env_guard*/,
+              std::string* /* errmsg */) {
+        static CustomEnv env(Env::Default());
+        return &env;
+      });
+  return 1;
+}
+
 // This test suite tests the old APIs into the Configure options methods.
 // Once those APIs are officially deprecated, this test suite can be deleted.
 class OptionsOldApiTest : public testing::Test {};
@@ -1653,6 +1935,8 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
       {"min_blob_size", "1K"},
       {"blob_file_size", "1G"},
       {"blob_compression_type", "kZSTD"},
+      {"enable_blob_garbage_collection", "true"},
+      {"blob_garbage_collection_age_cutoff", "0.5"},
   };
 
   std::unordered_map<std::string, std::string> db_options_map = {
@@ -1660,6 +1944,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
       {"create_missing_column_families", "true"},
       {"error_if_exists", "false"},
       {"paranoid_checks", "true"},
+      {"track_and_verify_wals_in_manifest", "true"},
       {"max_open_files", "32"},
       {"max_total_wal_size", "33"},
       {"use_fsync", "true"},
@@ -1773,6 +2058,8 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_cf_opt.min_blob_size, 1ULL << 10);
   ASSERT_EQ(new_cf_opt.blob_file_size, 1ULL << 30);
   ASSERT_EQ(new_cf_opt.blob_compression_type, kZSTD);
+  ASSERT_EQ(new_cf_opt.enable_blob_garbage_collection, true);
+  ASSERT_EQ(new_cf_opt.blob_garbage_collection_age_cutoff, 0.5);
 
   cf_options_map["write_buffer_size"] = "hello";
   ASSERT_NOK(GetColumnFamilyOptionsFromMap(
@@ -1808,6 +2095,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_db_opt.create_missing_column_families, true);
   ASSERT_EQ(new_db_opt.error_if_exists, false);
   ASSERT_EQ(new_db_opt.paranoid_checks, true);
+  ASSERT_EQ(new_db_opt.track_and_verify_wals_in_manifest, true);
   ASSERT_EQ(new_db_opt.max_open_files, 32);
   ASSERT_EQ(new_db_opt.max_total_wal_size, static_cast<uint64_t>(33));
   ASSERT_EQ(new_db_opt.use_fsync, true);
@@ -2284,14 +2572,8 @@ TEST_F(OptionsOldApiTest, GetOptionsFromStringTest) {
       NewBlockBasedTableFactory(block_based_table_options));
 
   // Register an Env with object registry.
-  const static char* kCustomEnvName = "CustomEnv";
-  class CustomEnv : public EnvWrapper {
-   public:
-    explicit CustomEnv(Env* _target) : EnvWrapper(_target) {}
-  };
-
   ObjectLibrary::Default()->Register<Env>(
-      kCustomEnvName,
+      "CustomEnvDefault",
       [](const std::string& /*name*/, std::unique_ptr<Env>* /*env_guard*/,
          std::string* /* errmsg */) {
         static CustomEnv env(Env::Default());
@@ -2305,7 +2587,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromStringTest) {
       "compression_opts=4:5:6;create_if_missing=true;max_open_files=1;"
       "bottommost_compression_opts=5:6:7;create_if_missing=true;max_open_files="
       "1;"
-      "rate_limiter_bytes_per_sec=1024;env=CustomEnv",
+      "rate_limiter_bytes_per_sec=1024;env=CustomEnvDefault",
       &new_options));
 
   ASSERT_EQ(new_options.compression_opts.window_bits, 4);
@@ -2339,7 +2621,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromStringTest) {
   ASSERT_EQ(new_options.max_open_files, 1);
   ASSERT_TRUE(new_options.rate_limiter.get() != nullptr);
   Env* newEnv = new_options.env;
-  ASSERT_OK(Env::LoadEnv(kCustomEnvName, &newEnv));
+  ASSERT_OK(Env::LoadEnv("CustomEnvDefault", &newEnv));
   ASSERT_EQ(newEnv, new_options.env);
 }
 
@@ -2390,14 +2672,10 @@ TEST_F(OptionsOldApiTest, ColumnFamilyOptionsSerialization) {
 #ifndef ROCKSDB_LITE
 class OptionsParserTest : public testing::Test {
  public:
-  OptionsParserTest() {
-    env_.reset(new test::StringEnv(Env::Default()));
-    fs_.reset(new LegacyFileSystemWrapper(env_.get()));
-  }
+  OptionsParserTest() { fs_.reset(new test::StringFS(FileSystem::Default())); }
 
  protected:
-  std::unique_ptr<test::StringEnv> env_;
-  std::unique_ptr<LegacyFileSystemWrapper> fs_;
+  std::shared_ptr<test::StringFS> fs_;
 };
 
 TEST_F(OptionsParserTest, Comment) {
@@ -2426,7 +2704,7 @@ TEST_F(OptionsParserTest, Comment) {
       "  # if a section is blank, we will use the default\n";
 
   const std::string kTestFileName = "test-rocksdb-options.ini";
-  ASSERT_OK(env_->WriteToNewFile(kTestFileName, options_file_content));
+  ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
   RocksDBOptionsParser parser;
   ASSERT_OK(
       parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */));
@@ -2457,7 +2735,7 @@ TEST_F(OptionsParserTest, ExtraSpace) {
       "  # if a section is blank, we will use the default\n";
 
   const std::string kTestFileName = "test-rocksdb-options.ini";
-  ASSERT_OK(env_->WriteToNewFile(kTestFileName, options_file_content));
+  ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
   RocksDBOptionsParser parser;
   ASSERT_OK(
       parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */));
@@ -2475,7 +2753,7 @@ TEST_F(OptionsParserTest, MissingDBOptions) {
       "  # if a section is blank, we will use the default\n";
 
   const std::string kTestFileName = "test-rocksdb-options.ini";
-  ASSERT_OK(env_->WriteToNewFile(kTestFileName, options_file_content));
+  ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
   RocksDBOptionsParser parser;
   ASSERT_NOK(
       parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */));
@@ -2505,7 +2783,7 @@ TEST_F(OptionsParserTest, DoubleDBOptions) {
       "  # if a section is blank, we will use the default\n";
 
   const std::string kTestFileName = "test-rocksdb-options.ini";
-  ASSERT_OK(env_->WriteToNewFile(kTestFileName, options_file_content));
+  ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
   RocksDBOptionsParser parser;
   ASSERT_NOK(
       parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */));
@@ -2533,7 +2811,7 @@ TEST_F(OptionsParserTest, NoDefaultCFOptions) {
       "  # if a section is blank, we will use the default\n";
 
   const std::string kTestFileName = "test-rocksdb-options.ini";
-  ASSERT_OK(env_->WriteToNewFile(kTestFileName, options_file_content));
+  ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
   RocksDBOptionsParser parser;
   ASSERT_NOK(
       parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */));
@@ -2563,7 +2841,7 @@ TEST_F(OptionsParserTest, DefaultCFOptionsMustBeTheFirst) {
       "  # if a section is blank, we will use the default\n";
 
   const std::string kTestFileName = "test-rocksdb-options.ini";
-  ASSERT_OK(env_->WriteToNewFile(kTestFileName, options_file_content));
+  ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
   RocksDBOptionsParser parser;
   ASSERT_NOK(
       parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */));
@@ -2592,7 +2870,7 @@ TEST_F(OptionsParserTest, DuplicateCFOptions) {
       "[CFOptions \"something_else\"]\n";
 
   const std::string kTestFileName = "test-rocksdb-options.ini";
-  ASSERT_OK(env_->WriteToNewFile(kTestFileName, options_file_content));
+  ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
   RocksDBOptionsParser parser;
   ASSERT_NOK(
       parser.Parse(kTestFileName, fs_.get(), false, 4096 /* readahead_size */));
@@ -2660,12 +2938,12 @@ TEST_F(OptionsParserTest, IgnoreUnknownOptions) {
         "  # if a section is blank, we will use the default\n";
 
     const std::string kTestFileName = "test-rocksdb-options.ini";
-    auto s = env_->FileExists(kTestFileName);
+    auto s = fs_->FileExists(kTestFileName, IOOptions(), nullptr);
     ASSERT_TRUE(s.ok() || s.IsNotFound());
     if (s.ok()) {
-      ASSERT_OK(env_->DeleteFile(kTestFileName));
+      ASSERT_OK(fs_->DeleteFile(kTestFileName, IOOptions(), nullptr));
     }
-    ASSERT_OK(env_->WriteToNewFile(kTestFileName, options_file_content));
+    ASSERT_OK(fs_->WriteToNewFile(kTestFileName, options_file_content));
     RocksDBOptionsParser parser;
     ASSERT_NOK(parser.Parse(kTestFileName, fs_.get(), false,
                             4096 /* readahead_size */));
@@ -2713,7 +2991,7 @@ TEST_F(OptionsParserTest, ParseVersion) {
     snprintf(buffer, kLength - 1, file_template.c_str(), iv.c_str());
 
     parser.Reset();
-    ASSERT_OK(env_->WriteToNewFile(iv, buffer));
+    ASSERT_OK(fs_->WriteToNewFile(iv, buffer));
     ASSERT_NOK(parser.Parse(iv, fs_.get(), false, 0 /* readahead_size */));
   }
 
@@ -2722,7 +3000,7 @@ TEST_F(OptionsParserTest, ParseVersion) {
   for (auto vv : valid_versions) {
     snprintf(buffer, kLength - 1, file_template.c_str(), vv.c_str());
     parser.Reset();
-    ASSERT_OK(env_->WriteToNewFile(vv, buffer));
+    ASSERT_OK(fs_->WriteToNewFile(vv, buffer));
     ASSERT_OK(parser.Parse(vv, fs_.get(), false, 0 /* readahead_size */));
   }
 }
@@ -2831,37 +3109,37 @@ TEST_F(OptionsParserTest, Readahead) {
                                   kOptionsFileName, fs_.get()));
 
   uint64_t file_size = 0;
-  ASSERT_OK(env_->GetFileSize(kOptionsFileName, &file_size));
+  ASSERT_OK(
+      fs_->GetFileSize(kOptionsFileName, IOOptions(), &file_size, nullptr));
   assert(file_size > 0);
 
   RocksDBOptionsParser parser;
 
-  env_->num_seq_file_read_ = 0;
+  fs_->num_seq_file_read_ = 0;
   size_t readahead_size = 128 * 1024;
 
   ASSERT_OK(parser.Parse(kOptionsFileName, fs_.get(), false, readahead_size));
-  ASSERT_EQ(env_->num_seq_file_read_.load(),
+  ASSERT_EQ(fs_->num_seq_file_read_.load(),
             (file_size - 1) / readahead_size + 1);
 
-  env_->num_seq_file_read_.store(0);
+  fs_->num_seq_file_read_.store(0);
   readahead_size = 1024 * 1024;
   ASSERT_OK(parser.Parse(kOptionsFileName, fs_.get(), false, readahead_size));
-  ASSERT_EQ(env_->num_seq_file_read_.load(),
+  ASSERT_EQ(fs_->num_seq_file_read_.load(),
             (file_size - 1) / readahead_size + 1);
 
   // Tiny readahead. 8 KB is read each time.
-  env_->num_seq_file_read_.store(0);
+  fs_->num_seq_file_read_.store(0);
   ASSERT_OK(
       parser.Parse(kOptionsFileName, fs_.get(), false, 1 /* readahead_size */));
-  ASSERT_GE(env_->num_seq_file_read_.load(), file_size / (8 * 1024));
-  ASSERT_LT(env_->num_seq_file_read_.load(), file_size / (8 * 1024) * 2);
+  ASSERT_GE(fs_->num_seq_file_read_.load(), file_size / (8 * 1024));
+  ASSERT_LT(fs_->num_seq_file_read_.load(), file_size / (8 * 1024) * 2);
 
   // Disable readahead means 512KB readahead.
-  env_->num_seq_file_read_.store(0);
+  fs_->num_seq_file_read_.store(0);
   ASSERT_OK(
       parser.Parse(kOptionsFileName, fs_.get(), false, 0 /* readahead_size */));
-  ASSERT_GE(env_->num_seq_file_read_.load(),
-            (file_size - 1) / (512 * 1024) + 1);
+  ASSERT_GE(fs_->num_seq_file_read_.load(), (file_size - 1) / (512 * 1024) + 1);
 }
 
 TEST_F(OptionsParserTest, DumpAndParse) {
@@ -3059,7 +3337,7 @@ class OptionsSanityCheckTest : public OptionsParserTest {
   }
 
   Status PersistCFOptions(const ColumnFamilyOptions& cf_opts) {
-    Status s = env_->DeleteFile(kOptionsFileName);
+    Status s = fs_->DeleteFile(kOptionsFileName, IOOptions(), nullptr);
     if (!s.ok()) {
       return s;
     }
@@ -3437,8 +3715,8 @@ TEST_F(OptionTypeInfoTest, TestInvalidArgs) {
                            OptionVerificationType::kNormal,
                            OptionTypeFlags::kNone,
                            [](const ConfigOptions&, const std::string&,
-                              const std::string& value, char* addr) {
-                             auto ptr = reinterpret_cast<int*>(addr);
+                              const std::string& value, void* addr) {
+                             auto ptr = static_cast<int*>(addr);
                              *ptr = ParseInt(value);
                              return Status::OK();
                            });
@@ -3451,8 +3729,8 @@ TEST_F(OptionTypeInfoTest, TestParseFunc) {
       0, OptionType::kUnknown, OptionVerificationType::kNormal,
       OptionTypeFlags::kNone,
       [](const ConfigOptions& /*opts*/, const std::string& name,
-         const std::string& value, char* addr) {
-        auto ptr = reinterpret_cast<std::string*>(addr);
+         const std::string& value, void* addr) {
+        auto ptr = static_cast<std::string*>(addr);
         if (name == "Oops") {
           return Status::InvalidArgument(value);
         } else {
@@ -3472,7 +3750,7 @@ TEST_F(OptionTypeInfoTest, TestSerializeFunc) {
       0, OptionType::kString, OptionVerificationType::kNormal,
       OptionTypeFlags::kNone, nullptr,
       [](const ConfigOptions& /*opts*/, const std::string& name,
-         const char* /*addr*/, std::string* value) {
+         const void* /*addr*/, std::string* value) {
         if (name == "Oops") {
           return Status::InvalidArgument(name);
         } else {
@@ -3494,9 +3772,9 @@ TEST_F(OptionTypeInfoTest, TestEqualsFunc) {
       0, OptionType::kInt, OptionVerificationType::kNormal,
       OptionTypeFlags::kNone, nullptr, nullptr,
       [](const ConfigOptions& /*opts*/, const std::string& name,
-         const char* addr1, const char* addr2, std::string* mismatch) {
-        auto i1 = *(reinterpret_cast<const int*>(addr1));
-        auto i2 = *(reinterpret_cast<const int*>(addr2));
+         const void* addr1, const void* addr2, std::string* mismatch) {
+        auto i1 = *(static_cast<const int*>(addr1));
+        auto i2 = *(static_cast<const int*>(addr2));
         if (name == "LT") {
           return i1 < i2;
         } else if (name == "GT") {
@@ -3550,8 +3828,7 @@ TEST_F(OptionTypeInfoTest, TestOptionFlags) {
   // An alias can change the value via parse, but does nothing on serialize on
   // match
   std::string result;
-  ASSERT_OK(opt_alias.Parse(config_options, "Alias", "Alias",
-                            reinterpret_cast<char*>(&base)));
+  ASSERT_OK(opt_alias.Parse(config_options, "Alias", "Alias", &base));
   ASSERT_OK(opt_alias.Serialize(config_options, "Alias", &base, &result));
   ASSERT_TRUE(
       opt_alias.AreEqual(config_options, "Alias", &base, &comp, &result));
@@ -3762,6 +4039,73 @@ TEST_F(OptionTypeInfoTest, TestVectorType) {
   ASSERT_EQ(vec1[1], "b1|b2");
   ASSERT_EQ(vec1[2], "c1|c2|{d1|d2}");
 }
+
+TEST_F(OptionTypeInfoTest, TestStaticType) {
+  struct SimpleOptions {
+    size_t size = 0;
+    bool verify = true;
+  };
+
+  static std::unordered_map<std::string, OptionTypeInfo> type_map = {
+      {"size", {offsetof(struct SimpleOptions, size), OptionType::kSizeT}},
+      {"verify",
+       {offsetof(struct SimpleOptions, verify), OptionType::kBoolean}},
+  };
+
+  ConfigOptions config_options;
+  SimpleOptions opts, copy;
+  opts.size = 12345;
+  opts.verify = false;
+  std::string str, mismatch;
+
+  ASSERT_OK(
+      OptionTypeInfo::SerializeType(config_options, type_map, &opts, &str));
+  ASSERT_FALSE(OptionTypeInfo::TypesAreEqual(config_options, type_map, &opts,
+                                             &copy, &mismatch));
+  ASSERT_OK(OptionTypeInfo::ParseType(config_options, str, type_map, &copy));
+  ASSERT_TRUE(OptionTypeInfo::TypesAreEqual(config_options, type_map, &opts,
+                                            &copy, &mismatch));
+}
+
+class ConfigOptionsTest : public testing::Test {};
+
+TEST_F(ConfigOptionsTest, EnvFromConfigOptions) {
+  ConfigOptions config_options;
+  DBOptions db_opts;
+  Options opts;
+  Env* mem_env = NewMemEnv(Env::Default());
+  config_options.registry->AddLibrary("custom-env", RegisterCustomEnv,
+                                      kCustomEnvName);
+
+  config_options.env = mem_env;
+  // First test that we can get the env as expected
+  ASSERT_OK(GetDBOptionsFromString(config_options, DBOptions(), kCustomEnvProp,
+                                   &db_opts));
+  ASSERT_OK(
+      GetOptionsFromString(config_options, Options(), kCustomEnvProp, &opts));
+  ASSERT_NE(config_options.env, db_opts.env);
+  ASSERT_EQ(opts.env, db_opts.env);
+  Env* custom_env = db_opts.env;
+
+  // Now try a "bad" env" and check that nothing changed
+  config_options.ignore_unsupported_options = true;
+  ASSERT_OK(
+      GetDBOptionsFromString(config_options, db_opts, "env=unknown", &db_opts));
+  ASSERT_OK(GetOptionsFromString(config_options, opts, "env=unknown", &opts));
+  ASSERT_EQ(config_options.env, mem_env);
+  ASSERT_EQ(db_opts.env, custom_env);
+  ASSERT_EQ(opts.env, db_opts.env);
+
+  // Now try a "bad" env" ignoring unknown objects
+  config_options.ignore_unsupported_options = false;
+  ASSERT_NOK(
+      GetDBOptionsFromString(config_options, db_opts, "env=unknown", &db_opts));
+  ASSERT_EQ(config_options.env, mem_env);
+  ASSERT_EQ(db_opts.env, custom_env);
+  ASSERT_EQ(opts.env, db_opts.env);
+
+  delete mem_env;
+}
 #endif  // !ROCKSDB_LITE
 }  // namespace ROCKSDB_NAMESPACE
 
diff --git a/plugin/README.md b/plugin/README.md
new file mode 100644
index 00000000000..4bae8312f8e
--- /dev/null
+++ b/plugin/README.md
@@ -0,0 +1,25 @@
+## Building external plugins together with RocksDB
+
+RocksDB offers several plugin interfaces for developers to customize its behavior. One difficulty developers face is how to make their plugin available to end users. The approach discussed here involves building the external code together with the RocksDB code into a single binary. Note another approach we plan to support involves loading plugins dynamically from shared libraries.
+
+### Discovery
+
+We hope developers will mention their work in "PLUGINS.md" so users can easily discover and reuse solutions for customizing RocksDB.
+
+### Directory organization
+
+External plugins will be linked according to their name into a subdirectory of "plugin/". For example, a plugin called "dedupfs" would be linked into "plugin/dedupfs/".
+
+### Build standard
+
+Currently the only supported build system is make. In the plugin directory, files ending in the .mk extension can define the following variables.
+
+* `$(PLUGIN_NAME)_SOURCES`: these files will be compiled and linked with RocksDB. They can access RocksDB public header files.
+* `$(PLUGIN_NAME)_HEADERS`: these files will be installed in the RocksDB header directory. Their paths will be prefixed by "rocksdb/plugin/$(PLUGIN_NAME)/".
+* `$(PLUGIN_NAME)_LDFLAGS`: these flags will be passed to the final link step. For example, library dependencies can be propagated here, or symbols can be forcibly included, e.g., for static registration.
+
+Users will run the usual make commands from the RocksDB directory, specifying the plugins to include in a space-separated list in the variable `ROCKSDB_PLUGINS`.
+
+### Example
+
+For a working example, see [Dedupfs](https://github.com/ajkr/dedupfs).
diff --git a/port/jemalloc_helper.h b/port/jemalloc_helper.h
index fbf25c313d0..c0ef19a40bd 100644
--- a/port/jemalloc_helper.h
+++ b/port/jemalloc_helper.h
@@ -38,25 +38,54 @@ static inline bool HasJemalloc() { return true; }
 
 #else
 
+// definitions for compatibility with older versions of jemalloc
+#if !defined(JEMALLOC_ALLOCATOR)
+#define JEMALLOC_ALLOCATOR
+#endif
+#if !defined(JEMALLOC_RESTRICT_RETURN)
+#define JEMALLOC_RESTRICT_RETURN
+#endif
+#if !defined(JEMALLOC_NOTHROW)
+#define JEMALLOC_NOTHROW JEMALLOC_ATTR(nothrow)
+#endif
+#if !defined(JEMALLOC_ALLOC_SIZE)
+#ifdef JEMALLOC_HAVE_ATTR_ALLOC_SIZE
+#define JEMALLOC_ALLOC_SIZE(s) JEMALLOC_ATTR(alloc_size(s))
+#else
+#define JEMALLOC_ALLOC_SIZE(s)
+#endif
+#endif
+
 // Declare non-standard jemalloc APIs as weak symbols. We can null-check these
 // symbols to detect whether jemalloc is linked with the binary.
-extern "C" void* mallocx(size_t, int) __attribute__((__nothrow__, __weak__));
-extern "C" void* rallocx(void*, size_t, int) __attribute__((__nothrow__, __weak__));
-extern "C" size_t xallocx(void*, size_t, size_t, int) __attribute__((__nothrow__, __weak__));
-extern "C" size_t sallocx(const void*, int) __attribute__((__nothrow__, __weak__));
-extern "C" void dallocx(void*, int) __attribute__((__nothrow__, __weak__));
-extern "C" void sdallocx(void*, size_t, int) __attribute__((__nothrow__, __weak__));
-extern "C" size_t nallocx(size_t, int) __attribute__((__nothrow__, __weak__));
-extern "C" int mallctl(const char*, void*, size_t*, void*, size_t)
-    __attribute__((__nothrow__, __weak__));
-extern "C" int mallctlnametomib(const char*, size_t*, size_t*)
-    __attribute__((__nothrow__, __weak__));
-extern "C" int mallctlbymib(const size_t*, size_t, void*, size_t*, void*,
-                            size_t) __attribute__((__nothrow__, __weak__));
-extern "C" void malloc_stats_print(void (*)(void*, const char*), void*,
-                                   const char*) __attribute__((__nothrow__, __weak__));
-extern "C" size_t malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void*)
-    JEMALLOC_CXX_THROW __attribute__((__weak__));
+extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW *
+mallocx(size_t, int) JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
+    __attribute__((__weak__));
+extern "C" JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN void JEMALLOC_NOTHROW *
+rallocx(void *, size_t, int) JEMALLOC_ALLOC_SIZE(2) __attribute__((__weak__));
+extern "C" size_t JEMALLOC_NOTHROW xallocx(void *, size_t, size_t, int)
+    __attribute__((__weak__));
+extern "C" size_t JEMALLOC_NOTHROW sallocx(const void *, int)
+    JEMALLOC_ATTR(pure) __attribute__((__weak__));
+extern "C" void JEMALLOC_NOTHROW dallocx(void *, int) __attribute__((__weak__));
+extern "C" void JEMALLOC_NOTHROW sdallocx(void *, size_t, int)
+    __attribute__((__weak__));
+extern "C" size_t JEMALLOC_NOTHROW nallocx(size_t, int) JEMALLOC_ATTR(pure)
+    __attribute__((__weak__));
+extern "C" int JEMALLOC_NOTHROW mallctl(const char *, void *, size_t *, void *,
+                                        size_t) __attribute__((__weak__));
+extern "C" int JEMALLOC_NOTHROW mallctlnametomib(const char *, size_t *,
+                                                 size_t *)
+    __attribute__((__weak__));
+extern "C" int JEMALLOC_NOTHROW mallctlbymib(const size_t *, size_t, void *,
+                                             size_t *, void *, size_t)
+    __attribute__((__weak__));
+extern "C" void JEMALLOC_NOTHROW
+malloc_stats_print(void (*)(void *, const char *), void *, const char *)
+    __attribute__((__weak__));
+extern "C" size_t JEMALLOC_NOTHROW
+malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *) JEMALLOC_CXX_THROW
+    __attribute__((__weak__));
 
 // Check if Jemalloc is linked with the binary. Note the main program might be
 // using a different memory allocator even this method return true.
diff --git a/port/lang.h b/port/lang.h
index a5fd364907c..4429f105eed 100644
--- a/port/lang.h
+++ b/port/lang.h
@@ -14,3 +14,28 @@
 #define FALLTHROUGH_INTENDED do {} while (0)
 #endif
 #endif
+
+#if defined(__clang__)
+#if defined(__has_feature)
+#if __has_feature(address_sanitizer)
+#define MUST_FREE_HEAP_ALLOCATIONS 1
+#endif  // __has_feature(address_sanitizer)
+#endif  // defined(__has_feature)
+#else   // __clang__
+#ifdef __SANITIZE_ADDRESS__
+#define MUST_FREE_HEAP_ALLOCATIONS 1
+#endif  // __SANITIZE_ADDRESS__
+#endif  // __clang__
+
+// Coding guidelines say to avoid static objects with non-trivial destructors,
+// because it's easy to cause trouble (UB) in static destruction. This
+// macro makes it easier to define static objects that are normally never
+// destructed, except are destructed when running under ASAN. This should
+// avoid unexpected, unnecessary destruction behavior in production.
+// Note that constructor arguments can be provided as in
+//   STATIC_AVOID_DESTRUCTION(Foo, foo)(arg1, arg2);
+#ifdef MUST_FREE_HEAP_ALLOCATIONS
+#define STATIC_AVOID_DESTRUCTION(Type, name) static Type name
+#else
+#define STATIC_AVOID_DESTRUCTION(Type, name) static Type& name = *new Type
+#endif
diff --git a/port/port_posix.cc b/port/port_posix.cc
index 8610c6e0d5d..112984de296 100644
--- a/port/port_posix.cc
+++ b/port/port_posix.cc
@@ -21,11 +21,12 @@
 #include <stdio.h>
 #include <string.h>
 #include <sys/resource.h>
-#include <sys/syscall.h>
 #include <sys/time.h>
 #include <unistd.h>
+
 #include <cstdlib>
-#include "logging/logging.h"
+
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -47,7 +48,7 @@ namespace port {
 
 static int PthreadCall(const char* label, int result) {
   if (result != 0 && result != ETIMEDOUT) {
-    fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
+    fprintf(stderr, "pthread %s: %s\n", label, errnoStr(result).c_str());
     abort();
   }
   return result;
diff --git a/port/port_posix.h b/port/port_posix.h
index a24c7b690e2..90f131e1310 100644
--- a/port/port_posix.h
+++ b/port/port_posix.h
@@ -167,7 +167,7 @@ static inline void AsmVolatilePause() {
 #if defined(__i386__) || defined(__x86_64__)
   asm volatile("pause");
 #elif defined(__aarch64__)
-  asm volatile("wfe");
+  asm volatile("yield");
 #elif defined(__powerpc64__)
   asm volatile("or 27,27,27");
 #endif
diff --git a/port/stack_trace.cc b/port/stack_trace.cc
index dee8bd9cea6..c82da2a206b 100644
--- a/port/stack_trace.cc
+++ b/port/stack_trace.cc
@@ -7,7 +7,7 @@
 
 #if defined(ROCKSDB_LITE) ||                                                  \
     !(defined(ROCKSDB_BACKTRACE) || defined(OS_MACOSX)) || defined(CYGWIN) || \
-    defined(OS_FREEBSD) || defined(OS_SOLARIS) || defined(OS_WIN)
+    defined(OS_SOLARIS) || defined(OS_WIN)
 
 // noop
 
@@ -32,6 +32,10 @@ void* SaveStack(int* /*num_frames*/, int /*first_frames_to_skip*/) {
 #include <unistd.h>
 #include <cxxabi.h>
 
+#if defined(OS_FREEBSD)
+#include <sys/sysctl.h>
+#endif
+
 namespace ROCKSDB_NAMESPACE {
 namespace port {
 
@@ -41,6 +45,7 @@ namespace {
 const char* GetExecutableName() {
   static char name[1024];
 
+#if !defined(OS_FREEBSD)
   char link[1024];
   snprintf(link, sizeof(link), "/proc/%d/exe", getpid());
   auto read = readlink(link, name, sizeof(name) - 1);
@@ -50,6 +55,17 @@ const char* GetExecutableName() {
     name[read] = 0;
     return name;
   }
+#else
+  int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1};
+  size_t namesz = sizeof(name);
+
+  auto ret = sysctl(mib, 4, name, &namesz, nullptr, 0);
+  if (-1 == ret) {
+    return nullptr;
+  } else {
+    return name;
+  }
+#endif
 }
 
 void PrintStackTraceLine(const char* symbol, void* frame) {
@@ -144,6 +160,22 @@ static void StackTraceHandler(int sig) {
   fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig));
   // skip the top three signal handler related frames
   PrintStack(3);
+
+  // Efforts to fix or suppress TSAN warnings "signal-unsafe call inside of
+  // a signal" have failed, so just warn the user about them.
+#if defined(__clang__) && defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+  fprintf(stderr,
+          "==> NOTE: any above warnings about \"signal-unsafe call\" are\n"
+          "==> ignorable, as they are expected when generating a stack\n"
+          "==> trace because of a signal under TSAN. Consider why the\n"
+          "==> signal was generated to begin with, and the stack trace\n"
+          "==> in the TSAN warning can be useful for that. (The stack\n"
+          "==> trace printed by the signal handler is likely obscured\n"
+          "==> by TSAN output.)\n");
+#endif
+#endif
+
   // re-signal to default handler (so we still get core dump if needed...)
   raise(sig);
 }
diff --git a/port/win/env_default.cc b/port/win/env_default.cc
index 36f95fbe342..0c9958ddd9c 100644
--- a/port/win/env_default.cc
+++ b/port/win/env_default.cc
@@ -11,8 +11,8 @@
 
 #include <mutex>
 
-#include <rocksdb/env.h>
 #include "port/win/env_win.h"
+#include "rocksdb/env.h"
 #include "test_util/sync_point.h"
 #include "util/compression_context_cache.h"
 #include "util/thread_local.h"
diff --git a/port/win/env_win.cc b/port/win/env_win.cc
index f7d8f9ce3ce..cc337c1f8c6 100644
--- a/port/win/env_win.cc
+++ b/port/win/env_win.cc
@@ -9,37 +9,37 @@
 
 #if defined(OS_WIN)
 
-#include "port/win/env_win.h"
-#include "port/win/win_thread.h"
-#include <algorithm>
-#include <ctime>
-#include <thread>
-
+#include <direct.h>  // _rmdir, _mkdir, _getcwd
 #include <errno.h>
-#include <io.h> // _access
-#include <direct.h> // _rmdir, _mkdir, _getcwd
-#include <sys/types.h>
+#include <io.h>   // _access
+#include <rpc.h>  // for uuid generation
+#include <shlwapi.h>
 #include <sys/stat.h>
+#include <sys/types.h>
+#include <windows.h>
 
-#include "rocksdb/env.h"
-#include "rocksdb/slice.h"
-
-#include "port/port.h"
-#include "port/port_dirent.h"
-#include "port/win/win_logger.h"
-#include "port/win/io_win.h"
+#include <algorithm>
+#include <ctime>
+#include <thread>
 
 #include "monitoring/iostats_context_imp.h"
-
 #include "monitoring/thread_status_updater.h"
 #include "monitoring/thread_status_util.h"
-
-#include <rpc.h>  // for uuid generation
-#include <windows.h>
-#include <shlwapi.h>
+#include "port/port.h"
+#include "port/port_dirent.h"
+#include "port/win/env_win.h"
+#include "port/win/io_win.h"
+#include "port/win/win_logger.h"
+#include "port/win/win_thread.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
 #include "strsafe.h"
+#include "util/string_util.h"
 
-#include <algorithm>
+// Undefine the functions  windows might use (again)...
+#undef GetCurrentTime
+#undef DeleteFile
+#undef LoadLibrary
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -61,29 +61,19 @@ typedef std::unique_ptr<void, decltype(FindCloseFunc)> UniqueFindClosePtr;
 
 void WinthreadCall(const char* label, std::error_code result) {
   if (0 != result.value()) {
-    fprintf(stderr, "pthread %s: %s\n", label, strerror(result.value()));
+    fprintf(stderr, "Winthread %s: %s\n", label,
+            errnoStr(result.value()).c_str());
     abort();
   }
 }
 
-}
+}  // namespace
 
 namespace port {
-
-WinEnvIO::WinEnvIO(Env* hosted_env)
-    : hosted_env_(hosted_env),
-      page_size_(4 * 1024),
-      allocation_granularity_(page_size_),
-      perf_counter_frequency_(0),
+WinClock::WinClock()
+    : perf_counter_frequency_(0),
       nano_seconds_per_period_(0),
       GetSystemTimePreciseAsFileTime_(NULL) {
-
-  SYSTEM_INFO sinfo;
-  GetSystemInfo(&sinfo);
-
-  page_size_ = sinfo.dwPageSize;
-  allocation_granularity_ = sinfo.dwAllocationGranularity;
-
   {
     LARGE_INTEGER qpf;
     BOOL ret __attribute__((__unused__));
@@ -98,39 +88,91 @@ WinEnvIO::WinEnvIO(Env* hosted_env)
 
   HMODULE module = GetModuleHandle("kernel32.dll");
   if (module != NULL) {
-    GetSystemTimePreciseAsFileTime_ =
-      (FnGetSystemTimePreciseAsFileTime)GetProcAddress(
-          module, "GetSystemTimePreciseAsFileTime");
+    GetSystemTimePreciseAsFileTime_ = (FnGetSystemTimePreciseAsFileTime)(
+        void*)GetProcAddress(module, "GetSystemTimePreciseAsFileTime");
   }
 }
 
-WinEnvIO::~WinEnvIO() {
+void WinClock::SleepForMicroseconds(int micros) {
+  std::this_thread::sleep_for(std::chrono::microseconds(micros));
 }
 
-Status WinEnvIO::DeleteFile(const std::string& fname) {
-  Status result;
+std::string WinClock::TimeToString(uint64_t secondsSince1970) {
+  std::string result;
 
-  BOOL ret = RX_DeleteFile(RX_FN(fname).c_str());
+  const time_t seconds = secondsSince1970;
+  const int maxsize = 64;
 
-  if(!ret) {
-    auto lastError = GetLastError();
-    result = IOErrorFromWindowsError("Failed to delete: " + fname,
-                                     lastError);
+  struct tm t;
+  errno_t ret = localtime_s(&t, &seconds);
+
+  if (ret) {
+    result = std::to_string(seconds);
+  } else {
+    result.resize(maxsize);
+    char* p = &result[0];
+
+    int len =
+        snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", t.tm_year + 1900,
+                 t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec);
+    assert(len > 0);
+
+    result.resize(len);
   }
 
   return result;
 }
 
-Status WinEnvIO::Truncate(const std::string& fname, size_t size) {
-  Status s;
-  int result = ROCKSDB_NAMESPACE::port::Truncate(fname, size);
-  if (result != 0) {
-    s = IOError("Failed to truncate: " + fname, errno);
+uint64_t WinClock::NowMicros() {
+  if (GetSystemTimePreciseAsFileTime_ != NULL) {
+    // all std::chrono clocks on windows proved to return
+    // values that may repeat that is not good enough for some uses.
+    const int64_t c_UnixEpochStartTicks = 116444736000000000LL;
+    const int64_t c_FtToMicroSec = 10;
+
+    // This interface needs to return system time and not
+    // just any microseconds because it is often used as an argument
+    // to TimedWait() on condition variable
+    FILETIME ftSystemTime;
+    GetSystemTimePreciseAsFileTime_(&ftSystemTime);
+
+    LARGE_INTEGER li;
+    li.LowPart = ftSystemTime.dwLowDateTime;
+    li.HighPart = ftSystemTime.dwHighDateTime;
+    // Subtract unix epoch start
+    li.QuadPart -= c_UnixEpochStartTicks;
+    // Convert to microsecs
+    li.QuadPart /= c_FtToMicroSec;
+    return li.QuadPart;
   }
-  return s;
+  using namespace std::chrono;
+  return duration_cast<microseconds>(system_clock::now().time_since_epoch())
+      .count();
+}
+
+uint64_t WinClock::NowNanos() {
+  if (nano_seconds_per_period_ != 0) {
+    // all std::chrono clocks on windows have the same resolution that is only
+    // good enough for microseconds but not nanoseconds
+    // On Windows 8 and Windows 2012 Server
+    // GetSystemTimePreciseAsFileTime(&current_time) can be used
+    LARGE_INTEGER li;
+    QueryPerformanceCounter(&li);
+    // Convert performance counter to nanoseconds by precomputed ratio.
+    // Directly multiply nano::den with li.QuadPart causes overflow.
+    // Only do this when nano::den is divisible by perf_counter_frequency_,
+    // which most likely is the case in reality. If it's not, fall back to
+    // high_resolution_clock, which may be less precise under old compilers.
+    li.QuadPart *= nano_seconds_per_period_;
+    return li.QuadPart;
+  }
+  using namespace std::chrono;
+  return duration_cast<nanoseconds>(
+             high_resolution_clock::now().time_since_epoch())
+      .count();
 }
 
-Status WinEnvIO::GetCurrentTime(int64_t* unix_time) {
+Status WinClock::GetCurrentTime(int64_t* unix_time) {
   time_t time = std::time(nullptr);
   if (time == (time_t)(-1)) {
     return Status::NotSupported("Failed to get time");
@@ -140,10 +182,55 @@ Status WinEnvIO::GetCurrentTime(int64_t* unix_time) {
   return Status::OK();
 }
 
-Status WinEnvIO::NewSequentialFile(const std::string& fname,
-                                   std::unique_ptr<SequentialFile>* result,
-                                   const EnvOptions& options) {
-  Status s;
+WinFileSystem::WinFileSystem(const std::shared_ptr<SystemClock>& clock)
+    : clock_(clock), page_size_(4 * 1024), allocation_granularity_(page_size_) {
+  SYSTEM_INFO sinfo;
+  GetSystemInfo(&sinfo);
+
+  page_size_ = sinfo.dwPageSize;
+  allocation_granularity_ = sinfo.dwAllocationGranularity;
+}
+
+const std::shared_ptr<WinFileSystem>& WinFileSystem::Default() {
+  static std::shared_ptr<WinFileSystem> fs =
+      std::make_shared<WinFileSystem>(WinClock::Default());
+  return fs;
+}
+
+WinEnvIO::WinEnvIO(Env* hosted_env) : hosted_env_(hosted_env) {}
+
+WinEnvIO::~WinEnvIO() {}
+
+IOStatus WinFileSystem::DeleteFile(const std::string& fname,
+                                   const IOOptions& /*options*/,
+                                   IODebugContext* /*dbg*/) {
+  IOStatus result;
+
+  BOOL ret = RX_DeleteFile(RX_FN(fname).c_str());
+
+  if (!ret) {
+    auto lastError = GetLastError();
+    result = IOErrorFromWindowsError("Failed to delete: " + fname, lastError);
+  }
+
+  return result;
+}
+
+IOStatus WinFileSystem::Truncate(const std::string& fname, size_t size,
+                                 const IOOptions& /*options*/,
+                                 IODebugContext* /*dbg*/) {
+  IOStatus s;
+  int result = ROCKSDB_NAMESPACE::port::Truncate(fname, size);
+  if (result != 0) {
+    s = IOError("Failed to truncate: " + fname, errno);
+  }
+  return s;
+}
+
+IOStatus WinFileSystem::NewSequentialFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSSequentialFile>* result, IODebugContext* /*dbg*/) {
+  IOStatus s;
 
   result->reset();
 
@@ -177,11 +264,11 @@ Status WinEnvIO::NewSequentialFile(const std::string& fname,
   return s;
 }
 
-Status WinEnvIO::NewRandomAccessFile(const std::string& fname,
-                                     std::unique_ptr<RandomAccessFile>* result,
-                                     const EnvOptions& options) {
+IOStatus WinFileSystem::NewRandomAccessFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
   result->reset();
-  Status s;
+  IOStatus s;
 
   // Open the file for read-only random access
   // Random access is to disable read-ahead as the system reads too much data
@@ -198,10 +285,10 @@ Status WinEnvIO::NewRandomAccessFile(const std::string& fname,
   HANDLE hFile = 0;
   {
     IOSTATS_TIMER_GUARD(open_nanos);
-    hFile = RX_CreateFile(
-        RX_FN(fname).c_str(), GENERIC_READ,
-        FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
-        NULL, OPEN_EXISTING, fileFlags, NULL);
+    hFile =
+        RX_CreateFile(RX_FN(fname).c_str(), GENERIC_READ,
+                      FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                      NULL, OPEN_EXISTING, fileFlags, NULL);
   }
 
   if (INVALID_HANDLE_VALUE == hFile) {
@@ -217,13 +304,13 @@ Status WinEnvIO::NewRandomAccessFile(const std::string& fname,
     // Use mmap when virtual address-space is plentiful.
     uint64_t fileSize;
 
-    s = GetFileSize(fname, &fileSize);
+    s = GetFileSize(fname, IOOptions(), &fileSize, dbg);
 
     if (s.ok()) {
       // Will not map empty files
       if (fileSize == 0) {
-        return IOError(
-            "NewRandomAccessFile failed to map empty file: " + fname, EINVAL);
+        return IOError("NewRandomAccessFile failed to map empty file: " + fname,
+                       EINVAL);
       }
 
       HANDLE hMap = RX_CreateFileMapping(hFile, NULL, PAGE_READONLY,
@@ -241,11 +328,11 @@ Status WinEnvIO::NewRandomAccessFile(const std::string& fname,
       UniqueCloseHandlePtr mapGuard(hMap, CloseHandleFunc);
 
       const void* mapped_region =
-        MapViewOfFileEx(hMap, FILE_MAP_READ,
-                        0,  // High DWORD of access start
-                        0,  // Low DWORD
-                        static_cast<SIZE_T>(fileSize),
-                        NULL);  // Let the OS choose the mapping
+          MapViewOfFileEx(hMap, FILE_MAP_READ,
+                          0,  // High DWORD of access start
+                          0,  // Low DWORD
+                          static_cast<SIZE_T>(fileSize),
+                          NULL);  // Let the OS choose the mapping
 
       if (!mapped_region) {
         auto lastError = GetLastError();
@@ -261,26 +348,22 @@ Status WinEnvIO::NewRandomAccessFile(const std::string& fname,
       fileGuard.release();
     }
   } else {
-    result->reset(new WinRandomAccessFile(fname, hFile,
-                                          std::max(GetSectorSize(fname),
-                                                   page_size_),
-                                          options));
+    result->reset(new WinRandomAccessFile(
+        fname, hFile, std::max(GetSectorSize(fname), page_size_), options));
     fileGuard.release();
   }
   return s;
 }
 
-Status WinEnvIO::OpenWritableFile(const std::string& fname,
-                                  std::unique_ptr<WritableFile>* result,
-                                  const EnvOptions& options,
-                                  bool reopen) {
-
+IOStatus WinFileSystem::OpenWritableFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSWritableFile>* result, bool reopen) {
   const size_t c_BufferCapacity = 64 * 1024;
 
   EnvOptions local_options(options);
 
   result->reset();
-  Status s;
+  IOStatus s;
 
   DWORD fileFlags = FILE_ATTRIBUTE_NORMAL;
 
@@ -317,11 +400,11 @@ Status WinEnvIO::OpenWritableFile(const std::string& fname,
         RX_FN(fname).c_str(),
         desired_access,  // Access desired
         shared_mode,
-        NULL,           // Security attributes
+        NULL,  // Security attributes
         // Posix env says (reopen) ? (O_CREATE | O_APPEND) : O_CREAT | O_TRUNC
         creation_disposition,
-        fileFlags,      // Flags
-        NULL);          // Template File
+        fileFlags,  // Flags
+        NULL);      // Template File
   }
 
   if (INVALID_HANDLE_VALUE == hFile) {
@@ -351,25 +434,37 @@ Status WinEnvIO::OpenWritableFile(const std::string& fname,
   } else {
     // Here we want the buffer allocation to be aligned by the SSD page size
     // and to be a multiple of it
-    result->reset(new WinWritableFile(fname, hFile,
-                                      std::max(GetSectorSize(fname),
-                                               GetPageSize()),
-                                      c_BufferCapacity, local_options));
+    result->reset(new WinWritableFile(
+        fname, hFile, std::max(GetSectorSize(fname), GetPageSize()),
+        c_BufferCapacity, local_options));
   }
   return s;
 }
 
-Status WinEnvIO::NewRandomRWFile(const std::string & fname,
-                                 std::unique_ptr<RandomRWFile>* result,
-                                 const EnvOptions & options) {
+IOStatus WinFileSystem::NewWritableFile(const std::string& fname,
+                                        const FileOptions& options,
+                                        std::unique_ptr<FSWritableFile>* result,
+                                        IODebugContext* /*dbg*/) {
+  return OpenWritableFile(fname, options, result, false);
+}
 
-  Status s;
+IOStatus WinFileSystem::ReopenWritableFile(
+    const std::string& fname, const FileOptions& options,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* /*dbg*/) {
+  return OpenWritableFile(fname, options, result, true);
+}
+
+IOStatus WinFileSystem::NewRandomRWFile(const std::string& fname,
+                                        const FileOptions& options,
+                                        std::unique_ptr<FSRandomRWFile>* result,
+                                        IODebugContext* /*dbg*/) {
+  IOStatus s;
 
   // Open the file for read-only random access
   // Random access is to disable read-ahead as the system reads too much data
   DWORD desired_access = GENERIC_READ | GENERIC_WRITE;
   DWORD shared_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
-  DWORD creation_disposition = OPEN_EXISTING; // Fail if file does not exist
+  DWORD creation_disposition = OPEN_EXISTING;  // Fail if file does not exist
   DWORD file_flags = FILE_FLAG_RANDOM_ACCESS;
 
   if (options.use_direct_reads && options.use_direct_writes) {
@@ -381,36 +476,28 @@ Status WinEnvIO::NewRandomRWFile(const std::string & fname,
   HANDLE hFile = 0;
   {
     IOSTATS_TIMER_GUARD(open_nanos);
-    hFile =
-      RX_CreateFile(RX_FN(fname).c_str(),
-                    desired_access,
-                    shared_mode,
-                    NULL, // Security attributes
-                    creation_disposition,
-                    file_flags,
-                    NULL);
+    hFile = RX_CreateFile(RX_FN(fname).c_str(), desired_access, shared_mode,
+                          NULL,  // Security attributes
+                          creation_disposition, file_flags, NULL);
   }
 
   if (INVALID_HANDLE_VALUE == hFile) {
     auto lastError = GetLastError();
     return IOErrorFromWindowsError(
-      "NewRandomRWFile failed to Create/Open: " + fname, lastError);
+        "NewRandomRWFile failed to Create/Open: " + fname, lastError);
   }
 
   UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc);
-  result->reset(new WinRandomRWFile(fname, hFile,
-                                    std::max(GetSectorSize(fname),
-                                             GetPageSize()),
-                                    options));
+  result->reset(new WinRandomRWFile(
+      fname, hFile, std::max(GetSectorSize(fname), GetPageSize()), options));
   fileGuard.release();
 
   return s;
 }
 
-Status WinEnvIO::NewMemoryMappedFileBuffer(
-    const std::string & fname,
-    std::unique_ptr<MemoryMappedFileBuffer>* result) {
-  Status s;
+IOStatus WinFileSystem::NewMemoryMappedFileBuffer(
+    const std::string& fname, std::unique_ptr<MemoryMappedFileBuffer>* result) {
+  IOStatus s;
   result->reset();
 
   DWORD fileFlags = FILE_ATTRIBUTE_READONLY;
@@ -420,11 +507,9 @@ Status WinEnvIO::NewMemoryMappedFileBuffer(
     IOSTATS_TIMER_GUARD(open_nanos);
     hFile = RX_CreateFile(
         RX_FN(fname).c_str(), GENERIC_READ | GENERIC_WRITE,
-        FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
-        NULL,
+        FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL,
         OPEN_EXISTING,  // Open only if it exists
-        fileFlags,
-        NULL);
+        fileFlags, NULL);
   }
 
   if (INVALID_HANDLE_VALUE == hFile) {
@@ -436,21 +521,21 @@ Status WinEnvIO::NewMemoryMappedFileBuffer(
   UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc);
 
   uint64_t fileSize = 0;
-  s = GetFileSize(fname, &fileSize);
+  s = GetFileSize(fname, IOOptions(), &fileSize, nullptr);
   if (!s.ok()) {
     return s;
   }
   // Will not map empty files
   if (fileSize == 0) {
-    return Status::NotSupported(
+    return IOStatus::NotSupported(
         "NewMemoryMappedFileBuffer can not map zero length files: " + fname);
   }
 
   // size_t is 32-bit with 32-bit builds
   if (fileSize > std::numeric_limits<size_t>::max()) {
-    return Status::NotSupported(
-        "The specified file size does not fit into 32-bit memory addressing: "
-         + fname);
+    return IOStatus::NotSupported(
+        "The specified file size does not fit into 32-bit memory addressing: " +
+        fname);
   }
 
   HANDLE hMap = RX_CreateFileMapping(hFile, NULL, PAGE_READWRITE,
@@ -487,15 +572,16 @@ Status WinEnvIO::NewMemoryMappedFileBuffer(
   return s;
 }
 
-Status WinEnvIO::NewDirectory(const std::string& name,
-                              std::unique_ptr<Directory>* result) {
-  Status s;
+IOStatus WinFileSystem::NewDirectory(const std::string& name,
+                                     const IOOptions& /*options*/,
+                                     std::unique_ptr<FSDirectory>* result,
+                                     IODebugContext* /*dbg*/) {
+  IOStatus s;
   // Must be nullptr on failure
   result->reset();
 
   if (!DirExists(name)) {
-    s = IOErrorFromWindowsError(
-        "open folder: " + name, ERROR_DIRECTORY);
+    s = IOErrorFromWindowsError("open folder: " + name, ERROR_DIRECTORY);
     return s;
   }
 
@@ -505,10 +591,9 @@ Status WinEnvIO::NewDirectory(const std::string& name,
     IOSTATS_TIMER_GUARD(open_nanos);
     handle = RX_CreateFile(
         RX_FN(name).c_str(), 0,
-        FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
-        NULL,
+        FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
         OPEN_EXISTING,
-        FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible
+        FILE_FLAG_BACKUP_SEMANTICS,  // make opening folders possible
         NULL);
   }
 
@@ -523,8 +608,10 @@ Status WinEnvIO::NewDirectory(const std::string& name,
   return s;
 }
 
-Status WinEnvIO::FileExists(const std::string& fname) {
-  Status s;
+IOStatus WinFileSystem::FileExists(const std::string& fname,
+                                   const IOOptions& /*opts*/,
+                                   IODebugContext* /*dbg*/) {
+  IOStatus s;
   // TODO: This does not follow symbolic links at this point
   // which is consistent with _access() impl on windows
   // but can be added
@@ -533,70 +620,74 @@ Status WinEnvIO::FileExists(const std::string& fname) {
                                       GetFileExInfoStandard, &attrs)) {
     auto lastError = GetLastError();
     switch (lastError) {
-    case ERROR_ACCESS_DENIED:
-    case ERROR_NOT_FOUND:
-    case ERROR_FILE_NOT_FOUND:
-    case ERROR_PATH_NOT_FOUND:
-      s = Status::NotFound();
-      break;
-    default:
-      s = IOErrorFromWindowsError("Unexpected error for: " + fname,
-                                  lastError);
-      break;
+      case ERROR_ACCESS_DENIED:
+      case ERROR_NOT_FOUND:
+      case ERROR_FILE_NOT_FOUND:
+      case ERROR_PATH_NOT_FOUND:
+        s = IOStatus::NotFound();
+        break;
+      default:
+        s = IOErrorFromWindowsError("Unexpected error for: " + fname,
+                                    lastError);
+        break;
     }
   }
   return s;
 }
 
-Status WinEnvIO::GetChildren(const std::string& dir,
-                             std::vector<std::string>* result) {
-
-  Status status;
+IOStatus WinFileSystem::GetChildren(const std::string& dir,
+                                    const IOOptions& /*opts*/,
+                                    std::vector<std::string>* result,
+                                    IODebugContext* /*dbg*/) {
+  IOStatus status;
   result->clear();
-  std::vector<std::string> output;
 
   RX_WIN32_FIND_DATA data;
   memset(&data, 0, sizeof(data));
   std::string pattern(dir);
   pattern.append("\\").append("*");
 
-  HANDLE handle = RX_FindFirstFileEx(RX_FN(pattern).c_str(),
-                                     // Do not want alternative name
-                                     FindExInfoBasic,
-                                     &data,
-                                     FindExSearchNameMatch,
-                                     NULL,  // lpSearchFilter
-                                     0);
+  HANDLE handle =
+      RX_FindFirstFileEx(RX_FN(pattern).c_str(),
+                         // Do not want alternative name
+                         FindExInfoBasic, &data, FindExSearchNameMatch,
+                         NULL,  // lpSearchFilter
+                         0);
 
   if (handle == INVALID_HANDLE_VALUE) {
     auto lastError = GetLastError();
     switch (lastError) {
-    case ERROR_NOT_FOUND:
-    case ERROR_ACCESS_DENIED:
-    case ERROR_FILE_NOT_FOUND:
-    case ERROR_PATH_NOT_FOUND:
-      status = Status::NotFound();
-      break;
-    default:
-      status = IOErrorFromWindowsError(
-          "Failed to GetChhildren for: " + dir, lastError);
+      case ERROR_NOT_FOUND:
+      case ERROR_ACCESS_DENIED:
+      case ERROR_FILE_NOT_FOUND:
+      case ERROR_PATH_NOT_FOUND:
+        status = IOStatus::NotFound();
+        break;
+      default:
+        status = IOErrorFromWindowsError("Failed to GetChhildren for: " + dir,
+                                         lastError);
     }
     return status;
   }
 
   UniqueFindClosePtr fc(handle, FindCloseFunc);
 
-  if (result->capacity() > 0) {
-    output.reserve(result->capacity());
-  }
-
   // For safety
   data.cFileName[MAX_PATH - 1] = 0;
 
   while (true) {
-    auto x = RX_FILESTRING(data.cFileName, RX_FNLEN(data.cFileName));
-    output.emplace_back(FN_TO_RX(x));
-    BOOL ret =- RX_FindNextFile(handle, &data);
+    // filter out '.' and '..' directory entries
+    // which appear only on some platforms
+    const bool ignore =
+        ((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) != 0) &&
+        (RX_FNCMP(data.cFileName, ".") == 0 ||
+         RX_FNCMP(data.cFileName, "..") == 0);
+    if (!ignore) {
+      auto x = RX_FILESTRING(data.cFileName, RX_FNLEN(data.cFileName));
+      result->push_back(FN_TO_RX(x));
+    }
+
+    BOOL ret = -RX_FindNextFile(handle, &data);
     // If the function fails the return value is zero
     // and non-zero otherwise. Not TRUE or FALSE.
     if (ret == FALSE) {
@@ -605,24 +696,27 @@ Status WinEnvIO::GetChildren(const std::string& dir,
     }
     data.cFileName[MAX_PATH - 1] = 0;
   }
-  output.swap(*result);
   return status;
 }
 
-Status WinEnvIO::CreateDir(const std::string& name) {
-  Status result;
+IOStatus WinFileSystem::CreateDir(const std::string& name,
+                                  const IOOptions& /*opts*/,
+                                  IODebugContext* /*dbg*/) {
+  IOStatus result;
   BOOL ret = RX_CreateDirectory(RX_FN(name).c_str(), NULL);
   if (!ret) {
     auto lastError = GetLastError();
-    result = IOErrorFromWindowsError(
-        "Failed to create a directory: " + name, lastError);
+    result = IOErrorFromWindowsError("Failed to create a directory: " + name,
+                                     lastError);
   }
 
   return result;
 }
 
-Status  WinEnvIO::CreateDirIfMissing(const std::string& name) {
-  Status result;
+IOStatus WinFileSystem::CreateDirIfMissing(const std::string& name,
+                                           const IOOptions& /*opts*/,
+                                           IODebugContext* /*dbg*/) {
+  IOStatus result;
 
   if (DirExists(name)) {
     return result;
@@ -632,30 +726,32 @@ Status  WinEnvIO::CreateDirIfMissing(const std::string& name) {
   if (!ret) {
     auto lastError = GetLastError();
     if (lastError != ERROR_ALREADY_EXISTS) {
-      result = IOErrorFromWindowsError(
-          "Failed to create a directory: " + name, lastError);
+      result = IOErrorFromWindowsError("Failed to create a directory: " + name,
+                                       lastError);
     } else {
-      result =
-          Status::IOError(name + ": exists but is not a directory");
+      result = IOStatus::IOError(name + ": exists but is not a directory");
     }
   }
   return result;
 }
 
-Status WinEnvIO::DeleteDir(const std::string& name) {
-  Status result;
+IOStatus WinFileSystem::DeleteDir(const std::string& name,
+                                  const IOOptions& /*options*/,
+                                  IODebugContext* /*dbg*/) {
+  IOStatus result;
   BOOL ret = RX_RemoveDirectory(RX_FN(name).c_str());
   if (!ret) {
     auto lastError = GetLastError();
-    result = IOErrorFromWindowsError("Failed to remove dir: " + name,
-                                     lastError);
+    result =
+        IOErrorFromWindowsError("Failed to remove dir: " + name, lastError);
   }
   return result;
 }
 
-Status WinEnvIO::GetFileSize(const std::string& fname,
-  uint64_t* size) {
-  Status s;
+IOStatus WinFileSystem::GetFileSize(const std::string& fname,
+                                    const IOOptions& /*opts*/, uint64_t* size,
+                                    IODebugContext* /*dbg*/) {
+  IOStatus s;
 
   WIN32_FILE_ATTRIBUTE_DATA attrs;
   if (RX_GetFileAttributesEx(RX_FN(fname).c_str(), GetFileExInfoStandard,
@@ -671,7 +767,7 @@ Status WinEnvIO::GetFileSize(const std::string& fname,
   return s;
 }
 
-uint64_t WinEnvIO::FileTimeToUnixTime(const FILETIME& ftTime) {
+uint64_t WinFileSystem::FileTimeToUnixTime(const FILETIME& ftTime) {
   const uint64_t c_FileTimePerSecond = 10000000U;
   // UNIX epoch starts on 1970-01-01T00:00:00Z
   // Windows FILETIME starts on 1601-01-01T00:00:00Z
@@ -685,31 +781,35 @@ uint64_t WinEnvIO::FileTimeToUnixTime(const FILETIME& ftTime) {
   li.LowPart = ftTime.dwLowDateTime;
 
   uint64_t result =
-    (li.QuadPart / c_FileTimePerSecond) - c_SecondBeforeUnixEpoch;
+      (li.QuadPart / c_FileTimePerSecond) - c_SecondBeforeUnixEpoch;
   return result;
 }
 
-Status WinEnvIO::GetFileModificationTime(const std::string& fname,
-  uint64_t* file_mtime) {
-  Status s;
+IOStatus WinFileSystem::GetFileModificationTime(const std::string& fname,
+                                                const IOOptions& /*opts*/,
+                                                uint64_t* file_mtime,
+                                                IODebugContext* /*dbg*/) {
+  IOStatus s;
 
   WIN32_FILE_ATTRIBUTE_DATA attrs;
   if (RX_GetFileAttributesEx(RX_FN(fname).c_str(), GetFileExInfoStandard,
-                            &attrs)) {
+                             &attrs)) {
     *file_mtime = FileTimeToUnixTime(attrs.ftLastWriteTime);
   } else {
     auto lastError = GetLastError();
     s = IOErrorFromWindowsError(
-      "Can not get file modification time for: " + fname, lastError);
+        "Can not get file modification time for: " + fname, lastError);
     *file_mtime = 0;
   }
 
   return s;
 }
 
-Status WinEnvIO::RenameFile(const std::string& src,
-  const std::string& target) {
-  Status result;
+IOStatus WinFileSystem::RenameFile(const std::string& src,
+                                   const std::string& target,
+                                   const IOOptions& /*opts*/,
+                                   IODebugContext* /*dbg*/) {
+  IOStatus result;
 
   // rename() is not capable of replacing the existing file as on Linux
   // so use OS API directly
@@ -726,14 +826,16 @@ Status WinEnvIO::RenameFile(const std::string& src,
   return result;
 }
 
-Status WinEnvIO::LinkFile(const std::string& src,
-  const std::string& target) {
-  Status result;
+IOStatus WinFileSystem::LinkFile(const std::string& src,
+                                 const std::string& target,
+                                 const IOOptions& /*opts*/,
+                                 IODebugContext* /*dbg*/) {
+  IOStatus result;
 
-  if (!RX_CreateHardLink(RX_FN(target).c_str(), RX_FN(src).c_str(),  NULL)) {
+  if (!RX_CreateHardLink(RX_FN(target).c_str(), RX_FN(src).c_str(), NULL)) {
     DWORD lastError = GetLastError();
     if (lastError == ERROR_NOT_SAME_DEVICE) {
-      return Status::NotSupported("No cross FS links allowed");
+      return IOStatus::NotSupported("No cross FS links allowed");
     }
 
     std::string text("Failed to link: ");
@@ -745,12 +847,14 @@ Status WinEnvIO::LinkFile(const std::string& src,
   return result;
 }
 
-Status WinEnvIO::NumFileLinks(const std::string& fname, uint64_t* count) {
-  Status s;
-  HANDLE handle = RX_CreateFile(
-      RX_FN(fname).c_str(), 0,
-      FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
-      NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL);
+IOStatus WinFileSystem::NumFileLinks(const std::string& fname,
+                                     const IOOptions& /*opts*/, uint64_t* count,
+                                     IODebugContext* /*dbg*/) {
+  IOStatus s;
+  HANDLE handle =
+      RX_CreateFile(RX_FN(fname).c_str(), 0,
+                    FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
+                    NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL);
 
   if (INVALID_HANDLE_VALUE == handle) {
     auto lastError = GetLastError();
@@ -771,26 +875,27 @@ Status WinEnvIO::NumFileLinks(const std::string& fname, uint64_t* count) {
   return s;
 }
 
-Status WinEnvIO::AreFilesSame(const std::string& first,
-  const std::string& second, bool* res) {
+IOStatus WinFileSystem::AreFilesSame(const std::string& first,
+                                     const std::string& second,
+                                     const IOOptions& /*opts*/, bool* res,
+                                     IODebugContext* /*dbg*/) {
 // For MinGW builds
 #if (_WIN32_WINNT == _WIN32_WINNT_VISTA)
-  Status s = Status::NotSupported();
+  IOStatus s = IOStatus::NotSupported();
 #else
   assert(res != nullptr);
-  Status s;
+  IOStatus s;
   if (res == nullptr) {
-    s = Status::InvalidArgument("res");
+    s = IOStatus::InvalidArgument("res");
     return s;
   }
 
   // 0 - for access means read metadata
   HANDLE file_1 = RX_CreateFile(
       RX_FN(first).c_str(), 0,
-      FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
-      NULL,
+      FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
       OPEN_EXISTING,
-      FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible
+      FILE_FLAG_BACKUP_SEMANTICS,  // make opening folders possible
       NULL);
 
   if (INVALID_HANDLE_VALUE == file_1) {
@@ -802,9 +907,9 @@ Status WinEnvIO::AreFilesSame(const std::string& first,
 
   HANDLE file_2 = RX_CreateFile(
       RX_FN(second).c_str(), 0,
-      FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
-      NULL, OPEN_EXISTING,
-      FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible
+      FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
+      OPEN_EXISTING,
+      FILE_FLAG_BACKUP_SEMANTICS,  // make opening folders possible
       NULL);
 
   if (INVALID_HANDLE_VALUE == file_2) {
@@ -824,9 +929,9 @@ Status WinEnvIO::AreFilesSame(const std::string& first,
     return s;
   }
 
-   FILE_ID_INFO FileInfo_2;
-   result = GetFileInformationByHandleEx(file_2, FileIdInfo, &FileInfo_2,
-                                         sizeof(FileInfo_2));
+  FILE_ID_INFO FileInfo_2;
+  result = GetFileInformationByHandleEx(file_2, FileIdInfo, &FileInfo_2,
+                                        sizeof(FileInfo_2));
 
   if (!result) {
     auto lastError = GetLastError();
@@ -835,9 +940,9 @@ Status WinEnvIO::AreFilesSame(const std::string& first,
   }
 
   if (FileInfo_1.VolumeSerialNumber == FileInfo_2.VolumeSerialNumber) {
-    *res = (0 == memcmp(FileInfo_1.FileId.Identifier,
-                        FileInfo_2.FileId.Identifier,
-                        sizeof(FileInfo_1.FileId.Identifier)));
+    *res =
+        (0 == memcmp(FileInfo_1.FileId.Identifier, FileInfo_2.FileId.Identifier,
+                     sizeof(FileInfo_1.FileId.Identifier)));
   } else {
     *res = false;
   }
@@ -845,12 +950,13 @@ Status WinEnvIO::AreFilesSame(const std::string& first,
   return s;
 }
 
-Status  WinEnvIO::LockFile(const std::string& lockFname,
-                           FileLock** lock) {
+IOStatus WinFileSystem::LockFile(const std::string& lockFname,
+                                 const IOOptions& /*opts*/, FileLock** lock,
+                                 IODebugContext* /*dbg*/) {
   assert(lock != nullptr);
 
   *lock = NULL;
-  Status result;
+  IOStatus result;
 
   // No-sharing, this is a LOCK file
   const DWORD ExclusiveAccessON = 0;
@@ -862,15 +968,14 @@ Status  WinEnvIO::LockFile(const std::string& lockFname,
   {
     IOSTATS_TIMER_GUARD(open_nanos);
     hFile = RX_CreateFile(RX_FN(lockFname).c_str(),
-                          (GENERIC_READ | GENERIC_WRITE),
-                          ExclusiveAccessON, NULL, CREATE_ALWAYS,
-                          FILE_ATTRIBUTE_NORMAL, NULL);
+                          (GENERIC_READ | GENERIC_WRITE), ExclusiveAccessON,
+                          NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
   }
 
   if (INVALID_HANDLE_VALUE == hFile) {
     auto lastError = GetLastError();
-    result = IOErrorFromWindowsError(
-        "Failed to create lock file: " + lockFname, lastError);
+    result = IOErrorFromWindowsError("Failed to create lock file: " + lockFname,
+                                     lastError);
   } else {
     *lock = new WinFileLock(hFile);
   }
@@ -878,8 +983,9 @@ Status  WinEnvIO::LockFile(const std::string& lockFname,
   return result;
 }
 
-Status WinEnvIO::UnlockFile(FileLock* lock) {
-  Status result;
+IOStatus WinFileSystem::UnlockFile(FileLock* lock, const IOOptions& /*opts*/,
+                                   IODebugContext* /*dbg*/) {
+  IOStatus result;
 
   assert(lock != nullptr);
 
@@ -888,8 +994,9 @@ Status WinEnvIO::UnlockFile(FileLock* lock) {
   return result;
 }
 
-Status WinEnvIO::GetTestDirectory(std::string* result) {
-
+IOStatus WinFileSystem::GetTestDirectory(const IOOptions& opts,
+                                         std::string* result,
+                                         IODebugContext* dbg) {
   std::string output;
 
   const char* env = getenv("TEST_TMPDIR");
@@ -904,21 +1011,23 @@ Status WinEnvIO::GetTestDirectory(std::string* result) {
       output = "c:\\tmp";
     }
   }
-  CreateDir(output);
+  CreateDir(output, opts, dbg);
 
   output.append("\\testrocksdb-");
   output.append(std::to_string(GetCurrentProcessId()));
 
-  CreateDir(output);
+  CreateDir(output, opts, dbg);
 
   output.swap(*result);
 
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status WinEnvIO::NewLogger(const std::string& fname,
-                           std::shared_ptr<Logger>* result) {
-  Status s;
+IOStatus WinFileSystem::NewLogger(const std::string& fname,
+                                  const IOOptions& /*opts*/,
+                                  std::shared_ptr<Logger>* result,
+                                  IODebugContext* /*dbg*/) {
+  IOStatus s;
 
   result->reset();
 
@@ -951,72 +1060,25 @@ Status WinEnvIO::NewLogger(const std::string& fname,
       // Set creation, last access and last write time to the same value
       SetFileTime(hFile, &ft, &ft, &ft);
     }
-    result->reset(new WinLogger(&WinEnvThreads::gettid, hosted_env_, hFile));
+    result->reset(new WinLogger(&WinEnvThreads::gettid, clock_.get(), hFile));
   }
   return s;
 }
 
-Status WinEnvIO::IsDirectory(const std::string& path, bool* is_dir) {
+IOStatus WinFileSystem::IsDirectory(const std::string& path,
+                                    const IOOptions& /*opts*/, bool* is_dir,
+                                    IODebugContext* /*dbg*/) {
   BOOL ret = RX_PathIsDirectory(RX_FN(path).c_str());
   if (is_dir) {
     *is_dir = ret ? true : false;
   }
-  return Status::OK();
-}
-
-uint64_t WinEnvIO::NowMicros() {
-
-  if (GetSystemTimePreciseAsFileTime_ != NULL) {
-    // all std::chrono clocks on windows proved to return
-    // values that may repeat that is not good enough for some uses.
-    const int64_t c_UnixEpochStartTicks = 116444736000000000LL;
-    const int64_t c_FtToMicroSec = 10;
-
-    // This interface needs to return system time and not
-    // just any microseconds because it is often used as an argument
-    // to TimedWait() on condition variable
-    FILETIME ftSystemTime;
-    GetSystemTimePreciseAsFileTime_(&ftSystemTime);
-
-    LARGE_INTEGER li;
-    li.LowPart = ftSystemTime.dwLowDateTime;
-    li.HighPart = ftSystemTime.dwHighDateTime;
-    // Subtract unix epoch start
-    li.QuadPart -= c_UnixEpochStartTicks;
-    // Convert to microsecs
-    li.QuadPart /= c_FtToMicroSec;
-    return li.QuadPart;
-  }
-  using namespace std::chrono;
-  return duration_cast<microseconds>(system_clock::now().time_since_epoch())
-      .count();
-}
-
-uint64_t WinEnvIO::NowNanos() {
-  if (nano_seconds_per_period_ != 0) {
-    // all std::chrono clocks on windows have the same resolution that is only
-    // good enough for microseconds but not nanoseconds
-    // On Windows 8 and Windows 2012 Server
-    // GetSystemTimePreciseAsFileTime(&current_time) can be used
-    LARGE_INTEGER li;
-    QueryPerformanceCounter(&li);
-    // Convert performance counter to nanoseconds by precomputed ratio.
-    // Directly multiply nano::den with li.QuadPart causes overflow.
-    // Only do this when nano::den is divisible by perf_counter_frequency_,
-    // which most likely is the case in reality. If it's not, fall back to
-    // high_resolution_clock, which may be less precise under old compilers.
-    li.QuadPart *= nano_seconds_per_period_;
-    return li.QuadPart;
-  }
-  using namespace std::chrono;
-  return duration_cast<nanoseconds>(
-      high_resolution_clock::now().time_since_epoch()).count();
+  return IOStatus::OK();
 }
 
 Status WinEnvIO::GetHostName(char* name, uint64_t len) {
   Status s;
   DWORD nSize = static_cast<DWORD>(
-    std::min<uint64_t>(len, std::numeric_limits<DWORD>::max()));
+      std::min<uint64_t>(len, std::numeric_limits<DWORD>::max()));
 
   if (!::GetComputerNameA(name, &nSize)) {
     auto lastError = GetLastError();
@@ -1028,15 +1090,17 @@ Status WinEnvIO::GetHostName(char* name, uint64_t len) {
   return s;
 }
 
-Status WinEnvIO::GetAbsolutePath(const std::string& db_path,
-                                 std::string* output_path) {
+IOStatus WinFileSystem::GetAbsolutePath(const std::string& db_path,
+                                        const IOOptions& /*options*/,
+                                        std::string* output_path,
+                                        IODebugContext* dbg) {
   // Check if we already have an absolute path
   // For test compatibility we will consider starting slash as an
   // absolute path
   if ((!db_path.empty() && (db_path[0] == '\\' || db_path[0] == '/')) ||
-    !RX_PathIsRelative(RX_FN(db_path).c_str())) {
+      !RX_PathIsRelative(RX_FN(db_path).c_str())) {
     *output_path = db_path;
-    return Status::OK();
+    return IOStatus::OK();
   }
 
   RX_FILESTRING result;
@@ -1055,42 +1119,19 @@ Status WinEnvIO::GetAbsolutePath(const std::string& db_path,
   std::string res = FN_TO_RX(result);
 
   res.swap(*output_path);
-  return Status::OK();
-}
-
-std::string WinEnvIO::TimeToString(uint64_t secondsSince1970) {
-  std::string result;
-
-  const time_t seconds = secondsSince1970;
-  const int maxsize = 64;
-
-  struct tm t;
-  errno_t ret = localtime_s(&t, &seconds);
-
-  if (ret) {
-    result = std::to_string(seconds);
-  } else {
-    result.resize(maxsize);
-    char* p = &result[0];
-
-    int len = snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ",
-                       t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour,
-                       t.tm_min, t.tm_sec);
-    assert(len > 0);
-
-    result.resize(len);
-  }
-
-  return result;
+  return IOStatus::OK();
 }
 
-Status WinEnvIO::GetFreeSpace(const std::string& path, uint64_t* diskfree) {
+IOStatus WinFileSystem::GetFreeSpace(const std::string& path,
+                                     const IOOptions& /*options*/,
+                                     uint64_t* diskfree,
+                                     IODebugContext* /*dbg*/) {
   assert(diskfree != nullptr);
   ULARGE_INTEGER freeBytes;
   BOOL f = RX_GetDiskFreeSpaceEx(RX_FN(path).c_str(), &freeBytes, NULL, NULL);
   if (f) {
     *diskfree = freeBytes.QuadPart;
-    return Status::OK();
+    return IOStatus::OK();
   } else {
     DWORD lastError = GetLastError();
     return IOErrorFromWindowsError("Failed to get free space: " + path,
@@ -1098,9 +1139,9 @@ Status WinEnvIO::GetFreeSpace(const std::string& path, uint64_t* diskfree) {
   }
 }
 
-EnvOptions WinEnvIO::OptimizeForLogWrite(const EnvOptions& env_options,
-                                         const DBOptions& db_options) const {
-  EnvOptions optimized(env_options);
+FileOptions WinFileSystem::OptimizeForLogWrite(
+    const FileOptions& file_options, const DBOptions& db_options) const {
+  FileOptions optimized(file_options);
   // These two the same as default optimizations
   optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
   optimized.writable_file_max_buffer_size =
@@ -1114,33 +1155,33 @@ EnvOptions WinEnvIO::OptimizeForLogWrite(const EnvOptions& env_options,
   return optimized;
 }
 
-EnvOptions WinEnvIO::OptimizeForManifestWrite(
-    const EnvOptions& env_options) const {
-  EnvOptions optimized(env_options);
+FileOptions WinFileSystem::OptimizeForManifestWrite(
+    const FileOptions& options) const {
+  FileOptions optimized(options);
   optimized.use_mmap_writes = false;
   optimized.use_direct_reads = false;
   return optimized;
 }
 
-EnvOptions WinEnvIO::OptimizeForManifestRead(
-    const EnvOptions& env_options) const {
-  EnvOptions optimized(env_options);
+FileOptions WinFileSystem::OptimizeForManifestRead(
+    const FileOptions& file_options) const {
+  FileOptions optimized(file_options);
   optimized.use_mmap_writes = false;
   optimized.use_direct_reads = false;
   return optimized;
 }
 
 // Returns true iff the named directory exists and is a directory.
-bool WinEnvIO::DirExists(const std::string& dname) {
+bool WinFileSystem::DirExists(const std::string& dname) {
   WIN32_FILE_ATTRIBUTE_DATA attrs;
-  if (RX_GetFileAttributesEx(RX_FN(dname).c_str(),
-                             GetFileExInfoStandard, &attrs)) {
+  if (RX_GetFileAttributesEx(RX_FN(dname).c_str(), GetFileExInfoStandard,
+                             &attrs)) {
     return 0 != (attrs.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY);
   }
   return false;
 }
 
-size_t WinEnvIO::GetSectorSize(const std::string& fname) {
+size_t WinFileSystem::GetSectorSize(const std::string& fname) {
   size_t sector_size = kSectorSize;
 
   if (RX_PathIsRelative(RX_FN(fname).c_str())) {
@@ -1170,21 +1211,21 @@ size_t WinEnvIO::GetSectorSize(const std::string& fname) {
   BYTE output_buffer[sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR)];
   DWORD output_bytes = 0;
 
-  BOOL ret = DeviceIoControl(hDevice, IOCTL_STORAGE_QUERY_PROPERTY,
-                             &spropertyquery, sizeof(spropertyquery),
-                             output_buffer,
-                             sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR),
-                             &output_bytes, nullptr);
+  BOOL ret = DeviceIoControl(
+      hDevice, IOCTL_STORAGE_QUERY_PROPERTY, &spropertyquery,
+      sizeof(spropertyquery), output_buffer,
+      sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR), &output_bytes, nullptr);
 
   if (ret) {
-    sector_size = ((STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR *)output_buffer)->BytesPerLogicalSector;
+    sector_size = ((STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR*)output_buffer)
+                      ->BytesPerLogicalSector;
   } else {
-    // many devices do not support StorageProcessAlignmentProperty. Any failure here and we
-    // fall back to logical alignment
+    // many devices do not support StorageProcessAlignmentProperty. Any failure
+    // here and we fall back to logical alignment
 
-    DISK_GEOMETRY_EX geometry = { 0 };
-    ret = DeviceIoControl(hDevice, IOCTL_DISK_GET_DRIVE_GEOMETRY,
-           nullptr, 0, &geometry, sizeof(geometry), &output_bytes, nullptr);
+    DISK_GEOMETRY_EX geometry = {0};
+    ret = DeviceIoControl(hDevice, IOCTL_DISK_GET_DRIVE_GEOMETRY, nullptr, 0,
+                          &geometry, sizeof(geometry), &output_bytes, nullptr);
     if (ret) {
       sector_size = geometry.Geometry.BytesPerSector;
     }
@@ -1202,17 +1243,15 @@ size_t WinEnvIO::GetSectorSize(const std::string& fname) {
 
 WinEnvThreads::WinEnvThreads(Env* hosted_env)
     : hosted_env_(hosted_env), thread_pools_(Env::Priority::TOTAL) {
-
   for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
     thread_pools_[pool_id].SetThreadPriority(
-      static_cast<Env::Priority>(pool_id));
+        static_cast<Env::Priority>(pool_id));
     // This allows later initializing the thread-local-env of each thread.
     thread_pools_[pool_id].SetHostEnv(hosted_env);
   }
 }
 
 WinEnvThreads::~WinEnvThreads() {
-
   WaitForJoin();
 
   for (auto& thpool : thread_pools_) {
@@ -1220,9 +1259,9 @@ WinEnvThreads::~WinEnvThreads() {
   }
 }
 
-void WinEnvThreads::Schedule(void(*function)(void*), void* arg,
+void WinEnvThreads::Schedule(void (*function)(void*), void* arg,
                              Env::Priority pri, void* tag,
-                             void(*unschedFunction)(void* arg)) {
+                             void (*unschedFunction)(void* arg)) {
   assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
   thread_pools_[pri].Schedule(function, arg, tag, unschedFunction);
 }
@@ -1233,21 +1272,21 @@ int WinEnvThreads::UnSchedule(void* arg, Env::Priority pri) {
 
 namespace {
 
-  struct StartThreadState {
-    void(*user_function)(void*);
-    void* arg;
-  };
+struct StartThreadState {
+  void (*user_function)(void*);
+  void* arg;
+};
 
-  void* StartThreadWrapper(void* arg) {
-    std::unique_ptr<StartThreadState> state(
+void* StartThreadWrapper(void* arg) {
+  std::unique_ptr<StartThreadState> state(
       reinterpret_cast<StartThreadState*>(arg));
-    state->user_function(state->arg);
-    return nullptr;
-  }
-
+  state->user_function(state->arg);
+  return nullptr;
 }
 
-void WinEnvThreads::StartThread(void(*function)(void* arg), void* arg) {
+}  // namespace
+
+void WinEnvThreads::StartThread(void (*function)(void* arg), void* arg) {
   std::unique_ptr<StartThreadState> state(new StartThreadState);
   state->user_function = function;
   state->arg = arg;
@@ -1282,10 +1321,6 @@ uint64_t WinEnvThreads::gettid() {
 
 uint64_t WinEnvThreads::GetThreadID() const { return gettid(); }
 
-void  WinEnvThreads::SleepForMicroseconds(int micros) {
-  std::this_thread::sleep_for(std::chrono::microseconds(micros));
-}
-
 void WinEnvThreads::SetBackgroundThreads(int num, Env::Priority pri) {
   assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
   thread_pools_[pri].SetBackgroundThreads(num);
@@ -1304,12 +1339,14 @@ void WinEnvThreads::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) {
 /////////////////////////////////////////////////////////////////////////
 // WinEnv
 
-WinEnv::WinEnv() : winenv_io_(this), winenv_threads_(this) {
+WinEnv::WinEnv()
+    : CompositeEnv(WinFileSystem::Default(), WinClock::Default()),
+      winenv_io_(this),
+      winenv_threads_(this) {
   // Protected member of the base class
   thread_status_updater_ = CreateThreadStatusUpdater();
 }
 
-
 WinEnv::~WinEnv() {
   // All threads must be joined before the deletion of
   // thread_status_updater_.
@@ -1321,155 +1358,12 @@ Status WinEnv::GetThreadList(std::vector<ThreadStatus>* thread_list) {
   return thread_status_updater_->GetThreadList(thread_list);
 }
 
-Status WinEnv::DeleteFile(const std::string& fname) {
-  return winenv_io_.DeleteFile(fname);
-}
-
-Status WinEnv::Truncate(const std::string& fname, size_t size) {
-  return winenv_io_.Truncate(fname, size);
-}
-
-Status WinEnv::GetCurrentTime(int64_t* unix_time) {
-  return winenv_io_.GetCurrentTime(unix_time);
-}
-
-Status  WinEnv::NewSequentialFile(const std::string& fname,
-                                  std::unique_ptr<SequentialFile>* result,
-                                  const EnvOptions& options) {
-  return winenv_io_.NewSequentialFile(fname, result, options);
-}
-
-Status WinEnv::NewRandomAccessFile(const std::string& fname,
-                                   std::unique_ptr<RandomAccessFile>* result,
-                                   const EnvOptions& options) {
-  return winenv_io_.NewRandomAccessFile(fname, result, options);
-}
-
-Status WinEnv::NewWritableFile(const std::string& fname,
-                               std::unique_ptr<WritableFile>* result,
-                               const EnvOptions& options) {
-  return winenv_io_.OpenWritableFile(fname, result, options, false);
-}
-
-Status WinEnv::ReopenWritableFile(const std::string& fname,
-                                  std::unique_ptr<WritableFile>* result,
-                                  const EnvOptions& options) {
-  return winenv_io_.OpenWritableFile(fname, result, options, true);
-}
-
-Status WinEnv::NewRandomRWFile(const std::string & fname,
-                               std::unique_ptr<RandomRWFile>* result,
-                               const EnvOptions & options) {
-  return winenv_io_.NewRandomRWFile(fname, result, options);
-}
-
-Status WinEnv::NewMemoryMappedFileBuffer(
-    const std::string& fname,
-    std::unique_ptr<MemoryMappedFileBuffer>* result) {
-  return winenv_io_.NewMemoryMappedFileBuffer(fname, result);
-}
-
-Status WinEnv::NewDirectory(const std::string& name,
-                            std::unique_ptr<Directory>* result) {
-  return winenv_io_.NewDirectory(name, result);
-}
-
-Status WinEnv::FileExists(const std::string& fname) {
-  return winenv_io_.FileExists(fname);
-}
-
-Status WinEnv::GetChildren(const std::string& dir,
-                           std::vector<std::string>* result) {
-  return winenv_io_.GetChildren(dir, result);
-}
-
-Status WinEnv::CreateDir(const std::string& name) {
-  return winenv_io_.CreateDir(name);
-}
-
-Status WinEnv::CreateDirIfMissing(const std::string& name) {
-  return winenv_io_.CreateDirIfMissing(name);
-}
-
-Status WinEnv::DeleteDir(const std::string& name) {
-  return winenv_io_.DeleteDir(name);
-}
-
-Status WinEnv::GetFileSize(const std::string& fname,
-                           uint64_t* size) {
-  return winenv_io_.GetFileSize(fname, size);
-}
-
-Status  WinEnv::GetFileModificationTime(const std::string& fname,
-                                        uint64_t* file_mtime) {
-  return winenv_io_.GetFileModificationTime(fname, file_mtime);
-}
-
-Status WinEnv::RenameFile(const std::string& src,
-                          const std::string& target) {
-  return winenv_io_.RenameFile(src, target);
-}
-
-Status WinEnv::LinkFile(const std::string& src,
-                        const std::string& target) {
-  return winenv_io_.LinkFile(src, target);
-}
-
-Status WinEnv::NumFileLinks(const std::string& fname, uint64_t* count) {
-  return winenv_io_.NumFileLinks(fname, count);
-}
-
-Status WinEnv::AreFilesSame(const std::string& first,
-                            const std::string& second, bool* res) {
-  return winenv_io_.AreFilesSame(first, second, res);
-}
-
-Status WinEnv::LockFile(const std::string& lockFname,
-  FileLock** lock) {
-  return winenv_io_.LockFile(lockFname, lock);
-}
-
-Status WinEnv::UnlockFile(FileLock* lock) {
-  return winenv_io_.UnlockFile(lock);
-}
-
-Status WinEnv::GetTestDirectory(std::string* result) {
-  return winenv_io_.GetTestDirectory(result);
-}
-
-Status WinEnv::NewLogger(const std::string& fname,
-                         std::shared_ptr<Logger>* result) {
-  return winenv_io_.NewLogger(fname, result);
-}
-
-Status WinEnv::IsDirectory(const std::string& path, bool* is_dir) {
-  return winenv_io_.IsDirectory(path, is_dir);
-}
-
-uint64_t WinEnv::NowMicros() {
-  return winenv_io_.NowMicros();
-}
-
-uint64_t  WinEnv::NowNanos() {
-  return winenv_io_.NowNanos();
-}
-
 Status WinEnv::GetHostName(char* name, uint64_t len) {
   return winenv_io_.GetHostName(name, len);
 }
 
-Status WinEnv::GetAbsolutePath(const std::string& db_path,
-  std::string* output_path) {
-  return winenv_io_.GetAbsolutePath(db_path, output_path);
-}
-
-std::string WinEnv::TimeToString(uint64_t secondsSince1970) {
-  return winenv_io_.TimeToString(secondsSince1970);
-}
-
-void  WinEnv::Schedule(void(*function)(void*), void* arg, Env::Priority pri,
-                       void* tag,
-                       void(*unschedFunction)(void* arg)) {
+void WinEnv::Schedule(void (*function)(void*), void* arg, Env::Priority pri,
+                      void* tag, void (*unschedFunction)(void* arg)) {
   return winenv_threads_.Schedule(function, arg, pri, tag, unschedFunction);
 }
 
@@ -1477,32 +1371,20 @@ int WinEnv::UnSchedule(void* arg, Env::Priority pri) {
   return winenv_threads_.UnSchedule(arg, pri);
 }
 
-void WinEnv::StartThread(void(*function)(void* arg), void* arg) {
+void WinEnv::StartThread(void (*function)(void* arg), void* arg) {
   return winenv_threads_.StartThread(function, arg);
 }
 
-void WinEnv::WaitForJoin() {
-  return winenv_threads_.WaitForJoin();
-}
+void WinEnv::WaitForJoin() { return winenv_threads_.WaitForJoin(); }
 
-unsigned int  WinEnv::GetThreadPoolQueueLen(Env::Priority pri) const {
+unsigned int WinEnv::GetThreadPoolQueueLen(Env::Priority pri) const {
   return winenv_threads_.GetThreadPoolQueueLen(pri);
 }
 
-uint64_t WinEnv::GetThreadID() const {
-  return winenv_threads_.GetThreadID();
-}
-
-Status WinEnv::GetFreeSpace(const std::string& path, uint64_t* diskfree) {
-  return winenv_io_.GetFreeSpace(path, diskfree);
-}
-
-void WinEnv::SleepForMicroseconds(int micros) {
-  return winenv_threads_.SleepForMicroseconds(micros);
-}
+uint64_t WinEnv::GetThreadID() const { return winenv_threads_.GetThreadID(); }
 
 // Allow increasing the number of worker threads.
-void  WinEnv::SetBackgroundThreads(int num, Env::Priority pri) {
+void WinEnv::SetBackgroundThreads(int num, Env::Priority pri) {
   return winenv_threads_.SetBackgroundThreads(num, pri);
 }
 
@@ -1510,25 +1392,10 @@ int WinEnv::GetBackgroundThreads(Env::Priority pri) {
   return winenv_threads_.GetBackgroundThreads(pri);
 }
 
-void  WinEnv::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) {
+void WinEnv::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) {
   return winenv_threads_.IncBackgroundThreadsIfNeeded(num, pri);
 }
 
-EnvOptions WinEnv::OptimizeForManifestRead(
-    const EnvOptions& env_options) const {
-  return winenv_io_.OptimizeForManifestRead(env_options);
-}
-
-EnvOptions WinEnv::OptimizeForLogWrite(const EnvOptions& env_options,
-                                       const DBOptions& db_options) const {
-  return winenv_io_.OptimizeForLogWrite(env_options, db_options);
-}
-
-EnvOptions WinEnv::OptimizeForManifestWrite(
-    const EnvOptions& env_options) const {
-  return winenv_io_.OptimizeForManifestWrite(env_options);
-}
-
 }  // namespace port
 
 std::string Env::GenerateUniqueId() {
@@ -1550,6 +1417,19 @@ std::string Env::GenerateUniqueId() {
   return result;
 }
 
+std::shared_ptr<FileSystem> FileSystem::Default() {
+  return port::WinFileSystem::Default();
+}
+
+const std::shared_ptr<SystemClock>& SystemClock::Default() {
+  static std::shared_ptr<SystemClock> clock =
+      std::make_shared<port::WinClock>();
+  return clock;
+}
+
+std::unique_ptr<Env> NewCompositeEnv(const std::shared_ptr<FileSystem>& fs) {
+  return std::unique_ptr<Env>(new CompositeEnvWrapper(Env::Default(), fs));
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 #endif
diff --git a/port/win/env_win.h b/port/win/env_win.h
index 24e3a56fe36..54d3e7dbf0c 100644
--- a/port/win/env_win.h
+++ b/port/win/env_win.h
@@ -15,30 +15,30 @@
 // multiple threads without any external synchronization.
 
 #pragma once
-
-#include "port/win/win_thread.h"
-#include <rocksdb/env.h>
-#include "util/threadpool_imp.h"
-
 #include <stdint.h>
 #include <windows.h>
 
 #include <mutex>
-#include <vector>
 #include <string>
+#include <vector>
 
+#include "env/composite_env_wrapper.h"
+#include "port/win/win_thread.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/system_clock.h"
+#include "util/threadpool_imp.h"
 
 #undef GetCurrentTime
 #undef DeleteFile
-#undef GetTickCount
+#undef LoadLibrary
 
 namespace ROCKSDB_NAMESPACE {
 namespace port {
 
 // Currently not designed for inheritance but rather a replacement
 class WinEnvThreads {
-public:
-
+ public:
   explicit WinEnvThreads(Env* hosted_env);
 
   ~WinEnvThreads();
@@ -46,12 +46,12 @@ class WinEnvThreads {
   WinEnvThreads(const WinEnvThreads&) = delete;
   WinEnvThreads& operator=(const WinEnvThreads&) = delete;
 
-  void Schedule(void(*function)(void*), void* arg, Env::Priority pri,
-                void* tag, void(*unschedFunction)(void* arg));
+  void Schedule(void (*function)(void*), void* arg, Env::Priority pri,
+                void* tag, void (*unschedFunction)(void* arg));
 
   int UnSchedule(void* arg, Env::Priority pri);
 
-  void StartThread(void(*function)(void* arg), void* arg);
+  void StartThread(void (*function)(void* arg), void* arg);
 
   void WaitForJoin();
 
@@ -61,255 +61,211 @@ class WinEnvThreads {
 
   uint64_t GetThreadID() const;
 
-  void SleepForMicroseconds(int micros);
-
   // Allow increasing the number of worker threads.
   void SetBackgroundThreads(int num, Env::Priority pri);
   int GetBackgroundThreads(Env::Priority pri);
 
   void IncBackgroundThreadsIfNeeded(int num, Env::Priority pri);
 
-private:
-
+ private:
   Env* hosted_env_;
   mutable std::mutex mu_;
   std::vector<ThreadPoolImpl> thread_pools_;
   std::vector<WindowsThread> threads_to_join_;
-
 };
 
-// Designed for inheritance so can be re-used
-// but certain parts replaced
-class WinEnvIO {
-public:
-  explicit WinEnvIO(Env* hosted_env);
-
-  virtual ~WinEnvIO();
-
-  virtual Status DeleteFile(const std::string& fname);
-
-  Status Truncate(const std::string& fname, size_t size);
-
-  virtual Status GetCurrentTime(int64_t* unix_time);
-
-  virtual Status NewSequentialFile(const std::string& fname,
-                                   std::unique_ptr<SequentialFile>* result,
-                                   const EnvOptions& options);
-
-  // Helper for NewWritable and ReopenWritableFile
-  virtual Status OpenWritableFile(const std::string& fname,
-                                  std::unique_ptr<WritableFile>* result,
-                                  const EnvOptions& options,
-                                  bool reopen);
-
-  virtual Status NewRandomAccessFile(const std::string& fname,
-                                     std::unique_ptr<RandomAccessFile>* result,
-                                     const EnvOptions& options);
-
-  // The returned file will only be accessed by one thread at a time.
-  virtual Status NewRandomRWFile(const std::string& fname,
-                                 std::unique_ptr<RandomRWFile>* result,
-                                 const EnvOptions& options);
-
-  virtual Status NewMemoryMappedFileBuffer(
-      const std::string& fname,
-      std::unique_ptr<MemoryMappedFileBuffer>* result);
-
-  virtual Status NewDirectory(const std::string& name,
-                              std::unique_ptr<Directory>* result);
-
-  virtual Status FileExists(const std::string& fname);
-
-  virtual Status GetChildren(const std::string& dir,
-                             std::vector<std::string>* result);
-
-  virtual Status CreateDir(const std::string& name);
+class WinClock : public SystemClock {
+ public:
+  WinClock();
+  virtual ~WinClock() {}
 
-  virtual Status CreateDirIfMissing(const std::string& name);
+  const char* Name() const override { return "WindowsClock"; }
 
-  virtual Status DeleteDir(const std::string& name);
-
-  virtual Status GetFileSize(const std::string& fname, uint64_t* size);
-
-  static uint64_t FileTimeToUnixTime(const FILETIME& ftTime);
-
-  virtual Status GetFileModificationTime(const std::string& fname,
-                                         uint64_t* file_mtime);
-
-  virtual Status RenameFile(const std::string& src, const std::string& target);
-
-  virtual Status LinkFile(const std::string& src, const std::string& target);
-
-  virtual Status NumFileLinks(const std::string& /*fname*/,
-                              uint64_t* /*count*/);
-
-  virtual Status AreFilesSame(const std::string& first,
-                              const std::string& second, bool* res);
-
-  virtual Status LockFile(const std::string& lockFname, FileLock** lock);
-
-  virtual Status UnlockFile(FileLock* lock);
-
-  virtual Status GetTestDirectory(std::string* result);
-
-  virtual Status NewLogger(const std::string& fname,
-                           std::shared_ptr<Logger>* result);
-
-  virtual Status IsDirectory(const std::string& path, bool* is_dir);
-
-  virtual uint64_t NowMicros();
-
-  virtual uint64_t NowNanos();
-
-  virtual Status GetHostName(char* name, uint64_t len);
-
-  virtual Status GetAbsolutePath(const std::string& db_path,
-                                 std::string* output_path);
-
-  // This seems to clash with a macro on Windows, so #undef it here
-#undef GetFreeSpace
-
-  // Get the amount of free disk space
-  virtual Status GetFreeSpace(const std::string& path, uint64_t* diskfree);
-
-  virtual std::string TimeToString(uint64_t secondsSince1970);
-
-  virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
-                                         const DBOptions& db_options) const;
-
-  virtual EnvOptions OptimizeForManifestWrite(
-      const EnvOptions& env_options) const;
+  uint64_t NowMicros() override;
 
-  virtual EnvOptions OptimizeForManifestRead(
-      const EnvOptions& env_options) const;
+  uint64_t NowNanos() override;
 
-  size_t GetPageSize() const { return page_size_; }
+  // 0 indicates not supported
+  uint64_t CPUMicros() override { return 0; }
+  void SleepForMicroseconds(int micros) override;
 
-  size_t GetAllocationGranularity() const { return allocation_granularity_; }
+  Status GetCurrentTime(int64_t* unix_time) override;
+  // Converts seconds-since-Jan-01-1970 to a printable string
+  virtual std::string TimeToString(uint64_t time);
 
   uint64_t GetPerfCounterFrequency() const { return perf_counter_frequency_; }
 
-  static size_t GetSectorSize(const std::string& fname);
-
-private:
-  // Returns true iff the named directory exists and is a directory.
-  virtual bool DirExists(const std::string& dname);
+ private:
+  typedef VOID(WINAPI* FnGetSystemTimePreciseAsFileTime)(LPFILETIME);
 
-  typedef VOID(WINAPI * FnGetSystemTimePreciseAsFileTime)(LPFILETIME);
-
-  Env* hosted_env_;
-  size_t page_size_;
-  size_t allocation_granularity_;
   uint64_t perf_counter_frequency_;
   uint64_t nano_seconds_per_period_;
   FnGetSystemTimePreciseAsFileTime GetSystemTimePreciseAsFileTime_;
 };
 
-class WinEnv : public Env {
-public:
-  WinEnv();
-
-  ~WinEnv();
-
-  Status DeleteFile(const std::string& fname) override;
-
-  Status Truncate(const std::string& fname, size_t size) override;
-
-  Status GetCurrentTime(int64_t* unix_time) override;
+class WinFileSystem : public FileSystem {
+ public:
+  static const std::shared_ptr<WinFileSystem>& Default();
+  WinFileSystem(const std::shared_ptr<SystemClock>& clock);
+  ~WinFileSystem() {}
+  const char* Name() const { return "WinFS"; }
+  static size_t GetSectorSize(const std::string& fname);
+  size_t GetPageSize() const { return page_size_; }
+  size_t GetAllocationGranularity() const { return allocation_granularity_; }
 
-  Status NewSequentialFile(const std::string& fname,
-                           std::unique_ptr<SequentialFile>* result,
-                           const EnvOptions& options) override;
-
-  Status NewRandomAccessFile(const std::string& fname,
-                             std::unique_ptr<RandomAccessFile>* result,
-                             const EnvOptions& options) override;
-
-  Status NewWritableFile(const std::string& fname,
-                         std::unique_ptr<WritableFile>* result,
-                         const EnvOptions& options) override;
-
-  // Create an object that writes to a new file with the specified
-  // name.  Deletes any existing file with the same name and creates a
-  // new file.  On success, stores a pointer to the new file in
-  // *result and returns OK.  On failure stores nullptr in *result and
-  // returns non-OK.
-  //
-  // The returned file will only be accessed by one thread at a time.
-  Status ReopenWritableFile(const std::string& fname,
-                            std::unique_ptr<WritableFile>* result,
-                            const EnvOptions& options) override;
-
-  // The returned file will only be accessed by one thread at a time.
-  Status NewRandomRWFile(const std::string& fname,
-                         std::unique_ptr<RandomRWFile>* result,
-                         const EnvOptions& options) override;
-
-  Status NewMemoryMappedFileBuffer(
+  IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override;
+
+  // Truncate the named file to the specified size.
+  IOStatus Truncate(const std::string& /*fname*/, size_t /*size*/,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override;
+  IOStatus NewSequentialFile(const std::string& fname,
+                             const FileOptions& file_opts,
+                             std::unique_ptr<FSSequentialFile>* result,
+                             IODebugContext* dbg) override;
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& options,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* /*dbg*/) override;
+  IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts,
+                           std::unique_ptr<FSWritableFile>* r,
+                           IODebugContext* dbg) override;
+  IOStatus ReopenWritableFile(const std::string& fname,
+                              const FileOptions& options,
+                              std::unique_ptr<FSWritableFile>* result,
+                              IODebugContext* dbg) override;
+
+  IOStatus NewRandomRWFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* dbg) override;
+  IOStatus NewMemoryMappedFileBuffer(
       const std::string& fname,
       std::unique_ptr<MemoryMappedFileBuffer>* result) override;
 
-  Status NewDirectory(const std::string& name,
-                      std::unique_ptr<Directory>* result) override;
-
-  Status FileExists(const std::string& fname) override;
-
-  Status GetChildren(const std::string& dir,
-                     std::vector<std::string>* result) override;
-
-  Status CreateDir(const std::string& name) override;
-
-  Status CreateDirIfMissing(const std::string& name) override;
-
-  Status DeleteDir(const std::string& name) override;
-
-  Status GetFileSize(const std::string& fname,
-                     uint64_t* size) override;
-
-  Status GetFileModificationTime(const std::string& fname,
-                                 uint64_t* file_mtime) override;
-
-  Status RenameFile(const std::string& src,
-                    const std::string& target) override;
-
-  Status LinkFile(const std::string& src,
-                  const std::string& target) override;
-
-  Status NumFileLinks(const std::string& fname, uint64_t* count) override;
+  IOStatus NewDirectory(const std::string& name, const IOOptions& io_opts,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* dbg) override;
+  IOStatus FileExists(const std::string& f, const IOOptions& io_opts,
+                      IODebugContext* dbg) override;
+  IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts,
+                       std::vector<std::string>* r,
+                       IODebugContext* dbg) override;
+  IOStatus CreateDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+
+  // Creates directory if missing. Return Ok if it exists, or successful in
+  // Creating.
+  IOStatus CreateDirIfMissing(const std::string& dirname,
+                              const IOOptions& options,
+                              IODebugContext* dbg) override;
+
+  // Delete the specified directory.
+  IOStatus DeleteDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override;
+  // Store the size of fname in *file_size.
+  IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
+                       uint64_t* file_size, IODebugContext* dbg) override;
+  // Store the last modification time of fname in *file_mtime.
+  IOStatus GetFileModificationTime(const std::string& fname,
+                                   const IOOptions& options,
+                                   uint64_t* file_mtime,
+                                   IODebugContext* dbg) override;
+  // Rename file src to target.
+  IOStatus RenameFile(const std::string& src, const std::string& target,
+                      const IOOptions& options, IODebugContext* dbg) override;
+
+  // Hard Link file src to target.
+  IOStatus LinkFile(const std::string& /*src*/, const std::string& /*target*/,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override;
+  IOStatus NumFileLinks(const std::string& /*fname*/,
+                        const IOOptions& /*options*/, uint64_t* /*count*/,
+                        IODebugContext* /*dbg*/) override;
+  IOStatus AreFilesSame(const std::string& /*first*/,
+                        const std::string& /*second*/,
+                        const IOOptions& /*options*/, bool* /*res*/,
+                        IODebugContext* /*dbg*/) override;
+  IOStatus LockFile(const std::string& fname, const IOOptions& options,
+                    FileLock** lock, IODebugContext* dbg) override;
+  IOStatus UnlockFile(FileLock* lock, const IOOptions& options,
+                      IODebugContext* dbg) override;
+  IOStatus GetTestDirectory(const IOOptions& options, std::string* path,
+                            IODebugContext* dbg) override;
+
+  // Create and returns a default logger (an instance of EnvLogger) for storing
+  // informational messages. Derived classes can overide to provide custom
+  // logger.
+  IOStatus NewLogger(const std::string& fname, const IOOptions& io_opts,
+                     std::shared_ptr<Logger>* result,
+                     IODebugContext* dbg) override;
+  // Get full directory name for this db.
+  IOStatus GetAbsolutePath(const std::string& db_path, const IOOptions& options,
+                           std::string* output_path,
+                           IODebugContext* dbg) override;
+  IOStatus IsDirectory(const std::string& /*path*/, const IOOptions& options,
+                       bool* is_dir, IODebugContext* /*dgb*/) override;
+  // This seems to clash with a macro on Windows, so #undef it here
+#undef GetFreeSpace
+  IOStatus GetFreeSpace(const std::string& /*path*/,
+                        const IOOptions& /*options*/, uint64_t* /*diskfree*/,
+                        IODebugContext* /*dbg*/) override;
+  FileOptions OptimizeForLogWrite(const FileOptions& file_options,
+                                  const DBOptions& db_options) const override;
+  FileOptions OptimizeForManifestRead(
+      const FileOptions& file_options) const override;
+  FileOptions OptimizeForManifestWrite(
+      const FileOptions& file_options) const override;
+
+ protected:
+  static uint64_t FileTimeToUnixTime(const FILETIME& ftTime);
+  // Returns true iff the named directory exists and is a directory.
 
-  Status AreFilesSame(const std::string& first,
-                      const std::string& second, bool* res) override;
+  virtual bool DirExists(const std::string& dname);
+  // Helper for NewWritable and ReopenWritableFile
+  virtual IOStatus OpenWritableFile(const std::string& fname,
+                                    const FileOptions& options,
+                                    std::unique_ptr<FSWritableFile>* result,
+                                    bool reopen);
 
-  Status LockFile(const std::string& lockFname, FileLock** lock) override;
+ private:
+  std::shared_ptr<SystemClock> clock_;
+  size_t page_size_;
+  size_t allocation_granularity_;
+};
 
-  Status UnlockFile(FileLock* lock) override;
+// Designed for inheritance so can be re-used
+// but certain parts replaced
+class WinEnvIO {
+ public:
+  explicit WinEnvIO(Env* hosted_env);
 
-  Status GetTestDirectory(std::string* result) override;
+  virtual ~WinEnvIO();
 
-  Status NewLogger(const std::string& fname,
-                   std::shared_ptr<Logger>* result) override;
+  virtual Status GetHostName(char* name, uint64_t len);
 
-  Status IsDirectory(const std::string& path, bool* is_dir) override;
+ private:
+  Env* hosted_env_;
+};
 
-  uint64_t NowMicros() override;
+class WinEnv : public CompositeEnv {
+ public:
+  WinEnv();
 
-  uint64_t NowNanos() override;
+  ~WinEnv();
 
   Status GetHostName(char* name, uint64_t len) override;
 
-  Status GetAbsolutePath(const std::string& db_path,
-                         std::string* output_path) override;
-
-  std::string TimeToString(uint64_t secondsSince1970) override;
-
   Status GetThreadList(std::vector<ThreadStatus>* thread_list) override;
 
-  void Schedule(void(*function)(void*), void* arg, Env::Priority pri,
-                void* tag, void(*unschedFunction)(void* arg)) override;
+  void Schedule(void (*function)(void*), void* arg, Env::Priority pri,
+                void* tag, void (*unschedFunction)(void* arg)) override;
 
   int UnSchedule(void* arg, Env::Priority pri) override;
 
-  void StartThread(void(*function)(void* arg), void* arg) override;
+  void StartThread(void (*function)(void* arg), void* arg) override;
 
   void WaitForJoin() override;
 
@@ -317,35 +273,16 @@ class WinEnv : public Env {
 
   uint64_t GetThreadID() const override;
 
-  // This seems to clash with a macro on Windows, so #undef it here
-#undef GetFreeSpace
-
-  // Get the amount of free disk space
-  Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override;
-
-  void SleepForMicroseconds(int micros) override;
-
   // Allow increasing the number of worker threads.
   void SetBackgroundThreads(int num, Env::Priority pri) override;
   int GetBackgroundThreads(Env::Priority pri) override;
 
   void IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) override;
 
-  EnvOptions OptimizeForManifestRead(
-      const EnvOptions& env_options) const override;
-
-  EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
-                                 const DBOptions& db_options) const override;
-
-  EnvOptions OptimizeForManifestWrite(
-      const EnvOptions& env_options) const override;
-
-
-private:
-
+ private:
   WinEnvIO winenv_io_;
   WinEnvThreads winenv_threads_;
 };
 
-} // namespace port
+}  // namespace port
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/port/win/io_win.cc b/port/win/io_win.cc
index f8d1c3dbb83..1e662c06d91 100644
--- a/port/win/io_win.cc
+++ b/port/win/io_win.cc
@@ -20,36 +20,32 @@ namespace ROCKSDB_NAMESPACE {
 namespace port {
 
 /*
-* DirectIOHelper
-*/
+ * DirectIOHelper
+ */
 namespace {
 
 const size_t kSectorSize = 512;
 
-inline
-bool IsPowerOfTwo(const size_t alignment) {
+inline bool IsPowerOfTwo(const size_t alignment) {
   return ((alignment) & (alignment - 1)) == 0;
 }
 
-inline
-bool IsSectorAligned(const size_t off) {
+inline bool IsSectorAligned(const size_t off) {
   return (off & (kSectorSize - 1)) == 0;
 }
 
-inline
-bool IsAligned(size_t alignment, const void* ptr) {
+inline bool IsAligned(size_t alignment, const void* ptr) {
   return ((uintptr_t(ptr)) & (alignment - 1)) == 0;
 }
-}
-
+}  // namespace
 
 std::string GetWindowsErrSz(DWORD err) {
   LPSTR lpMsgBuf;
   FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
-    FORMAT_MESSAGE_IGNORE_INSERTS,
-    NULL, err,
-    0,  // Default language
-    reinterpret_cast<LPSTR>(&lpMsgBuf), 0, NULL);
+                     FORMAT_MESSAGE_IGNORE_INSERTS,
+                 NULL, err,
+                 0,  // Default language
+                 reinterpret_cast<LPSTR>(&lpMsgBuf), 0, NULL);
 
   std::string Err = lpMsgBuf;
   LocalFree(lpMsgBuf);
@@ -69,21 +65,20 @@ std::string GetWindowsErrSz(DWORD err) {
 // Because all the reads/writes happen by the specified offset, the caller in
 // theory should not
 // rely on the current file offset.
-Status pwrite(const WinFileData* file_data, const Slice& data,
-  uint64_t offset, size_t& bytes_written) {
-
-  Status s;
+IOStatus pwrite(const WinFileData* file_data, const Slice& data,
+                uint64_t offset, size_t& bytes_written) {
+  IOStatus s;
   bytes_written = 0;
 
   size_t num_bytes = data.size();
   if (num_bytes > std::numeric_limits<DWORD>::max()) {
     // May happen in 64-bit builds where size_t is 64-bits but
     // long is still 32-bit, but that's the API here at the moment
-    return Status::InvalidArgument("num_bytes is too large for a single write: " +
-          file_data->GetName());
+    return IOStatus::InvalidArgument(
+        "num_bytes is too large for a single write: " + file_data->GetName());
   }
 
-  OVERLAPPED overlapped = { 0 };
+  OVERLAPPED overlapped = {0};
   ULARGE_INTEGER offsetUnion;
   offsetUnion.QuadPart = offset;
 
@@ -92,11 +87,12 @@ Status pwrite(const WinFileData* file_data, const Slice& data,
 
   DWORD bytesWritten = 0;
 
-  if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(), static_cast<DWORD>(num_bytes),
-    &bytesWritten, &overlapped)) {
+  if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(),
+                         static_cast<DWORD>(num_bytes), &bytesWritten,
+                         &overlapped)) {
     auto lastError = GetLastError();
     s = IOErrorFromWindowsError("WriteFile failed: " + file_data->GetName(),
-      lastError);
+                                lastError);
   } else {
     bytes_written = bytesWritten;
   }
@@ -105,18 +101,17 @@ Status pwrite(const WinFileData* file_data, const Slice& data,
 }
 
 // See comments for pwrite above
-Status pread(const WinFileData* file_data, char* src, size_t num_bytes,
-  uint64_t offset, size_t& bytes_read) {
-
-  Status s;
+IOStatus pread(const WinFileData* file_data, char* src, size_t num_bytes,
+               uint64_t offset, size_t& bytes_read) {
+  IOStatus s;
   bytes_read = 0;
 
   if (num_bytes > std::numeric_limits<DWORD>::max()) {
-    return Status::InvalidArgument("num_bytes is too large for a single read: " +
-      file_data->GetName());
+    return IOStatus::InvalidArgument(
+        "num_bytes is too large for a single read: " + file_data->GetName());
   }
 
-  OVERLAPPED overlapped = { 0 };
+  OVERLAPPED overlapped = {0};
   ULARGE_INTEGER offsetUnion;
   offsetUnion.QuadPart = offset;
 
@@ -125,13 +120,14 @@ Status pread(const WinFileData* file_data, char* src, size_t num_bytes,
 
   DWORD bytesRead = 0;
 
-  if (FALSE == ReadFile(file_data->GetFileHandle(), src, static_cast<DWORD>(num_bytes),
-    &bytesRead, &overlapped)) {
+  if (FALSE == ReadFile(file_data->GetFileHandle(), src,
+                        static_cast<DWORD>(num_bytes), &bytesRead,
+                        &overlapped)) {
     auto lastError = GetLastError();
     // EOF is OK with zero bytes read
     if (lastError != ERROR_HANDLE_EOF) {
       s = IOErrorFromWindowsError("ReadFile failed: " + file_data->GetName(),
-        lastError);
+                                  lastError);
     }
   } else {
     bytes_read = bytesRead;
@@ -143,35 +139,34 @@ Status pread(const WinFileData* file_data, char* src, size_t num_bytes,
 // SetFileInformationByHandle() is capable of fast pre-allocates.
 // However, this does not change the file end position unless the file is
 // truncated and the pre-allocated space is not considered filled with zeros.
-Status fallocate(const std::string& filename, HANDLE hFile,
-  uint64_t to_size) {
-  Status status;
+IOStatus fallocate(const std::string& filename, HANDLE hFile,
+                   uint64_t to_size) {
+  IOStatus status;
 
   FILE_ALLOCATION_INFO alloc_info;
   alloc_info.AllocationSize.QuadPart = to_size;
 
   if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info,
-    sizeof(FILE_ALLOCATION_INFO))) {
+                                  sizeof(FILE_ALLOCATION_INFO))) {
     auto lastError = GetLastError();
     status = IOErrorFromWindowsError(
-      "Failed to pre-allocate space: " + filename, lastError);
+        "Failed to pre-allocate space: " + filename, lastError);
   }
 
   return status;
 }
 
-Status ftruncate(const std::string& filename, HANDLE hFile,
-  uint64_t toSize) {
-  Status status;
+IOStatus ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize) {
+  IOStatus status;
 
   FILE_END_OF_FILE_INFO end_of_file;
   end_of_file.EndOfFile.QuadPart = toSize;
 
   if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file,
-    sizeof(FILE_END_OF_FILE_INFO))) {
+                                  sizeof(FILE_END_OF_FILE_INFO))) {
     auto lastError = GetLastError();
     status = IOErrorFromWindowsError("Failed to Set end of file: " + filename,
-      lastError);
+                                     lastError);
   }
 
   return status;
@@ -212,9 +207,11 @@ WinMmapReadableFile::~WinMmapReadableFile() {
   assert(ret);
 }
 
-Status WinMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result,
-  char* scratch) const {
-  Status s;
+IOStatus WinMmapReadableFile::Read(uint64_t offset, size_t n,
+                                   const IOOptions& /*options*/, Slice* result,
+                                   char* scratch,
+                                   IODebugContext* /*dbg*/) const {
+  IOStatus s;
 
   if (offset > length_) {
     *result = Slice();
@@ -222,13 +219,12 @@ Status WinMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result,
   } else if (offset + n > length_) {
     n = length_ - static_cast<size_t>(offset);
   }
-  *result =
-    Slice(reinterpret_cast<const char*>(mapped_region_)+offset, n);
+  *result = Slice(reinterpret_cast<const char*>(mapped_region_) + offset, n);
   return s;
 }
 
-Status WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
-  return Status::OK();
+IOStatus WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
+  return IOStatus::OK();
 }
 
 size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const {
@@ -238,20 +234,19 @@ size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const {
 ///////////////////////////////////////////////////////////////////////////////
 /// WinMmapFile
 
-
 // Can only truncate or reserve to a sector size aligned if
 // used on files that are opened with Unbuffered I/O
-Status WinMmapFile::TruncateFile(uint64_t toSize) {
+IOStatus WinMmapFile::TruncateFile(uint64_t toSize) {
   return ftruncate(filename_, hFile_, toSize);
 }
 
-Status WinMmapFile::UnmapCurrentRegion() {
-  Status status;
+IOStatus WinMmapFile::UnmapCurrentRegion() {
+  IOStatus status;
 
   if (mapped_begin_ != nullptr) {
     if (!::UnmapViewOfFile(mapped_begin_)) {
       status = IOErrorFromWindowsError(
-        "Failed to unmap file view: " + filename_, GetLastError());
+          "Failed to unmap file view: " + filename_, GetLastError());
     }
 
     // Move on to the next portion of the file
@@ -271,16 +266,16 @@ Status WinMmapFile::UnmapCurrentRegion() {
   return status;
 }
 
-Status WinMmapFile::MapNewRegion() {
-
-  Status status;
+IOStatus WinMmapFile::MapNewRegion(const IOOptions& options,
+                                   IODebugContext* dbg) {
+  IOStatus status;
 
   assert(mapped_begin_ == nullptr);
 
   size_t minDiskSize = static_cast<size_t>(file_offset_) + view_size_;
 
   if (minDiskSize > reserved_size_) {
-    status = Allocate(file_offset_, view_size_);
+    status = Allocate(file_offset_, view_size_, options, dbg);
     if (!status.ok()) {
       return status;
     }
@@ -288,7 +283,6 @@ Status WinMmapFile::MapNewRegion() {
 
   // Need to remap
   if (hMap_ == NULL || reserved_size_ > mapping_size_) {
-
     if (hMap_ != NULL) {
       // Unmap the previous one
       BOOL ret __attribute__((__unused__));
@@ -301,18 +295,18 @@ Status WinMmapFile::MapNewRegion() {
     mappingSize.QuadPart = reserved_size_;
 
     hMap_ = CreateFileMappingA(
-      hFile_,
-      NULL,                  // Security attributes
-      PAGE_READWRITE,        // There is not a write only mode for mapping
-      mappingSize.HighPart,  // Enable mapping the whole file but the actual
-      // amount mapped is determined by MapViewOfFile
-      mappingSize.LowPart,
-      NULL);  // Mapping name
+        hFile_,
+        NULL,                  // Security attributes
+        PAGE_READWRITE,        // There is not a write only mode for mapping
+        mappingSize.HighPart,  // Enable mapping the whole file but the actual
+        // amount mapped is determined by MapViewOfFile
+        mappingSize.LowPart,
+        NULL);  // Mapping name
 
     if (NULL == hMap_) {
       return IOErrorFromWindowsError(
-        "WindowsMmapFile failed to create file mapping for: " + filename_,
-        GetLastError());
+          "WindowsMmapFile failed to create file mapping for: " + filename_,
+          GetLastError());
     }
 
     mapping_size_ = reserved_size_;
@@ -323,13 +317,13 @@ Status WinMmapFile::MapNewRegion() {
 
   // View must begin at the granularity aligned offset
   mapped_begin_ = reinterpret_cast<char*>(
-    MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart,
-    view_size_, NULL));
+      MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart,
+                      view_size_, NULL));
 
   if (!mapped_begin_) {
     status = IOErrorFromWindowsError(
-      "WindowsMmapFile failed to map file view: " + filename_,
-      GetLastError());
+        "WindowsMmapFile failed to map file view: " + filename_,
+        GetLastError());
   } else {
     mapped_end_ = mapped_begin_ + view_size_;
     dst_ = mapped_begin_;
@@ -339,15 +333,15 @@ Status WinMmapFile::MapNewRegion() {
   return status;
 }
 
-Status WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) {
+IOStatus WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) {
   return fallocate(filename_, hFile_, spaceToReserve);
 }
 
 WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile,
                          size_t page_size, size_t allocation_granularity,
-                         const EnvOptions& options)
+                         const FileOptions& options)
     : WinFileData(fname, hFile, false),
-      WritableFile(options),
+      FSWritableFile(options),
       hMap_(NULL),
       page_size_(page_size),
       allocation_granularity_(allocation_granularity),
@@ -373,17 +367,19 @@ WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile,
 
   // View size must be both the multiple of allocation_granularity AND the
   // page size and the granularity is usually a multiple of a page size.
-  const size_t viewSize = 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode
+  const size_t viewSize =
+      32 * 1024;  // 32Kb similar to the Windows File Cache in buffered mode
   view_size_ = Roundup(viewSize, allocation_granularity_);
 }
 
 WinMmapFile::~WinMmapFile() {
   if (hFile_) {
-    this->Close();
+    this->Close(IOOptions(), nullptr);
   }
 }
 
-Status WinMmapFile::Append(const Slice& data) {
+IOStatus WinMmapFile::Append(const Slice& data, const IOOptions& options,
+                             IODebugContext* dbg) {
   const char* src = data.data();
   size_t left = data.size();
 
@@ -392,9 +388,9 @@ Status WinMmapFile::Append(const Slice& data) {
     size_t avail = mapped_end_ - dst_;
 
     if (avail == 0) {
-      Status s = UnmapCurrentRegion();
+      IOStatus s = UnmapCurrentRegion();
       if (s.ok()) {
-        s = MapNewRegion();
+        s = MapNewRegion(options, dbg);
       }
 
       if (!s.ok()) {
@@ -416,30 +412,31 @@ Status WinMmapFile::Append(const Slice& data) {
     memset(dst_, 0, bytesToPad);
   }
 
-  return Status::OK();
+  return IOStatus::OK();
 }
 
 // Means Close() will properly take care of truncate
 // and it does not need any additional information
-Status WinMmapFile::Truncate(uint64_t size) {
-  return Status::OK();
+IOStatus WinMmapFile::Truncate(uint64_t size, const IOOptions& /*options*/,
+                               IODebugContext* /*dbg*/) {
+  return IOStatus::OK();
 }
 
-Status WinMmapFile::Close() {
-  Status s;
+IOStatus WinMmapFile::Close(const IOOptions& options, IODebugContext* dbg) {
+  IOStatus s;
 
   assert(NULL != hFile_);
 
   // We truncate to the precise size so no
   // uninitialized data at the end. SetEndOfFile
   // which we use does not write zeros and it is good.
-  uint64_t targetSize = GetFileSize();
+  uint64_t targetSize = GetFileSize(options, dbg);
 
   if (mapped_begin_ != nullptr) {
     // Sync before unmapping to make sure everything
     // is on disk and there is not a lazy writing
     // so we are deterministic with the tests
-    Sync();
+    Sync(options, dbg);
     s = UnmapCurrentRegion();
   }
 
@@ -448,14 +445,13 @@ Status WinMmapFile::Close() {
     if (!ret && s.ok()) {
       auto lastError = GetLastError();
       s = IOErrorFromWindowsError(
-        "Failed to Close mapping for file: " + filename_, lastError);
+          "Failed to Close mapping for file: " + filename_, lastError);
     }
 
     hMap_ = NULL;
   }
 
   if (hFile_ != NULL) {
-
     TruncateFile(targetSize);
 
     BOOL ret = ::CloseHandle(hFile_);
@@ -464,18 +460,22 @@ Status WinMmapFile::Close() {
     if (!ret && s.ok()) {
       auto lastError = GetLastError();
       s = IOErrorFromWindowsError(
-        "Failed to close file map handle: " + filename_, lastError);
+          "Failed to close file map handle: " + filename_, lastError);
     }
   }
 
   return s;
 }
 
-Status WinMmapFile::Flush() { return Status::OK(); }
+IOStatus WinMmapFile::Flush(const IOOptions& /*options*/,
+                            IODebugContext* /*dbg*/) {
+  return IOStatus::OK();
+}
 
 // Flush only data
-Status WinMmapFile::Sync() {
-  Status s;
+IOStatus WinMmapFile::Sync(const IOOptions& /*options*/,
+                           IODebugContext* /*dbg*/) {
+  IOStatus s;
 
   // Some writes occurred since last sync
   if (dst_ > last_sync_) {
@@ -485,15 +485,15 @@ Status WinMmapFile::Sync() {
     assert(dst_ < mapped_end_);
 
     size_t page_begin =
-      TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_);
+        TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_);
     size_t page_end =
-      TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1);
+        TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1);
 
     // Flush only the amount of that is a multiple of pages
     if (!::FlushViewOfFile(mapped_begin_ + page_begin,
-      (page_end - page_begin) + page_size_)) {
+                           (page_end - page_begin) + page_size_)) {
       s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_,
-        GetLastError());
+                                  GetLastError());
     } else {
       last_sync_ = dst_;
     }
@@ -503,16 +503,16 @@ Status WinMmapFile::Sync() {
 }
 
 /**
-* Flush data as well as metadata to stable storage.
-*/
-Status WinMmapFile::Fsync() {
-  Status s = Sync();
+ * Flush data as well as metadata to stable storage.
+ */
+IOStatus WinMmapFile::Fsync(const IOOptions& options, IODebugContext* dbg) {
+  IOStatus s = Sync(options, dbg);
 
   // Flush metadata
   if (s.ok() && pending_sync_) {
     if (!::FlushFileBuffers(hFile_)) {
       s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_,
-        GetLastError());
+                                  GetLastError());
     }
     pending_sync_ = false;
   }
@@ -521,27 +521,31 @@ Status WinMmapFile::Fsync() {
 }
 
 /**
-* Get the size of valid data in the file. This will not match the
-* size that is returned from the filesystem because we use mmap
-* to extend file by map_size every time.
-*/
-uint64_t WinMmapFile::GetFileSize() {
+ * Get the size of valid data in the file. This will not match the
+ * size that is returned from the filesystem because we use mmap
+ * to extend file by map_size every time.
+ */
+uint64_t WinMmapFile::GetFileSize(const IOOptions& /*options*/,
+                                  IODebugContext* /*dbg*/) {
   size_t used = dst_ - mapped_begin_;
   return file_offset_ + used;
 }
 
-Status WinMmapFile::InvalidateCache(size_t offset, size_t length) {
-  return Status::OK();
+IOStatus WinMmapFile::InvalidateCache(size_t offset, size_t length) {
+  return IOStatus::OK();
 }
 
-Status WinMmapFile::Allocate(uint64_t offset, uint64_t len) {
-  Status status;
-  TEST_KILL_RANDOM("WinMmapFile::Allocate", rocksdb_kill_odds);
+IOStatus WinMmapFile::Allocate(uint64_t offset, uint64_t len,
+                               const IOOptions& /*options*/,
+                               IODebugContext* /*dbg*/) {
+  IOStatus status;
+  TEST_KILL_RANDOM("WinMmapFile::Allocate");
 
   // Make sure that we reserve an aligned amount of space
   // since the reservation block size is driven outside so we want
   // to check if we are ok with reservation here
-  size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len), view_size_);
+  size_t spaceToReserve =
+      Roundup(static_cast<size_t>(offset + len), view_size_);
   // Nothing to do
   if (spaceToReserve <= reserved_size_) {
     return status;
@@ -563,31 +567,34 @@ size_t WinMmapFile::GetUniqueId(char* id, size_t max_size) const {
 // WinSequentialFile
 
 WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f,
-                                     const EnvOptions& options)
+                                     const FileOptions& options)
     : WinFileData(fname, f, options.use_direct_reads) {}
 
 WinSequentialFile::~WinSequentialFile() {
   assert(hFile_ != INVALID_HANDLE_VALUE);
 }
 
-Status WinSequentialFile::Read(size_t n, Slice* result, char* scratch) {
-  Status s;
+IOStatus WinSequentialFile::Read(size_t n, const IOOptions& /*opts*/,
+                                 Slice* result, char* scratch,
+                                 IODebugContext* /*dbg*/) {
+  IOStatus s;
   size_t r = 0;
 
   assert(result != nullptr);
   if (WinFileData::use_direct_io()) {
-    return Status::NotSupported("Read() does not support direct_io");
+    return IOStatus::NotSupported("Read() does not support direct_io");
   }
 
   // Windows ReadFile API accepts a DWORD.
   // While it is possible to read in a loop if n is too big
   // it is an unlikely case.
   if (n > std::numeric_limits<DWORD>::max()) {
-    return Status::InvalidArgument("n is too big for a single ReadFile: "
-      + filename_);
+    return IOStatus::InvalidArgument("n is too big for a single ReadFile: " +
+                                     filename_);
   }
 
-  DWORD bytesToRead = static_cast<DWORD>(n); //cast is safe due to the check above
+  DWORD bytesToRead =
+      static_cast<DWORD>(n);  // cast is safe due to the check above
   DWORD bytesRead = 0;
   BOOL ret = ReadFile(hFile_, scratch, bytesToRead, &bytesRead, NULL);
   if (ret != FALSE) {
@@ -595,8 +602,7 @@ Status WinSequentialFile::Read(size_t n, Slice* result, char* scratch) {
   } else {
     auto lastError = GetLastError();
     if (lastError != ERROR_HANDLE_EOF) {
-      s = IOErrorFromWindowsError("ReadFile failed: " + filename_,
-        lastError);
+      s = IOErrorFromWindowsError("ReadFile failed: " + filename_, lastError);
     }
   }
 
@@ -604,99 +610,91 @@ Status WinSequentialFile::Read(size_t n, Slice* result, char* scratch) {
   return s;
 }
 
-Status WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes,
-  uint64_t offset, size_t& bytes_read) const {
+IOStatus WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes,
+                                                   uint64_t offset,
+                                                   size_t& bytes_read) const {
   return pread(this, src, numBytes, offset, bytes_read);
 }
 
-Status WinSequentialFile::PositionedRead(uint64_t offset, size_t n, Slice* result,
-  char* scratch) {
-
-  Status s;
-
+IOStatus WinSequentialFile::PositionedRead(uint64_t offset, size_t n,
+                                           const IOOptions& /*opts*/,
+                                           Slice* result, char* scratch,
+                                           IODebugContext* /*dbg*/) {
   if (!WinFileData::use_direct_io()) {
-    return Status::NotSupported("This function is only used for direct_io");
+    return IOStatus::NotSupported("This function is only used for direct_io");
   }
 
-  if (!IsSectorAligned(static_cast<size_t>(offset)) ||
-      !IsSectorAligned(n)) {
-      return Status::InvalidArgument(
+  if (!IsSectorAligned(static_cast<size_t>(offset)) || !IsSectorAligned(n)) {
+    return IOStatus::InvalidArgument(
         "WinSequentialFile::PositionedRead: offset is not properly aligned");
   }
 
-  size_t bytes_read = 0; // out param
-  s = PositionedReadInternal(scratch, static_cast<size_t>(n), offset, bytes_read);
+  size_t bytes_read = 0;  // out param
+  IOStatus s = PositionedReadInternal(scratch, static_cast<size_t>(n), offset,
+                                      bytes_read);
   *result = Slice(scratch, bytes_read);
   return s;
 }
 
-
-Status WinSequentialFile::Skip(uint64_t n) {
-  // Can't handle more than signed max as SetFilePointerEx accepts a signed 64-bit
-  // integer. As such it is a highly unlikley case to have n so large.
+IOStatus WinSequentialFile::Skip(uint64_t n) {
+  // Can't handle more than signed max as SetFilePointerEx accepts a signed
+  // 64-bit integer. As such it is a highly unlikley case to have n so large.
   if (n > static_cast<uint64_t>(std::numeric_limits<LONGLONG>::max())) {
-    return Status::InvalidArgument("n is too large for a single SetFilePointerEx() call" +
-      filename_);
+    return IOStatus::InvalidArgument(
+        "n is too large for a single SetFilePointerEx() call" + filename_);
   }
 
   LARGE_INTEGER li;
-  li.QuadPart = static_cast<LONGLONG>(n); //cast is safe due to the check above
+  li.QuadPart = static_cast<LONGLONG>(n);  // cast is safe due to the check
+                                           // above
   BOOL ret = SetFilePointerEx(hFile_, li, NULL, FILE_CURRENT);
   if (ret == FALSE) {
     auto lastError = GetLastError();
     return IOErrorFromWindowsError("Skip SetFilePointerEx():" + filename_,
                                    lastError);
   }
-  return Status::OK();
+  return IOStatus::OK();
 }
 
-Status WinSequentialFile::InvalidateCache(size_t offset, size_t length) {
-  return Status::OK();
+IOStatus WinSequentialFile::InvalidateCache(size_t offset, size_t length) {
+  return IOStatus::OK();
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////////////
 /// WinRandomAccessBase
 
-inline
-Status WinRandomAccessImpl::PositionedReadInternal(char* src,
-  size_t numBytes,
-  uint64_t offset,
-  size_t& bytes_read) const {
+inline IOStatus WinRandomAccessImpl::PositionedReadInternal(
+    char* src, size_t numBytes, uint64_t offset, size_t& bytes_read) const {
   return pread(file_base_, src, numBytes, offset, bytes_read);
 }
 
-inline
-WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base,
-  size_t alignment,
-  const EnvOptions& options) :
-    file_base_(file_base),
-    alignment_(alignment) {
-
+inline WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base,
+                                                size_t alignment,
+                                                const FileOptions& options)
+    : file_base_(file_base), alignment_(alignment) {
   assert(!options.use_mmap_reads);
 }
 
-inline
-Status WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, Slice* result,
-  char* scratch) const {
-
-  Status s;
-
+inline IOStatus WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n,
+                                              Slice* result,
+                                              char* scratch) const {
   // Check buffer alignment
   if (file_base_->use_direct_io()) {
     if (!IsSectorAligned(static_cast<size_t>(offset)) ||
         !IsAligned(alignment_, scratch)) {
-      return Status::InvalidArgument(
-        "WinRandomAccessImpl::ReadImpl: offset or scratch is not properly aligned");
+      return IOStatus::InvalidArgument(
+          "WinRandomAccessImpl::ReadImpl: offset or scratch is not properly "
+          "aligned");
     }
   }
 
   if (n == 0) {
     *result = Slice(scratch, 0);
-    return s;
+    return IOStatus::OK();
   }
 
   size_t bytes_read = 0;
-  s = PositionedReadInternal(scratch, n, offset, bytes_read);
+  IOStatus s = PositionedReadInternal(scratch, n, offset, bytes_read);
   *result = Slice(scratch, bytes_read);
   return s;
 }
@@ -706,20 +704,21 @@ Status WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, Slice* result,
 
 WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile,
                                          size_t alignment,
-                                         const EnvOptions& options)
+                                         const FileOptions& options)
     : WinFileData(fname, hFile, options.use_direct_reads),
       WinRandomAccessImpl(this, alignment, options) {}
 
-WinRandomAccessFile::~WinRandomAccessFile() {
-}
+WinRandomAccessFile::~WinRandomAccessFile() {}
 
-Status WinRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result,
-  char* scratch) const {
+IOStatus WinRandomAccessFile::Read(uint64_t offset, size_t n,
+                                   const IOOptions& /*options*/, Slice* result,
+                                   char* scratch,
+                                   IODebugContext* /*dbg*/) const {
   return ReadImpl(offset, n, result, scratch);
 }
 
-Status WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
-  return Status::OK();
+IOStatus WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
+  return IOStatus::OK();
 }
 
 size_t WinRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
@@ -734,27 +733,26 @@ size_t WinRandomAccessFile::GetRequiredBufferAlignment() const {
 // WinWritableImpl
 //
 
-inline
-Status WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) {
-  return fallocate(file_data_->GetName(), file_data_->GetFileHandle(), spaceToReserve);
+inline IOStatus WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) {
+  return fallocate(file_data_->GetName(), file_data_->GetFileHandle(),
+                   spaceToReserve);
 }
 
-inline
-WinWritableImpl::WinWritableImpl(WinFileData* file_data, size_t alignment)
-  : file_data_(file_data),
-  alignment_(alignment),
-  next_write_offset_(0),
-  reservedsize_(0) {
-
+inline WinWritableImpl::WinWritableImpl(WinFileData* file_data,
+                                        size_t alignment)
+    : file_data_(file_data),
+      alignment_(alignment),
+      next_write_offset_(0),
+      reservedsize_(0) {
   // Query current position in case ReopenWritableFile is called
   // This position is only important for buffered writes
   // for unbuffered writes we explicitely specify the position.
   LARGE_INTEGER zero_move;
-  zero_move.QuadPart = 0; // Do not move
+  zero_move.QuadPart = 0;  // Do not move
   LARGE_INTEGER pos;
   pos.QuadPart = 0;
   BOOL ret = SetFilePointerEx(file_data_->GetFileHandle(), zero_move, &pos,
-      FILE_CURRENT);
+                              FILE_CURRENT);
   // Querying no supped to fail
   if (ret != 0) {
     next_write_offset_ = pos.QuadPart;
@@ -763,17 +761,15 @@ WinWritableImpl::WinWritableImpl(WinFileData* file_data, size_t alignment)
   }
 }
 
-inline
-Status WinWritableImpl::AppendImpl(const Slice& data) {
-
-  Status s;
+inline IOStatus WinWritableImpl::AppendImpl(const Slice& data) {
+  IOStatus s;
 
   if (data.size() > std::numeric_limits<DWORD>::max()) {
-    return Status::InvalidArgument("data is too long for a single write" +
-                                   file_data_->GetName());
+    return IOStatus::InvalidArgument("data is too long for a single write" +
+                                     file_data_->GetName());
   }
 
-  size_t bytes_written = 0; // out param
+  size_t bytes_written = 0;  // out param
 
   if (file_data_->use_direct_io()) {
     // With no offset specified we are appending
@@ -781,56 +777,53 @@ Status WinWritableImpl::AppendImpl(const Slice& data) {
     assert(IsSectorAligned(next_write_offset_));
     if (!IsSectorAligned(data.size()) ||
         !IsAligned(static_cast<size_t>(GetAlignement()), data.data())) {
-      s = Status::InvalidArgument(
-        "WriteData must be page aligned, size must be sector aligned");
+      s = IOStatus::InvalidArgument(
+          "WriteData must be page aligned, size must be sector aligned");
     } else {
       s = pwrite(file_data_, data, next_write_offset_, bytes_written);
     }
   } else {
-
     DWORD bytesWritten = 0;
     if (!WriteFile(file_data_->GetFileHandle(), data.data(),
-      static_cast<DWORD>(data.size()), &bytesWritten, NULL)) {
+                   static_cast<DWORD>(data.size()), &bytesWritten, NULL)) {
       auto lastError = GetLastError();
       s = IOErrorFromWindowsError(
-        "Failed to WriteFile: " + file_data_->GetName(),
-        lastError);
+          "Failed to WriteFile: " + file_data_->GetName(), lastError);
     } else {
       bytes_written = bytesWritten;
     }
   }
 
-  if(s.ok()) {
+  if (s.ok()) {
     if (bytes_written == data.size()) {
       // This matters for direct_io cases where
       // we rely on the fact that next_write_offset_
       // is sector aligned
       next_write_offset_ += bytes_written;
     } else {
-      s = Status::IOError("Failed to write all bytes: " +
-                          file_data_->GetName());
+      s = IOStatus::IOError("Failed to write all bytes: " +
+                            file_data_->GetName());
     }
   }
 
   return s;
 }
 
-inline
-Status WinWritableImpl::PositionedAppendImpl(const Slice& data, uint64_t offset) {
-
-  if(file_data_->use_direct_io()) {
+inline IOStatus WinWritableImpl::PositionedAppendImpl(const Slice& data,
+                                                      uint64_t offset) {
+  if (file_data_->use_direct_io()) {
     if (!IsSectorAligned(static_cast<size_t>(offset)) ||
         !IsSectorAligned(data.size()) ||
         !IsAligned(static_cast<size_t>(GetAlignement()), data.data())) {
-      return Status::InvalidArgument(
-        "Data and offset must be page aligned, size must be sector aligned");
+      return IOStatus::InvalidArgument(
+          "Data and offset must be page aligned, size must be sector aligned");
     }
   }
 
   size_t bytes_written = 0;
-  Status s = pwrite(file_data_, data, offset, bytes_written);
+  IOStatus s = pwrite(file_data_, data, offset, bytes_written);
 
-  if(s.ok()) {
+  if (s.ok()) {
     if (bytes_written == data.size()) {
       // For sequential write this would be simple
       // size extension by data.size()
@@ -839,23 +832,21 @@ Status WinWritableImpl::PositionedAppendImpl(const Slice& data, uint64_t offset)
         next_write_offset_ = write_end;
       }
     } else {
-      s = Status::IOError("Failed to write all of the requested data: " +
-        file_data_->GetName());
+      s = IOStatus::IOError("Failed to write all of the requested data: " +
+                            file_data_->GetName());
     }
   }
   return s;
 }
 
-inline
-Status WinWritableImpl::TruncateImpl(uint64_t size) {
-
+inline IOStatus WinWritableImpl::TruncateImpl(uint64_t size) {
   // It is tempting to check for the size for sector alignment
   // but truncation may come at the end and there is not a requirement
   // for this to be sector aligned so long as we do not attempt to write
   // after that. The interface docs state that the behavior is undefined
   // in that case.
-  Status s = ftruncate(file_data_->GetName(), file_data_->GetFileHandle(),
-    size);
+  IOStatus s =
+      ftruncate(file_data_->GetName(), file_data_->GetFileHandle(), size);
 
   if (s.ok()) {
     next_write_offset_ = size;
@@ -863,50 +854,48 @@ Status WinWritableImpl::TruncateImpl(uint64_t size) {
   return s;
 }
 
-inline
-Status WinWritableImpl::CloseImpl() {
-
-  Status s;
+inline IOStatus WinWritableImpl::CloseImpl() {
+  IOStatus s;
 
   auto hFile = file_data_->GetFileHandle();
   assert(INVALID_HANDLE_VALUE != hFile);
 
   if (!::FlushFileBuffers(hFile)) {
     auto lastError = GetLastError();
-    s = IOErrorFromWindowsError("FlushFileBuffers failed at Close() for: " +
-      file_data_->GetName(),
-      lastError);
+    s = IOErrorFromWindowsError(
+        "FlushFileBuffers failed at Close() for: " + file_data_->GetName(),
+        lastError);
   }
 
-  if(!file_data_->CloseFile() && s.ok()) {
+  if (!file_data_->CloseFile() && s.ok()) {
     auto lastError = GetLastError();
-    s = IOErrorFromWindowsError("CloseHandle failed for: " + file_data_->GetName(),
-      lastError);
+    s = IOErrorFromWindowsError(
+        "CloseHandle failed for: " + file_data_->GetName(), lastError);
   }
   return s;
 }
 
-inline
-Status WinWritableImpl::SyncImpl() {
-  Status s;
-  if (!::FlushFileBuffers (file_data_->GetFileHandle())) {
+inline IOStatus WinWritableImpl::SyncImpl(const IOOptions& /*options*/,
+                                          IODebugContext* /*dbg*/) {
+  IOStatus s;
+  if (!::FlushFileBuffers(file_data_->GetFileHandle())) {
     auto lastError = GetLastError();
     s = IOErrorFromWindowsError(
-        "FlushFileBuffers failed at Sync() for: " + file_data_->GetName(), lastError);
+        "FlushFileBuffers failed at Sync() for: " + file_data_->GetName(),
+        lastError);
   }
   return s;
 }
 
-
-inline
-Status WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) {
-  Status status;
-  TEST_KILL_RANDOM("WinWritableFile::Allocate", rocksdb_kill_odds);
+inline IOStatus WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) {
+  IOStatus status;
+  TEST_KILL_RANDOM("WinWritableFile::Allocate");
 
   // Make sure that we reserve an aligned amount of space
   // since the reservation block size is driven outside so we want
   // to check if we are ok with reservation here
-  size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len), static_cast<size_t>(alignment_));
+  size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len),
+                                  static_cast<size_t>(alignment_));
   // Nothing to do
   if (spaceToReserve <= reservedsize_) {
     return status;
@@ -920,66 +909,78 @@ Status WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) {
   return status;
 }
 
-
 ////////////////////////////////////////////////////////////////////////////////
 /// WinWritableFile
 
 WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile,
                                  size_t alignment, size_t /* capacity */,
-                                 const EnvOptions& options)
+                                 const FileOptions& options)
     : WinFileData(fname, hFile, options.use_direct_writes),
       WinWritableImpl(this, alignment),
-      WritableFile(options) {
+      FSWritableFile(options) {
   assert(!options.use_mmap_writes);
 }
 
-WinWritableFile::~WinWritableFile() {
-}
+WinWritableFile::~WinWritableFile() {}
 
 // Indicates if the class makes use of direct I/O
-bool WinWritableFile::use_direct_io() const { return WinFileData::use_direct_io(); }
+bool WinWritableFile::use_direct_io() const {
+  return WinFileData::use_direct_io();
+}
 
 size_t WinWritableFile::GetRequiredBufferAlignment() const {
   return static_cast<size_t>(GetAlignement());
 }
 
-Status WinWritableFile::Append(const Slice& data) {
+IOStatus WinWritableFile::Append(const Slice& data,
+                                 const IOOptions& /*options*/,
+                                 IODebugContext* /*dbg*/) {
   return AppendImpl(data);
 }
 
-Status WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset) {
+IOStatus WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset,
+                                           const IOOptions& /*options*/,
+                                           IODebugContext* /*dbg*/) {
   return PositionedAppendImpl(data, offset);
 }
 
 // Need to implement this so the file is truncated correctly
 // when buffered and unbuffered mode
-Status WinWritableFile::Truncate(uint64_t size) {
+IOStatus WinWritableFile::Truncate(uint64_t size, const IOOptions& /*options*/,
+                                   IODebugContext* /*dbg*/) {
   return TruncateImpl(size);
 }
 
-Status WinWritableFile::Close() {
+IOStatus WinWritableFile::Close(const IOOptions& /*options*/,
+                                IODebugContext* /*dbg*/) {
   return CloseImpl();
 }
 
-  // write out the cached data to the OS cache
-  // This is now taken care of the WritableFileWriter
-Status WinWritableFile::Flush() {
-  return Status::OK();
+// write out the cached data to the OS cache
+// This is now taken care of the WritableFileWriter
+IOStatus WinWritableFile::Flush(const IOOptions& /*options*/,
+                                IODebugContext* /*dbg*/) {
+  return IOStatus::OK();
 }
 
-Status WinWritableFile::Sync() {
-  return SyncImpl();
+IOStatus WinWritableFile::Sync(const IOOptions& options, IODebugContext* dbg) {
+  return SyncImpl(options, dbg);
 }
 
-Status WinWritableFile::Fsync() { return SyncImpl(); }
+IOStatus WinWritableFile::Fsync(const IOOptions& options, IODebugContext* dbg) {
+  return SyncImpl(options, dbg);
+}
 
 bool WinWritableFile::IsSyncThreadSafe() const { return true; }
 
-uint64_t WinWritableFile::GetFileSize() {
+uint64_t WinWritableFile::GetFileSize(const IOOptions& /*options*/,
+                                      IODebugContext* /*dbg*/) {
   return GetFileNextWriteOffset();
 }
 
-Status WinWritableFile::Allocate(uint64_t offset, uint64_t len) {
+IOStatus WinWritableFile::Allocate(uint64_t offset, uint64_t len,
+                                   const IOOptions& /*options*/,
+                                   IODebugContext* /*dbg*/) {
   return AllocateImpl(offset, len);
 }
 
@@ -991,36 +992,43 @@ size_t WinWritableFile::GetUniqueId(char* id, size_t max_size) const {
 /// WinRandomRWFile
 
 WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile,
-                                 size_t alignment, const EnvOptions& options)
+                                 size_t alignment, const FileOptions& options)
     : WinFileData(fname, hFile,
                   options.use_direct_reads && options.use_direct_writes),
       WinRandomAccessImpl(this, alignment, options),
       WinWritableImpl(this, alignment) {}
 
-bool WinRandomRWFile::use_direct_io() const { return WinFileData::use_direct_io(); }
+bool WinRandomRWFile::use_direct_io() const {
+  return WinFileData::use_direct_io();
+}
 
 size_t WinRandomRWFile::GetRequiredBufferAlignment() const {
   return static_cast<size_t>(GetAlignement());
 }
 
-Status WinRandomRWFile::Write(uint64_t offset, const Slice & data) {
+IOStatus WinRandomRWFile::Write(uint64_t offset, const Slice& data,
+                                const IOOptions& /*options*/,
+                                IODebugContext* /*dbg*/) {
   return PositionedAppendImpl(data, offset);
 }
 
-Status WinRandomRWFile::Read(uint64_t offset, size_t n, Slice* result,
-                             char* scratch) const {
+IOStatus WinRandomRWFile::Read(uint64_t offset, size_t n,
+                               const IOOptions& /*options*/, Slice* result,
+                               char* scratch, IODebugContext* /*dbg*/) const {
   return ReadImpl(offset, n, result, scratch);
 }
 
-Status WinRandomRWFile::Flush() {
-  return Status::OK();
+IOStatus WinRandomRWFile::Flush(const IOOptions& /*options*/,
+                                IODebugContext* /*dbg*/) {
+  return IOStatus::OK();
 }
 
-Status WinRandomRWFile::Sync() {
-  return SyncImpl();
+IOStatus WinRandomRWFile::Sync(const IOOptions& options, IODebugContext* dbg) {
+  return SyncImpl(options, dbg);
 }
 
-Status WinRandomRWFile::Close() {
+IOStatus WinRandomRWFile::Close(const IOOptions& /*options*/,
+                                IODebugContext* /*dbg*/) {
   return CloseImpl();
 }
 
@@ -1029,9 +1037,9 @@ Status WinRandomRWFile::Close() {
 WinMemoryMappedBuffer::~WinMemoryMappedBuffer() {
   BOOL ret
 #if defined(_MSC_VER)
-    = FALSE;
+      = FALSE;
 #else
-    __attribute__((__unused__));
+      __attribute__((__unused__));
 #endif
   if (base_ != nullptr) {
     ret = ::UnmapViewOfFile(base_);
@@ -1053,7 +1061,10 @@ WinMemoryMappedBuffer::~WinMemoryMappedBuffer() {
 //////////////////////////////////////////////////////////////////////////
 /// WinDirectory
 
-Status WinDirectory::Fsync() { return Status::OK(); }
+IOStatus WinDirectory::Fsync(const IOOptions& /*options*/,
+                             IODebugContext* /*dbg*/) {
+  return IOStatus::OK();
+}
 
 size_t WinDirectory::GetUniqueId(char* id, size_t max_size) const {
   return GetUniqueIdFromFile(handle_, id, max_size);
@@ -1067,7 +1078,7 @@ WinFileLock::~WinFileLock() {
   assert(ret);
 }
 
-}
+}  // namespace port
 }  // namespace ROCKSDB_NAMESPACE
 
 #endif
diff --git a/port/win/io_win.h b/port/win/io_win.h
index d7aa7b48397..4119f5add2c 100644
--- a/port/win/io_win.h
+++ b/port/win/io_win.h
@@ -9,51 +9,53 @@
 #pragma once
 
 #include <stdint.h>
+#include <windows.h>
+
 #include <mutex>
 #include <string>
 
+#include "rocksdb/file_system.h"
 #include "rocksdb/status.h"
-#include "rocksdb/env.h"
 #include "util/aligned_buffer.h"
-
-#include <windows.h>
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace port {
 
 std::string GetWindowsErrSz(DWORD err);
 
-inline Status IOErrorFromWindowsError(const std::string& context, DWORD err) {
+inline IOStatus IOErrorFromWindowsError(const std::string& context, DWORD err) {
   return ((err == ERROR_HANDLE_DISK_FULL) || (err == ERROR_DISK_FULL))
-             ? Status::NoSpace(context, GetWindowsErrSz(err))
+             ? IOStatus::NoSpace(context, GetWindowsErrSz(err))
              : ((err == ERROR_FILE_NOT_FOUND) || (err == ERROR_PATH_NOT_FOUND))
-                   ? Status::PathNotFound(context, GetWindowsErrSz(err))
-                   : Status::IOError(context, GetWindowsErrSz(err));
+                   ? IOStatus::PathNotFound(context, GetWindowsErrSz(err))
+                   : IOStatus::IOError(context, GetWindowsErrSz(err));
 }
 
-inline Status IOErrorFromLastWindowsError(const std::string& context) {
+inline IOStatus IOErrorFromLastWindowsError(const std::string& context) {
   return IOErrorFromWindowsError(context, GetLastError());
 }
 
-inline Status IOError(const std::string& context, int err_number) {
+inline IOStatus IOError(const std::string& context, int err_number) {
   return (err_number == ENOSPC)
-             ? Status::NoSpace(context, strerror(err_number))
+             ? IOStatus::NoSpace(context, errnoStr(err_number).c_str())
              : (err_number == ENOENT)
-                   ? Status::PathNotFound(context, strerror(err_number))
-                   : Status::IOError(context, strerror(err_number));
+                   ? IOStatus::PathNotFound(context,
+                                            errnoStr(err_number).c_str())
+                   : IOStatus::IOError(context, errnoStr(err_number).c_str());
 }
 
 class WinFileData;
 
-Status pwrite(const WinFileData* file_data, const Slice& data,
-  uint64_t offset, size_t& bytes_written);
+IOStatus pwrite(const WinFileData* file_data, const Slice& data,
+                uint64_t offset, size_t& bytes_written);
 
-Status pread(const WinFileData* file_data, char* src, size_t num_bytes,
-  uint64_t offset, size_t& bytes_read);
+IOStatus pread(const WinFileData* file_data, char* src, size_t num_bytes,
+               uint64_t offset, size_t& bytes_read);
 
-Status fallocate(const std::string& filename, HANDLE hFile, uint64_t to_size);
+IOStatus fallocate(const std::string& filename, HANDLE hFile, uint64_t to_size);
 
-Status ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize);
+IOStatus ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize);
 
 size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size);
 
@@ -95,34 +97,38 @@ class WinFileData {
   WinFileData& operator=(const WinFileData&) = delete;
 };
 
-class WinSequentialFile : protected WinFileData, public SequentialFile {
-
+class WinSequentialFile : protected WinFileData, public FSSequentialFile {
   // Override for behavior change when creating a custom env
-  virtual Status PositionedReadInternal(char* src, size_t numBytes,
-    uint64_t offset, size_t& bytes_read) const;
+  virtual IOStatus PositionedReadInternal(char* src, size_t numBytes,
+                                          uint64_t offset,
+                                          size_t& bytes_read) const;
 
-public:
+ public:
   WinSequentialFile(const std::string& fname, HANDLE f,
-    const EnvOptions& options);
+                    const FileOptions& options);
 
   ~WinSequentialFile();
 
   WinSequentialFile(const WinSequentialFile&) = delete;
   WinSequentialFile& operator=(const WinSequentialFile&) = delete;
 
-  virtual Status Read(size_t n, Slice* result, char* scratch) override;
-  virtual Status PositionedRead(uint64_t offset, size_t n, Slice* result,
-    char* scratch) override;
+  IOStatus Read(size_t n, const IOOptions& options, Slice* result,
+                char* scratch, IODebugContext* dbg) override;
+  IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options,
+                          Slice* result, char* scratch,
+                          IODebugContext* dbg) override;
 
-  virtual Status Skip(uint64_t n) override;
+  IOStatus Skip(uint64_t n) override;
 
-  virtual Status InvalidateCache(size_t offset, size_t length) override;
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
 
-  virtual bool use_direct_io() const override { return WinFileData::use_direct_io(); }
+  virtual bool use_direct_io() const override {
+    return WinFileData::use_direct_io();
+  }
 };
 
 // mmap() based random-access
-class WinMmapReadableFile : private WinFileData, public RandomAccessFile {
+class WinMmapReadableFile : private WinFileData, public FSRandomAccessFile {
   HANDLE hMap_;
 
   const void* mapped_region_;
@@ -138,10 +144,11 @@ class WinMmapReadableFile : private WinFileData, public RandomAccessFile {
   WinMmapReadableFile(const WinMmapReadableFile&) = delete;
   WinMmapReadableFile& operator=(const WinMmapReadableFile&) = delete;
 
-  virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                      char* scratch) const override;
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
 
-  virtual Status InvalidateCache(size_t offset, size_t length) override;
+  virtual IOStatus InvalidateCache(size_t offset, size_t length) override;
 
   virtual size_t GetUniqueId(char* id, size_t max_size) const override;
 };
@@ -150,7 +157,7 @@ class WinMmapReadableFile : private WinFileData, public RandomAccessFile {
 // data to the file.  This is safe since we either properly close the
 // file before reading from it, or for log files, the reading code
 // knows enough to skip zero suffixes.
-class WinMmapFile : private WinFileData, public WritableFile {
+class WinMmapFile : private WinFileData, public FSWritableFile {
  private:
   HANDLE hMap_;
 
@@ -179,51 +186,59 @@ class WinMmapFile : private WinFileData, public WritableFile {
 
   // Can only truncate or reserve to a sector size aligned if
   // used on files that are opened with Unbuffered I/O
-  Status TruncateFile(uint64_t toSize);
+  IOStatus TruncateFile(uint64_t toSize);
 
-  Status UnmapCurrentRegion();
+  IOStatus UnmapCurrentRegion();
 
-  Status MapNewRegion();
+  IOStatus MapNewRegion(const IOOptions& options, IODebugContext* dbg);
 
-  virtual Status PreallocateInternal(uint64_t spaceToReserve);
+  virtual IOStatus PreallocateInternal(uint64_t spaceToReserve);
 
  public:
   WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size,
-              size_t allocation_granularity, const EnvOptions& options);
+              size_t allocation_granularity, const FileOptions& options);
 
   ~WinMmapFile();
 
   WinMmapFile(const WinMmapFile&) = delete;
   WinMmapFile& operator=(const WinMmapFile&) = delete;
 
-  virtual Status Append(const Slice& data) override;
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  IODebugContext* dbg) override;
+  IOStatus Append(const Slice& data, const IOOptions& opts,
+                  const DataVerificationInfo& /* verification_info */,
+                  IODebugContext* dbg) override {
+    return Append(data, opts, dbg);
+  }
 
   // Means Close() will properly take care of truncate
   // and it does not need any additional information
-  virtual Status Truncate(uint64_t size) override;
+  IOStatus Truncate(uint64_t size, const IOOptions& options,
+                    IODebugContext* dbg) override;
 
-  virtual Status Close() override;
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
 
-  virtual Status Flush() override;
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
 
   // Flush only data
-  virtual Status Sync() override;
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
 
   /**
-  * Flush data as well as metadata to stable storage.
-  */
-  virtual Status Fsync() override;
+   * Flush data as well as metadata to stable storage.
+   */
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
 
   /**
-  * Get the size of valid data in the file. This will not match the
-  * size that is returned from the filesystem because we use mmap
-  * to extend file by map_size every time.
-  */
-  virtual uint64_t GetFileSize() override;
+   * Get the size of valid data in the file. This will not match the
+   * size that is returned from the filesystem because we use mmap
+   * to extend file by map_size every time.
+   */
+  uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override;
 
-  virtual Status InvalidateCache(size_t offset, size_t length) override;
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
 
-  virtual Status Allocate(uint64_t offset, uint64_t len) override;
+  IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options,
+                    IODebugContext* dbg) override;
 
   virtual size_t GetUniqueId(char* id, size_t max_size) const override;
 };
@@ -231,24 +246,24 @@ class WinMmapFile : private WinFileData, public WritableFile {
 class WinRandomAccessImpl {
  protected:
   WinFileData* file_base_;
-  size_t       alignment_;
+  size_t alignment_;
 
   // Override for behavior change when creating a custom env
-  virtual Status PositionedReadInternal(char* src, size_t numBytes,
-                                        uint64_t offset, size_t& bytes_read) const;
+  virtual IOStatus PositionedReadInternal(char* src, size_t numBytes,
+                                          uint64_t offset,
+                                          size_t& bytes_read) const;
 
   WinRandomAccessImpl(WinFileData* file_base, size_t alignment,
-                      const EnvOptions& options);
+                      const FileOptions& options);
 
   virtual ~WinRandomAccessImpl() {}
 
-  Status ReadImpl(uint64_t offset, size_t n, Slice* result,
-                  char* scratch) const;
+  IOStatus ReadImpl(uint64_t offset, size_t n, Slice* result,
+                    char* scratch) const;
 
   size_t GetAlignment() const { return alignment_; }
 
  public:
-
   WinRandomAccessImpl(const WinRandomAccessImpl&) = delete;
   WinRandomAccessImpl& operator=(const WinRandomAccessImpl&) = delete;
 };
@@ -258,21 +273,24 @@ class WinRandomAccessFile
     : private WinFileData,
       protected WinRandomAccessImpl,  // Want to be able to override
                                       // PositionedReadInternal
-      public RandomAccessFile {
+      public FSRandomAccessFile {
  public:
   WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment,
-                      const EnvOptions& options);
+                      const FileOptions& options);
 
   ~WinRandomAccessFile();
 
-  virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                      char* scratch) const override;
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
 
   virtual size_t GetUniqueId(char* id, size_t max_size) const override;
 
-  virtual bool use_direct_io() const override { return WinFileData::use_direct_io(); }
+  virtual bool use_direct_io() const override {
+    return WinFileData::use_direct_io();
+  }
 
-  virtual Status InvalidateCache(size_t offset, size_t length) override;
+  IOStatus InvalidateCache(size_t offset, size_t length) override;
 
   virtual size_t GetRequiredBufferAlignment() const override;
 };
@@ -293,10 +311,11 @@ class WinWritableImpl {
  protected:
   WinFileData* file_data_;
   const uint64_t alignment_;
-  uint64_t next_write_offset_; // Needed because Windows does not support O_APPEND
+  uint64_t
+      next_write_offset_;  // Needed because Windows does not support O_APPEND
   uint64_t reservedsize_;  // how far we have reserved space
 
-  virtual Status PreallocateInternal(uint64_t spaceToReserve);
+  virtual IOStatus PreallocateInternal(uint64_t spaceToReserve);
 
   WinWritableImpl(WinFileData* file_data, size_t alignment);
 
@@ -304,17 +323,17 @@ class WinWritableImpl {
 
   uint64_t GetAlignement() const { return alignment_; }
 
-  Status AppendImpl(const Slice& data);
+  IOStatus AppendImpl(const Slice& data);
 
   // Requires that the data is aligned as specified by
   // GetRequiredBufferAlignment()
-  Status PositionedAppendImpl(const Slice& data, uint64_t offset);
+  IOStatus PositionedAppendImpl(const Slice& data, uint64_t offset);
 
-  Status TruncateImpl(uint64_t size);
+  IOStatus TruncateImpl(uint64_t size);
 
-  Status CloseImpl();
+  IOStatus CloseImpl();
 
-  Status SyncImpl();
+  IOStatus SyncImpl(const IOOptions& options, IODebugContext* dbg);
 
   uint64_t GetFileNextWriteOffset() {
     // Double accounting now here with WritableFileWriter
@@ -326,7 +345,7 @@ class WinWritableImpl {
     return next_write_offset_;
   }
 
-  Status AllocateImpl(uint64_t offset, uint64_t len);
+  IOStatus AllocateImpl(uint64_t offset, uint64_t len);
 
  public:
   WinWritableImpl(const WinWritableImpl&) = delete;
@@ -335,32 +354,47 @@ class WinWritableImpl {
 
 class WinWritableFile : private WinFileData,
                         protected WinWritableImpl,
-                        public WritableFile {
+                        public FSWritableFile {
  public:
   WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment,
-                  size_t capacity, const EnvOptions& options);
+                  size_t capacity, const FileOptions& options);
 
   ~WinWritableFile();
 
-  virtual Status Append(const Slice& data) override;
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  IODebugContext* dbg) override;
+  IOStatus Append(const Slice& data, const IOOptions& opts,
+                  const DataVerificationInfo& /* verification_info */,
+                  IODebugContext* dbg) override {
+    return Append(data, opts, dbg);
+  }
 
   // Requires that the data is aligned as specified by
   // GetRequiredBufferAlignment()
-  virtual Status PositionedAppend(const Slice& data, uint64_t offset) override;
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& options,
+                            IODebugContext* dbg) override;
+  IOStatus PositionedAppend(const Slice& data, uint64_t offset,
+                            const IOOptions& opts,
+                            const DataVerificationInfo& /* verification_info */,
+                            IODebugContext* dbg) override {
+    return PositionedAppend(data, offset, opts, dbg);
+  }
 
   // Need to implement this so the file is truncated correctly
   // when buffered and unbuffered mode
-  virtual Status Truncate(uint64_t size) override;
+  IOStatus Truncate(uint64_t size, const IOOptions& options,
+                    IODebugContext* dbg) override;
 
-  virtual Status Close() override;
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
 
   // write out the cached data to the OS cache
   // This is now taken care of the WritableFileWriter
-  virtual Status Flush() override;
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
 
-  virtual Status Sync() override;
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
 
-  virtual Status Fsync() override;
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
 
   virtual bool IsSyncThreadSafe() const override;
 
@@ -370,9 +404,10 @@ class WinWritableFile : private WinFileData,
 
   virtual size_t GetRequiredBufferAlignment() const override;
 
-  virtual uint64_t GetFileSize() override;
+  uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override;
 
-  virtual Status Allocate(uint64_t offset, uint64_t len) override;
+  IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options,
+                    IODebugContext* dbg) override;
 
   virtual size_t GetUniqueId(char* id, size_t max_size) const override;
 };
@@ -380,10 +415,10 @@ class WinWritableFile : private WinFileData,
 class WinRandomRWFile : private WinFileData,
                         protected WinRandomAccessImpl,
                         protected WinWritableImpl,
-                        public RandomRWFile {
+                        public FSRandomRWFile {
  public:
   WinRandomRWFile(const std::string& fname, HANDLE hFile, size_t alignment,
-                  const EnvOptions& options);
+                  const FileOptions& options);
 
   ~WinRandomRWFile() {}
 
@@ -397,45 +432,50 @@ class WinRandomRWFile : private WinFileData,
 
   // Write bytes in `data` at  offset `offset`, Returns Status::OK() on success.
   // Pass aligned buffer when use_direct_io() returns true.
-  virtual Status Write(uint64_t offset, const Slice& data) override;
+  IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
+                 IODebugContext* dbg) override;
 
   // Read up to `n` bytes starting from offset `offset` and store them in
   // result, provided `scratch` size should be at least `n`.
   // Returns Status::OK() on success.
-  virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                      char* scratch) const override;
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
 
-  virtual Status Flush() override;
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
 
-  virtual Status Sync() override;
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
 
-  virtual Status Fsync() override { return Sync(); }
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+    return Sync(options, dbg);
+  }
 
-  virtual Status Close() override;
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
 };
 
 class WinMemoryMappedBuffer : public MemoryMappedFileBuffer {
-private:
-  HANDLE  file_handle_;
-  HANDLE  map_handle_;
-public:
-  WinMemoryMappedBuffer(HANDLE file_handle, HANDLE map_handle, void* base, size_t size) :
-    MemoryMappedFileBuffer(base, size),
-    file_handle_(file_handle),
-    map_handle_(map_handle) {}
+ private:
+  HANDLE file_handle_;
+  HANDLE map_handle_;
+
+ public:
+  WinMemoryMappedBuffer(HANDLE file_handle, HANDLE map_handle, void* base,
+                        size_t size)
+      : MemoryMappedFileBuffer(base, size),
+        file_handle_(file_handle),
+        map_handle_(map_handle) {}
   ~WinMemoryMappedBuffer() override;
 };
 
-class WinDirectory : public Directory {
+class WinDirectory : public FSDirectory {
   HANDLE handle_;
+
  public:
   explicit WinDirectory(HANDLE h) noexcept : handle_(h) {
     assert(handle_ != INVALID_HANDLE_VALUE);
   }
-  ~WinDirectory() {
-    ::CloseHandle(handle_);
-  }
-  virtual Status Fsync() override;
+  ~WinDirectory() { ::CloseHandle(handle_); }
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
 
   size_t GetUniqueId(char* id, size_t max_size) const override;
 };
@@ -452,5 +492,5 @@ class WinFileLock : public FileLock {
  private:
   HANDLE hFile_;
 };
-}
+}  // namespace port
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/port/win/port_win.h b/port/win/port_win.h
index 2c5b8ff0533..a6a6de27801 100644
--- a/port/win/port_win.h
+++ b/port/win/port_win.h
@@ -355,6 +355,7 @@ extern void SetCpuPriority(ThreadId id, CpuPriority priority);
 #define RX_FILESTRING std::wstring
 #define RX_FN(a) ROCKSDB_NAMESPACE::port::utf8_to_utf16(a)
 #define FN_TO_RX(a) ROCKSDB_NAMESPACE::port::utf16_to_utf8(a)
+#define RX_FNCMP(a, b) ::wcscmp(a, RX_FN(b).c_str())
 #define RX_FNLEN(a) ::wcslen(a)
 
 #define RX_DeleteFile DeleteFileW
@@ -379,6 +380,7 @@ extern void SetCpuPriority(ThreadId id, CpuPriority priority);
 #define RX_FILESTRING std::string
 #define RX_FN(a) a
 #define FN_TO_RX(a) a
+#define RX_FNCMP(a, b) strcmp(a, b)
 #define RX_FNLEN(a) strlen(a)
 
 #define RX_DeleteFile DeleteFileA
@@ -388,7 +390,7 @@ extern void SetCpuPriority(ThreadId id, CpuPriority priority);
 #define RX_FindFirstFileEx FindFirstFileExA
 #define RX_CreateDirectory CreateDirectoryA
 #define RX_FindNextFile FindNextFileA
-#define RX_WIN32_FIND_DATA WIN32_FIND_DATA
+#define RX_WIN32_FIND_DATA WIN32_FIND_DATAA
 #define RX_CreateDirectory CreateDirectoryA
 #define RX_RemoveDirectory RemoveDirectoryA
 #define RX_GetFileAttributesEx GetFileAttributesExA
diff --git a/port/win/win_logger.cc b/port/win/win_logger.cc
index a9b10c04b8c..a45f3c6d439 100644
--- a/port/win/win_logger.cc
+++ b/port/win/win_logger.cc
@@ -13,31 +13,33 @@
 #if defined(OS_WIN)
 
 #include "port/win/win_logger.h"
-#include "port/win/io_win.h"
 
-#include <algorithm>
+#include <fcntl.h>
 #include <stdio.h>
 #include <time.h>
-#include <fcntl.h>
-#include <atomic>
 
-#include "rocksdb/env.h"
+#include <algorithm>
+#include <atomic>
 
 #include "monitoring/iostats_context_imp.h"
 #include "port/sys_time.h"
+#include "port/win/env_win.h"
+#include "port/win/io_win.h"
+#include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 namespace port {
 
-WinLogger::WinLogger(uint64_t (*gettid)(), Env* env, HANDLE file,
+WinLogger::WinLogger(uint64_t (*gettid)(), SystemClock* clock, HANDLE file,
                      const InfoLogLevel log_level)
     : Logger(log_level),
       file_(file),
       gettid_(gettid),
       log_size_(0),
       last_flush_micros_(0),
-      env_(env),
+      clock_(clock),
       flush_pending_(false) {
   assert(file_ != NULL);
   assert(file_ != INVALID_HANDLE_VALUE);
@@ -53,7 +55,7 @@ void WinLogger::DebugWriter(const char* str, int len) {
   }
 }
 
-WinLogger::~WinLogger() { CloseInternal(); }
+WinLogger::~WinLogger() { CloseInternal().PermitUncheckedError(); }
 
 Status WinLogger::CloseImpl() {
   return CloseInternal();
@@ -88,7 +90,7 @@ void WinLogger::Flush() {
     // for perf reasons.
   }
 
-  last_flush_micros_ = env_->NowMicros();
+  last_flush_micros_ = clock_->NowMicros();
 }
 
 void WinLogger::Logv(const char* format, va_list ap) {
diff --git a/port/win/win_logger.h b/port/win/win_logger.h
index 116e7898db6..809c7d5a2bb 100644
--- a/port/win/win_logger.h
+++ b/port/win/win_logger.h
@@ -12,22 +12,21 @@
 
 #pragma once
 
+#include <stdint.h>
+#include <windows.h>
+
 #include <atomic>
+#include <memory>
 
 #include "rocksdb/env.h"
 
-#include <stdint.h>
-#include <windows.h>
-
 namespace ROCKSDB_NAMESPACE {
-
-class Env;
+class SystemClock;
 
 namespace port {
-
 class WinLogger : public ROCKSDB_NAMESPACE::Logger {
  public:
-  WinLogger(uint64_t (*gettid)(), Env* env, HANDLE file,
+  WinLogger(uint64_t (*gettid)(), SystemClock* clock, HANDLE file,
             const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL);
 
   virtual ~WinLogger();
@@ -54,7 +53,7 @@ class WinLogger : public ROCKSDB_NAMESPACE::Logger {
   uint64_t (*gettid_)();  // Return the thread id for the current thread
   std::atomic_size_t log_size_;
   std::atomic_uint_fast64_t last_flush_micros_;
-  Env* env_;
+  SystemClock* clock_;
   bool flush_pending_;
 
   Status CloseInternal();
diff --git a/src.mk b/src.mk
index fe43adc46fa..70f657de4de 100644
--- a/src.mk
+++ b/src.mk
@@ -1,6 +1,7 @@
 # These are the sources from which librocksdb.a is built:
 LIB_SOURCES =                                                   \
   cache/cache.cc                                                \
+  cache/cache_entry_roles.cc                                    \
   cache/clock_cache.cc                                          \
   cache/lru_cache.cc                                            \
   cache/sharded_cache.cc                                        \
@@ -21,8 +22,10 @@ LIB_SOURCES =                                                   \
   cloud/cloud_storage_provider.cc                               \
   cloud/cloud_file_cache.cc                                     \
   db/arena_wrapped_db_iter.cc                                   \
+  db/blob/blob_fetcher.cc                                       \
   db/blob/blob_file_addition.cc                                 \
   db/blob/blob_file_builder.cc                                  \
+  db/blob/blob_file_cache.cc                                    \
   db/blob/blob_file_garbage.cc                                  \
   db/blob/blob_file_meta.cc                                     \
   db/blob/blob_file_reader.cc                                   \
@@ -32,7 +35,6 @@ LIB_SOURCES =                                                   \
   db/builder.cc                                                 \
   db/c.cc                                                       \
   db/column_family.cc                                           \
-  db/compacted_db_impl.cc                                       \
   db/compaction/compaction.cc                                   \
   db/compaction/compaction_iterator.cc                          \
   db/compaction/compaction_job.cc                               \
@@ -43,6 +45,7 @@ LIB_SOURCES =                                                   \
   db/compaction/sst_partitioner.cc                              \
   db/convenience.cc                                             \
   db/db_filesnapshot.cc                                         \
+  db/db_impl/compacted_db_impl.cc                                       \
   db/db_impl/db_impl.cc                                         \
   db/db_impl/db_impl_compaction_flush.cc                        \
   db/db_impl/db_impl_debug.cc                                   \
@@ -94,6 +97,7 @@ LIB_SOURCES =                                                   \
   db/write_batch_base.cc                                        \
   db/write_controller.cc                                        \
   db/write_thread.cc                                            \
+  env/composite_env.cc                                          \
   env/env.cc                                                    \
   env/env_chroot.cc                                             \
   env/env_encryption.cc                                         \
@@ -101,6 +105,7 @@ LIB_SOURCES =                                                   \
   env/env_posix.cc                                              \
   env/file_system.cc                                            \
   env/fs_posix.cc                                               \
+  env/fs_remap.cc                                               \
   env/file_system_tracer.cc                                     \
   env/io_posix.cc                                               \
   env/mock_env.cc                                               \
@@ -108,6 +113,7 @@ LIB_SOURCES =                                                   \
   file/file_prefetch_buffer.cc                                  \
   file/file_util.cc                                             \
   file/filename.cc                                              \
+  file/line_file_reader.cc                                      \
   file/random_access_file_reader.cc                             \
   file/read_write_util.cc                                       \
   file/readahead_raf.cc                                         \
@@ -143,6 +149,7 @@ LIB_SOURCES =                                                   \
   monitoring/thread_status_util_debug.cc                        \
   options/cf_options.cc                                         \
   options/configurable.cc                                       \
+  options/customizable.cc                                       \
   options/db_options.cc                                         \
   options/options.cc                                            \
   options/options_helper.cc                                     \
@@ -217,11 +224,13 @@ LIB_SOURCES =                                                   \
   util/compression_context_cache.cc                             \
   util/concurrent_task_limiter_impl.cc                          \
   util/crc32c.cc                                                \
+  util/crc32c_arm64.cc                                          \
   util/dynamic_bloom.cc                                         \
   util/hash.cc                                                  \
   util/murmurhash.cc                                            \
   util/random.cc                                                \
   util/rate_limiter.cc                                          \
+  util/ribbon_config.cc                                         \
   util/slice.cc                                                 \
   util/file_checksum_helper.cc                                  \
   util/status.cc                                                \
@@ -267,8 +276,9 @@ LIB_SOURCES =                                                   \
   utilities/simulator_cache/sim_cache.cc                        \
   utilities/table_properties_collectors/compact_on_deletion_collector.cc \
   utilities/trace/file_trace_reader_writer.cc                   \
-  utilities/transactions/lock/lock_tracker.cc                   \
-  utilities/transactions/lock/point_lock_tracker.cc             \
+  utilities/transactions/lock/lock_manager.cc                   \
+  utilities/transactions/lock/point/point_lock_tracker.cc       \
+  utilities/transactions/lock/point/point_lock_manager.cc       \
   utilities/transactions/optimistic_transaction.cc              \
   utilities/transactions/optimistic_transaction_db_impl.cc      \
   utilities/transactions/pessimistic_transaction.cc             \
@@ -276,7 +286,6 @@ LIB_SOURCES =                                                   \
   utilities/transactions/snapshot_checker.cc                    \
   utilities/transactions/transaction_base.cc                    \
   utilities/transactions/transaction_db_mutex_impl.cc           \
-  utilities/transactions/transaction_lock_mgr.cc                \
   utilities/transactions/transaction_util.cc                    \
   utilities/transactions/write_prepared_txn.cc                  \
   utilities/transactions/write_prepared_txn_db.cc               \
@@ -286,11 +295,6 @@ LIB_SOURCES =                                                   \
   utilities/write_batch_with_index/write_batch_with_index.cc    \
   utilities/write_batch_with_index/write_batch_with_index_internal.cc    \
 
-ifeq ($(ARMCRC_SOURCE),1)
-LIB_SOURCES +=\
-  util/crc32c_arm64.cc
-endif
-
 ifeq (,$(shell $(CXX) -fsyntax-only -maltivec -xc /dev/null 2>&1))
 LIB_SOURCES_ASM =\
   util/crc32c_ppc_asm.S
@@ -301,6 +305,22 @@ LIB_SOURCES_ASM =
 LIB_SOURCES_C =
 endif
 
+RANGE_TREE_SOURCES =\
+  utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc \
+  utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc        \
+  utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc    \
+  utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc        \
+  utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc         \
+  utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc    \
+  utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc        \
+  utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc       \
+  utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc             \
+  utilities/transactions/lock/range/range_tree/lib/standalone_port.cc          \
+  utilities/transactions/lock/range/range_tree/lib/util/dbt.cc                 \
+  utilities/transactions/lock/range/range_tree/lib/util/memarena.cc            \
+  utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc      \
+  utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc
+
 TOOL_LIB_SOURCES =                                              \
   tools/io_tracer_parser_tool.cc                                \
   tools/ldb_cmd.cc                                              \
@@ -317,6 +337,10 @@ MOCK_LIB_SOURCES =                                              \
 
 BENCH_LIB_SOURCES =                                             \
   tools/db_bench_tool.cc                                        \
+  tools/simulated_hybrid_file_system.cc                         \
+
+CACHE_BENCH_LIB_SOURCES =					\
+  cache/cache_bench_tool.cc                                     \
 
 STRESS_LIB_SOURCES =                                            \
   db_stress_tool/batched_ops_stress.cc                         \
@@ -378,15 +402,21 @@ TEST_MAIN_SOURCES =                                                     \
   cloud/remote_compaction_test.cc                                       \
   db/blob/blob_file_addition_test.cc                                    \
   db/blob/blob_file_builder_test.cc                                     \
+  db/blob/blob_file_cache_test.cc                                       \
   db/blob/blob_file_garbage_test.cc                                     \
   db/blob/blob_file_reader_test.cc                                      \
+  db/blob/db_blob_basic_test.cc                                         \
+  db/blob/db_blob_compaction_test.cc                                    \
+  db/blob/db_blob_corruption_test.cc                                    \
   db/blob/db_blob_index_test.cc                                         \
   db/column_family_test.cc                                              \
   db/compact_files_test.cc                                              \
+  db/compaction/clipping_iterator_test.cc                               \
   db/compaction/compaction_iterator_test.cc                             \
   db/compaction/compaction_job_test.cc                                  \
   db/compaction/compaction_job_stats_test.cc                            \
   db/compaction/compaction_picker_test.cc                               \
+  db/compaction/compaction_service_test.cc                              \
   db/comparator_db_test.cc                                              \
   db/corruption_test.cc                                                 \
   db/cuckoo_table_db_test.cc                                            \
@@ -405,6 +435,7 @@ TEST_MAIN_SOURCES =                                                     \
   db/db_iter_test.cc                                                    \
   db/db_iter_stress_test.cc                                             \
   db/db_iterator_test.cc                                                \
+  db/db_kv_checksum_test.cc                                             \
   db/db_log_iter_test.cc                                                \
   db/db_memtable_test.cc                                                \
   db/db_merge_operator_test.cc                                          \
@@ -412,7 +443,7 @@ TEST_MAIN_SOURCES =                                                     \
   db/db_options_test.cc                                                 \
   db/db_properties_test.cc                                              \
   db/db_range_del_test.cc                                               \
-  db/db_impl/db_secondary_test.cc                                       \
+  db/db_secondary_test.cc                                               \
   db/db_sst_test.cc                                                     \
   db/db_statistics_test.cc                                              \
   db/db_table_properties_test.cc                                        \
@@ -423,6 +454,7 @@ TEST_MAIN_SOURCES =                                                     \
   db/db_universal_compaction_test.cc                                    \
   db/db_wal_test.cc                                                     \
   db/db_with_timestamp_compaction_test.cc                               \
+  db/db_write_buffer_manager_test.cc                                    \
   db/db_write_test.cc                                                   \
   db/dbformat_test.cc                                                   \
   db/deletefile_test.cc                                                 \
@@ -431,7 +463,6 @@ TEST_MAIN_SOURCES =                                                     \
   db/external_sst_file_test.cc                                          \
   db/fault_injection_test.cc                                            \
   db/file_indexer_test.cc                                               \
-  db/file_reader_writer_test.cc                                         \
   db/filename_test.cc                                                   \
   db/flush_job_test.cc                                                  \
   db/listener_test.cc                                                   \
@@ -477,6 +508,7 @@ TEST_MAIN_SOURCES =                                                     \
   monitoring/statistics_test.cc                                         \
   monitoring/stats_history_test.cc                                      \
   options/configurable_test.cc                                          \
+  options/customizable_test.cc                                          \
   options/options_settable_test.cc                                      \
   options/options_test.cc                                               \
   table/block_based/block_based_filter_block_test.cc                    \
@@ -514,6 +546,7 @@ TEST_MAIN_SOURCES =                                                     \
   util/random_test.cc                                                   \
   util/rate_limiter_test.cc                                             \
   util/repeatable_thread_test.cc                                        \
+  util/ribbon_test.cc                                                   \
   util/slice_test.cc                                                    \
   util/slice_transform_test.cc                                          \
   util/timer_queue_test.cc                                              \
@@ -540,8 +573,9 @@ TEST_MAIN_SOURCES =                                                     \
   utilities/simulator_cache/sim_cache_test.cc                           \
   utilities/table_properties_collectors/compact_on_deletion_collector_test.cc  \
   utilities/transactions/optimistic_transaction_test.cc                 \
+  utilities/transactions/lock/range/range_locking_test.cc               \
   utilities/transactions/transaction_test.cc                            \
-  utilities/transactions/transaction_lock_mgr_test.cc                   \
+  utilities/transactions/lock/point/point_lock_manager_test.cc          \
   utilities/transactions/write_prepared_transaction_test.cc             \
   utilities/transactions/write_unprepared_transaction_test.cc           \
   utilities/ttl/ttl_test.cc                                             \
@@ -557,6 +591,7 @@ JNI_NATIVE_SOURCES =                                          \
   java/rocksjni/backupablejni.cc                              \
   java/rocksjni/checkpoint.cc                                 \
   java/rocksjni/clock_cache.cc                                \
+  java/rocksjni/cache.cc                                      \
   java/rocksjni/columnfamilyhandle.cc                         \
   java/rocksjni/compact_range_options.cc                      \
   java/rocksjni/compaction_filter.cc                          \
@@ -574,6 +609,8 @@ JNI_NATIVE_SOURCES =                                          \
   java/rocksjni/config_options.cc                             \
   java/rocksjni/env.cc                                        \
   java/rocksjni/env_options.cc                                \
+  java/rocksjni/event_listener.cc                             \
+  java/rocksjni/event_listener_jnicallback.cc                 \
   java/rocksjni/ingest_external_file_options.cc               \
   java/rocksjni/filter.cc                                     \
   java/rocksjni/iterator.cc                                   \
@@ -620,6 +657,7 @@ JNI_NATIVE_SOURCES =                                          \
   java/rocksjni/transaction_notifier.cc                       \
   java/rocksjni/transaction_notifier_jnicallback.cc           \
   java/rocksjni/ttl.cc                                        \
+  java/rocksjni/testable_event_listener.cc                    \
   java/rocksjni/wal_filter.cc                                 \
   java/rocksjni/wal_filter_jnicallback.cc                     \
   java/rocksjni/write_batch.cc                                \
diff --git a/table/adaptive/adaptive_table_factory.cc b/table/adaptive/adaptive_table_factory.cc
index 480c4c9a600..63333b1b333 100644
--- a/table/adaptive/adaptive_table_factory.cc
+++ b/table/adaptive/adaptive_table_factory.cc
@@ -71,10 +71,9 @@ Status AdaptiveTableFactory::NewTableReader(
 }
 
 TableBuilder* AdaptiveTableFactory::NewTableBuilder(
-    const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
+    const TableBuilderOptions& table_builder_options,
     WritableFileWriter* file) const {
-  return table_factory_to_write_->NewTableBuilder(table_builder_options,
-                                                  column_family_id, file);
+  return table_factory_to_write_->NewTableBuilder(table_builder_options, file);
 }
 
 std::string AdaptiveTableFactory::GetPrintableOptions() const {
diff --git a/table/adaptive/adaptive_table_factory.h b/table/adaptive/adaptive_table_factory.h
index cbc81868cc6..65f816fad83 100644
--- a/table/adaptive/adaptive_table_factory.h
+++ b/table/adaptive/adaptive_table_factory.h
@@ -42,7 +42,7 @@ class AdaptiveTableFactory : public TableFactory {
 
   TableBuilder* NewTableBuilder(
       const TableBuilderOptions& table_builder_options,
-      uint32_t column_family_id, WritableFileWriter* file) const override;
+      WritableFileWriter* file) const override;
 
   std::string GetPrintableOptions() const override;
 
diff --git a/table/block_based/block.cc b/table/block_based/block.cc
index bc481d57e53..2d32ebcb492 100644
--- a/table/block_based/block.cc
+++ b/table/block_based/block.cc
@@ -15,7 +15,6 @@
 #include <unordered_map>
 #include <vector>
 
-#include "logging/logging.h"
 #include "monitoring/perf_context_imp.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
diff --git a/table/block_based/block_based_filter_block.cc b/table/block_based/block_based_filter_block.cc
index 2e457e32f6e..13b3dcc448a 100644
--- a/table/block_based/block_based_filter_block.cc
+++ b/table/block_based/block_based_filter_block.cc
@@ -68,7 +68,7 @@ BlockBasedFilterBlockBuilder::BlockBasedFilterBlockBuilder(
       whole_key_filtering_(table_opt.whole_key_filtering),
       prev_prefix_start_(0),
       prev_prefix_size_(0),
-      num_added_(0) {
+      total_added_in_built_(0) {
   assert(policy_);
 }
 
@@ -80,19 +80,22 @@ void BlockBasedFilterBlockBuilder::StartBlock(uint64_t block_offset) {
   }
 }
 
-void BlockBasedFilterBlockBuilder::Add(const Slice& key) {
-  if (prefix_extractor_ && prefix_extractor_->InDomain(key)) {
-    AddPrefix(key);
+size_t BlockBasedFilterBlockBuilder::EstimateEntriesAdded() {
+  return total_added_in_built_ + start_.size();
+}
+
+void BlockBasedFilterBlockBuilder::Add(const Slice& key_without_ts) {
+  if (prefix_extractor_ && prefix_extractor_->InDomain(key_without_ts)) {
+    AddPrefix(key_without_ts);
   }
 
   if (whole_key_filtering_) {
-    AddKey(key);
+    AddKey(key_without_ts);
   }
 }
 
 // Add key to filter if needed
 inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) {
-  num_added_++;
   start_.push_back(entries_.size());
   entries_.append(key.data(), key.size());
 }
@@ -118,6 +121,7 @@ Slice BlockBasedFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/,
                                            Status* status) {
   // In this impl we ignore BlockHandle
   *status = Status::OK();
+
   if (!start_.empty()) {
     GenerateFilter();
   }
@@ -140,6 +144,7 @@ void BlockBasedFilterBlockBuilder::GenerateFilter() {
     filter_offsets_.push_back(static_cast<uint32_t>(result_.size()));
     return;
   }
+  total_added_in_built_ += num_entries;
 
   // Make list of keys from flattened key structure
   start_.push_back(entries_.size());  // Simplify length computation
diff --git a/table/block_based/block_based_filter_block.h b/table/block_based/block_based_filter_block.h
index 67ded1ee3b4..0b46cd7c1bf 100644
--- a/table/block_based/block_based_filter_block.h
+++ b/table/block_based/block_based_filter_block.h
@@ -44,8 +44,11 @@ class BlockBasedFilterBlockBuilder : public FilterBlockBuilder {
 
   virtual bool IsBlockBased() override { return true; }
   virtual void StartBlock(uint64_t block_offset) override;
-  virtual void Add(const Slice& key) override;
-  virtual size_t NumAdded() const override { return num_added_; }
+  virtual void Add(const Slice& key_without_ts) override;
+  virtual bool IsEmpty() const override {
+    return start_.empty() && filter_offsets_.empty();
+  }
+  virtual size_t EstimateEntriesAdded() override;
   virtual Slice Finish(const BlockHandle& tmp, Status* status) override;
   using FilterBlockBuilder::Finish;
 
@@ -70,7 +73,7 @@ class BlockBasedFilterBlockBuilder : public FilterBlockBuilder {
   std::string result_;              // Filter data computed so far
   std::vector<Slice> tmp_entries_;  // policy_->CreateFilter() argument
   std::vector<uint32_t> filter_offsets_;
-  size_t num_added_;  // Number of keys added
+  uint64_t total_added_in_built_;  // Total keys added to filters built so far
 };
 
 // A FilterBlockReader is used to parse filter from SST table.
diff --git a/table/block_based/block_based_filter_block_test.cc b/table/block_based/block_based_filter_block_test.cc
index 283d6a9a2e4..862e90233e6 100644
--- a/table/block_based/block_based_filter_block_test.cc
+++ b/table/block_based/block_based_filter_block_test.cc
@@ -76,17 +76,26 @@ TEST_F(FilterBlockTest, EmptyBuilder) {
 
 TEST_F(FilterBlockTest, SingleChunk) {
   BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
-  ASSERT_EQ(0, builder.NumAdded());
+  ASSERT_TRUE(builder.IsEmpty());
   builder.StartBlock(100);
   builder.Add("foo");
+  ASSERT_FALSE(builder.IsEmpty());
+  builder.Add("bar");
   builder.Add("bar");
   builder.Add("box");
   builder.StartBlock(200);
   builder.Add("box");
   builder.StartBlock(300);
   builder.Add("hello");
-  ASSERT_EQ(5, builder.NumAdded());
-  Slice slice(builder.Finish());
+  // XXX: "bar" should only count once but is counted twice. This actually
+  // indicates a serious space usage bug in old block-based filter. Good
+  // that it is deprecated.
+  // "box" counts twice, because it's in distinct blocks.
+  ASSERT_EQ(6, builder.EstimateEntriesAdded());
+  ASSERT_FALSE(builder.IsEmpty());
+  Status s;
+  Slice slice = builder.Finish(BlockHandle(), &s);
+  ASSERT_OK(s);
 
   CachableEntry<BlockContents> block(
       new BlockContents(slice), nullptr /* cache */, nullptr /* cache_handle */,
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 28bfbb7b23b..ac0d45cddb9 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -11,25 +11,26 @@
 
 #include <assert.h>
 #include <stdio.h>
+
 #include <atomic>
 #include <list>
 #include <map>
 #include <memory>
+#include <numeric>
 #include <string>
 #include <unordered_map>
 #include <utility>
 
 #include "db/dbformat.h"
 #include "index_builder.h"
-#include "port/lang.h"
-
+#include "memory/memory_allocator.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
 #include "rocksdb/flush_block_policy.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/table.h"
-
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_filter_block.h"
 #include "table/block_based/block_based_table_factory.h"
@@ -41,8 +42,6 @@
 #include "table/block_based/partitioned_filter_block.h"
 #include "table/format.h"
 #include "table/table_builder.h"
-
-#include "memory/memory_allocator.h"
 #include "util/coding.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
@@ -56,7 +55,6 @@ namespace ROCKSDB_NAMESPACE {
 extern const std::string kHashIndexPrefixesBlock;
 extern const std::string kHashIndexPrefixesMetadataBlock;
 
-typedef BlockBasedTableOptions::IndexType IndexType;
 
 // Without anonymous namespace here, we fail the warning -Wmissing-prototypes
 namespace {
@@ -68,7 +66,7 @@ FilterBlockBuilder* CreateFilterBlockBuilder(
     const bool use_delta_encoding_for_index_values,
     PartitionedIndexBuilder* const p_index_builder) {
   const BlockBasedTableOptions& table_opt = context.table_options;
-  if (table_opt.filter_policy == nullptr) return nullptr;
+  assert(table_opt.filter_policy);  // precondition
 
   FilterBitsBuilder* filter_bits_builder =
       BloomFilterPolicy::GetBuilderFromContext(context);
@@ -79,8 +77,9 @@ FilterBlockBuilder* CreateFilterBlockBuilder(
     if (table_opt.partition_filters) {
       assert(p_index_builder != nullptr);
       // Since after partition cut request from filter builder it takes time
-      // until index builder actully cuts the partition, we take the lower bound
-      // as partition size.
+      // until index builder actully cuts the partition, until the end of a
+      // data block potentially with many keys, we take the lower bound as
+      // partition size.
       assert(table_opt.block_size_deviation <= 100);
       auto partition_size =
           static_cast<uint32_t>(((table_opt.metadata_block_size *
@@ -212,9 +211,9 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
     return Status::OK();
   }
 
-  virtual void BlockAdd(uint64_t /* blockRawBytes */,
-                        uint64_t /* blockCompressedBytesFast */,
-                        uint64_t /* blockCompressedBytesSlow */) override {
+  virtual void BlockAdd(uint64_t /* block_raw_bytes */,
+                        uint64_t /* block_compressed_bytes_fast */,
+                        uint64_t /* block_compressed_bytes_slow */) override {
     // Intentionally left blank. No interest in collecting stats for
     // blocks.
     return;
@@ -248,24 +247,18 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
 };
 
 struct BlockBasedTableBuilder::Rep {
-  const ImmutableCFOptions ioptions;
+  const ImmutableOptions ioptions;
   const MutableCFOptions moptions;
   const BlockBasedTableOptions table_options;
   const InternalKeyComparator& internal_comparator;
   WritableFileWriter* file;
   std::atomic<uint64_t> offset;
-  // Synchronize status & io_status accesses across threads from main thread,
-  // compression thread and write thread in parallel compression.
-  std::mutex status_mutex;
   size_t alignment;
   BlockBuilder data_block;
-  // Buffers uncompressed data blocks and keys to replay later. Needed when
+  // Buffers uncompressed data blocks to replay later. Needed when
   // compression dictionary is enabled so we can finalize the dictionary before
   // compressing any data blocks.
-  // TODO(ajkr): ideally we don't buffer all keys and all uncompressed data
-  // blocks as it's redundant, but it's easier to implement for now.
-  std::vector<std::pair<std::string, std::vector<std::string>>>
-      data_block_and_keys_buffers;
+  std::vector<std::string> data_block_buffers;
   BlockBuilder range_del_block;
 
   InternalKeySliceTransform internal_prefix_transform;
@@ -276,6 +269,11 @@ struct BlockBasedTableBuilder::Rep {
   const Slice* first_key_in_next_block = nullptr;
   CompressionType compression_type;
   uint64_t sample_for_compression;
+  std::atomic<uint64_t> compressible_input_data_bytes;
+  std::atomic<uint64_t> uncompressible_input_data_bytes;
+  std::atomic<uint64_t> sampled_input_data_bytes;
+  std::atomic<uint64_t> sampled_output_slow_data_bytes;
+  std::atomic<uint64_t> sampled_output_fast_data_bytes;
   CompressionOptions compression_opts;
   std::unique_ptr<CompressionDict> compression_dict;
   std::vector<std::unique_ptr<CompressionContext>> compression_ctxs;
@@ -310,9 +308,14 @@ struct BlockBasedTableBuilder::Rep {
     kClosed,
   };
   State state;
+  // `kBuffered` state is allowed only as long as the buffering of uncompressed
+  // data blocks (see `data_block_buffers`) does not exceed `buffer_limit`.
+  uint64_t buffer_limit;
 
   const bool use_delta_encoding_for_index_values;
   std::unique_ptr<FilterBlockBuilder> filter_builder;
+  char cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize];
+  size_t cache_key_prefix_size;
   char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize];
   size_t compressed_cache_key_prefix_size;
 
@@ -320,17 +323,16 @@ struct BlockBasedTableBuilder::Rep {
 
   std::string compressed_output;
   std::unique_ptr<FlushBlockPolicy> flush_block_policy;
-  int level_at_creation;
   uint32_t column_family_id;
-  const std::string& column_family_name;
+  std::string column_family_name;
   uint64_t creation_time = 0;
   uint64_t oldest_key_time = 0;
-  const uint64_t target_file_size;
   uint64_t file_creation_time = 0;
 
   // DB IDs
   const std::string db_id;
   const std::string db_session_id;
+  std::string db_host_id;
 
   std::vector<std::unique_ptr<IntTblPropCollector>> table_properties_collectors;
 
@@ -339,78 +341,72 @@ struct BlockBasedTableBuilder::Rep {
   uint64_t get_offset() { return offset.load(std::memory_order_relaxed); }
   void set_offset(uint64_t o) { offset.store(o, std::memory_order_relaxed); }
 
-  const IOStatus& GetIOStatus() {
-    if (compression_opts.parallel_threads > 1) {
-      std::lock_guard<std::mutex> lock(status_mutex);
-      return io_status;
+  bool IsParallelCompressionEnabled() const {
+    return compression_opts.parallel_threads > 1;
+  }
+
+  Status GetStatus() {
+    // We need to make modifications of status visible when status_ok is set
+    // to false, and this is ensured by status_mutex, so no special memory
+    // order for status_ok is required.
+    if (status_ok.load(std::memory_order_relaxed)) {
+      return Status::OK();
     } else {
-      return io_status;
+      return CopyStatus();
     }
   }
 
-  const Status& GetStatus() {
-    if (compression_opts.parallel_threads > 1) {
-      std::lock_guard<std::mutex> lock(status_mutex);
-      return status;
+  Status CopyStatus() {
+    std::lock_guard<std::mutex> lock(status_mutex);
+    return status;
+  }
+
+  IOStatus GetIOStatus() {
+    // We need to make modifications of io_status visible when status_ok is set
+    // to false, and this is ensured by io_status_mutex, so no special memory
+    // order for io_status_ok is required.
+    if (io_status_ok.load(std::memory_order_relaxed)) {
+      return IOStatus::OK();
     } else {
-      return status;
+      return CopyIOStatus();
     }
   }
 
-  void SyncStatusFromIOStatus() {
-    if (compression_opts.parallel_threads > 1) {
-      std::lock_guard<std::mutex> lock(status_mutex);
-      if (status.ok()) {
-        status = io_status;
-      }
-    } else if (status.ok()) {
-      status = io_status;
-    }
+  IOStatus CopyIOStatus() {
+    std::lock_guard<std::mutex> lock(io_status_mutex);
+    return io_status;
   }
 
   // Never erase an existing status that is not OK.
   void SetStatus(Status s) {
-    if (!s.ok()) {
+    if (!s.ok() && status_ok.load(std::memory_order_relaxed)) {
       // Locking is an overkill for non compression_opts.parallel_threads
       // case but since it's unlikely that s is not OK, we take this cost
       // to be simplicity.
       std::lock_guard<std::mutex> lock(status_mutex);
-      if (status.ok()) {
-        status = s;
-      }
+      status = s;
+      status_ok.store(false, std::memory_order_relaxed);
     }
   }
 
   // Never erase an existing I/O status that is not OK.
   void SetIOStatus(IOStatus ios) {
-    if (!ios.ok()) {
+    if (!ios.ok() && io_status_ok.load(std::memory_order_relaxed)) {
       // Locking is an overkill for non compression_opts.parallel_threads
       // case but since it's unlikely that s is not OK, we take this cost
       // to be simplicity.
-      std::lock_guard<std::mutex> lock(status_mutex);
-      if (io_status.ok()) {
-        io_status = ios;
-      }
+      std::lock_guard<std::mutex> lock(io_status_mutex);
+      io_status = ios;
+      io_status_ok.store(false, std::memory_order_relaxed);
     }
   }
 
-  Rep(const ImmutableCFOptions& _ioptions, const MutableCFOptions& _moptions,
-      const BlockBasedTableOptions& table_opt,
-      const InternalKeyComparator& icomparator,
-      const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-          int_tbl_prop_collector_factories,
-      uint32_t _column_family_id, WritableFileWriter* f,
-      const CompressionType _compression_type,
-      const uint64_t _sample_for_compression,
-      const CompressionOptions& _compression_opts, const bool skip_filters,
-      const int _level_at_creation, const std::string& _column_family_name,
-      const uint64_t _creation_time, const uint64_t _oldest_key_time,
-      const uint64_t _target_file_size, const uint64_t _file_creation_time,
-      const std::string& _db_id, const std::string& _db_session_id)
-      : ioptions(_ioptions),
-        moptions(_moptions),
+  Rep(const BlockBasedTableOptions& table_opt, const TableBuilderOptions& tbo,
+      WritableFileWriter* f)
+      : ioptions(tbo.ioptions),
+        moptions(tbo.moptions),
         table_options(table_opt),
-        internal_comparator(icomparator),
+        internal_comparator(tbo.internal_comparator),
         file(f),
         offset(0),
         alignment(table_options.block_align
@@ -419,37 +415,52 @@ struct BlockBasedTableBuilder::Rep {
         data_block(table_options.block_restart_interval,
                    table_options.use_delta_encoding,
                    false /* use_value_delta_encoding */,
-                   icomparator.user_comparator()
+                   tbo.internal_comparator.user_comparator()
                            ->CanKeysWithDifferentByteContentsBeEqual()
                        ? BlockBasedTableOptions::kDataBlockBinarySearch
                        : table_options.data_block_index_type,
                    table_options.data_block_hash_table_util_ratio),
         range_del_block(1 /* block_restart_interval */),
-        internal_prefix_transform(_moptions.prefix_extractor.get()),
-        compression_type(_compression_type),
-        sample_for_compression(_sample_for_compression),
-        compression_opts(_compression_opts),
+        internal_prefix_transform(tbo.moptions.prefix_extractor.get()),
+        compression_type(tbo.compression_type),
+        sample_for_compression(tbo.moptions.sample_for_compression),
+        compressible_input_data_bytes(0),
+        uncompressible_input_data_bytes(0),
+        sampled_input_data_bytes(0),
+        sampled_output_slow_data_bytes(0),
+        sampled_output_fast_data_bytes(0),
+        compression_opts(tbo.compression_opts),
         compression_dict(),
-        compression_ctxs(_compression_opts.parallel_threads),
-        verify_ctxs(_compression_opts.parallel_threads),
+        compression_ctxs(tbo.compression_opts.parallel_threads),
+        verify_ctxs(tbo.compression_opts.parallel_threads),
         verify_dict(),
-        state((_compression_opts.max_dict_bytes > 0) ? State::kBuffered
-                                                     : State::kUnbuffered),
+        state((tbo.compression_opts.max_dict_bytes > 0) ? State::kBuffered
+                                                        : State::kUnbuffered),
         use_delta_encoding_for_index_values(table_opt.format_version >= 4 &&
                                             !table_opt.block_align),
+        cache_key_prefix_size(0),
         compressed_cache_key_prefix_size(0),
         flush_block_policy(
             table_options.flush_block_policy_factory->NewFlushBlockPolicy(
                 table_options, data_block)),
-        level_at_creation(_level_at_creation),
-        column_family_id(_column_family_id),
-        column_family_name(_column_family_name),
-        creation_time(_creation_time),
-        oldest_key_time(_oldest_key_time),
-        target_file_size(_target_file_size),
-        file_creation_time(_file_creation_time),
-        db_id(_db_id),
-        db_session_id(_db_session_id) {
+        column_family_id(tbo.column_family_id),
+        column_family_name(tbo.column_family_name),
+        creation_time(tbo.creation_time),
+        oldest_key_time(tbo.oldest_key_time),
+        file_creation_time(tbo.file_creation_time),
+        db_id(tbo.db_id),
+        db_session_id(tbo.db_session_id),
+        db_host_id(ioptions.db_host_id),
+        status_ok(true),
+        io_status_ok(true) {
+    if (tbo.target_file_size == 0) {
+      buffer_limit = compression_opts.max_dict_buffer_bytes;
+    } else if (compression_opts.max_dict_buffer_bytes == 0) {
+      buffer_limit = tbo.target_file_size;
+    } else {
+      buffer_limit = std::min(tbo.target_file_size,
+                              compression_opts.max_dict_buffer_bytes);
+    }
     for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) {
       compression_ctxs[i].reset(new CompressionContext(compression_type));
     }
@@ -465,39 +476,71 @@ struct BlockBasedTableBuilder::Rep {
           &this->internal_prefix_transform, use_delta_encoding_for_index_values,
           table_options));
     }
-    if (skip_filters) {
-      filter_builder = nullptr;
+    if (ioptions.optimize_filters_for_hits && tbo.is_bottommost) {
+      // Apply optimize_filters_for_hits setting here when applicable by
+      // skipping filter generation
+      filter_builder.reset();
+    } else if (tbo.skip_filters) {
+      // For SstFileWriter skip_filters
+      filter_builder.reset();
+    } else if (!table_options.filter_policy) {
+      // Null filter_policy -> no filter
+      filter_builder.reset();
     } else {
-      FilterBuildingContext context(table_options);
-      context.column_family_name = column_family_name;
-      context.compaction_style = ioptions.compaction_style;
-      context.level_at_creation = level_at_creation;
-      context.info_log = ioptions.info_log;
+      FilterBuildingContext filter_context(table_options);
+
+      filter_context.info_log = ioptions.logger;
+      filter_context.column_family_name = tbo.column_family_name;
+      filter_context.reason = tbo.reason;
+
+      // Only populate other fields if known to be in LSM rather than
+      // generating external SST file
+      if (tbo.reason != TableFileCreationReason::kMisc) {
+        filter_context.compaction_style = ioptions.compaction_style;
+        filter_context.num_levels = ioptions.num_levels;
+        filter_context.level_at_creation = tbo.level_at_creation;
+        filter_context.is_bottommost = tbo.is_bottommost;
+        assert(filter_context.level_at_creation < filter_context.num_levels);
+      }
+
       filter_builder.reset(CreateFilterBlockBuilder(
-          ioptions, moptions, context, use_delta_encoding_for_index_values,
-          p_index_builder_));
+          ioptions, moptions, filter_context,
+          use_delta_encoding_for_index_values, p_index_builder_));
     }
 
-    for (auto& collector_factories : *int_tbl_prop_collector_factories) {
+    const auto& factory_range = tbo.int_tbl_prop_collector_factories;
+    for (auto it = factory_range.first; it != factory_range.second; ++it) {
+      assert(*it);
+
       table_properties_collectors.emplace_back(
-          collector_factories->CreateIntTblPropCollector(column_family_id));
+          (*it)->CreateIntTblPropCollector(column_family_id));
     }
     table_properties_collectors.emplace_back(
         new BlockBasedTablePropertiesCollector(
             table_options.index_type, table_options.whole_key_filtering,
-            _moptions.prefix_extractor != nullptr));
+            moptions.prefix_extractor != nullptr));
     if (table_options.verify_compression) {
       for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) {
         verify_ctxs[i].reset(new UncompressionContext(compression_type));
       }
     }
+
+    if (!ReifyDbHostIdProperty(ioptions.env, &db_host_id).ok()) {
+      ROCKS_LOG_INFO(ioptions.logger, "db_host_id property will not be set");
+    }
   }
 
   Rep(const Rep&) = delete;
   Rep& operator=(const Rep&) = delete;
 
  private:
+  // Synchronize status & io_status accesses across threads from main thread,
+  // compression thread and write thread in parallel compression.
+  std::mutex status_mutex;
+  std::atomic<bool> status_ok;
   Status status;
+  std::mutex io_status_mutex;
+  std::atomic<bool> io_status_ok;
   IOStatus io_status;
 };
 
@@ -593,41 +636,123 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
   WriteQueue write_queue;
   std::unique_ptr<port::Thread> write_thread;
 
-  // Raw bytes compressed so far.
-  uint64_t raw_bytes_compressed;
-  // Size of current block being appended.
-  uint64_t raw_bytes_curr_block;
-  // Raw bytes under compression and not appended yet.
-  std::atomic<uint64_t> raw_bytes_inflight;
-  // Number of blocks under compression and not appended yet.
-  std::atomic<uint64_t> blocks_inflight;
-  // Current compression ratio, maintained by BGWorkWriteRawBlock.
-  std::atomic<double> curr_compression_ratio;
-  // Estimated SST file size.
-  std::atomic<uint64_t> estimated_file_size;
-
-  // Wait for the completion of first block compression to get a
-  // non-zero compression ratio.
-  bool first_block;
+  // Estimate output file size when parallel compression is enabled. This is
+  // necessary because compression & flush are no longer synchronized,
+  // and BlockBasedTableBuilder::FileSize() is no longer accurate.
+  // memory_order_relaxed suffices because accurate statistics is not required.
+  class FileSizeEstimator {
+   public:
+    explicit FileSizeEstimator()
+        : raw_bytes_compressed(0),
+          raw_bytes_curr_block(0),
+          raw_bytes_curr_block_set(false),
+          raw_bytes_inflight(0),
+          blocks_inflight(0),
+          curr_compression_ratio(0),
+          estimated_file_size(0) {}
+
+    // Estimate file size when a block is about to be emitted to
+    // compression thread
+    void EmitBlock(uint64_t raw_block_size, uint64_t curr_file_size) {
+      uint64_t new_raw_bytes_inflight =
+          raw_bytes_inflight.fetch_add(raw_block_size,
+                                       std::memory_order_relaxed) +
+          raw_block_size;
+
+      uint64_t new_blocks_inflight =
+          blocks_inflight.fetch_add(1, std::memory_order_relaxed) + 1;
+
+      estimated_file_size.store(
+          curr_file_size +
+              static_cast<uint64_t>(
+                  static_cast<double>(new_raw_bytes_inflight) *
+                  curr_compression_ratio.load(std::memory_order_relaxed)) +
+              new_blocks_inflight * kBlockTrailerSize,
+          std::memory_order_relaxed);
+    }
+
+    // Estimate file size when a block is already reaped from
+    // compression thread
+    void ReapBlock(uint64_t compressed_block_size, uint64_t curr_file_size) {
+      assert(raw_bytes_curr_block_set);
+
+      uint64_t new_raw_bytes_compressed =
+          raw_bytes_compressed + raw_bytes_curr_block;
+      assert(new_raw_bytes_compressed > 0);
+
+      curr_compression_ratio.store(
+          (curr_compression_ratio.load(std::memory_order_relaxed) *
+               raw_bytes_compressed +
+           compressed_block_size) /
+              static_cast<double>(new_raw_bytes_compressed),
+          std::memory_order_relaxed);
+      raw_bytes_compressed = new_raw_bytes_compressed;
+
+      uint64_t new_raw_bytes_inflight =
+          raw_bytes_inflight.fetch_sub(raw_bytes_curr_block,
+                                       std::memory_order_relaxed) -
+          raw_bytes_curr_block;
+
+      uint64_t new_blocks_inflight =
+          blocks_inflight.fetch_sub(1, std::memory_order_relaxed) - 1;
+
+      estimated_file_size.store(
+          curr_file_size +
+              static_cast<uint64_t>(
+                  static_cast<double>(new_raw_bytes_inflight) *
+                  curr_compression_ratio.load(std::memory_order_relaxed)) +
+              new_blocks_inflight * kBlockTrailerSize,
+          std::memory_order_relaxed);
+
+      raw_bytes_curr_block_set = false;
+    }
+
+    void SetEstimatedFileSize(uint64_t size) {
+      estimated_file_size.store(size, std::memory_order_relaxed);
+    }
+
+    uint64_t GetEstimatedFileSize() {
+      return estimated_file_size.load(std::memory_order_relaxed);
+    }
+
+    void SetCurrBlockRawSize(uint64_t size) {
+      raw_bytes_curr_block = size;
+      raw_bytes_curr_block_set = true;
+    }
+
+   private:
+    // Raw bytes compressed so far.
+    uint64_t raw_bytes_compressed;
+    // Size of current block being appended.
+    uint64_t raw_bytes_curr_block;
+    // Whether raw_bytes_curr_block has been set for next
+    // ReapBlock call.
+    bool raw_bytes_curr_block_set;
+    // Raw bytes under compression and not appended yet.
+    std::atomic<uint64_t> raw_bytes_inflight;
+    // Number of blocks under compression and not appended yet.
+    std::atomic<uint64_t> blocks_inflight;
+    // Current compression ratio, maintained by BGWorkWriteRawBlock.
+    std::atomic<double> curr_compression_ratio;
+    // Estimated SST file size.
+    std::atomic<uint64_t> estimated_file_size;
+  };
+  FileSizeEstimator file_size_estimator;
+
+  // Facilities used for waiting first block completion. Need to Wait for
+  // the completion of first block compression and flush to get a non-zero
+  // compression ratio.
+  std::atomic<bool> first_block_processed;
   std::condition_variable first_block_cond;
   std::mutex first_block_mutex;
 
-  bool finished;
-
-  ParallelCompressionRep(uint32_t parallel_threads)
+  explicit ParallelCompressionRep(uint32_t parallel_threads)
       : curr_block_keys(new Keys()),
         block_rep_buf(parallel_threads),
         block_rep_pool(parallel_threads),
         compress_queue(parallel_threads),
         write_queue(parallel_threads),
-        raw_bytes_compressed(0),
-        raw_bytes_curr_block(0),
-        raw_bytes_inflight(0),
-        blocks_inflight(0),
-        curr_compression_ratio(0),
-        estimated_file_size(0),
-        first_block(true),
-        finished(false) {
+        first_block_processed(false) {
     for (uint32_t i = 0; i < parallel_threads; i++) {
       block_rep_buf[i].contents = Slice();
       block_rep_buf[i].compressed_contents = Slice();
@@ -643,27 +768,98 @@ struct BlockBasedTableBuilder::ParallelCompressionRep {
   }
 
   ~ParallelCompressionRep() { block_rep_pool.finish(); }
+
+  // Make a block prepared to be emitted to compression thread
+  // Used in non-buffered mode
+  BlockRep* PrepareBlock(CompressionType compression_type,
+                         const Slice* first_key_in_next_block,
+                         BlockBuilder* data_block) {
+    BlockRep* block_rep =
+        PrepareBlockInternal(compression_type, first_key_in_next_block);
+    assert(block_rep != nullptr);
+    data_block->SwapAndReset(*(block_rep->data));
+    block_rep->contents = *(block_rep->data);
+    std::swap(block_rep->keys, curr_block_keys);
+    curr_block_keys->Clear();
+    return block_rep;
+  }
+
+  // Used in EnterUnbuffered
+  BlockRep* PrepareBlock(CompressionType compression_type,
+                         const Slice* first_key_in_next_block,
+                         std::string* data_block,
+                         std::vector<std::string>* keys) {
+    BlockRep* block_rep =
+        PrepareBlockInternal(compression_type, first_key_in_next_block);
+    assert(block_rep != nullptr);
+    std::swap(*(block_rep->data), *data_block);
+    block_rep->contents = *(block_rep->data);
+    block_rep->keys->SwapAssign(*keys);
+    return block_rep;
+  }
+
+  // Emit a block to compression thread
+  void EmitBlock(BlockRep* block_rep) {
+    assert(block_rep != nullptr);
+    assert(block_rep->status.ok());
+    if (!write_queue.push(block_rep->slot.get())) {
+      return;
+    }
+    if (!compress_queue.push(block_rep)) {
+      return;
+    }
+
+    if (!first_block_processed.load(std::memory_order_relaxed)) {
+      std::unique_lock<std::mutex> lock(first_block_mutex);
+      first_block_cond.wait(lock, [this] {
+        return first_block_processed.load(std::memory_order_relaxed);
+      });
+    }
+  }
+
+  // Reap a block from compression thread
+  void ReapBlock(BlockRep* block_rep) {
+    assert(block_rep != nullptr);
+    block_rep->compressed_data->clear();
+    block_rep_pool.push(block_rep);
+
+    if (!first_block_processed.load(std::memory_order_relaxed)) {
+      std::lock_guard<std::mutex> lock(first_block_mutex);
+      first_block_processed.store(true, std::memory_order_relaxed);
+      first_block_cond.notify_one();
+    }
+  }
+
+ private:
+  BlockRep* PrepareBlockInternal(CompressionType compression_type,
+                                 const Slice* first_key_in_next_block) {
+    BlockRep* block_rep = nullptr;
+    block_rep_pool.pop(block_rep);
+    assert(block_rep != nullptr);
+
+    assert(block_rep->data);
+
+    block_rep->compression_type = compression_type;
+
+    if (first_key_in_next_block == nullptr) {
+      block_rep->first_key_in_next_block.reset(nullptr);
+    } else {
+      block_rep->first_key_in_next_block->assign(
+          first_key_in_next_block->data(), first_key_in_next_block->size());
+    }
+
+    return block_rep;
+  }
 };
 
 BlockBasedTableBuilder::BlockBasedTableBuilder(
-    const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
-    const BlockBasedTableOptions& table_options,
-    const InternalKeyComparator& internal_comparator,
-    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-        int_tbl_prop_collector_factories,
-    uint32_t column_family_id, WritableFileWriter* file,
-    const CompressionType compression_type,
-    const uint64_t sample_for_compression,
-    const CompressionOptions& compression_opts, const bool skip_filters,
-    const std::string& column_family_name, const int level_at_creation,
-    const uint64_t creation_time, const uint64_t oldest_key_time,
-    const uint64_t target_file_size, const uint64_t file_creation_time,
-    const std::string& db_id, const std::string& db_session_id) {
+    const BlockBasedTableOptions& table_options, const TableBuilderOptions& tbo,
+    WritableFileWriter* file) {
   BlockBasedTableOptions sanitized_table_options(table_options);
   if (sanitized_table_options.format_version == 0 &&
       sanitized_table_options.checksum != kCRC32c) {
     ROCKS_LOG_WARN(
-        ioptions.info_log,
+        tbo.ioptions.logger,
         "Silently converting format_version to 1 because checksum is "
         "non-default");
     // silently convert format_version to 1 to keep consistent with current
@@ -671,36 +867,16 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
     sanitized_table_options.format_version = 1;
   }
 
-  rep_ = new Rep(
-      ioptions, moptions, sanitized_table_options, internal_comparator,
-      int_tbl_prop_collector_factories, column_family_id, file,
-      compression_type, sample_for_compression, compression_opts, skip_filters,
-      level_at_creation, column_family_name, creation_time, oldest_key_time,
-      target_file_size, file_creation_time, db_id, db_session_id);
+  rep_ = new Rep(sanitized_table_options, tbo, file);
 
   if (rep_->filter_builder != nullptr) {
     rep_->filter_builder->StartBlock(0);
   }
-  if (table_options.block_cache_compressed.get() != nullptr) {
-    BlockBasedTable::GenerateCachePrefix<Cache, FSWritableFile>(
-        table_options.block_cache_compressed.get(), file->writable_file(),
-        &rep_->compressed_cache_key_prefix[0],
-        &rep_->compressed_cache_key_prefix_size);
-  }
-
-  if (rep_->compression_opts.parallel_threads > 1) {
-    rep_->pc_rep.reset(
-        new ParallelCompressionRep(rep_->compression_opts.parallel_threads));
-    rep_->pc_rep->compress_thread_pool.reserve(
-        rep_->compression_opts.parallel_threads);
-    for (uint32_t i = 0; i < rep_->compression_opts.parallel_threads; i++) {
-      rep_->pc_rep->compress_thread_pool.emplace_back([this, i] {
-        BGWorkCompression(*(rep_->compression_ctxs[i]),
-                          rep_->verify_ctxs[i].get());
-      });
-    }
-    rep_->pc_rep->write_thread.reset(
-        new port::Thread([this] { BGWorkWriteRawBlock(); }));
+
+  SetupCacheKeyPrefix(tbo);
+
+  if (rep_->IsParallelCompressionEnabled()) {
+    StartParallelCompression();
   }
 }
 
@@ -728,8 +904,8 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
       r->first_key_in_next_block = &key;
       Flush();
 
-      if (r->state == Rep::State::kBuffered && r->target_file_size != 0 &&
-          r->data_begin_offset > r->target_file_size) {
+      if (r->state == Rep::State::kBuffered && r->buffer_limit != 0 &&
+          r->data_begin_offset > r->buffer_limit) {
         EnterUnbuffered();
       }
 
@@ -742,7 +918,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
       // entries in the first block and < all entries in subsequent
       // blocks.
       if (ok() && r->state == Rep::State::kUnbuffered) {
-        if (r->compression_opts.parallel_threads > 1) {
+        if (r->IsParallelCompressionEnabled()) {
           r->pc_rep->curr_block_keys->Clear();
         } else {
           r->index_builder->AddIndexEntry(&r->last_key, &key,
@@ -754,7 +930,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
     // Note: PartitionedFilterBlockBuilder requires key being added to filter
     // builder after being added to index builder.
     if (r->state == Rep::State::kUnbuffered) {
-      if (r->compression_opts.parallel_threads > 1) {
+      if (r->IsParallelCompressionEnabled()) {
         r->pc_rep->curr_block_keys->PushBack(key);
       } else {
         if (r->filter_builder != nullptr) {
@@ -768,28 +944,24 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
     r->last_key.assign(key.data(), key.size());
     r->data_block.Add(key, value);
     if (r->state == Rep::State::kBuffered) {
-      // Buffer keys to be replayed during `Finish()` once compression
-      // dictionary has been finalized.
-      if (r->data_block_and_keys_buffers.empty() || should_flush) {
-        r->data_block_and_keys_buffers.emplace_back();
-      }
-      r->data_block_and_keys_buffers.back().second.emplace_back(key.ToString());
+      // Buffered keys will be replayed from data_block_buffers during
+      // `Finish()` once compression dictionary has been finalized.
     } else {
-      if (r->compression_opts.parallel_threads == 1) {
+      if (!r->IsParallelCompressionEnabled()) {
         r->index_builder->OnKeyAdded(key);
       }
     }
     // TODO offset passed in is not accurate for parallel compression case
     NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(),
                                       r->table_properties_collectors,
-                                      r->ioptions.info_log);
+                                      r->ioptions.logger);
 
   } else if (value_type == kTypeRangeDeletion) {
     r->range_del_block.Add(key, value);
     // TODO offset passed in is not accurate for parallel compression case
     NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(),
                                       r->table_properties_collectors,
-                                      r->ioptions.info_log);
+                                      r->ioptions.logger);
   } else {
     assert(false);
   }
@@ -812,61 +984,15 @@ void BlockBasedTableBuilder::Flush() {
   assert(rep_->state != Rep::State::kClosed);
   if (!ok()) return;
   if (r->data_block.empty()) return;
-  if (r->compression_opts.parallel_threads > 1 &&
+  if (r->IsParallelCompressionEnabled() &&
       r->state == Rep::State::kUnbuffered) {
-    ParallelCompressionRep::BlockRep* block_rep = nullptr;
-    r->pc_rep->block_rep_pool.pop(block_rep);
-    assert(block_rep != nullptr);
-
     r->data_block.Finish();
-    assert(block_rep->data);
-    r->data_block.SwapAndReset(*(block_rep->data));
-
-    block_rep->contents = *(block_rep->data);
-
-    block_rep->compression_type = r->compression_type;
-
-    std::swap(block_rep->keys, r->pc_rep->curr_block_keys);
-    r->pc_rep->curr_block_keys->Clear();
-
-    if (r->first_key_in_next_block == nullptr) {
-      block_rep->first_key_in_next_block.reset(nullptr);
-    } else {
-      block_rep->first_key_in_next_block->assign(
-          r->first_key_in_next_block->data(),
-          r->first_key_in_next_block->size());
-    }
-
-    uint64_t new_raw_bytes_inflight =
-        r->pc_rep->raw_bytes_inflight.fetch_add(block_rep->data->size(),
-                                                std::memory_order_relaxed) +
-        block_rep->data->size();
-    uint64_t new_blocks_inflight =
-        r->pc_rep->blocks_inflight.fetch_add(1, std::memory_order_relaxed) + 1;
-    r->pc_rep->estimated_file_size.store(
-        r->get_offset() +
-            static_cast<uint64_t>(static_cast<double>(new_raw_bytes_inflight) *
-                                  r->pc_rep->curr_compression_ratio.load(
-                                      std::memory_order_relaxed)) +
-            new_blocks_inflight * kBlockTrailerSize,
-        std::memory_order_relaxed);
-
-    // Read out first_block here to avoid data race with BGWorkWriteRawBlock
-    bool first_block = r->pc_rep->first_block;
-
-    assert(block_rep->status.ok());
-    if (!r->pc_rep->write_queue.push(block_rep->slot.get())) {
-      return;
-    }
-    if (!r->pc_rep->compress_queue.push(block_rep)) {
-      return;
-    }
-
-    if (first_block) {
-      std::unique_lock<std::mutex> lock(r->pc_rep->first_block_mutex);
-      r->pc_rep->first_block_cond.wait(lock,
-                                       [r] { return !r->pc_rep->first_block; });
-    }
+    ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock(
+        r->compression_type, r->first_key_in_next_block, &(r->data_block));
+    assert(block_rep != nullptr);
+    r->pc_rep->file_size_estimator.EmitBlock(block_rep->data->size(),
+                                             r->get_offset());
+    r->pc_rep->EmitBlock(block_rep);
   } else {
     WriteBlock(&r->data_block, &r->pending_handle, true /* is_data_block */);
   }
@@ -875,23 +1001,25 @@ void BlockBasedTableBuilder::Flush() {
 void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block,
                                         BlockHandle* handle,
                                         bool is_data_block) {
-  WriteBlock(block->Finish(), handle, is_data_block);
-  block->Reset();
+  block->Finish();
+  std::string raw_block_contents;
+  block->SwapAndReset(raw_block_contents);
+  if (rep_->state == Rep::State::kBuffered) {
+    assert(is_data_block);
+    rep_->data_block_buffers.emplace_back(std::move(raw_block_contents));
+    rep_->data_begin_offset += rep_->data_block_buffers.back().size();
+    return;
+  }
+  WriteBlock(raw_block_contents, handle, is_data_block);
 }
 
 void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents,
                                         BlockHandle* handle,
                                         bool is_data_block) {
   Rep* r = rep_;
+  assert(r->state == Rep::State::kUnbuffered);
   Slice block_contents;
   CompressionType type;
-  if (r->state == Rep::State::kBuffered) {
-    assert(is_data_block);
-    assert(!r->data_block_and_keys_buffers.empty());
-    r->data_block_and_keys_buffers.back().first = raw_block_contents.ToString();
-    r->data_begin_offset += r->data_block_and_keys_buffers.back().first.size();
-    return;
-  }
   Status compress_status;
   CompressAndVerifyBlock(raw_block_contents, is_data_block,
                          *(r->compression_ctxs[0]), r->verify_ctxs[0].get(),
@@ -901,7 +1029,9 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents,
   if (!ok()) {
     return;
   }
-  WriteRawBlock(block_contents, type, handle, is_data_block);
+
+  WriteRawBlock(block_contents, type, handle, is_data_block,
+                &raw_block_contents);
   r->compressed_output.clear();
   if (is_data_block) {
     if (r->filter_builder != nullptr) {
@@ -913,9 +1043,11 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents,
 }
 
 void BlockBasedTableBuilder::BGWorkCompression(
-    CompressionContext& compression_ctx, UncompressionContext* verify_ctx) {
-  ParallelCompressionRep::BlockRep* block_rep;
+    const CompressionContext& compression_ctx,
+    UncompressionContext* verify_ctx) {
+  ParallelCompressionRep::BlockRep* block_rep = nullptr;
   while (rep_->pc_rep->compress_queue.pop(block_rep)) {
+    assert(block_rep != nullptr);
     CompressAndVerifyBlock(block_rep->contents, true, /* is_data_block*/
                            compression_ctx, verify_ctx,
                            block_rep->compressed_data.get(),
@@ -927,25 +1059,32 @@ void BlockBasedTableBuilder::BGWorkCompression(
 
 void BlockBasedTableBuilder::CompressAndVerifyBlock(
     const Slice& raw_block_contents, bool is_data_block,
-    CompressionContext& compression_ctx, UncompressionContext* verify_ctx_ptr,
+    const CompressionContext& compression_ctx, UncompressionContext* verify_ctx,
     std::string* compressed_output, Slice* block_contents,
     CompressionType* type, Status* out_status) {
   // File format contains a sequence of blocks where each block has:
   //    block_data: uint8[n]
   //    type: uint8
   //    crc: uint32
-  assert(ok());
   Rep* r = rep_;
+  bool is_status_ok = ok();
+  if (!r->IsParallelCompressionEnabled()) {
+    assert(is_status_ok);
+  }
 
   *type = r->compression_type;
   uint64_t sample_for_compression = r->sample_for_compression;
   bool abort_compression = false;
 
   StopWatchNano timer(
-      r->ioptions.env,
-      ShouldReportDetailedTime(r->ioptions.env, r->ioptions.statistics));
+      r->ioptions.clock,
+      ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats));
 
-  if (raw_block_contents.size() < kCompressionSizeLimit) {
+  if (is_status_ok && raw_block_contents.size() < kCompressionSizeLimit) {
+    if (is_data_block) {
+      r->compressible_input_data_bytes.fetch_add(raw_block_contents.size(),
+                                                 std::memory_order_relaxed);
+    }
     const CompressionDict* compression_dict;
     if (!is_data_block || r->compression_dict == nullptr) {
       compression_dict = &CompressionDict::GetEmptyDict();
@@ -964,6 +1103,16 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
         r->table_options.format_version, is_data_block /* do_sample */,
         compressed_output, &sampled_output_fast, &sampled_output_slow);
 
+    if (sampled_output_slow.size() > 0 || sampled_output_fast.size() > 0) {
+      // Currently compression sampling is only enabled for data block.
+      assert(is_data_block);
+      r->sampled_input_data_bytes.fetch_add(raw_block_contents.size(),
+                                            std::memory_order_relaxed);
+      r->sampled_output_slow_data_bytes.fetch_add(sampled_output_slow.size(),
+                                                  std::memory_order_relaxed);
+      r->sampled_output_fast_data_bytes.fetch_add(sampled_output_fast.size(),
+                                                  std::memory_order_relaxed);
+    }
     // notify collectors on block add
     NotifyCollectTableCollectorsOnBlockAdd(
         r->table_properties_collectors, raw_block_contents.size(),
@@ -982,7 +1131,7 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
       }
       assert(verify_dict != nullptr);
       BlockContents contents;
-      UncompressionInfo uncompression_info(*verify_ctx_ptr, *verify_dict,
+      UncompressionInfo uncompression_info(*verify_ctx, *verify_dict,
                                            r->compression_type);
       Status stat = UncompressBlockContentsForCompressionType(
           uncompression_info, block_contents->data(), block_contents->size(),
@@ -993,7 +1142,7 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
         if (!compressed_ok) {
           // The result of the compression was invalid. abort.
           abort_compression = true;
-          ROCKS_LOG_ERROR(r->ioptions.info_log,
+          ROCKS_LOG_ERROR(r->ioptions.logger,
                           "Decompressed block did not match raw block");
           *out_status =
               Status::Corruption("Decompressed block did not match raw block");
@@ -1007,36 +1156,45 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock(
     }
   } else {
     // Block is too big to be compressed.
+    if (is_data_block) {
+      r->uncompressible_input_data_bytes.fetch_add(raw_block_contents.size(),
+                                                   std::memory_order_relaxed);
+    }
     abort_compression = true;
   }
+  if (is_data_block) {
+    r->uncompressible_input_data_bytes.fetch_add(kBlockTrailerSize,
+                                                 std::memory_order_relaxed);
+  }
 
   // Abort compression if the block is too big, or did not pass
   // verification.
   if (abort_compression) {
-    RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED);
+    RecordTick(r->ioptions.stats, NUMBER_BLOCK_NOT_COMPRESSED);
     *type = kNoCompression;
     *block_contents = raw_block_contents;
   } else if (*type != kNoCompression) {
-    if (ShouldReportDetailedTime(r->ioptions.env, r->ioptions.statistics)) {
-      RecordTimeToHistogram(r->ioptions.statistics, COMPRESSION_TIMES_NANOS,
+    if (ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats)) {
+      RecordTimeToHistogram(r->ioptions.stats, COMPRESSION_TIMES_NANOS,
                             timer.ElapsedNanos());
     }
-    RecordInHistogram(r->ioptions.statistics, BYTES_COMPRESSED,
+    RecordInHistogram(r->ioptions.stats, BYTES_COMPRESSED,
                       raw_block_contents.size());
-    RecordTick(r->ioptions.statistics, NUMBER_BLOCK_COMPRESSED);
+    RecordTick(r->ioptions.stats, NUMBER_BLOCK_COMPRESSED);
   } else if (*type != r->compression_type) {
-    RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED);
+    RecordTick(r->ioptions.stats, NUMBER_BLOCK_NOT_COMPRESSED);
   }
 }
 
 void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
                                            CompressionType type,
                                            BlockHandle* handle,
-                                           bool is_data_block) {
+                                           bool is_data_block,
+                                           const Slice* raw_block_contents) {
   Rep* r = rep_;
   Status s = Status::OK();
   IOStatus io_s = IOStatus::OK();
-  StopWatch sw(r->ioptions.env, r->ioptions.statistics, WRITE_RAW_BLOCK_MICROS);
+  StopWatch sw(r->ioptions.clock, r->ioptions.stats, WRITE_RAW_BLOCK_MICROS);
   handle->set_offset(r->get_offset());
   handle->set_size(block_contents.size());
   assert(status().ok());
@@ -1089,7 +1247,21 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
     io_s = r->file->Append(Slice(trailer, kBlockTrailerSize));
     if (io_s.ok()) {
       assert(s.ok());
-      s = InsertBlockInCache(block_contents, type, handle);
+      if (is_data_block &&
+          r->table_options.prepopulate_block_cache ==
+              BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly) {
+        if (type == kNoCompression) {
+          s = InsertBlockInCache(block_contents, handle);
+        } else if (raw_block_contents != nullptr) {
+          s = InsertBlockInCache(*raw_block_contents, handle);
+        }
+        if (!s.ok()) {
+          r->SetStatus(s);
+        }
+      }
+      // TODO:: Should InsertBlockInCompressedCache take into account error from
+      // InsertBlockInCache or ignore and overwrite it.
+      s = InsertBlockInCompressedCache(block_contents, type, handle);
       if (!s.ok()) {
         r->SetStatus(s);
       }
@@ -1111,39 +1283,12 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
           r->SetIOStatus(io_s);
         }
       }
-      if (r->compression_opts.parallel_threads > 1) {
-        if (!r->pc_rep->finished) {
-          assert(r->pc_rep->raw_bytes_compressed +
-                     r->pc_rep->raw_bytes_curr_block >
-                 0);
-          r->pc_rep->curr_compression_ratio.store(
-              (r->pc_rep->curr_compression_ratio.load(
-                   std::memory_order_relaxed) *
-                   r->pc_rep->raw_bytes_compressed +
-               block_contents.size()) /
-                  static_cast<double>(r->pc_rep->raw_bytes_compressed +
-                                      r->pc_rep->raw_bytes_curr_block),
-              std::memory_order_relaxed);
-          r->pc_rep->raw_bytes_compressed += r->pc_rep->raw_bytes_curr_block;
-          uint64_t new_raw_bytes_inflight =
-              r->pc_rep->raw_bytes_inflight.fetch_sub(
-                  r->pc_rep->raw_bytes_curr_block, std::memory_order_relaxed) -
-              r->pc_rep->raw_bytes_curr_block;
-          uint64_t new_blocks_inflight = r->pc_rep->blocks_inflight.fetch_sub(
-                                             1, std::memory_order_relaxed) -
-                                         1;
-          assert(new_blocks_inflight < r->compression_opts.parallel_threads);
-          r->pc_rep->estimated_file_size.store(
-              r->get_offset() +
-                  static_cast<uint64_t>(
-                      static_cast<double>(new_raw_bytes_inflight) *
-                      r->pc_rep->curr_compression_ratio.load(
-                          std::memory_order_relaxed)) +
-                  new_blocks_inflight * kBlockTrailerSize,
-              std::memory_order_relaxed);
+      if (r->IsParallelCompressionEnabled()) {
+        if (is_data_block) {
+          r->pc_rep->file_size_estimator.ReapBlock(block_contents.size(),
+                                                   r->get_offset());
         } else {
-          r->pc_rep->estimated_file_size.store(r->get_offset(),
-                                               std::memory_order_relaxed);
+          r->pc_rep->file_size_estimator.SetEstimatedFileSize(r->get_offset());
         }
       }
     }
@@ -1157,24 +1302,19 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
 
 void BlockBasedTableBuilder::BGWorkWriteRawBlock() {
   Rep* r = rep_;
-  ParallelCompressionRep::BlockRepSlot* slot;
-  ParallelCompressionRep::BlockRep* block_rep;
+  ParallelCompressionRep::BlockRepSlot* slot = nullptr;
+  ParallelCompressionRep::BlockRep* block_rep = nullptr;
   while (r->pc_rep->write_queue.pop(slot)) {
+    assert(slot != nullptr);
     slot->Take(block_rep);
+    assert(block_rep != nullptr);
     if (!block_rep->status.ok()) {
       r->SetStatus(block_rep->status);
-      // Return block_rep to the pool so that blocked Flush() can finish
+      // Reap block so that blocked Flush() can finish
       // if there is one, and Flush() will notice !ok() next time.
       block_rep->status = Status::OK();
-      block_rep->compressed_data->clear();
-      r->pc_rep->block_rep_pool.push(block_rep);
-      // Unlock first block if necessary.
-      if (r->pc_rep->first_block) {
-        std::lock_guard<std::mutex> lock(r->pc_rep->first_block_mutex);
-        r->pc_rep->first_block = false;
-        r->pc_rep->first_block_cond.notify_one();
-      }
-      break;
+      r->pc_rep->ReapBlock(block_rep);
+      continue;
     }
 
     for (size_t i = 0; i < block_rep->keys->Size(); i++) {
@@ -1187,19 +1327,15 @@ void BlockBasedTableBuilder::BGWorkWriteRawBlock() {
       r->index_builder->OnKeyAdded(key);
     }
 
-    r->pc_rep->raw_bytes_curr_block = block_rep->data->size();
+    r->pc_rep->file_size_estimator.SetCurrBlockRawSize(block_rep->data->size());
+
     WriteRawBlock(block_rep->compressed_contents, block_rep->compression_type,
-                  &r->pending_handle, true /* is_data_block*/);
+                  &r->pending_handle, true /* is_data_block*/,
+                  &block_rep->contents);
     if (!ok()) {
       break;
     }
 
-    if (r->pc_rep->first_block) {
-      std::lock_guard<std::mutex> lock(r->pc_rep->first_block_mutex);
-      r->pc_rep->first_block = false;
-      r->pc_rep->first_block_cond.notify_one();
-    }
-
     if (r->filter_builder != nullptr) {
       r->filter_builder->StartBlock(r->get_offset());
     }
@@ -1216,31 +1352,77 @@ void BlockBasedTableBuilder::BGWorkWriteRawBlock() {
                                       &first_key_in_next_block,
                                       r->pending_handle);
     }
-    block_rep->compressed_data->clear();
-    r->pc_rep->block_rep_pool.push(block_rep);
+
+    r->pc_rep->ReapBlock(block_rep);
   }
 }
 
+void BlockBasedTableBuilder::StartParallelCompression() {
+  rep_->pc_rep.reset(
+      new ParallelCompressionRep(rep_->compression_opts.parallel_threads));
+  rep_->pc_rep->compress_thread_pool.reserve(
+      rep_->compression_opts.parallel_threads);
+  for (uint32_t i = 0; i < rep_->compression_opts.parallel_threads; i++) {
+    rep_->pc_rep->compress_thread_pool.emplace_back([this, i] {
+      BGWorkCompression(*(rep_->compression_ctxs[i]),
+                        rep_->verify_ctxs[i].get());
+    });
+  }
+  rep_->pc_rep->write_thread.reset(
+      new port::Thread([this] { BGWorkWriteRawBlock(); }));
+}
+
+void BlockBasedTableBuilder::StopParallelCompression() {
+  rep_->pc_rep->compress_queue.finish();
+  for (auto& thread : rep_->pc_rep->compress_thread_pool) {
+    thread.join();
+  }
+  rep_->pc_rep->write_queue.finish();
+  rep_->pc_rep->write_thread->join();
+}
+
 Status BlockBasedTableBuilder::status() const { return rep_->GetStatus(); }
 
 IOStatus BlockBasedTableBuilder::io_status() const {
   return rep_->GetIOStatus();
 }
 
-static void DeleteCachedBlockContents(const Slice& /*key*/, void* value) {
-  BlockContents* bc = reinterpret_cast<BlockContents*>(value);
-  delete bc;
+namespace {
+// Delete the entry resided in the cache.
+template <class Entry>
+void DeleteEntryCached(const Slice& /*key*/, void* value) {
+  auto entry = reinterpret_cast<Entry*>(value);
+  delete entry;
+}
+}  // namespace
+
+// Helper function to setup the cache key's prefix for the Table.
+void BlockBasedTableBuilder::SetupCacheKeyPrefix(
+    const TableBuilderOptions& tbo) {
+  if (rep_->table_options.block_cache.get() != nullptr) {
+    BlockBasedTable::GenerateCachePrefix<Cache, FSWritableFile>(
+        rep_->table_options.block_cache.get(), rep_->file->writable_file(),
+        &rep_->cache_key_prefix[0], &rep_->cache_key_prefix_size,
+        tbo.db_session_id, tbo.cur_file_num);
+  }
+  if (rep_->table_options.block_cache_compressed.get() != nullptr) {
+    BlockBasedTable::GenerateCachePrefix<Cache, FSWritableFile>(
+        rep_->table_options.block_cache_compressed.get(),
+        rep_->file->writable_file(), &rep_->compressed_cache_key_prefix[0],
+        &rep_->compressed_cache_key_prefix_size, tbo.db_session_id,
+        tbo.cur_file_num);
+  }
 }
 
 //
 // Make a copy of the block contents and insert into compressed block cache
 //
-Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
-                                                  const CompressionType type,
-                                                  const BlockHandle* handle) {
+Status BlockBasedTableBuilder::InsertBlockInCompressedCache(
+    const Slice& block_contents, const CompressionType type,
+    const BlockHandle* handle) {
   Rep* r = rep_;
   Cache* block_cache_compressed = r->table_options.block_cache_compressed.get();
-
+  Status s;
   if (type != kNoCompression && block_cache_compressed != nullptr) {
     size_t size = block_contents.size();
 
@@ -1262,27 +1444,63 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
     Slice key(r->compressed_cache_key_prefix,
               static_cast<size_t>(end - r->compressed_cache_key_prefix));
 
-    // Insert into compressed block cache.
-    // How should we deal with compressed cache full?
-    block_cache_compressed
-        ->Insert(key, block_contents_to_cache,
-                 block_contents_to_cache->ApproximateMemoryUsage(),
-                 &DeleteCachedBlockContents)
-        .PermitUncheckedError();
-
+    s = block_cache_compressed->Insert(
+        key, block_contents_to_cache,
+        block_contents_to_cache->ApproximateMemoryUsage(),
+        &DeleteEntryCached<BlockContents>);
+    if (s.ok()) {
+      RecordTick(rep_->ioptions.stats, BLOCK_CACHE_COMPRESSED_ADD);
+    } else {
+      RecordTick(rep_->ioptions.stats, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
+    }
     // Invalidate OS cache.
     r->file->InvalidateCache(static_cast<size_t>(r->get_offset()), size)
         .PermitUncheckedError();
   }
-  return Status::OK();
+  return s;
+}
+
+Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
+                                                  const BlockHandle* handle) {
+  // Uncompressed regular block cache
+  Cache* block_cache = rep_->table_options.block_cache.get();
+  Status s;
+  if (block_cache != nullptr) {
+    size_t size = block_contents.size();
+    auto buf = AllocateBlock(size, block_cache->memory_allocator());
+    memcpy(buf.get(), block_contents.data(), size);
+    BlockContents results(std::move(buf), size);
+
+    char
+        cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+    Slice key = BlockBasedTable::GetCacheKey(rep_->cache_key_prefix,
+                                             rep_->cache_key_prefix_size,
+                                             *handle, cache_key);
+
+    const size_t read_amp_bytes_per_bit =
+        rep_->table_options.read_amp_bytes_per_bit;
+    Block* block = new Block(std::move(results), read_amp_bytes_per_bit);
+    size_t charge = block->ApproximateMemoryUsage();
+    s = block_cache->Insert(key, block, charge, &DeleteEntryCached<Block>);
+    if (s.ok()) {
+      BlockBasedTable::UpdateCacheInsertionMetrics(
+          BlockType::kData, nullptr /*get_context*/, charge,
+          s.IsOkOverwritten(), rep_->ioptions.stats);
+    } else {
+      RecordTick(rep_->ioptions.stats, BLOCK_CACHE_ADD_FAILURES);
+    }
+  }
+  return s;
 }
 
 void BlockBasedTableBuilder::WriteFilterBlock(
     MetaIndexBuilder* meta_index_builder) {
   BlockHandle filter_block_handle;
-  bool empty_filter_block = (rep_->filter_builder == nullptr ||
-                             rep_->filter_builder->NumAdded() == 0);
+  bool empty_filter_block =
+      (rep_->filter_builder == nullptr || rep_->filter_builder->IsEmpty());
   if (ok() && !empty_filter_block) {
+    rep_->props.num_filter_entries +=
+        rep_->filter_builder->EstimateEntriesAdded();
     Status s = Status::Incomplete();
     while (ok() && s.IsIncomplete()) {
       Slice filter_content =
@@ -1339,20 +1557,23 @@ void BlockBasedTableBuilder::WriteIndexBlock(
     }
   }
   // If there are more index partitions, finish them and write them out
-  Status s = index_builder_status;
-  while (ok() && s.IsIncomplete()) {
-    s = rep_->index_builder->Finish(&index_blocks, *index_block_handle);
-    if (!s.ok() && !s.IsIncomplete()) {
-      rep_->SetStatus(s);
-      return;
-    }
-    if (rep_->table_options.enable_index_compression) {
-      WriteBlock(index_blocks.index_block_contents, index_block_handle, false);
-    } else {
-      WriteRawBlock(index_blocks.index_block_contents, kNoCompression,
-                    index_block_handle);
+  if (index_builder_status.IsIncomplete()) {
+    Status s = Status::Incomplete();
+    while (ok() && s.IsIncomplete()) {
+      s = rep_->index_builder->Finish(&index_blocks, *index_block_handle);
+      if (!s.ok() && !s.IsIncomplete()) {
+        rep_->SetStatus(s);
+        return;
+      }
+      if (rep_->table_options.enable_index_compression) {
+        WriteBlock(index_blocks.index_block_contents, index_block_handle,
+                   false);
+      } else {
+        WriteRawBlock(index_blocks.index_block_contents, kNoCompression,
+                      index_block_handle);
+      }
+      // The last index_block_handle will be for the partition index block
     }
-    // The last index_block_handle will be for the partition index block
   }
 }
 
@@ -1410,15 +1631,37 @@ void BlockBasedTableBuilder::WritePropertiesBlock(
     rep_->props.creation_time = rep_->creation_time;
     rep_->props.oldest_key_time = rep_->oldest_key_time;
     rep_->props.file_creation_time = rep_->file_creation_time;
+    if (rep_->sampled_input_data_bytes > 0) {
+      rep_->props.slow_compression_estimated_data_size = static_cast<uint64_t>(
+          static_cast<double>(rep_->sampled_output_slow_data_bytes) /
+              rep_->sampled_input_data_bytes *
+              rep_->compressible_input_data_bytes +
+          rep_->uncompressible_input_data_bytes + 0.5);
+      rep_->props.fast_compression_estimated_data_size = static_cast<uint64_t>(
+          static_cast<double>(rep_->sampled_output_fast_data_bytes) /
+              rep_->sampled_input_data_bytes *
+              rep_->compressible_input_data_bytes +
+          rep_->uncompressible_input_data_bytes + 0.5);
+    } else if (rep_->sample_for_compression > 0) {
+      // We tried to sample but none were found. Assume worst-case (compression
+      // ratio 1.0) so data is complete and aggregatable.
+      rep_->props.slow_compression_estimated_data_size =
+          rep_->compressible_input_data_bytes +
+          rep_->uncompressible_input_data_bytes;
+      rep_->props.fast_compression_estimated_data_size =
+          rep_->compressible_input_data_bytes +
+          rep_->uncompressible_input_data_bytes;
+    }
     rep_->props.db_id = rep_->db_id;
     rep_->props.db_session_id = rep_->db_session_id;
+    rep_->props.db_host_id = rep_->db_host_id;
 
     // Add basic properties
     property_block_builder.AddTableProperty(rep_->props);
 
     // Add use collected properties
     NotifyCollectTableCollectorsOnFinish(rep_->table_properties_collectors,
-                                         rep_->ioptions.info_log,
+                                         rep_->ioptions.logger,
                                          &property_block_builder);
 
     WriteRawBlock(property_block_builder.Finish(), kNoCompression,
@@ -1497,11 +1740,12 @@ void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle,
   footer.EncodeTo(&footer_encoding);
   assert(ok());
   IOStatus ios = r->file->Append(footer_encoding);
-  r->SetIOStatus(ios);
   if (ios.ok()) {
     r->set_offset(r->get_offset() + footer_encoding.size());
+  } else {
+    r->SetIOStatus(ios);
+    r->SetStatus(ios);
   }
-  r->SyncStatusFromIOStatus();
 }
 
 void BlockBasedTableBuilder::EnterUnbuffered() {
@@ -1511,20 +1755,45 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
   const size_t kSampleBytes = r->compression_opts.zstd_max_train_bytes > 0
                                   ? r->compression_opts.zstd_max_train_bytes
                                   : r->compression_opts.max_dict_bytes;
-  Random64 generator{r->creation_time};
+  const size_t kNumBlocksBuffered = r->data_block_buffers.size();
+  if (kNumBlocksBuffered == 0) {
+    // The below code is neither safe nor necessary for handling zero data
+    // blocks.
+    return;
+  }
+
+  // Abstract algebra teaches us that a finite cyclic group (such as the
+  // additive group of integers modulo N) can be generated by a number that is
+  // coprime with N. Since N is variable (number of buffered data blocks), we
+  // must then pick a prime number in order to guarantee coprimeness with any N.
+  //
+  // One downside of this approach is the spread will be poor when
+  // `kPrimeGeneratorRemainder` is close to zero or close to
+  // `kNumBlocksBuffered`.
+  //
+  // Picked a random number between one and one trillion and then chose the
+  // next prime number greater than or equal to it.
+  const uint64_t kPrimeGenerator = 545055921143ull;
+  // Can avoid repeated division by just adding the remainder repeatedly.
+  const size_t kPrimeGeneratorRemainder = static_cast<size_t>(
+      kPrimeGenerator % static_cast<uint64_t>(kNumBlocksBuffered));
+  const size_t kInitSampleIdx = kNumBlocksBuffered / 2;
+
   std::string compression_dict_samples;
   std::vector<size_t> compression_dict_sample_lens;
-  if (!r->data_block_and_keys_buffers.empty()) {
-    while (compression_dict_samples.size() < kSampleBytes) {
-      size_t rand_idx =
-          static_cast<size_t>(
-              generator.Uniform(r->data_block_and_keys_buffers.size()));
-      size_t copy_len =
-          std::min(kSampleBytes - compression_dict_samples.size(),
-                   r->data_block_and_keys_buffers[rand_idx].first.size());
-      compression_dict_samples.append(
-          r->data_block_and_keys_buffers[rand_idx].first, 0, copy_len);
-      compression_dict_sample_lens.emplace_back(copy_len);
+  size_t buffer_idx = kInitSampleIdx;
+  for (size_t i = 0;
+       i < kNumBlocksBuffered && compression_dict_samples.size() < kSampleBytes;
+       ++i) {
+    size_t copy_len = std::min(kSampleBytes - compression_dict_samples.size(),
+                               r->data_block_buffers[buffer_idx].size());
+    compression_dict_samples.append(r->data_block_buffers[buffer_idx], 0,
+                                    copy_len);
+    compression_dict_sample_lens.emplace_back(copy_len);
+
+    buffer_idx += kPrimeGeneratorRemainder;
+    if (buffer_idx >= kNumBlocksBuffered) {
+      buffer_idx -= kNumBlocksBuffered;
     }
   }
 
@@ -1544,70 +1813,58 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
       dict, r->compression_type == kZSTD ||
                 r->compression_type == kZSTDNotFinalCompression));
 
-  for (size_t i = 0; ok() && i < r->data_block_and_keys_buffers.size(); ++i) {
-    auto& data_block = r->data_block_and_keys_buffers[i].first;
-    auto& keys = r->data_block_and_keys_buffers[i].second;
+  auto get_iterator_for_block = [&r](size_t i) {
+    auto& data_block = r->data_block_buffers[i];
     assert(!data_block.empty());
-    assert(!keys.empty());
 
-    if (r->compression_opts.parallel_threads > 1) {
-      ParallelCompressionRep::BlockRep* block_rep;
-      r->pc_rep->block_rep_pool.pop(block_rep);
+    Block reader{BlockContents{data_block}};
+    DataBlockIter* iter = reader.NewDataIterator(
+        r->internal_comparator.user_comparator(), kDisableGlobalSequenceNumber);
 
-      std::swap(*(block_rep->data), data_block);
-      block_rep->contents = *(block_rep->data);
-
-      block_rep->compression_type = r->compression_type;
+    iter->SeekToFirst();
+    assert(iter->Valid());
+    return std::unique_ptr<DataBlockIter>(iter);
+  };
 
-      block_rep->keys->SwapAssign(keys);
+  std::unique_ptr<DataBlockIter> iter = nullptr, next_block_iter = nullptr;
 
-      if (i + 1 < r->data_block_and_keys_buffers.size()) {
-        block_rep->first_key_in_next_block->assign(
-            r->data_block_and_keys_buffers[i + 1].second.front());
-      } else {
-        if (r->first_key_in_next_block == nullptr) {
-          block_rep->first_key_in_next_block.reset(nullptr);
-        } else {
-          block_rep->first_key_in_next_block->assign(
-              r->first_key_in_next_block->data(),
-              r->first_key_in_next_block->size());
-        }
-      }
+  for (size_t i = 0; ok() && i < r->data_block_buffers.size(); ++i) {
+    if (iter == nullptr) {
+      iter = get_iterator_for_block(i);
+      assert(iter != nullptr);
+    };
 
-      uint64_t new_raw_bytes_inflight =
-          r->pc_rep->raw_bytes_inflight.fetch_add(block_rep->data->size(),
-                                                  std::memory_order_relaxed) +
-          block_rep->data->size();
-      uint64_t new_blocks_inflight =
-          r->pc_rep->blocks_inflight.fetch_add(1, std::memory_order_relaxed) +
-          1;
-      r->pc_rep->estimated_file_size.store(
-          r->get_offset() +
-              static_cast<uint64_t>(
-                  static_cast<double>(new_raw_bytes_inflight) *
-                  r->pc_rep->curr_compression_ratio.load(
-                      std::memory_order_relaxed)) +
-              new_blocks_inflight * kBlockTrailerSize,
-          std::memory_order_relaxed);
+    if (i + 1 < r->data_block_buffers.size()) {
+      next_block_iter = get_iterator_for_block(i + 1);
+    }
 
-      // Read out first_block here to avoid data race with BGWorkWriteRawBlock
-      bool first_block = r->pc_rep->first_block;
+    auto& data_block = r->data_block_buffers[i];
 
-      assert(block_rep->status.ok());
-      if (!r->pc_rep->write_queue.push(block_rep->slot.get())) {
-        return;
-      }
-      if (!r->pc_rep->compress_queue.push(block_rep)) {
-        return;
+    if (r->IsParallelCompressionEnabled()) {
+      Slice first_key_in_next_block;
+      const Slice* first_key_in_next_block_ptr = &first_key_in_next_block;
+      if (i + 1 < r->data_block_buffers.size()) {
+        assert(next_block_iter != nullptr);
+        first_key_in_next_block = next_block_iter->key();
+      } else {
+        first_key_in_next_block_ptr = r->first_key_in_next_block;
       }
 
-      if (first_block) {
-        std::unique_lock<std::mutex> lock(r->pc_rep->first_block_mutex);
-        r->pc_rep->first_block_cond.wait(
-            lock, [r] { return !r->pc_rep->first_block; });
+      std::vector<std::string> keys;
+      for (; iter->Valid(); iter->Next()) {
+        keys.emplace_back(iter->key().ToString());
       }
+
+      ParallelCompressionRep::BlockRep* block_rep = r->pc_rep->PrepareBlock(
+          r->compression_type, first_key_in_next_block_ptr, &data_block, &keys);
+
+      assert(block_rep != nullptr);
+      r->pc_rep->file_size_estimator.EmitBlock(block_rep->data->size(),
+                                               r->get_offset());
+      r->pc_rep->EmitBlock(block_rep);
     } else {
-      for (const auto& key : keys) {
+      for (; iter->Valid(); iter->Next()) {
+        Slice key = iter->key();
         if (r->filter_builder != nullptr) {
           size_t ts_sz =
               r->internal_comparator.user_comparator()->timestamp_size();
@@ -1617,16 +1874,22 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
       }
       WriteBlock(Slice(data_block), &r->pending_handle,
                  true /* is_data_block */);
-      if (ok() && i + 1 < r->data_block_and_keys_buffers.size()) {
-        Slice first_key_in_next_block =
-            r->data_block_and_keys_buffers[i + 1].second.front();
+      if (ok() && i + 1 < r->data_block_buffers.size()) {
+        assert(next_block_iter != nullptr);
+        Slice first_key_in_next_block = next_block_iter->key();
+
         Slice* first_key_in_next_block_ptr = &first_key_in_next_block;
-        r->index_builder->AddIndexEntry(
-            &keys.back(), first_key_in_next_block_ptr, r->pending_handle);
+
+        iter->SeekToLast();
+        std::string last_key = iter->key().ToString();
+        r->index_builder->AddIndexEntry(&last_key, first_key_in_next_block_ptr,
+                                        r->pending_handle);
       }
     }
+
+    std::swap(iter, next_block_iter);
   }
-  r->data_block_and_keys_buffers.clear();
+  r->data_block_buffers.clear();
 }
 
 Status BlockBasedTableBuilder::Finish() {
@@ -1638,14 +1901,8 @@ Status BlockBasedTableBuilder::Finish() {
   if (r->state == Rep::State::kBuffered) {
     EnterUnbuffered();
   }
-  if (r->compression_opts.parallel_threads > 1) {
-    r->pc_rep->compress_queue.finish();
-    for (auto& thread : r->pc_rep->compress_thread_pool) {
-      thread.join();
-    }
-    r->pc_rep->write_queue.finish();
-    r->pc_rep->write_thread->join();
-    r->pc_rep->finished = true;
+  if (r->IsParallelCompressionEnabled()) {
+    StopParallelCompression();
 #ifndef NDEBUG
     for (const auto& br : r->pc_rep->block_rep_buf) {
       assert(br.status.ok());
@@ -1684,25 +1941,20 @@ Status BlockBasedTableBuilder::Finish() {
     WriteFooter(metaindex_block_handle, index_block_handle);
   }
   r->state = Rep::State::kClosed;
-  Status ret_status = r->GetStatus();
+  r->SetStatus(r->CopyIOStatus());
+  Status ret_status = r->CopyStatus();
   assert(!ret_status.ok() || io_status().ok());
   return ret_status;
 }
 
 void BlockBasedTableBuilder::Abandon() {
   assert(rep_->state != Rep::State::kClosed);
-  if (rep_->compression_opts.parallel_threads > 1) {
-    rep_->pc_rep->compress_queue.finish();
-    for (auto& thread : rep_->pc_rep->compress_thread_pool) {
-      thread.join();
-    }
-    rep_->pc_rep->write_queue.finish();
-    rep_->pc_rep->write_thread->join();
-    rep_->pc_rep->finished = true;
+  if (rep_->IsParallelCompressionEnabled()) {
+    StopParallelCompression();
   }
   rep_->state = Rep::State::kClosed;
-  rep_->GetStatus().PermitUncheckedError();
-  rep_->GetIOStatus().PermitUncheckedError();
+  rep_->CopyStatus().PermitUncheckedError();
+  rep_->CopyIOStatus().PermitUncheckedError();
 }
 
 uint64_t BlockBasedTableBuilder::NumEntries() const {
@@ -1716,10 +1968,10 @@ bool BlockBasedTableBuilder::IsEmpty() const {
 uint64_t BlockBasedTableBuilder::FileSize() const { return rep_->offset; }
 
 uint64_t BlockBasedTableBuilder::EstimatedFileSize() const {
-  if (rep_->compression_opts.parallel_threads > 1) {
+  if (rep_->IsParallelCompressionEnabled()) {
     // Use compression ratio so far and inflight raw bytes to estimate
     // final SST size.
-    return rep_->pc_rep->estimated_file_size.load(std::memory_order_relaxed);
+    return rep_->pc_rep->file_size_estimator.GetEstimatedFileSize();
   } else {
     return FileSize();
   }
diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h
index 2e3081d26ce..65be35b1948 100644
--- a/table/block_based/block_based_table_builder.h
+++ b/table/block_based/block_based_table_builder.h
@@ -38,21 +38,9 @@ class BlockBasedTableBuilder : public TableBuilder {
   // Create a builder that will store the contents of the table it is
   // building in *file.  Does not close the file.  It is up to the
   // caller to close the file after calling Finish().
-  BlockBasedTableBuilder(
-      const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
-      const BlockBasedTableOptions& table_options,
-      const InternalKeyComparator& internal_comparator,
-      const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-          int_tbl_prop_collector_factories,
-      uint32_t column_family_id, WritableFileWriter* file,
-      const CompressionType compression_type,
-      const uint64_t sample_for_compression,
-      const CompressionOptions& compression_opts, const bool skip_filters,
-      const std::string& column_family_name, const int level_at_creation,
-      const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0,
-      const uint64_t target_file_size = 0,
-      const uint64_t file_creation_time = 0, const std::string& db_id = "",
-      const std::string& db_session_id = "");
+  BlockBasedTableBuilder(const BlockBasedTableOptions& table_options,
+                         const TableBuilderOptions& table_builder_options,
+                         WritableFileWriter* file);
 
   // No copying allowed
   BlockBasedTableBuilder(const BlockBasedTableBuilder&) = delete;
@@ -117,8 +105,9 @@ class BlockBasedTableBuilder : public TableBuilder {
   // REQUIRES: `rep_->state == kBuffered`
   void EnterUnbuffered();
 
-  // Call block's Finish() method
-  // and then write the compressed block contents to file.
+  // Call block's Finish() method and then
+  // - in buffered mode, buffer the uncompressed block contents.
+  // - in unbuffered mode, write the compressed block contents to file.
   void WriteBlock(BlockBuilder* block, BlockHandle* handle, bool is_data_block);
 
   // Compress and write block content to the file.
@@ -126,10 +115,16 @@ class BlockBasedTableBuilder : public TableBuilder {
                   bool is_data_block);
   // Directly write data to the file.
   void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle,
-                     bool is_data_block = false);
+                     bool is_data_block = false,
+                     const Slice* raw_data = nullptr);
+
+  void SetupCacheKeyPrefix(const TableBuilderOptions& tbo);
+
   Status InsertBlockInCache(const Slice& block_contents,
-                            const CompressionType type,
                             const BlockHandle* handle);
+  Status InsertBlockInCompressedCache(const Slice& block_contents,
+                                      const CompressionType type,
+                                      const BlockHandle* handle);
 
   void WriteFilterBlock(MetaIndexBuilder* meta_index_builder);
   void WriteIndexBlock(MetaIndexBuilder* meta_index_builder,
@@ -159,19 +154,29 @@ class BlockBasedTableBuilder : public TableBuilder {
 
   // Get blocks from mem-table walking thread, compress them and
   // pass them to the write thread. Used in parallel compression mode only
-  void BGWorkCompression(CompressionContext& compression_ctx,
+  void BGWorkCompression(const CompressionContext& compression_ctx,
                          UncompressionContext* verify_ctx);
 
   // Given raw block content, try to compress it and return result and
   // compression type
-  void CompressAndVerifyBlock(
-      const Slice& raw_block_contents, bool is_data_block,
-      CompressionContext& compression_ctx, UncompressionContext* verify_ctx,
-      std::string* compressed_output, Slice* result_block_contents,
-      CompressionType* result_compression_type, Status* out_status);
+  void CompressAndVerifyBlock(const Slice& raw_block_contents,
+                              bool is_data_block,
+                              const CompressionContext& compression_ctx,
+                              UncompressionContext* verify_ctx,
+                              std::string* compressed_output,
+                              Slice* result_block_contents,
+                              CompressionType* result_compression_type,
+                              Status* out_status);
 
   // Get compressed blocks from BGWorkCompression and write them into SST
   void BGWorkWriteRawBlock();
+
+  // Initialize parallel compression context and
+  // start BGWorkCompression and BGWorkWriteRawBlock threads
+  void StartParallelCompression();
+
+  // Stop BGWorkCompression and BGWorkWriteRawBlock threads
+  void StopParallelCompression();
 };
 
 Slice CompressBlock(const Slice& raw, const CompressionInfo& info,
diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc
index 25e38f94c93..831235f489a 100644
--- a/table/block_based/block_based_table_factory.cc
+++ b/table/block_based/block_based_table_factory.cc
@@ -160,6 +160,16 @@ size_t TailPrefetchStats::GetSuggestedPrefetchSize() {
 }
 
 #ifndef ROCKSDB_LITE
+
+const std::string kOptNameMetadataCacheOpts = "metadata_cache_options";
+
+static std::unordered_map<std::string, PinningTier>
+    pinning_tier_type_string_map = {
+        {"kFallback", PinningTier::kFallback},
+        {"kNone", PinningTier::kNone},
+        {"kFlushedAndSimilar", PinningTier::kFlushedAndSimilar},
+        {"kAll", PinningTier::kAll}};
+
 static std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
     block_base_table_index_type_string_map = {
         {"kBinarySearch", BlockBasedTableOptions::IndexType::kBinarySearch},
@@ -187,6 +197,29 @@ static std::unordered_map<std::string,
         {"kShortenSeparatorsAndSuccessor",
          BlockBasedTableOptions::IndexShorteningMode::
              kShortenSeparatorsAndSuccessor}};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    metadata_cache_options_type_info = {
+        {"top_level_index_pinning",
+         OptionTypeInfo::Enum<PinningTier>(
+             offsetof(struct MetadataCacheOptions, top_level_index_pinning),
+             &pinning_tier_type_string_map)},
+        {"partition_pinning",
+         OptionTypeInfo::Enum<PinningTier>(
+             offsetof(struct MetadataCacheOptions, partition_pinning),
+             &pinning_tier_type_string_map)},
+        {"unpartitioned_pinning",
+         OptionTypeInfo::Enum<PinningTier>(
+             offsetof(struct MetadataCacheOptions, unpartitioned_pinning),
+             &pinning_tier_type_string_map)}};
+
+static std::unordered_map<std::string,
+                          BlockBasedTableOptions::PrepopulateBlockCache>
+    block_base_table_prepopulate_block_cache_string_map = {
+        {"kDisable", BlockBasedTableOptions::PrepopulateBlockCache::kDisable},
+        {"kFlushOnly",
+         BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly}};
+
 #endif  // ROCKSDB_LITE
 
 static std::unordered_map<std::string, OptionTypeInfo>
@@ -280,17 +313,16 @@ static std::unordered_map<std::string, OptionTypeInfo>
           OptionTypeFlags::kNone,
           // Parses the Filter policy
           [](const ConfigOptions& opts, const std::string&,
-             const std::string& value, char* addr) {
+             const std::string& value, void* addr) {
             auto* policy =
-                reinterpret_cast<std::shared_ptr<const FilterPolicy>*>(addr);
+                static_cast<std::shared_ptr<const FilterPolicy>*>(addr);
             return FilterPolicy::CreateFromString(opts, value, policy);
           },
           // Converts the FilterPolicy to its string representation
-          [](const ConfigOptions&, const std::string&, const char* addr,
+          [](const ConfigOptions&, const std::string&, const void* addr,
              std::string* value) {
             const auto* policy =
-                reinterpret_cast<const std::shared_ptr<const FilterPolicy>*>(
-                    addr);
+                static_cast<const std::shared_ptr<const FilterPolicy>*>(addr);
             if (policy->get()) {
               *value = (*policy)->Name();
             } else {
@@ -299,15 +331,13 @@ static std::unordered_map<std::string, OptionTypeInfo>
             return Status::OK();
           },
           // Compares two FilterPolicy objects for equality
-          [](const ConfigOptions&, const std::string&, const char* addr1,
-             const char* addr2, std::string*) {
+          [](const ConfigOptions&, const std::string&, const void* addr1,
+             const void* addr2, std::string*) {
             const auto* policy1 =
-                reinterpret_cast<const std::shared_ptr<const FilterPolicy>*>(
-                    addr1)
+                static_cast<const std::shared_ptr<const FilterPolicy>*>(addr1)
                     ->get();
             const auto* policy2 =
-                reinterpret_cast<const std::shared_ptr<FilterPolicy>*>(addr2)
-                    ->get();
+                static_cast<const std::shared_ptr<FilterPolicy>*>(addr2)->get();
             if (policy1 == policy2) {
               return true;
             } else if (policy1 != nullptr && policy2 != nullptr) {
@@ -336,7 +366,7 @@ static std::unordered_map<std::string, OptionTypeInfo>
           OptionType::kUInt32T, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone,
           [](const ConfigOptions& /*opts*/, const std::string& /*name*/,
-             const std::string& value, char* addr) {
+             const std::string& value, void* addr) {
             // A workaround to fix a bug in 6.10, 6.11, 6.12, 6.13
             // and 6.14. The bug will write out 8 bytes to OPTIONS file from the
             // starting address of BlockBasedTableOptions.read_amp_bytes_per_bit
@@ -347,7 +377,7 @@ static std::unordered_map<std::string, OptionTypeInfo>
             // generated by affected releases before the fix, we need to
             // manually parse read_amp_bytes_per_bit with this special hack.
             uint64_t read_amp_bytes_per_bit = ParseUint64(value);
-            *(reinterpret_cast<uint32_t*>(addr)) =
+            *(static_cast<uint32_t*>(addr)) =
                 static_cast<uint32_t>(read_amp_bytes_per_bit);
             return Status::OK();
           }}},
@@ -364,14 +394,19 @@ static std::unordered_map<std::string, OptionTypeInfo>
                    pin_top_level_index_and_filter),
           OptionType::kBoolean, OptionVerificationType::kNormal,
           OptionTypeFlags::kNone}},
+        {kOptNameMetadataCacheOpts,
+         OptionTypeInfo::Struct(
+             kOptNameMetadataCacheOpts, &metadata_cache_options_type_info,
+             offsetof(struct BlockBasedTableOptions, metadata_cache_options),
+             OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
         {"block_cache",
          {offsetof(struct BlockBasedTableOptions, block_cache),
           OptionType::kUnknown, OptionVerificationType::kNormal,
           (OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize),
           // Parses the input vsalue as a Cache
           [](const ConfigOptions& opts, const std::string&,
-             const std::string& value, char* addr) {
-            auto* cache = reinterpret_cast<std::shared_ptr<Cache>*>(addr);
+             const std::string& value, void* addr) {
+            auto* cache = static_cast<std::shared_ptr<Cache>*>(addr);
             return Cache::CreateFromString(opts, value, cache);
           }}},
         {"block_cache_compressed",
@@ -380,10 +415,19 @@ static std::unordered_map<std::string, OptionTypeInfo>
           (OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize),
           // Parses the input vsalue as a Cache
           [](const ConfigOptions& opts, const std::string&,
-             const std::string& value, char* addr) {
-            auto* cache = reinterpret_cast<std::shared_ptr<Cache>*>(addr);
+             const std::string& value, void* addr) {
+            auto* cache = static_cast<std::shared_ptr<Cache>*>(addr);
             return Cache::CreateFromString(opts, value, cache);
           }}},
+        {"max_auto_readahead_size",
+         {offsetof(struct BlockBasedTableOptions, max_auto_readahead_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kMutable}},
+        {"prepopulate_block_cache",
+         OptionTypeInfo::Enum<BlockBasedTableOptions::PrepopulateBlockCache>(
+             offsetof(struct BlockBasedTableOptions, prepopulate_block_cache),
+             &block_base_table_prepopulate_block_cache_string_map)},
+
 #endif  // ROCKSDB_LITE
 };
 
@@ -393,8 +437,7 @@ BlockBasedTableFactory::BlockBasedTableFactory(
     const BlockBasedTableOptions& _table_options)
     : table_options_(_table_options) {
   InitializeOptions();
-  ConfigurableHelper::RegisterOptions(*this, &table_options_,
-                                      &block_based_table_type_info);
+  RegisterOptions(&table_options_, &block_based_table_type_info);
 }
 
 void BlockBasedTableFactory::InitializeOptions() {
@@ -454,28 +497,16 @@ Status BlockBasedTableFactory::NewTableReader(
       table_reader_options.largest_seqno,
       table_reader_options.force_direct_prefetch, &tail_prefetch_stats_,
       table_reader_options.block_cache_tracer,
-      table_reader_options.max_file_size_for_l0_meta_pin);
+      table_reader_options.max_file_size_for_l0_meta_pin,
+      table_reader_options.cur_db_session_id,
+      table_reader_options.cur_file_num);
 }
 
 TableBuilder* BlockBasedTableFactory::NewTableBuilder(
-    const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
+    const TableBuilderOptions& table_builder_options,
     WritableFileWriter* file) const {
-  auto table_builder = new BlockBasedTableBuilder(
-      table_builder_options.ioptions, table_builder_options.moptions,
-      table_options_, table_builder_options.internal_comparator,
-      table_builder_options.int_tbl_prop_collector_factories, column_family_id,
-      file, table_builder_options.compression_type,
-      table_builder_options.sample_for_compression,
-      table_builder_options.compression_opts,
-      table_builder_options.skip_filters,
-      table_builder_options.column_family_name, table_builder_options.level,
-      table_builder_options.creation_time,
-      table_builder_options.oldest_key_time,
-      table_builder_options.target_file_size,
-      table_builder_options.file_creation_time, table_builder_options.db_id,
-      table_builder_options.db_session_id);
-
-  return table_builder;
+  return new BlockBasedTableBuilder(table_options_, table_builder_options,
+                                    file);
 }
 
 Status BlockBasedTableFactory::ValidateOptions(
@@ -656,6 +687,13 @@ std::string BlockBasedTableFactory::GetPrintableOptions() const {
   snprintf(buffer, kBufferSize, "  block_align: %d\n",
            table_options_.block_align);
   ret.append(buffer);
+  snprintf(buffer, kBufferSize,
+           "  max_auto_readahead_size: %" ROCKSDB_PRIszt "\n",
+           table_options_.max_auto_readahead_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  prepopulate_block_cache: %d\n",
+           static_cast<int>(table_options_.prepopulate_block_cache));
+  ret.append(buffer);
   return ret;
 }
 
diff --git a/table/block_based/block_based_table_factory.h b/table/block_based/block_based_table_factory.h
index a7120f8543b..534746b9d30 100644
--- a/table/block_based/block_based_table_factory.h
+++ b/table/block_based/block_based_table_factory.h
@@ -46,6 +46,9 @@ class BlockBasedTableFactory : public TableFactory {
 
   ~BlockBasedTableFactory() {}
 
+  // Method to allow CheckedCast to work for this class
+  static const char* kClassName() { return kBlockBasedTableName(); }
+
   const char* Name() const override { return kBlockBasedTableName(); }
 
   using TableFactory::NewTableReader;
@@ -57,7 +60,7 @@ class BlockBasedTableFactory : public TableFactory {
 
   TableBuilder* NewTableBuilder(
       const TableBuilderOptions& table_builder_options,
-      uint32_t column_family_id, WritableFileWriter* file) const override;
+      WritableFileWriter* file) const override;
 
   // Valdates the specified DB Options.
   Status ValidateOptions(const DBOptions& db_opts,
diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h
index e1f15057300..181e7824b11 100644
--- a/table/block_based/block_based_table_iterator.h
+++ b/table/block_based/block_based_table_iterator.h
@@ -230,7 +230,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<Slice> {
 
   bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction) {
     if (need_upper_bound_check_ && direction == IterDirection::kBackward) {
-      // Upper bound check isn't sufficnet for backward direction to
+      // Upper bound check isn't sufficient for backward direction to
       // guarantee the same result as total order, so disable prefix
       // check.
       return true;
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index be119194b6f..acb58138de6 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -15,8 +15,8 @@
 #include <utility>
 #include <vector>
 
+#include "cache/cache_entry_roles.h"
 #include "cache/sharded_cache.h"
-
 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
 #include "file/file_prefetch_buffer.h"
@@ -24,6 +24,7 @@
 #include "file/random_access_file_reader.h"
 #include "monitoring/perf_context_imp.h"
 #include "options/options_helper.h"
+#include "port/lang.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/env.h"
@@ -32,6 +33,7 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/statistics.h"
+#include "rocksdb/system_clock.h"
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
 #include "table/block_based/binary_search_index_reader.h"
@@ -39,7 +41,9 @@
 #include "table/block_based/block_based_filter_block.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/block_based/block_based_table_iterator.h"
+#include "table/block_based/block_like_traits.h"
 #include "table/block_based/block_prefix_index.h"
+#include "table/block_based/block_type.h"
 #include "table/block_based/filter_block.h"
 #include "table/block_based/full_filter_block.h"
 #include "table/block_based/hash_index_reader.h"
@@ -54,15 +58,11 @@
 #include "table/persistent_cache_helper.h"
 #include "table/sst_file_writer_collectors.h"
 #include "table/two_level_iterator.h"
-
-#include "monitoring/perf_context_imp.h"
-#include "port/lang.h"
 #include "test_util/sync_point.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
-#include "util/xxhash.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -70,84 +70,12 @@ extern const uint64_t kBlockBasedTableMagicNumber;
 extern const std::string kHashIndexPrefixesBlock;
 extern const std::string kHashIndexPrefixesMetadataBlock;
 
-typedef BlockBasedTable::IndexReader IndexReader;
-
-// Found that 256 KB readahead size provides the best performance, based on
-// experiments, for auto readahead. Experiment data is in PR #3282.
-const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024;
-
 BlockBasedTable::~BlockBasedTable() {
   delete rep_;
 }
 
 std::atomic<uint64_t> BlockBasedTable::next_cache_key_id_(0);
 
-template <typename TBlocklike>
-class BlocklikeTraits;
-
-template <>
-class BlocklikeTraits<BlockContents> {
- public:
-  static BlockContents* Create(BlockContents&& contents,
-                               size_t /* read_amp_bytes_per_bit */,
-                               Statistics* /* statistics */,
-                               bool /* using_zstd */,
-                               const FilterPolicy* /* filter_policy */) {
-    return new BlockContents(std::move(contents));
-  }
-
-  static uint32_t GetNumRestarts(const BlockContents& /* contents */) {
-    return 0;
-  }
-};
-
-template <>
-class BlocklikeTraits<ParsedFullFilterBlock> {
- public:
-  static ParsedFullFilterBlock* Create(BlockContents&& contents,
-                                       size_t /* read_amp_bytes_per_bit */,
-                                       Statistics* /* statistics */,
-                                       bool /* using_zstd */,
-                                       const FilterPolicy* filter_policy) {
-    return new ParsedFullFilterBlock(filter_policy, std::move(contents));
-  }
-
-  static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) {
-    return 0;
-  }
-};
-
-template <>
-class BlocklikeTraits<Block> {
- public:
-  static Block* Create(BlockContents&& contents, size_t read_amp_bytes_per_bit,
-                       Statistics* statistics, bool /* using_zstd */,
-                       const FilterPolicy* /* filter_policy */) {
-    return new Block(std::move(contents), read_amp_bytes_per_bit, statistics);
-  }
-
-  static uint32_t GetNumRestarts(const Block& block) {
-    return block.NumRestarts();
-  }
-};
-
-template <>
-class BlocklikeTraits<UncompressionDict> {
- public:
-  static UncompressionDict* Create(BlockContents&& contents,
-                                   size_t /* read_amp_bytes_per_bit */,
-                                   Statistics* /* statistics */,
-                                   bool using_zstd,
-                                   const FilterPolicy* /* filter_policy */) {
-    return new UncompressionDict(contents.data, std::move(contents.allocation),
-                                 using_zstd);
-  }
-
-  static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) {
-    return 0;
-  }
-};
-
 namespace {
 // Read the block identified by "handle" from "file".
 // The only relevant option is options.verify_checksums for now.
@@ -159,7 +87,7 @@ template <typename TBlocklike>
 Status ReadBlockFromFile(
     RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
     const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
-    std::unique_ptr<TBlocklike>* result, const ImmutableCFOptions& ioptions,
+    std::unique_ptr<TBlocklike>* result, const ImmutableOptions& ioptions,
     bool do_uncompress, bool maybe_compressed, BlockType block_type,
     const UncompressionDict& uncompression_dict,
     const PersistentCacheOptions& cache_options, size_t read_amp_bytes_per_bit,
@@ -175,20 +103,13 @@ Status ReadBlockFromFile(
   Status s = block_fetcher.ReadBlockContents();
   if (s.ok()) {
     result->reset(BlocklikeTraits<TBlocklike>::Create(
-        std::move(contents), read_amp_bytes_per_bit, ioptions.statistics,
-        using_zstd, filter_policy));
+        std::move(contents), read_amp_bytes_per_bit, ioptions.stats, using_zstd,
+        filter_policy));
   }
 
   return s;
 }
 
-// Delete the entry resided in the cache.
-template <class Entry>
-void DeleteCachedEntry(const Slice& /*key*/, void* value) {
-  auto entry = reinterpret_cast<Entry*>(value);
-  delete entry;
-}
-
 // Release the cached entry and decrement its ref count.
 // Do not force erase
 void ReleaseCachedEntry(void* arg, void* h) {
@@ -230,7 +151,7 @@ CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) {
 void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type,
                                             GetContext* get_context,
                                             size_t usage) const {
-  Statistics* const statistics = rep_->ioptions.statistics;
+  Statistics* const statistics = rep_->ioptions.stats;
 
   PERF_COUNTER_ADD(block_cache_hit_count, 1);
   PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
@@ -288,7 +209,7 @@ void BlockBasedTable::UpdateCacheHitMetrics(BlockType block_type,
 
 void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type,
                                              GetContext* get_context) const {
-  Statistics* const statistics = rep_->ioptions.statistics;
+  Statistics* const statistics = rep_->ioptions.stats;
 
   // TODO: introduce aggregate (not per-level) block cache miss count
   PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
@@ -338,12 +259,9 @@ void BlockBasedTable::UpdateCacheMissMetrics(BlockType block_type,
   }
 }
 
-void BlockBasedTable::UpdateCacheInsertionMetrics(BlockType block_type,
-                                                  GetContext* get_context,
-                                                  size_t usage,
-                                                  bool redundant) const {
-  Statistics* const statistics = rep_->ioptions.statistics;
-
+void BlockBasedTable::UpdateCacheInsertionMetrics(
+    BlockType block_type, GetContext* get_context, size_t usage, bool redundant,
+    Statistics* const statistics) {
   // TODO: introduce perf counters for block cache insertions
   if (get_context) {
     ++get_context->get_context_stats_.num_cache_add;
@@ -432,9 +350,12 @@ void BlockBasedTable::UpdateCacheInsertionMetrics(BlockType block_type,
 }
 
 Cache::Handle* BlockBasedTable::GetEntryFromCache(
-    Cache* block_cache, const Slice& key, BlockType block_type,
-    GetContext* get_context) const {
-  auto cache_handle = block_cache->Lookup(key, rep_->ioptions.statistics);
+    Cache* block_cache, const Slice& key, BlockType block_type, const bool wait,
+    GetContext* get_context, const Cache::CacheItemHelper* cache_helper,
+    const Cache::CreateCallback& create_cb, Cache::Priority priority) const {
+  auto cache_handle =
+      block_cache->Lookup(key, cache_helper, create_cb, priority, wait,
+                          rep_->ioptions.statistics.get());
 
   if (cache_handle != nullptr) {
     UpdateCacheHitMetrics(block_type, get_context,
@@ -447,26 +368,29 @@ Cache::Handle* BlockBasedTable::GetEntryFromCache(
 }
 
 // Helper function to setup the cache key's prefix for the Table.
-void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) {
+void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep,
+                                          const std::string& db_session_id,
+                                          uint64_t cur_file_num) {
   assert(kMaxCacheKeyPrefixSize >= 10);
   rep->cache_key_prefix_size = 0;
   rep->compressed_cache_key_prefix_size = 0;
   if (rep->table_options.block_cache != nullptr) {
     GenerateCachePrefix<Cache, FSRandomAccessFile>(
         rep->table_options.block_cache.get(), rep->file->file(),
-        &rep->cache_key_prefix[0], &rep->cache_key_prefix_size);
+        &rep->cache_key_prefix[0], &rep->cache_key_prefix_size, db_session_id,
+        cur_file_num);
   }
   if (rep->table_options.persistent_cache != nullptr) {
     GenerateCachePrefix<PersistentCache, FSRandomAccessFile>(
         rep->table_options.persistent_cache.get(), rep->file->file(),
         &rep->persistent_cache_key_prefix[0],
-        &rep->persistent_cache_key_prefix_size);
+        &rep->persistent_cache_key_prefix_size, "", cur_file_num);
   }
   if (rep->table_options.block_cache_compressed != nullptr) {
     GenerateCachePrefix<Cache, FSRandomAccessFile>(
         rep->table_options.block_cache_compressed.get(), rep->file->file(),
         &rep->compressed_cache_key_prefix[0],
-        &rep->compressed_cache_key_prefix_size);
+        &rep->compressed_cache_key_prefix_size, "", cur_file_num);
   }
 }
 
@@ -577,7 +501,7 @@ Slice BlockBasedTable::GetCacheKey(const char* cache_key_prefix,
 }
 
 Status BlockBasedTable::Open(
-    const ReadOptions& read_options, const ImmutableCFOptions& ioptions,
+    const ReadOptions& read_options, const ImmutableOptions& ioptions,
     const EnvOptions& env_options, const BlockBasedTableOptions& table_options,
     const InternalKeyComparator& internal_comparator,
     std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
@@ -588,7 +512,8 @@ Status BlockBasedTable::Open(
     const SequenceNumber largest_seqno, const bool force_direct_prefetch,
     TailPrefetchStats* tail_prefetch_stats,
     BlockCacheTracer* const block_cache_tracer,
-    size_t max_file_size_for_l0_meta_pin) {
+    size_t max_file_size_for_l0_meta_pin, const std::string& db_session_id,
+    uint64_t cur_file_num) {
   table_reader->reset();
 
   Status s;
@@ -612,6 +537,10 @@ Status BlockBasedTable::Open(
     s = PrefetchTail(ro, file.get(), file_size, force_direct_prefetch,
                      tail_prefetch_stats, prefetch_all, preload_all,
                      &prefetch_buffer);
+    // Return error in prefetch path to users.
+    if (!s.ok()) {
+      return s;
+    }
   } else {
     // Should not prefetch for mmap mode.
     prefetch_buffer.reset(new FilePrefetchBuffer(
@@ -627,7 +556,7 @@ Status BlockBasedTable::Open(
   //    6. [meta block: index]
   //    7. [meta block: filter]
   IOOptions opts;
-  s = PrepareIOFromReadOptions(ro, file->env(), opts);
+  s = file->PrepareIOOptions(ro, opts);
   if (s.ok()) {
     s = ReadFooterFromFile(opts, file.get(), prefetch_buffer.get(), file_size,
                            &footer, kBlockBasedTableMagicNumber);
@@ -658,7 +587,7 @@ Status BlockBasedTable::Open(
     rep->internal_prefix_transform.reset(
         new InternalKeySliceTransform(prefix_extractor));
   }
-  SetupCacheKeyPrefix(rep);
+  SetupCacheKeyPrefix(rep, db_session_id, cur_file_num);
   std::unique_ptr<BlockBasedTable> new_table(
       new BlockBasedTable(rep, block_cache_tracer));
 
@@ -667,7 +596,7 @@ Status BlockBasedTable::Open(
       PersistentCacheOptions(rep->table_options.persistent_cache,
                              std::string(rep->persistent_cache_key_prefix,
                                          rep->persistent_cache_key_prefix_size),
-                             rep->ioptions.statistics);
+                             rep->ioptions.stats);
 
   // Meta-blocks are not dictionary compressed. Explicitly set the dictionary
   // handle to null, otherwise it may be seen as uninitialized during the below
@@ -760,7 +689,7 @@ Status BlockBasedTable::PrefetchTail(
   // Use `FilePrefetchBuffer`
   prefetch_buffer->reset(new FilePrefetchBuffer(nullptr, 0, 0, true, true));
   IOOptions opts;
-  Status s = PrepareIOFromReadOptions(ro, file->env(), opts);
+  Status s = file->PrepareIOOptions(ro, opts);
   if (s.ok()) {
     s = (*prefetch_buffer)->Prefetch(opts, file, prefetch_off, prefetch_len);
   }
@@ -810,7 +739,7 @@ Status BlockBasedTable::ReadPropertiesBlock(
   s = SeekToPropertiesBlock(meta_iter, &found_properties_block);
 
   if (!s.ok()) {
-    ROCKS_LOG_WARN(rep_->ioptions.info_log,
+    ROCKS_LOG_WARN(rep_->ioptions.logger,
                    "Error when seeking to properties block from file: %s",
                    s.ToString().c_str());
   } else if (found_properties_block) {
@@ -837,7 +766,7 @@ Status BlockBasedTable::ReadPropertiesBlock(
     }
 
     if (!s.ok()) {
-      ROCKS_LOG_WARN(rep_->ioptions.info_log,
+      ROCKS_LOG_WARN(rep_->ioptions.logger,
                      "Encountered error while reading data from properties "
                      "block %s",
                      s.ToString().c_str());
@@ -854,7 +783,7 @@ Status BlockBasedTable::ReadPropertiesBlock(
                CompressionTypeToString(kZSTDNotFinalCompression));
     }
   } else {
-    ROCKS_LOG_ERROR(rep_->ioptions.info_log,
+    ROCKS_LOG_ERROR(rep_->ioptions.logger,
                     "Cannot find Properties block from file.");
   }
 #ifndef ROCKSDB_LITE
@@ -869,11 +798,10 @@ Status BlockBasedTable::ReadPropertiesBlock(
     rep_->whole_key_filtering &=
         IsFeatureSupported(*(rep_->table_properties),
                            BlockBasedTablePropertyNames::kWholeKeyFiltering,
-                           rep_->ioptions.info_log);
-    rep_->prefix_filtering &=
-        IsFeatureSupported(*(rep_->table_properties),
-                           BlockBasedTablePropertyNames::kPrefixFiltering,
-                           rep_->ioptions.info_log);
+                           rep_->ioptions.logger);
+    rep_->prefix_filtering &= IsFeatureSupported(
+        *(rep_->table_properties),
+        BlockBasedTablePropertyNames::kPrefixFiltering, rep_->ioptions.logger);
 
     rep_->index_key_includes_seq =
         rep_->table_properties->index_key_is_user_key == 0;
@@ -896,7 +824,7 @@ Status BlockBasedTable::ReadPropertiesBlock(
     s = GetGlobalSequenceNumber(*(rep_->table_properties), largest_seqno,
                                 &(rep_->global_seqno));
     if (!s.ok()) {
-      ROCKS_LOG_ERROR(rep_->ioptions.info_log, "%s", s.ToString().c_str());
+      ROCKS_LOG_ERROR(rep_->ioptions.logger, "%s", s.ToString().c_str());
     }
   }
   return s;
@@ -913,7 +841,7 @@ Status BlockBasedTable::ReadRangeDelBlock(
   s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle);
   if (!s.ok()) {
     ROCKS_LOG_WARN(
-        rep_->ioptions.info_log,
+        rep_->ioptions.logger,
         "Error when seeking to range delete tombstones block from file: %s",
         s.ToString().c_str());
   } else if (found_range_del_block && !range_del_handle.IsNull()) {
@@ -925,7 +853,7 @@ Status BlockBasedTable::ReadRangeDelBlock(
     s = iter->status();
     if (!s.ok()) {
       ROCKS_LOG_WARN(
-          rep_->ioptions.info_log,
+          rep_->ioptions.logger,
           "Encountered error while reading data from range del block %s",
           s.ToString().c_str());
       IGNORE_STATUS_IF_ERROR(s);
@@ -974,6 +902,9 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
       }
     }
   }
+  // Partition filters cannot be enabled without partition indexes
+  assert(rep_->filter_type != Rep::FilterType::kPartitionedFilter ||
+         rep_->index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
 
   // Find compression dictionary handle
   bool found_compression_dict = false;
@@ -987,20 +918,53 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
 
   const bool use_cache = table_options.cache_index_and_filter_blocks;
 
-  // pin both index and filters, down to all partitions.
-  const bool pin_all =
-      rep_->table_options.pin_l0_filter_and_index_blocks_in_cache &&
+  const bool maybe_flushed =
       level == 0 && file_size <= max_file_size_for_l0_meta_pin;
+  std::function<bool(PinningTier, PinningTier)> is_pinned =
+      [maybe_flushed, &is_pinned](PinningTier pinning_tier,
+                                  PinningTier fallback_pinning_tier) {
+        // Fallback to fallback would lead to infinite recursion. Disallow it.
+        assert(fallback_pinning_tier != PinningTier::kFallback);
+
+        switch (pinning_tier) {
+          case PinningTier::kFallback:
+            return is_pinned(fallback_pinning_tier,
+                             PinningTier::kNone /* fallback_pinning_tier */);
+          case PinningTier::kNone:
+            return false;
+          case PinningTier::kFlushedAndSimilar:
+            return maybe_flushed;
+          case PinningTier::kAll:
+            return true;
+        };
+
+        // In GCC, this is needed to suppress `control reaches end of non-void
+        // function [-Werror=return-type]`.
+        assert(false);
+        return false;
+      };
+  const bool pin_top_level_index = is_pinned(
+      table_options.metadata_cache_options.top_level_index_pinning,
+      table_options.pin_top_level_index_and_filter ? PinningTier::kAll
+                                                   : PinningTier::kNone);
+  const bool pin_partition =
+      is_pinned(table_options.metadata_cache_options.partition_pinning,
+                table_options.pin_l0_filter_and_index_blocks_in_cache
+                    ? PinningTier::kFlushedAndSimilar
+                    : PinningTier::kNone);
+  const bool pin_unpartitioned =
+      is_pinned(table_options.metadata_cache_options.unpartitioned_pinning,
+                table_options.pin_l0_filter_and_index_blocks_in_cache
+                    ? PinningTier::kFlushedAndSimilar
+                    : PinningTier::kNone);
 
-  // prefetch the first level of index
-  const bool prefetch_index =
-      prefetch_all ||
-      (table_options.pin_top_level_index_and_filter &&
-       index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
   // pin the first level of index
   const bool pin_index =
-      pin_all || (table_options.pin_top_level_index_and_filter &&
-                  index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
+      index_type == BlockBasedTableOptions::kTwoLevelIndexSearch
+          ? pin_top_level_index
+          : pin_unpartitioned;
+  // prefetch the first level of index
+  const bool prefetch_index = prefetch_all || pin_index;
 
   std::unique_ptr<IndexReader> index_reader;
   s = new_table->CreateIndexReader(ro, prefetch_buffer, meta_iter, use_cache,
@@ -1015,44 +979,43 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
   // The partitions of partitioned index are always stored in cache. They
   // are hence follow the configuration for pin and prefetch regardless of
   // the value of cache_index_and_filter_blocks
-  if (prefetch_all) {
-    s = rep_->index_reader->CacheDependencies(ro, pin_all);
+  if (prefetch_all || pin_partition) {
+    s = rep_->index_reader->CacheDependencies(ro, pin_partition);
   }
   if (!s.ok()) {
     return s;
   }
 
-  // prefetch the first level of filter
-  const bool prefetch_filter =
-      prefetch_all ||
-      (table_options.pin_top_level_index_and_filter &&
-       rep_->filter_type == Rep::FilterType::kPartitionedFilter);
-  // Partition fitlers cannot be enabled without partition indexes
-  assert(!prefetch_filter || prefetch_index);
   // pin the first level of filter
   const bool pin_filter =
-      pin_all || (table_options.pin_top_level_index_and_filter &&
-                  rep_->filter_type == Rep::FilterType::kPartitionedFilter);
+      rep_->filter_type == Rep::FilterType::kPartitionedFilter
+          ? pin_top_level_index
+          : pin_unpartitioned;
+  // prefetch the first level of filter
+  const bool prefetch_filter = prefetch_all || pin_filter;
 
   if (rep_->filter_policy) {
     auto filter = new_table->CreateFilterBlockReader(
         ro, prefetch_buffer, use_cache, prefetch_filter, pin_filter,
         lookup_context);
+
     if (filter) {
       // Refer to the comment above about paritioned indexes always being cached
-      if (prefetch_all) {
-        filter->CacheDependencies(ro, pin_all);
+      if (prefetch_all || pin_partition) {
+        s = filter->CacheDependencies(ro, pin_partition);
+        if (!s.ok()) {
+          return s;
+        }
       }
-
       rep_->filter = std::move(filter);
     }
   }
 
   if (!rep_->compression_dict_handle.IsNull()) {
     std::unique_ptr<UncompressionDictReader> uncompression_dict_reader;
-    s = UncompressionDictReader::Create(this, ro, prefetch_buffer, use_cache,
-                                        prefetch_all, pin_all, lookup_context,
-                                        &uncompression_dict_reader);
+    s = UncompressionDictReader::Create(
+        this, ro, prefetch_buffer, use_cache, prefetch_all || pin_unpartitioned,
+        pin_unpartitioned, lookup_context, &uncompression_dict_reader);
     if (!s.ok()) {
       return s;
     }
@@ -1121,7 +1084,7 @@ Status BlockBasedTable::ReadMetaIndexBlock(
       nullptr /* filter_policy */);
 
   if (!s.ok()) {
-    ROCKS_LOG_ERROR(rep_->ioptions.info_log,
+    ROCKS_LOG_ERROR(rep_->ioptions.logger,
                     "Encountered error while reading data from properties"
                     " block %s",
                     s.ToString().c_str());
@@ -1141,22 +1104,36 @@ Status BlockBasedTable::GetDataBlockFromCache(
     Cache* block_cache, Cache* block_cache_compressed,
     const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
     const UncompressionDict& uncompression_dict, BlockType block_type,
-    GetContext* get_context) const {
+    const bool wait, GetContext* get_context) const {
   const size_t read_amp_bytes_per_bit =
       block_type == BlockType::kData
           ? rep_->table_options.read_amp_bytes_per_bit
           : 0;
   assert(block);
   assert(block->IsEmpty());
+  const Cache::Priority priority =
+      rep_->table_options.cache_index_and_filter_blocks_with_high_priority &&
+              (block_type == BlockType::kFilter ||
+               block_type == BlockType::kCompressionDictionary ||
+               block_type == BlockType::kIndex)
+          ? Cache::Priority::HIGH
+          : Cache::Priority::LOW;
 
   Status s;
   BlockContents* compressed_block = nullptr;
   Cache::Handle* block_cache_compressed_handle = nullptr;
+  Statistics* statistics = rep_->ioptions.statistics.get();
+  bool using_zstd = rep_->blocks_definitely_zstd_compressed;
+  const FilterPolicy* filter_policy = rep_->filter_policy;
+  Cache::CreateCallback create_cb = GetCreateCallback<TBlocklike>(
+      read_amp_bytes_per_bit, statistics, using_zstd, filter_policy);
 
   // Lookup uncompressed cache first
   if (block_cache != nullptr) {
-    auto cache_handle = GetEntryFromCache(block_cache, block_cache_key,
-                                          block_type, get_context);
+    auto cache_handle = GetEntryFromCache(
+        block_cache, block_cache_key, block_type, wait, get_context,
+        BlocklikeTraits<TBlocklike>::GetCacheItemHelper(block_type), create_cb,
+        priority);
     if (cache_handle != nullptr) {
       block->SetCachedValue(
           reinterpret_cast<TBlocklike*>(block_cache->Value(cache_handle)),
@@ -1173,10 +1150,13 @@ Status BlockBasedTable::GetDataBlockFromCache(
   }
 
   assert(!compressed_block_cache_key.empty());
-  block_cache_compressed_handle =
-      block_cache_compressed->Lookup(compressed_block_cache_key);
-
-  Statistics* statistics = rep_->ioptions.statistics;
+  BlockContents contents;
+  Cache::CreateCallback create_cb_special = GetCreateCallback<BlockContents>(
+      read_amp_bytes_per_bit, statistics, using_zstd, filter_policy);
+  block_cache_compressed_handle = block_cache_compressed->Lookup(
+      compressed_block_cache_key,
+      BlocklikeTraits<BlockContents>::GetCacheItemHelper(block_type),
+      create_cb_special, priority, true);
 
   // if we found in the compressed cache, then uncompress and insert into
   // uncompressed cache
@@ -1193,7 +1173,6 @@ Status BlockBasedTable::GetDataBlockFromCache(
   assert(compression_type != kNoCompression);
 
   // Retrieve the uncompressed contents into a new buffer
-  BlockContents contents;
   UncompressionContext context(compression_type);
   UncompressionInfo info(context, uncompression_dict, compression_type);
   s = UncompressBlockContents(
@@ -1201,7 +1180,8 @@ Status BlockBasedTable::GetDataBlockFromCache(
       &contents, rep_->table_options.format_version, rep_->ioptions,
       GetMemoryAllocator(rep_->table_options));
 
-  // Insert uncompressed block into block cache
+  // Insert uncompressed block into block cache, the priority is based on the
+  // data block type.
   if (s.ok()) {
     std::unique_ptr<TBlocklike> block_holder(
         BlocklikeTraits<TBlocklike>::Create(
@@ -1213,15 +1193,17 @@ Status BlockBasedTable::GetDataBlockFromCache(
         read_options.fill_cache) {
       size_t charge = block_holder->ApproximateMemoryUsage();
       Cache::Handle* cache_handle = nullptr;
-      s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
-                              &DeleteCachedEntry<TBlocklike>, &cache_handle);
+      s = block_cache->Insert(
+          block_cache_key, block_holder.get(),
+          BlocklikeTraits<TBlocklike>::GetCacheItemHelper(block_type), charge,
+          &cache_handle, priority);
       if (s.ok()) {
         assert(cache_handle != nullptr);
         block->SetCachedValue(block_holder.release(), block_cache,
                               cache_handle);
 
         UpdateCacheInsertionMetrics(block_type, get_context, charge,
-                                    s.IsOkOverwritten());
+                                    s.IsOkOverwritten(), rep_->ioptions.stats);
       } else {
         RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
       }
@@ -1244,7 +1226,7 @@ Status BlockBasedTable::PutDataBlockToCache(
     const UncompressionDict& uncompression_dict,
     MemoryAllocator* memory_allocator, BlockType block_type,
     GetContext* get_context) const {
-  const ImmutableCFOptions& ioptions = rep_->ioptions;
+  const ImmutableOptions& ioptions = rep_->ioptions;
   const uint32_t format_version = rep_->table_options.format_version;
   const size_t read_amp_bytes_per_bit =
       block_type == BlockType::kData
@@ -1261,7 +1243,7 @@ Status BlockBasedTable::PutDataBlockToCache(
   assert(cached_block->IsEmpty());
 
   Status s;
-  Statistics* statistics = ioptions.statistics;
+  Statistics* statistics = ioptions.stats;
 
   std::unique_ptr<TBlocklike> block_holder;
   if (raw_block_comp_type != kNoCompression) {
@@ -1303,8 +1285,8 @@ Status BlockBasedTable::PutDataBlockToCache(
         new BlockContents(std::move(*raw_block_contents));
     s = block_cache_compressed->Insert(
         compressed_block_cache_key, block_cont_for_comp_cache,
-        block_cont_for_comp_cache->ApproximateMemoryUsage(),
-        &DeleteCachedEntry<BlockContents>);
+        BlocklikeTraits<BlockContents>::GetCacheItemHelper(block_type),
+        block_cont_for_comp_cache->ApproximateMemoryUsage());
     if (s.ok()) {
       // Avoid the following code to delete this cached block.
       RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
@@ -1318,16 +1300,17 @@ Status BlockBasedTable::PutDataBlockToCache(
   if (block_cache != nullptr && block_holder->own_bytes()) {
     size_t charge = block_holder->ApproximateMemoryUsage();
     Cache::Handle* cache_handle = nullptr;
-    s = block_cache->Insert(block_cache_key, block_holder.get(), charge,
-                            &DeleteCachedEntry<TBlocklike>, &cache_handle,
-                            priority);
+    s = block_cache->Insert(
+        block_cache_key, block_holder.get(),
+        BlocklikeTraits<TBlocklike>::GetCacheItemHelper(block_type), charge,
+        &cache_handle, priority);
     if (s.ok()) {
       assert(cache_handle != nullptr);
       cached_block->SetCachedValue(block_holder.release(), block_cache,
                                    cache_handle);
 
       UpdateCacheInsertionMetrics(block_type, get_context, charge,
-                                  s.IsOkOverwritten());
+                                  s.IsOkOverwritten(), rep_->ioptions.stats);
     } else {
       RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
     }
@@ -1392,8 +1375,7 @@ DataBlockIter* BlockBasedTable::InitBlockIterator<DataBlockIter>(
     DataBlockIter* input_iter, bool block_contents_pinned) {
   return block->NewDataIterator(rep->internal_comparator.user_comparator(),
                                 rep->get_global_seqno(block_type), input_iter,
-                                rep->ioptions.statistics,
-                                block_contents_pinned);
+                                rep->ioptions.stats, block_contents_pinned);
 }
 
 template <>
@@ -1402,7 +1384,7 @@ IndexBlockIter* BlockBasedTable::InitBlockIterator<IndexBlockIter>(
     IndexBlockIter* input_iter, bool block_contents_pinned) {
   return block->NewIndexIterator(
       rep->internal_comparator.user_comparator(),
-      rep->get_global_seqno(block_type), input_iter, rep->ioptions.statistics,
+      rep->get_global_seqno(block_type), input_iter, rep->ioptions.stats,
       /* total_order_seek */ true, rep->index_has_first_key,
       rep->index_key_includes_seq, rep->index_value_is_full,
       block_contents_pinned);
@@ -1417,9 +1399,9 @@ template <typename TBlocklike>
 Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
-    CachableEntry<TBlocklike>* block_entry, BlockType block_type,
-    GetContext* get_context, BlockCacheLookupContext* lookup_context,
-    BlockContents* contents) const {
+    const bool wait, CachableEntry<TBlocklike>* block_entry,
+    BlockType block_type, GetContext* get_context,
+    BlockCacheLookupContext* lookup_context, BlockContents* contents) const {
   assert(block_entry != nullptr);
   const bool no_io = (ro.read_tier == kBlockCacheTier);
   Cache* block_cache = rep_->table_options.block_cache.get();
@@ -1451,18 +1433,28 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
     if (!contents) {
       s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
                                 ro, block_entry, uncompression_dict, block_type,
-                                get_context);
-      if (block_entry->GetValue()) {
+                                wait, get_context);
+      // Value could still be null at this point, so check the cache handle
+      // and update the read pattern for prefetching
+      if (block_entry->GetValue() || block_entry->GetCacheHandle()) {
         // TODO(haoyu): Differentiate cache hit on uncompressed block cache and
         // compressed block cache.
         is_cache_hit = true;
+        if (prefetch_buffer) {
+          // Update the block details so that PrefetchBuffer can use the read
+          // pattern to determine if reads are sequential or not for
+          // prefetching. It should also take in account blocks read from cache.
+          prefetch_buffer->UpdateReadPattern(handle.offset(),
+                                             block_size(handle));
+        }
       }
     }
 
     // Can't find the block from the cache. If I/O is allowed, read from the
     // file.
-    if (block_entry->GetValue() == nullptr && !no_io && ro.fill_cache) {
-      Statistics* statistics = rep_->ioptions.statistics;
+    if (block_entry->GetValue() == nullptr &&
+        block_entry->GetCacheHandle() == nullptr && !no_io && ro.fill_cache) {
+      Statistics* statistics = rep_->ioptions.stats;
       const bool maybe_compressed =
           block_type != BlockType::kFilter &&
           block_type != BlockType::kCompressionDictionary &&
@@ -1471,7 +1463,7 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
       CompressionType raw_block_comp_type;
       BlockContents raw_block_contents;
       if (!contents) {
-        StopWatch sw(rep_->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
+        StopWatch sw(rep_->ioptions.clock, statistics, READ_BLOCK_GET_MICROS);
         BlockFetcher block_fetcher(
             rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
             &raw_block_contents, rep_->ioptions, do_uncompress,
@@ -1560,7 +1552,7 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
       // Avoid making copy of block_key and cf_name when constructing the access
       // record.
       BlockCacheTraceRecord access_record(
-          rep_->ioptions.env->NowMicros(),
+          rep_->ioptions.clock->NowMicros(),
           /*block_key=*/"", trace_block_type,
           /*block_size=*/usage, rep_->cf_id_for_tracing(),
           /*cf_name=*/"", rep_->level_for_tracing(),
@@ -1605,7 +1597,7 @@ void BlockBasedTable::RetrieveMultipleBlocks(
     char* scratch, const UncompressionDict& uncompression_dict) const {
   RandomAccessFileReader* file = rep_->file.get();
   const Footer& footer = rep_->footer;
-  const ImmutableCFOptions& ioptions = rep_->ioptions;
+  const ImmutableOptions& ioptions = rep_->ioptions;
   size_t read_amp_bytes_per_bit = rep_->table_options.read_amp_bytes_per_bit;
   MemoryAllocator* memory_allocator = GetMemoryAllocator(rep_->table_options);
 
@@ -1624,7 +1616,8 @@ void BlockBasedTable::RetrieveMultipleBlocks(
           RetrieveBlock(nullptr, options, handle, uncompression_dict,
                         &(*results)[idx_in_batch], BlockType::kData,
                         mget_iter->get_context, &lookup_data_block_context,
-                        /* for_compaction */ false, /* use_cache */ true);
+                        /* for_compaction */ false, /* use_cache */ true,
+                        /* wait_for_cache */ true);
     }
     return;
   }
@@ -1703,7 +1696,7 @@ void BlockBasedTable::RetrieveMultipleBlocks(
   AlignedBuf direct_io_buf;
   {
     IOOptions opts;
-    IOStatus s = PrepareIOFromReadOptions(options, file->env(), opts);
+    IOStatus s = file->PrepareIOOptions(options, opts);
     if (s.IsTimedOut()) {
       for (FSReadRequest& req : read_reqs) {
         req.status = s;
@@ -1731,6 +1724,9 @@ void BlockBasedTable::RetrieveMultipleBlocks(
     size_t& req_idx = req_idx_for_block[valid_batch_idx];
     size_t& req_offset = req_offset_for_block[valid_batch_idx];
     valid_batch_idx++;
+    if (mget_iter->get_context) {
+      ++(mget_iter->get_context->get_context_stats_.num_data_read);
+    }
     FSReadRequest& req = read_reqs[req_idx];
     Status s = req.status;
     if (s.ok()) {
@@ -1784,13 +1780,21 @@ void BlockBasedTable::RetrieveMultipleBlocks(
 
     if (s.ok()) {
       // When the blocks share the same underlying buffer (scratch or direct io
-      // buffer), if the block is compressed, the shared buffer will be
-      // uncompressed into heap during uncompressing; otherwise, we need to
-      // manually copy the block into heap before inserting the block to block
-      // cache.
+      // buffer), we may need to manually copy the block into heap if the raw
+      // block has to be inserted into a cache. That falls into th following
+      // cases -
+      // 1. Raw block is not compressed, it needs to be inserted into the
+      //    uncompressed block cache if there is one
+      // 2. If the raw block is compressed, it needs to be inserted into the
+      //    compressed block cache if there is one
+      //
+      // In all other cases, the raw block is either uncompressed into a heap
+      // buffer or there is no cache at all.
       CompressionType compression_type =
           raw_block_contents.get_compression_type();
-      if (use_shared_buffer && compression_type == kNoCompression) {
+      if (use_shared_buffer && (compression_type == kNoCompression ||
+                                (compression_type != kNoCompression &&
+                                 rep_->table_options.block_cache_compressed))) {
         Slice raw = Slice(req.result.data() + req_offset, block_size(handle));
         raw_block_contents = BlockContents(
             CopyBufferToHeap(GetMemoryAllocator(rep_->table_options), raw),
@@ -1810,8 +1814,8 @@ void BlockBasedTable::RetrieveMultipleBlocks(
         // necessary. Since we're passing the raw block contents, it will
         // avoid looking up the block cache
         s = MaybeReadBlockAndLoadToCache(
-            nullptr, options, handle, uncompression_dict, block_entry,
-            BlockType::kData, mget_iter->get_context,
+            nullptr, options, handle, uncompression_dict, /*wait=*/true,
+            block_entry, BlockType::kData, mget_iter->get_context,
             &lookup_data_block_context, &raw_block_contents);
 
         // block_entry value could be null if no block cache is present, i.e
@@ -1845,7 +1849,7 @@ void BlockBasedTable::RetrieveMultipleBlocks(
       }
       if (s.ok()) {
         (*results)[idx_in_batch].SetOwnedValue(new Block(
-            std::move(contents), read_amp_bytes_per_bit, ioptions.statistics));
+            std::move(contents), read_amp_bytes_per_bit, ioptions.stats));
       }
     }
     (*statuses)[idx_in_batch] = s;
@@ -1858,22 +1862,23 @@ Status BlockBasedTable::RetrieveBlock(
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
     CachableEntry<TBlocklike>* block_entry, BlockType block_type,
     GetContext* get_context, BlockCacheLookupContext* lookup_context,
-    bool for_compaction, bool use_cache) const {
+    bool for_compaction, bool use_cache, bool wait_for_cache) const {
   assert(block_entry);
   assert(block_entry->IsEmpty());
 
   Status s;
   if (use_cache) {
-    s = MaybeReadBlockAndLoadToCache(prefetch_buffer, ro, handle,
-                                     uncompression_dict, block_entry,
-                                     block_type, get_context, lookup_context,
-                                     /*contents=*/nullptr);
+    s = MaybeReadBlockAndLoadToCache(
+        prefetch_buffer, ro, handle, uncompression_dict, wait_for_cache,
+        block_entry, block_type, get_context, lookup_context,
+        /*contents=*/nullptr);
 
     if (!s.ok()) {
       return s;
     }
 
-    if (block_entry->GetValue() != nullptr) {
+    if (block_entry->GetValue() != nullptr ||
+        block_entry->GetCacheHandle() != nullptr) {
       assert(s.ok());
       return s;
     }
@@ -1894,7 +1899,7 @@ Status BlockBasedTable::RetrieveBlock(
   std::unique_ptr<TBlocklike> block;
 
   {
-    StopWatch sw(rep_->ioptions.env, rep_->ioptions.statistics,
+    StopWatch sw(rep_->ioptions.clock, rep_->ioptions.stats,
                  READ_BLOCK_GET_MICROS);
     s = ReadBlockFromFile(
         rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
@@ -1941,28 +1946,28 @@ template Status BlockBasedTable::RetrieveBlock<BlockContents>(
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
     CachableEntry<BlockContents>* block_entry, BlockType block_type,
     GetContext* get_context, BlockCacheLookupContext* lookup_context,
-    bool for_compaction, bool use_cache) const;
+    bool for_compaction, bool use_cache, bool wait_for_cache) const;
 
 template Status BlockBasedTable::RetrieveBlock<ParsedFullFilterBlock>(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
     CachableEntry<ParsedFullFilterBlock>* block_entry, BlockType block_type,
     GetContext* get_context, BlockCacheLookupContext* lookup_context,
-    bool for_compaction, bool use_cache) const;
+    bool for_compaction, bool use_cache, bool wait_for_cache) const;
 
 template Status BlockBasedTable::RetrieveBlock<Block>(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
     CachableEntry<Block>* block_entry, BlockType block_type,
     GetContext* get_context, BlockCacheLookupContext* lookup_context,
-    bool for_compaction, bool use_cache) const;
+    bool for_compaction, bool use_cache, bool wait_for_cache) const;
 
 template Status BlockBasedTable::RetrieveBlock<UncompressionDict>(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
     CachableEntry<UncompressionDict>* block_entry, BlockType block_type,
     GetContext* get_context, BlockCacheLookupContext* lookup_context,
-    bool for_compaction, bool use_cache) const;
+    bool for_compaction, bool use_cache, bool wait_for_cache) const;
 
 BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
     const BlockBasedTable* table,
@@ -1990,6 +1995,7 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
         rep->index_value_is_full);
   }
   // Create an empty iterator
+  // TODO(ajkr): this is not the right way to handle an unpinned partition.
   return new IndexBlockIter();
 }
 
@@ -2026,8 +2032,10 @@ bool BlockBasedTable::PrefixMayMatch(
   } else {
     prefix_extractor = rep_->table_prefix_extractor.get();
   }
-  auto user_key = ExtractUserKey(internal_key);
-  if (!prefix_extractor->InDomain(user_key)) {
+  auto ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size();
+  auto user_key_without_ts =
+      ExtractUserKeyAndStripTimestamp(internal_key, ts_sz);
+  if (!prefix_extractor->InDomain(user_key_without_ts)) {
     return true;
   }
 
@@ -2042,15 +2050,16 @@ bool BlockBasedTable::PrefixMayMatch(
     if (!filter->IsBlockBased()) {
       const Slice* const const_ikey_ptr = &internal_key;
       may_match = filter->RangeMayExist(
-          read_options.iterate_upper_bound, user_key, prefix_extractor,
-          rep_->internal_comparator.user_comparator(), const_ikey_ptr,
-          &filter_checked, need_upper_bound_check, no_io, lookup_context);
+          read_options.iterate_upper_bound, user_key_without_ts,
+          prefix_extractor, rep_->internal_comparator.user_comparator(),
+          const_ikey_ptr, &filter_checked, need_upper_bound_check, no_io,
+          lookup_context);
     } else {
       // if prefix_extractor changed for block based filter, skip filter
       if (need_upper_bound_check) {
         return true;
       }
-      auto prefix = prefix_extractor->Transform(user_key);
+      auto prefix = prefix_extractor->Transform(user_key_without_ts);
       InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue);
       auto internal_prefix = internal_key_prefix.Encode();
 
@@ -2104,7 +2113,7 @@ bool BlockBasedTable::PrefixMayMatch(
   }
 
   if (filter_checked) {
-    Statistics* statistics = rep_->ioptions.statistics;
+    Statistics* statistics = rep_->ioptions.stats;
     RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
     if (!may_match) {
       RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
@@ -2170,25 +2179,24 @@ bool BlockBasedTable::FullFilterKeyMayMatch(
   Slice user_key = ExtractUserKey(internal_key);
   const Slice* const const_ikey_ptr = &internal_key;
   bool may_match = true;
+  size_t ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size();
+  Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
   if (rep_->whole_key_filtering) {
-    size_t ts_sz =
-        rep_->internal_comparator.user_comparator()->timestamp_size();
-    Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz);
     may_match =
         filter->KeyMayMatch(user_key_without_ts, prefix_extractor, kNotValid,
                             no_io, const_ikey_ptr, get_context, lookup_context);
   } else if (!read_options.total_order_seek && prefix_extractor &&
              rep_->table_properties->prefix_extractor_name.compare(
                  prefix_extractor->Name()) == 0 &&
-             prefix_extractor->InDomain(user_key) &&
-             !filter->PrefixMayMatch(prefix_extractor->Transform(user_key),
-                                     prefix_extractor, kNotValid, no_io,
-                                     const_ikey_ptr, get_context,
-                                     lookup_context)) {
+             prefix_extractor->InDomain(user_key_without_ts) &&
+             !filter->PrefixMayMatch(
+                 prefix_extractor->Transform(user_key_without_ts),
+                 prefix_extractor, kNotValid, no_io, const_ikey_ptr,
+                 get_context, lookup_context)) {
     may_match = false;
   }
   if (may_match) {
-    RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE);
+    RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_POSITIVE);
     PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level);
   }
   return may_match;
@@ -2209,14 +2217,13 @@ void BlockBasedTable::FullFilterKeysMayMatch(
                          lookup_context);
     uint64_t after_keys = range->KeysLeft();
     if (after_keys) {
-      RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE,
-                 after_keys);
+      RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_POSITIVE, after_keys);
       PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, after_keys,
                                 rep_->level);
     }
     uint64_t filtered_keys = before_keys - after_keys;
     if (filtered_keys) {
-      RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL, filtered_keys);
+      RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL, filtered_keys);
       PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, filtered_keys,
                                 rep_->level);
     }
@@ -2225,12 +2232,11 @@ void BlockBasedTable::FullFilterKeysMayMatch(
                  prefix_extractor->Name()) == 0) {
     filter->PrefixesMayMatch(range, prefix_extractor, kNotValid, false,
                              lookup_context);
-    RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_PREFIX_CHECKED,
-               before_keys);
+    RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_CHECKED, before_keys);
     uint64_t after_keys = range->KeysLeft();
     uint64_t filtered_keys = before_keys - after_keys;
     if (filtered_keys) {
-      RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_PREFIX_USEFUL,
+      RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_USEFUL,
                  filtered_keys);
     }
   }
@@ -2266,7 +2272,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
                             get_context, &lookup_context);
   TEST_SYNC_POINT("BlockBasedTable::Get:AfterFilterMatch");
   if (!may_match) {
-    RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
+    RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL);
     PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
   } else {
     IndexBlockIter iiter_on_stack;
@@ -2303,15 +2309,16 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
         // Not found
         // TODO: think about interaction with Merge. If a user key cannot
         // cross one data block, we should be fine.
-        RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
+        RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL);
         PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
         break;
       }
 
       if (!v.first_internal_key.empty() && !skip_filters &&
           UserComparatorWrapper(rep_->internal_comparator.user_comparator())
-                  .Compare(ExtractUserKey(key),
-                           ExtractUserKey(v.first_internal_key)) < 0) {
+                  .CompareWithoutTimestamp(
+                      ExtractUserKey(key),
+                      ExtractUserKey(v.first_internal_key)) < 0) {
         // The requested key falls between highest key in previous block and
         // lowest key in current block.
         break;
@@ -2354,8 +2361,10 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
         // Call the *saver function on each entry/block until it returns false
         for (; biter.Valid(); biter.Next()) {
           ParsedInternalKey parsed_key;
-          if (ParseInternalKey(biter.key(), &parsed_key) != Status::OK()) {
-            s = Status::Corruption(Slice());
+          Status pik_status = ParseInternalKey(
+              biter.key(), &parsed_key, false /* log_err_key */);  // TODO
+          if (!pik_status.ok()) {
+            s = pik_status;
           }
 
           if (!get_context->SaveValue(
@@ -2382,7 +2391,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
           referenced_key = key;
         }
         BlockCacheTraceRecord access_record(
-            rep_->ioptions.env->NowMicros(),
+            rep_->ioptions.clock->NowMicros(),
             /*block_key=*/"", lookup_data_block_context.block_type,
             lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
             /*cf_name=*/"", rep_->level_for_tracing(),
@@ -2408,7 +2417,7 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
       }
     }
     if (matched && filter != nullptr && !filter->IsBlockBased()) {
-      RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
+      RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_TRUE_POSITIVE);
       PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
                                 rep_->level);
     }
@@ -2475,6 +2484,8 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
     {
       MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(),
                                      sst_file_range.end());
+      std::vector<Cache::Handle*> cache_handles;
+      bool wait_for_cache_results = false;
 
       CachableEntry<UncompressionDict> uncompression_dict;
       Status uncompression_dict_status;
@@ -2496,8 +2507,9 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
         if (!iiter->Valid() ||
             (!v.first_internal_key.empty() && !skip_filters &&
              UserComparatorWrapper(rep_->internal_comparator.user_comparator())
-                     .Compare(ExtractUserKey(key),
-                              ExtractUserKey(v.first_internal_key)) < 0)) {
+                     .CompareWithoutTimestamp(
+                         ExtractUserKey(key),
+                         ExtractUserKey(v.first_internal_key)) < 0)) {
           // The requested key falls between highest key in previous block and
           // lowest key in current block.
           if (!iiter->status().IsNotFound()) {
@@ -2546,20 +2558,61 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
         Status s = RetrieveBlock(
             nullptr, ro, handle, dict, &(results.back()), BlockType::kData,
             miter->get_context, &lookup_data_block_context,
-            /* for_compaction */ false, /* use_cache */ true);
+            /* for_compaction */ false, /* use_cache */ true,
+            /* wait_for_cache */ false);
         if (s.IsIncomplete()) {
           s = Status::OK();
         }
         if (s.ok() && !results.back().IsEmpty()) {
-          // Found it in the cache. Add NULL handle to indicate there is
-          // nothing to read from disk
-          block_handles.emplace_back(BlockHandle::NullBlockHandle());
+          // Since we have a valid handle, check the value. If its nullptr,
+          // it means the cache is waiting for the final result and we're
+          // supposed to call WaitAll() to wait for the result.
+          if (results.back().GetValue() != nullptr) {
+            // Found it in the cache. Add NULL handle to indicate there is
+            // nothing to read from disk.
+            if (results.back().GetCacheHandle()) {
+              results.back().UpdateCachedValue();
+            }
+            block_handles.emplace_back(BlockHandle::NullBlockHandle());
+          } else {
+            // We have to wait for the cache lookup to finish in the
+            // background, and then we may have to read the block from disk
+            // anyway
+            assert(results.back().GetCacheHandle());
+            wait_for_cache_results = true;
+            block_handles.emplace_back(handle);
+            cache_handles.emplace_back(results.back().GetCacheHandle());
+          }
         } else {
           block_handles.emplace_back(handle);
           total_len += block_size(handle);
         }
       }
 
+      if (wait_for_cache_results) {
+        Cache* block_cache = rep_->table_options.block_cache.get();
+        block_cache->WaitAll(cache_handles);
+        for (size_t i = 0; i < block_handles.size(); ++i) {
+          // If this block was a success or failure or not needed because
+          // the corresponding key is in the same block as a prior key, skip
+          if (block_handles[i] == BlockHandle::NullBlockHandle() ||
+              results[i].IsEmpty()) {
+            continue;
+          }
+          results[i].UpdateCachedValue();
+          void* val = results[i].GetValue();
+          if (!val) {
+            // The async cache lookup failed - could be due to an error
+            // or a false positive. We need to read the data block from
+            // the SST file
+            results[i].Reset();
+            total_len += block_size(block_handles[i]);
+          } else {
+            block_handles[i] = BlockHandle::NullBlockHandle();
+          }
+        }
+      }
+
       if (total_len) {
         char* scratch = nullptr;
         const UncompressionDict& dict = uncompression_dict.GetValue()
@@ -2635,8 +2688,9 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
           IndexValue v = iiter->value();
           if (!v.first_internal_key.empty() && !skip_filters &&
               UserComparatorWrapper(rep_->internal_comparator.user_comparator())
-                      .Compare(ExtractUserKey(key),
-                               ExtractUserKey(v.first_internal_key)) < 0) {
+                      .CompareWithoutTimestamp(
+                          ExtractUserKey(key),
+                          ExtractUserKey(v.first_internal_key)) < 0) {
             // The requested key falls between highest key in previous block and
             // lowest key in current block.
             break;
@@ -2678,8 +2732,10 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
           ParsedInternalKey parsed_key;
           Cleanable dummy;
           Cleanable* value_pinner = nullptr;
-          if (ParseInternalKey(biter->key(), &parsed_key) != Status::OK()) {
-            s = Status::Corruption(Slice());
+          Status pik_status = ParseInternalKey(
+              biter->key(), &parsed_key, false /* log_err_key */);  // TODO
+          if (!pik_status.ok()) {
+            s = pik_status;
           }
           if (biter->IsValuePinned()) {
             if (reusing_block) {
@@ -2716,7 +2772,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
             referenced_key = key;
           }
           BlockCacheTraceRecord access_record(
-              rep_->ioptions.env->NowMicros(),
+              rep_->ioptions.clock->NowMicros(),
               /*block_key=*/"", lookup_data_block_context.block_type,
               lookup_data_block_context.block_size, rep_->cf_id_for_tracing(),
               /*cf_name=*/"", rep_->level_for_tracing(),
@@ -2748,7 +2804,7 @@ void BlockBasedTable::MultiGet(const ReadOptions& read_options,
       } while (iiter->Valid());
 
       if (matched && filter != nullptr && !filter->IsBlockBased()) {
-        RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
+        RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_TRUE_POSITIVE);
         PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
                                   rep_->level);
       }
@@ -2869,11 +2925,11 @@ Status BlockBasedTable::VerifyChecksumInBlocks(
   // increasing of the buffer size.
   size_t readahead_size = (read_options.readahead_size != 0)
                               ? read_options.readahead_size
-                              : kMaxAutoReadaheadSize;
+                              : rep_->table_options.max_auto_readahead_size;
   // FilePrefetchBuffer doesn't work in mmap mode and readahead is not
   // needed there.
   FilePrefetchBuffer prefetch_buffer(
-      rep_->file.get(), readahead_size /* readadhead_size */,
+      rep_->file.get(), readahead_size /* readahead_size */,
       readahead_size /* max_readahead_size */,
       !rep_->ioptions.allow_mmap_reads /* enable */);
 
@@ -3040,7 +3096,7 @@ Status BlockBasedTable::CreateIndexReader(
       auto meta_index_iter = preloaded_meta_index_iter;
       bool should_fallback = false;
       if (rep_->internal_prefix_transform.get() == nullptr) {
-        ROCKS_LOG_WARN(rep_->ioptions.info_log,
+        ROCKS_LOG_WARN(rep_->ioptions.logger,
                        "No prefix extractor passed in. Fall back to binary"
                        " search index.");
         should_fallback = true;
@@ -3050,7 +3106,7 @@ Status BlockBasedTable::CreateIndexReader(
         if (!s.ok()) {
           // we simply fall back to binary search in case there is any
           // problem with prefix hash index loading.
-          ROCKS_LOG_WARN(rep_->ioptions.info_log,
+          ROCKS_LOG_WARN(rep_->ioptions.logger,
                          "Unable to read the metaindex block."
                          " Fall back to binary search index.");
           should_fallback = true;
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 2923b482d34..43b56a68ca3 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -64,9 +64,6 @@ class BlockBasedTable : public TableReader {
 
   // All the below fields control iterator readahead
   static const size_t kInitAutoReadaheadSize = 8 * 1024;
-  // Found that 256 KB readahead size provides the best performance, based on
-  // experiments, for auto readahead. Experiment data is in PR #3282.
-  static const size_t kMaxAutoReadaheadSize;
   static const int kMinNumFileReadsToStartAutoReadahead = 2;
 
   // Attempt to open the table that is stored in bytes [0..file_size)
@@ -87,7 +84,7 @@ class BlockBasedTable : public TableReader {
   //    are set.
   // @param force_direct_prefetch if true, always prefetching to RocksDB
   //    buffer, rather than calling RandomAccessFile::Prefetch().
-  static Status Open(const ReadOptions& ro, const ImmutableCFOptions& ioptions,
+  static Status Open(const ReadOptions& ro, const ImmutableOptions& ioptions,
                      const EnvOptions& env_options,
                      const BlockBasedTableOptions& table_options,
                      const InternalKeyComparator& internal_key_comparator,
@@ -102,7 +99,9 @@ class BlockBasedTable : public TableReader {
                      bool force_direct_prefetch = false,
                      TailPrefetchStats* tail_prefetch_stats = nullptr,
                      BlockCacheTracer* const block_cache_tracer = nullptr,
-                     size_t max_file_size_for_l0_meta_pin = 0);
+                     size_t max_file_size_for_l0_meta_pin = 0,
+                     const std::string& db_session_id = "",
+                     uint64_t cur_file_num = 0);
 
   bool PrefixMayMatch(const Slice& internal_key,
                       const ReadOptions& read_options,
@@ -218,6 +217,11 @@ class BlockBasedTable : public TableReader {
                            size_t cache_key_prefix_size,
                            const BlockHandle& handle, char* cache_key);
 
+  static void UpdateCacheInsertionMetrics(BlockType block_type,
+                                          GetContext* get_context, size_t usage,
+                                          bool redundant,
+                                          Statistics* const statistics);
+
   // Retrieve all key value pairs from data blocks in the table.
   // The key retrieved are internal keys.
   Status GetKVPairsFromDataBlocks(std::vector<KVPairBlock>* kv_pair_blocks);
@@ -268,12 +272,13 @@ class BlockBasedTable : public TableReader {
                              size_t usage) const;
   void UpdateCacheMissMetrics(BlockType block_type,
                               GetContext* get_context) const;
-  void UpdateCacheInsertionMetrics(BlockType block_type,
-                                   GetContext* get_context, size_t usage,
-                                   bool redundant) const;
+
   Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
-                                   BlockType block_type,
-                                   GetContext* get_context) const;
+                                   BlockType block_type, const bool wait,
+                                   GetContext* get_context,
+                                   const Cache::CacheItemHelper* cache_helper,
+                                   const Cache::CreateCallback& create_cb,
+                                   Cache::Priority priority) const;
 
   // Either Block::NewDataIterator() or Block::NewIndexIterator().
   template <typename TBlockIter>
@@ -295,9 +300,9 @@ class BlockBasedTable : public TableReader {
   Status MaybeReadBlockAndLoadToCache(
       FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
       const BlockHandle& handle, const UncompressionDict& uncompression_dict,
-      CachableEntry<TBlocklike>* block_entry, BlockType block_type,
-      GetContext* get_context, BlockCacheLookupContext* lookup_context,
-      BlockContents* contents) const;
+      const bool wait, CachableEntry<TBlocklike>* block_entry,
+      BlockType block_type, GetContext* get_context,
+      BlockCacheLookupContext* lookup_context, BlockContents* contents) const;
 
   // Similar to the above, with one crucial difference: it will retrieve the
   // block from the file even if there are no caches configured (assuming the
@@ -309,7 +314,8 @@ class BlockBasedTable : public TableReader {
                        CachableEntry<TBlocklike>* block_entry,
                        BlockType block_type, GetContext* get_context,
                        BlockCacheLookupContext* lookup_context,
-                       bool for_compaction, bool use_cache) const;
+                       bool for_compaction, bool use_cache,
+                       bool wait_for_cache) const;
 
   void RetrieveMultipleBlocks(
       const ReadOptions& options, const MultiGetRange* batch,
@@ -349,7 +355,7 @@ class BlockBasedTable : public TableReader {
       Cache* block_cache, Cache* block_cache_compressed,
       const ReadOptions& read_options, CachableEntry<TBlocklike>* block,
       const UncompressionDict& uncompression_dict, BlockType block_type,
-      GetContext* get_context) const;
+      const bool wait, GetContext* get_context) const;
 
   // Put a raw block (maybe compressed) to the corresponding block caches.
   // This method will perform decompression against raw_block if needed and then
@@ -446,20 +452,37 @@ class BlockBasedTable : public TableReader {
       bool use_cache, bool prefetch, bool pin,
       BlockCacheLookupContext* lookup_context);
 
-  static void SetupCacheKeyPrefix(Rep* rep);
+  static void SetupCacheKeyPrefix(Rep* rep, const std::string& db_session_id,
+                                  uint64_t cur_file_num);
 
   // Generate a cache key prefix from the file
   template <typename TCache, typename TFile>
   static void GenerateCachePrefix(TCache* cc, TFile* file, char* buffer,
-                                  size_t* size) {
+                                  size_t* size,
+                                  const std::string& db_session_id,
+                                  uint64_t cur_file_num) {
     // generate an id from the file
     *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
 
     // If the prefix wasn't generated or was too long,
-    // create one from the cache.
+    // create one based on the DbSessionId and curent file number if they
+    // are set. Otherwise, created from NewId()
     if (cc != nullptr && *size == 0) {
-      char* end = EncodeVarint64(buffer, cc->NewId());
-      *size = static_cast<size_t>(end - buffer);
+      if (db_session_id.size() == 20) {
+        // db_session_id is 20 bytes as defined.
+        memcpy(buffer, db_session_id.c_str(), 20);
+        char* end;
+        if (cur_file_num != 0) {
+          end = EncodeVarint64(buffer + 20, cur_file_num);
+        } else {
+          end = EncodeVarint64(buffer + 20, cc->NewId());
+        }
+        // kMaxVarint64Length is 10 therefore, the prefix is at most 30 bytes.
+        *size = static_cast<size_t>(end - buffer);
+      } else {
+        char* end = EncodeVarint64(buffer, cc->NewId());
+        *size = static_cast<size_t>(end - buffer);
+      }
     }
   }
 
@@ -505,7 +528,7 @@ class BlockBasedTable::PartitionedIndexIteratorState
 // Stores all the properties associated with a BlockBasedTable.
 // These are immutable.
 struct BlockBasedTable::Rep {
-  Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options,
+  Rep(const ImmutableOptions& _ioptions, const EnvOptions& _env_options,
       const BlockBasedTableOptions& _table_opt,
       const InternalKeyComparator& _internal_comparator, bool skip_filters,
       uint64_t _file_size, int _level, const bool _immortal_table)
@@ -524,7 +547,7 @@ struct BlockBasedTable::Rep {
         level(_level),
         immortal_table(_immortal_table) {}
   ~Rep() { status.PermitUncheckedError(); }
-  const ImmutableCFOptions& ioptions;
+  const ImmutableOptions& ioptions;
   const EnvOptions& env_options;
   const BlockBasedTableOptions table_options;
   const FilterPolicy* const filter_policy;
@@ -626,19 +649,23 @@ struct BlockBasedTable::Rep {
   uint64_t sst_number_for_tracing() const {
     return file ? TableFileNameToNumber(file->file_name()) : UINT64_MAX;
   }
-  void CreateFilePrefetchBuffer(
-      size_t readahead_size, size_t max_readahead_size,
-      std::unique_ptr<FilePrefetchBuffer>* fpb) const {
-    fpb->reset(new FilePrefetchBuffer(file.get(), readahead_size,
-                                      max_readahead_size,
-                                      !ioptions.allow_mmap_reads /* enable */));
+  void CreateFilePrefetchBuffer(size_t readahead_size,
+                                size_t max_readahead_size,
+                                std::unique_ptr<FilePrefetchBuffer>* fpb,
+                                bool implicit_auto_readahead) const {
+    fpb->reset(new FilePrefetchBuffer(
+        file.get(), readahead_size, max_readahead_size,
+        !ioptions.allow_mmap_reads /* enable */, false /* track_min_offset*/,
+        implicit_auto_readahead));
   }
 
   void CreateFilePrefetchBufferIfNotExists(
       size_t readahead_size, size_t max_readahead_size,
-      std::unique_ptr<FilePrefetchBuffer>* fpb) const {
+      std::unique_ptr<FilePrefetchBuffer>* fpb,
+      bool implicit_auto_readahead) const {
     if (!(*fpb)) {
-      CreateFilePrefetchBuffer(readahead_size, max_readahead_size, fpb);
+      CreateFilePrefetchBuffer(readahead_size, max_readahead_size, fpb,
+                               implicit_auto_readahead);
     }
   }
 };
@@ -655,13 +682,21 @@ class WritableFileStringStreamAdapter : public std::stringbuf {
   explicit WritableFileStringStreamAdapter(WritableFile* writable_file)
       : file_(writable_file) {}
 
-  // This is to handle `std::endl`, `endl` is written by `os.put()` directly
-  // without going through `xsputn()`. As we explicitly disabled buffering,
-  // every write, not captured by xsputn, is an overflow.
+  // Override overflow() to handle `sputc()`. There are cases that will not go
+  // through `xsputn()` e.g. `std::endl` or an unsigned long long is written by
+  // `os.put()` directly and will call `sputc()` By internal implementation:
+  //    int_type __CLR_OR_THIS_CALL sputc(_Elem _Ch) {  // put a character
+  //        return 0 < _Pnavail() ? _Traits::to_int_type(*_Pninc() = _Ch) :
+  //        overflow(_Traits::to_int_type(_Ch));
+  //    }
+  // As we explicitly disabled buffering (_Pnavail() is always 0), every write,
+  // not captured by xsputn(), becomes an overflow here.
   int overflow(int ch = EOF) override {
-    if (ch == '\n') {
-      file_->Append("\n");
-      return ch;
+    if (ch != EOF) {
+      Status s = file_->Append(Slice((char*)&ch, 1));
+      if (s.ok()) {
+        return ch;
+      }
     }
     return EOF;
   }
diff --git a/table/block_based/block_based_table_reader_impl.h b/table/block_based/block_based_table_reader_impl.h
index d9cfaa92ca5..603c6243153 100644
--- a/table/block_based/block_based_table_reader_impl.h
+++ b/table/block_based/block_based_table_reader_impl.h
@@ -54,7 +54,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
   CachableEntry<Block> block;
   s = RetrieveBlock(prefetch_buffer, ro, handle, dict, &block, block_type,
                     get_context, lookup_context, for_compaction,
-                    /* use_cache */ true);
+                    /* use_cache */ true, /* wait_for_cache */ true);
 
   if (!s.ok()) {
     assert(block.IsEmpty());
diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc
index be2af3195d9..07136dbf809 100644
--- a/table/block_based/block_based_table_reader_test.cc
+++ b/table/block_based/block_based_table_reader_test.cc
@@ -59,17 +59,17 @@ class BlockBasedTableReaderTest
 
     // Create table builder.
     Options options;
-    ImmutableCFOptions ioptions(options);
+    ImmutableOptions ioptions(options);
     InternalKeyComparator comparator(options.comparator);
     ColumnFamilyOptions cf_options;
     MutableCFOptions moptions(cf_options);
-    std::vector<std::unique_ptr<IntTblPropCollectorFactory>> factories;
+    IntTblPropCollectorFactories factories;
     std::unique_ptr<TableBuilder> table_builder(table_factory_->NewTableBuilder(
         TableBuilderOptions(ioptions, moptions, comparator, &factories,
-                            compression_type, 0 /* sample_for_compression */,
-                            CompressionOptions(), false /* skip_filters */,
-                            kDefaultColumnFamilyName, -1 /* level */),
-        0 /* column_family_id */, writer.get()));
+                            compression_type, CompressionOptions(),
+                            0 /* column_family_id */, kDefaultColumnFamilyName,
+                            -1 /* level */),
+        writer.get()));
 
     // Build table.
     for (auto it = kv.begin(); it != kv.end(); it++) {
@@ -81,7 +81,7 @@ class BlockBasedTableReaderTest
   }
 
   void NewBlockBasedTableReader(const FileOptions& foptions,
-                                const ImmutableCFOptions& ioptions,
+                                const ImmutableOptions& ioptions,
                                 const InternalKeyComparator& comparator,
                                 const std::string& table_name,
                                 std::unique_ptr<BlockBasedTable>* table) {
@@ -135,7 +135,8 @@ class BlockBasedTableReaderTest
     std::string path = Path(filename);
     std::unique_ptr<FSRandomAccessFile> f;
     ASSERT_OK(fs_->NewRandomAccessFile(path, opt, &f, nullptr));
-    reader->reset(new RandomAccessFileReader(std::move(f), path, env_));
+    reader->reset(new RandomAccessFileReader(std::move(f), path,
+                                             env_->GetSystemClock().get()));
   }
 
   std::string ToInternalKey(const std::string& key) {
@@ -196,7 +197,7 @@ TEST_P(BlockBasedTableReaderTest, MultiGet) {
 
   std::unique_ptr<BlockBasedTable> table;
   Options options;
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   FileOptions foptions;
   foptions.use_direct_reads = use_direct_reads_;
   InternalKeyComparator comparator(options.comparator);
@@ -274,7 +275,7 @@ TEST_P(BlockBasedTableReaderTestVerifyChecksum, ChecksumMismatch) {
 
   std::unique_ptr<BlockBasedTable> table;
   Options options;
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   FileOptions foptions;
   foptions.use_direct_reads = use_direct_reads_;
   InternalKeyComparator comparator(options.comparator);
@@ -299,7 +300,8 @@ TEST_P(BlockBasedTableReaderTestVerifyChecksum, ChecksumMismatch) {
   table.reset();
 
   // Corrupt the block pointed to by handle
-  test::CorruptFile(Path(table_name), static_cast<int>(handle.offset()), 128);
+  ASSERT_OK(test::CorruptFile(options.env, Path(table_name),
+                              static_cast<int>(handle.offset()), 128));
 
   NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table);
   Status s = table->VerifyChecksum(ReadOptions(),
diff --git a/table/block_based/block_like_traits.h b/table/block_based/block_like_traits.h
new file mode 100644
index 00000000000..ccfa8bc56b3
--- /dev/null
+++ b/table/block_based/block_like_traits.h
@@ -0,0 +1,225 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "cache/cache_entry_roles.h"
+#include "port/lang.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_type.h"
+#include "table/block_based/parsed_full_filter_block.h"
+#include "table/format.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+template <typename TBlocklike>
+class BlocklikeTraits;
+
+template <typename T, CacheEntryRole R>
+Cache::CacheItemHelper* GetCacheItemHelperForRole();
+
+template <typename TBlocklike>
+Cache::CreateCallback GetCreateCallback(size_t read_amp_bytes_per_bit,
+                                        Statistics* statistics, bool using_zstd,
+                                        const FilterPolicy* filter_policy) {
+  return [read_amp_bytes_per_bit, statistics, using_zstd, filter_policy](
+             void* buf, size_t size, void** out_obj, size_t* charge) -> Status {
+    assert(buf != nullptr);
+    std::unique_ptr<char[]> buf_data(new char[size]());
+    memcpy(buf_data.get(), buf, size);
+    BlockContents bc = BlockContents(std::move(buf_data), size);
+    TBlocklike* ucd_ptr = BlocklikeTraits<TBlocklike>::Create(
+        std::move(bc), read_amp_bytes_per_bit, statistics, using_zstd,
+        filter_policy);
+    *out_obj = reinterpret_cast<void*>(ucd_ptr);
+    *charge = size;
+    return Status::OK();
+  };
+}
+
+template <>
+class BlocklikeTraits<BlockContents> {
+ public:
+  static BlockContents* Create(BlockContents&& contents,
+                               size_t /* read_amp_bytes_per_bit */,
+                               Statistics* /* statistics */,
+                               bool /* using_zstd */,
+                               const FilterPolicy* /* filter_policy */) {
+    return new BlockContents(std::move(contents));
+  }
+
+  static uint32_t GetNumRestarts(const BlockContents& /* contents */) {
+    return 0;
+  }
+
+  static size_t SizeCallback(void* obj) {
+    assert(obj != nullptr);
+    BlockContents* ptr = static_cast<BlockContents*>(obj);
+    return ptr->data.size();
+  }
+
+  static Status SaveToCallback(void* from_obj, size_t from_offset,
+                               size_t length, void* out) {
+    assert(from_obj != nullptr);
+    BlockContents* ptr = static_cast<BlockContents*>(from_obj);
+    const char* buf = ptr->data.data();
+    assert(length == ptr->data.size());
+    (void)from_offset;
+    memcpy(out, buf, length);
+    return Status::OK();
+  }
+
+  static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) {
+    if (block_type == BlockType::kFilter) {
+      return GetCacheItemHelperForRole<
+          BlockContents, CacheEntryRole::kDeprecatedFilterBlock>();
+    } else {
+      // E.g. compressed cache
+      return GetCacheItemHelperForRole<BlockContents,
+                                       CacheEntryRole::kOtherBlock>();
+    }
+  }
+};
+
+template <>
+class BlocklikeTraits<ParsedFullFilterBlock> {
+ public:
+  static ParsedFullFilterBlock* Create(BlockContents&& contents,
+                                       size_t /* read_amp_bytes_per_bit */,
+                                       Statistics* /* statistics */,
+                                       bool /* using_zstd */,
+                                       const FilterPolicy* filter_policy) {
+    return new ParsedFullFilterBlock(filter_policy, std::move(contents));
+  }
+
+  static uint32_t GetNumRestarts(const ParsedFullFilterBlock& /* block */) {
+    return 0;
+  }
+
+  static size_t SizeCallback(void* obj) {
+    assert(obj != nullptr);
+    ParsedFullFilterBlock* ptr = static_cast<ParsedFullFilterBlock*>(obj);
+    return ptr->GetBlockContentsData().size();
+  }
+
+  static Status SaveToCallback(void* from_obj, size_t from_offset,
+                               size_t length, void* out) {
+    assert(from_obj != nullptr);
+    ParsedFullFilterBlock* ptr = static_cast<ParsedFullFilterBlock*>(from_obj);
+    const char* buf = ptr->GetBlockContentsData().data();
+    assert(length == ptr->GetBlockContentsData().size());
+    (void)from_offset;
+    memcpy(out, buf, length);
+    return Status::OK();
+  }
+
+  static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) {
+    (void)block_type;
+    assert(block_type == BlockType::kFilter);
+    return GetCacheItemHelperForRole<ParsedFullFilterBlock,
+                                     CacheEntryRole::kFilterBlock>();
+  }
+};
+
+template <>
+class BlocklikeTraits<Block> {
+ public:
+  static Block* Create(BlockContents&& contents, size_t read_amp_bytes_per_bit,
+                       Statistics* statistics, bool /* using_zstd */,
+                       const FilterPolicy* /* filter_policy */) {
+    return new Block(std::move(contents), read_amp_bytes_per_bit, statistics);
+  }
+
+  static uint32_t GetNumRestarts(const Block& block) {
+    return block.NumRestarts();
+  }
+
+  static size_t SizeCallback(void* obj) {
+    assert(obj != nullptr);
+    Block* ptr = static_cast<Block*>(obj);
+    return ptr->size();
+  }
+
+  static Status SaveToCallback(void* from_obj, size_t from_offset,
+                               size_t length, void* out) {
+    assert(from_obj != nullptr);
+    Block* ptr = static_cast<Block*>(from_obj);
+    const char* buf = ptr->data();
+    assert(length == ptr->size());
+    (void)from_offset;
+    memcpy(out, buf, length);
+    return Status::OK();
+  }
+
+  static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) {
+    switch (block_type) {
+      case BlockType::kData:
+        return GetCacheItemHelperForRole<Block, CacheEntryRole::kDataBlock>();
+      case BlockType::kIndex:
+        return GetCacheItemHelperForRole<Block, CacheEntryRole::kIndexBlock>();
+      case BlockType::kFilter:
+        return GetCacheItemHelperForRole<Block,
+                                         CacheEntryRole::kFilterMetaBlock>();
+      default:
+        // Not a recognized combination
+        assert(false);
+        FALLTHROUGH_INTENDED;
+      case BlockType::kRangeDeletion:
+        return GetCacheItemHelperForRole<Block, CacheEntryRole::kOtherBlock>();
+    }
+  }
+};
+
+template <>
+class BlocklikeTraits<UncompressionDict> {
+ public:
+  static UncompressionDict* Create(BlockContents&& contents,
+                                   size_t /* read_amp_bytes_per_bit */,
+                                   Statistics* /* statistics */,
+                                   bool using_zstd,
+                                   const FilterPolicy* /* filter_policy */) {
+    return new UncompressionDict(contents.data, std::move(contents.allocation),
+                                 using_zstd);
+  }
+
+  static uint32_t GetNumRestarts(const UncompressionDict& /* dict */) {
+    return 0;
+  }
+
+  static size_t SizeCallback(void* obj) {
+    assert(obj != nullptr);
+    UncompressionDict* ptr = static_cast<UncompressionDict*>(obj);
+    return ptr->slice_.size();
+  }
+
+  static Status SaveToCallback(void* from_obj, size_t from_offset,
+                               size_t length, void* out) {
+    assert(from_obj != nullptr);
+    UncompressionDict* ptr = static_cast<UncompressionDict*>(from_obj);
+    const char* buf = ptr->slice_.data();
+    assert(length == ptr->slice_.size());
+    (void)from_offset;
+    memcpy(out, buf, length);
+    return Status::OK();
+  }
+
+  static Cache::CacheItemHelper* GetCacheItemHelper(BlockType block_type) {
+    (void)block_type;
+    assert(block_type == BlockType::kCompressionDictionary);
+    return GetCacheItemHelperForRole<UncompressionDict,
+                                     CacheEntryRole::kOtherBlock>();
+  }
+};
+
+// Get an CacheItemHelper pointer for value type T and role R.
+template <typename T, CacheEntryRole R>
+Cache::CacheItemHelper* GetCacheItemHelperForRole() {
+  static Cache::CacheItemHelper cache_helper(
+      BlocklikeTraits<T>::SizeCallback, BlocklikeTraits<T>::SaveToCallback,
+      GetCacheEntryDeleterForRole<T, R>());
+  return &cache_helper;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/block_prefetcher.cc b/table/block_based/block_prefetcher.cc
index aa3fc36108d..d9ef162c63f 100644
--- a/table/block_based/block_prefetcher.cc
+++ b/table/block_based/block_prefetcher.cc
@@ -16,51 +16,83 @@ void BlockPrefetcher::PrefetchIfNeeded(const BlockBasedTable::Rep* rep,
   if (is_for_compaction) {
     rep->CreateFilePrefetchBufferIfNotExists(compaction_readahead_size_,
                                              compaction_readahead_size_,
-                                             &prefetch_buffer_);
+                                             &prefetch_buffer_, false);
     return;
   }
 
-  // Explicit user requested readahead
+  // Explicit user requested readahead.
   if (readahead_size > 0) {
     rep->CreateFilePrefetchBufferIfNotExists(readahead_size, readahead_size,
-                                             &prefetch_buffer_);
+                                             &prefetch_buffer_, false);
     return;
   }
 
+  // Implicit readahead.
+
+  // If max_auto_readahead_size is set to be 0 by user, no data will be
+  // prefetched.
+  size_t max_auto_readahead_size = rep->table_options.max_auto_readahead_size;
+  if (max_auto_readahead_size == 0) {
+    return;
+  }
+
+  size_t len = static_cast<size_t>(block_size(handle));
+  size_t offset = handle.offset();
+
+  // If FS supports prefetching (readahead_limit_ will be non zero in that case)
+  // and current block exists in prefetch buffer then return.
+  if (offset + len <= readahead_limit_) {
+    UpdateReadPattern(offset, len);
+    return;
+  }
+
+  if (!IsBlockSequential(offset)) {
+    UpdateReadPattern(offset, len);
+    ResetValues();
+    return;
+  }
+  UpdateReadPattern(offset, len);
+
   // Implicit auto readahead, which will be enabled if the number of reads
-  // reached `kMinNumFileReadsToStartAutoReadahead` (default: 2).
+  // reached `kMinNumFileReadsToStartAutoReadahead` (default: 2)  and scans are
+  // sequential.
   num_file_reads_++;
   if (num_file_reads_ <=
       BlockBasedTable::kMinNumFileReadsToStartAutoReadahead) {
     return;
   }
 
+  size_t initial_auto_readahead_size = BlockBasedTable::kInitAutoReadaheadSize;
+  if (initial_auto_readahead_size > max_auto_readahead_size) {
+    initial_auto_readahead_size = max_auto_readahead_size;
+  }
+
   if (rep->file->use_direct_io()) {
-    rep->CreateFilePrefetchBufferIfNotExists(
-        BlockBasedTable::kInitAutoReadaheadSize,
-        BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_);
+    rep->CreateFilePrefetchBufferIfNotExists(initial_auto_readahead_size,
+                                             max_auto_readahead_size,
+                                             &prefetch_buffer_, true);
     return;
   }
 
-  if (handle.offset() + static_cast<size_t>(block_size(handle)) <=
-      readahead_limit_) {
-    return;
+  if (readahead_size_ > max_auto_readahead_size) {
+    readahead_size_ = max_auto_readahead_size;
   }
 
   // If prefetch is not supported, fall back to use internal prefetch buffer.
   // Discarding other return status of Prefetch calls intentionally, as
   // we can fallback to reading from disk if Prefetch fails.
-  Status s = rep->file->Prefetch(handle.offset(), readahead_size_);
+  Status s = rep->file->Prefetch(handle.offset(),
+                                 block_size(handle) + readahead_size_);
   if (s.IsNotSupported()) {
-    rep->CreateFilePrefetchBufferIfNotExists(
-        BlockBasedTable::kInitAutoReadaheadSize,
-        BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_);
+    rep->CreateFilePrefetchBufferIfNotExists(initial_auto_readahead_size,
+                                             max_auto_readahead_size,
+                                             &prefetch_buffer_, true);
     return;
   }
-  readahead_limit_ = static_cast<size_t>(handle.offset() + readahead_size_);
+
+  readahead_limit_ = offset + len + readahead_size_;
   // Keep exponentially increasing readahead size until
-  // kMaxAutoReadaheadSize.
-  readahead_size_ =
-      std::min(BlockBasedTable::kMaxAutoReadaheadSize, readahead_size_ * 2);
+  // max_auto_readahead_size.
+  readahead_size_ = std::min(max_auto_readahead_size, readahead_size_ * 2);
 }
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/block_prefetcher.h b/table/block_based/block_prefetcher.h
index ee3b61f5c09..30b3d5eb2c1 100644
--- a/table/block_based/block_prefetcher.h
+++ b/table/block_based/block_prefetcher.h
@@ -19,6 +19,22 @@ class BlockPrefetcher {
                         bool is_for_compaction);
   FilePrefetchBuffer* prefetch_buffer() { return prefetch_buffer_.get(); }
 
+  void UpdateReadPattern(const size_t& offset, const size_t& len) {
+    prev_offset_ = offset;
+    prev_len_ = len;
+  }
+
+  bool IsBlockSequential(const size_t& offset) {
+    return (prev_len_ == 0 || (prev_offset_ + prev_len_ == offset));
+  }
+
+  void ResetValues() {
+    num_file_reads_ = 1;
+    readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize;
+    readahead_limit_ = 0;
+    return;
+  }
+
  private:
   // Readahead size used in compaction, its value is used only if
   // lookup_context_.caller = kCompaction.
@@ -27,6 +43,8 @@ class BlockPrefetcher {
   size_t readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize;
   size_t readahead_limit_ = 0;
   int64_t num_file_reads_ = 0;
+  size_t prev_offset_ = 0;
+  size_t prev_len_ = 0;
   std::unique_ptr<FilePrefetchBuffer> prefetch_buffer_;
 };
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/cachable_entry.h b/table/block_based/cachable_entry.h
index 598f1ef5771..155097c0536 100644
--- a/table/block_based/cachable_entry.h
+++ b/table/block_based/cachable_entry.h
@@ -162,7 +162,6 @@ class CachableEntry {
   }
 
   void SetCachedValue(T* value, Cache* cache, Cache::Handle* cache_handle) {
-    assert(value != nullptr);
     assert(cache != nullptr);
     assert(cache_handle != nullptr);
 
@@ -179,6 +178,22 @@ class CachableEntry {
     assert(!own_value_);
   }
 
+  void UpdateCachedValue() {
+    assert(cache_ != nullptr);
+    assert(cache_handle_ != nullptr);
+
+    value_ = static_cast<T*>(cache_->Value(cache_handle_));
+  }
+
+  bool IsReady() {
+    if (!own_value_) {
+      assert(cache_ != nullptr);
+      assert(cache_handle_ != nullptr);
+      return cache_->IsReady(cache_handle_);
+    }
+    return true;
+  }
+
 private:
   void ReleaseResource() {
     if (LIKELY(cache_handle_ != nullptr)) {
diff --git a/table/block_based/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc
index 94fa7e94f07..121f78cef4a 100644
--- a/table/block_based/data_block_hash_index_test.cc
+++ b/table/block_based/data_block_hash_index_test.cc
@@ -539,26 +539,27 @@ void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2,
   int level_ = -1;
 
   std::vector<std::string> keys;
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   const InternalKeyComparator internal_comparator(options.comparator);
 
   EnvOptions soptions;
 
   soptions.use_mmap_reads = ioptions.allow_mmap_reads;
+  test::StringSink* sink = new test::StringSink();
+  std::unique_ptr<FSWritableFile> f(sink);
   file_writer.reset(
-      test::GetWritableFileWriter(new test::StringSink(), "" /* don't care */));
+      new WritableFileWriter(std::move(f), "" /* don't care */, FileOptions()));
   std::unique_ptr<TableBuilder> builder;
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
   std::string column_family_name;
   builder.reset(ioptions.table_factory->NewTableBuilder(
-      TableBuilderOptions(ioptions, moptions, internal_comparator,
-                          &int_tbl_prop_collector_factories,
-                          options.compression, options.sample_for_compression,
-                          CompressionOptions(), false /* skip_filters */,
-                          column_family_name, level_),
-      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+      TableBuilderOptions(
+          ioptions, moptions, internal_comparator,
+          &int_tbl_prop_collector_factories, options.compression,
+          CompressionOptions(),
+          TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+          column_family_name, level_),
       file_writer.get()));
 
   builder->Add(ik1.Encode().ToString(), v1);
@@ -569,23 +570,20 @@ void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2,
   file_writer->Flush();
   EXPECT_TRUE(s.ok()) << s.ToString();
 
-  EXPECT_EQ(
-      test::GetStringSinkFromLegacyWriter(file_writer.get())->contents().size(),
-      builder->FileSize());
+  EXPECT_EQ(sink->contents().size(), builder->FileSize());
 
   // Open the table
-  file_reader.reset(test::GetRandomAccessFileReader(new test::StringSource(
-      test::GetStringSinkFromLegacyWriter(file_writer.get())->contents(),
-      0 /*uniq_id*/, ioptions.allow_mmap_reads)));
+  test::StringSource* source = new test::StringSource(
+      sink->contents(), 0 /*uniq_id*/, ioptions.allow_mmap_reads);
+  std::unique_ptr<FSRandomAccessFile> file(source);
+  file_reader.reset(new RandomAccessFileReader(std::move(file), "test"));
   const bool kSkipFilters = true;
   const bool kImmortal = true;
-  ioptions.table_factory->NewTableReader(
+  ASSERT_OK(ioptions.table_factory->NewTableReader(
       TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions,
                          internal_comparator, !kSkipFilters, !kImmortal,
                          level_),
-      std::move(file_reader),
-      test::GetStringSinkFromLegacyWriter(file_writer.get())->contents().size(),
-      &table_reader);
+      std::move(file_reader), sink->contents().size(), &table_reader));
   // Search using Get()
   ReadOptions ro;
 
diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h
index d94c7e606db..6f509c4f712 100644
--- a/table/block_based/filter_block.h
+++ b/table/block_based/filter_block.h
@@ -60,8 +60,11 @@ class FilterBlockBuilder {
 
   virtual bool IsBlockBased() = 0;                    // If is blockbased filter
   virtual void StartBlock(uint64_t block_offset) = 0;  // Start new block filter
-  virtual void Add(const Slice& key) = 0;      // Add a key to current filter
-  virtual size_t NumAdded() const = 0;         // Number of keys added
+  virtual void Add(
+      const Slice& key_without_ts) = 0;        // Add a key to current filter
+  virtual bool IsEmpty() const = 0;            // Empty == none added
+  // For reporting stats on how many entries the builder considered unique
+  virtual size_t EstimateEntriesAdded() = 0;
   Slice Finish() {                             // Generate Filter
     const BlockHandle empty_handle;
     Status dont_care_status;
@@ -108,11 +111,11 @@ class FilterBlockReader {
                             uint64_t block_offset, const bool no_io,
                             BlockCacheLookupContext* lookup_context) {
     for (auto iter = range->begin(); iter != range->end(); ++iter) {
-      const Slice ukey = iter->ukey;
+      const Slice ukey_without_ts = iter->ukey_without_ts;
       const Slice ikey = iter->ikey;
       GetContext* const get_context = iter->get_context;
-      if (!KeyMayMatch(ukey, prefix_extractor, block_offset, no_io, &ikey,
-                       get_context, lookup_context)) {
+      if (!KeyMayMatch(ukey_without_ts, prefix_extractor, block_offset, no_io,
+                       &ikey, get_context, lookup_context)) {
         range->SkipKey(iter);
       }
     }
@@ -133,13 +136,13 @@ class FilterBlockReader {
                                 uint64_t block_offset, const bool no_io,
                                 BlockCacheLookupContext* lookup_context) {
     for (auto iter = range->begin(); iter != range->end(); ++iter) {
-      const Slice ukey = iter->ukey;
+      const Slice ukey_without_ts = iter->ukey_without_ts;
       const Slice ikey = iter->ikey;
       GetContext* const get_context = iter->get_context;
-      if (prefix_extractor->InDomain(ukey) &&
-          !PrefixMayMatch(prefix_extractor->Transform(ukey), prefix_extractor,
-                          block_offset, no_io, &ikey, get_context,
-                          lookup_context)) {
+      if (prefix_extractor->InDomain(ukey_without_ts) &&
+          !PrefixMayMatch(prefix_extractor->Transform(ukey_without_ts),
+                          prefix_extractor, block_offset, no_io, &ikey,
+                          get_context, lookup_context)) {
         range->SkipKey(iter);
       }
     }
@@ -153,10 +156,12 @@ class FilterBlockReader {
     return error_msg;
   }
 
-  virtual void CacheDependencies(const ReadOptions& /*ro*/, bool /*pin*/) {}
+  virtual Status CacheDependencies(const ReadOptions& /*ro*/, bool /*pin*/) {
+    return Status::OK();
+  }
 
   virtual bool RangeMayExist(const Slice* /*iterate_upper_bound*/,
-                             const Slice& user_key,
+                             const Slice& user_key_without_ts,
                              const SliceTransform* prefix_extractor,
                              const Comparator* /*comparator*/,
                              const Slice* const const_ikey_ptr,
@@ -167,7 +172,7 @@ class FilterBlockReader {
       return true;
     }
     *filter_checked = true;
-    Slice prefix = prefix_extractor->Transform(user_key);
+    Slice prefix = prefix_extractor->Transform(user_key_without_ts);
     return PrefixMayMatch(prefix, prefix_extractor, kNotValid, no_io,
                           const_ikey_ptr, /* get_context */ nullptr,
                           lookup_context);
diff --git a/table/block_based/filter_block_reader_common.cc b/table/block_based/filter_block_reader_common.cc
index fa0802669c6..135fffdf2df 100644
--- a/table/block_based/filter_block_reader_common.cc
+++ b/table/block_based/filter_block_reader_common.cc
@@ -30,7 +30,8 @@ Status FilterBlockReaderCommon<TBlocklike>::ReadFilterBlock(
       table->RetrieveBlock(prefetch_buffer, read_options, rep->filter_handle,
                            UncompressionDict::GetEmptyDict(), filter_block,
                            BlockType::kFilter, get_context, lookup_context,
-                           /* for_compaction */ false, use_cache);
+                           /* for_compaction */ false, use_cache,
+                           /* wait_for_cache */ true);
 
   return s;
 }
diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc
index 31eb6b90df5..994ada1ba7e 100644
--- a/table/block_based/filter_policy.cc
+++ b/table/block_based/filter_policy.cc
@@ -7,119 +7,82 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include "rocksdb/filter_policy.h"
+
 #include <array>
 #include <deque>
-
-#include "rocksdb/filter_policy.h"
+#include <limits>
 
 #include "rocksdb/slice.h"
 #include "table/block_based/block_based_filter_block.h"
-#include "table/block_based/full_filter_block.h"
 #include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/full_filter_block.h"
 #include "third-party/folly/folly/ConstexprMath.h"
 #include "util/bloom_impl.h"
 #include "util/coding.h"
 #include "util/hash.h"
+#include "util/ribbon_config.h"
+#include "util/ribbon_impl.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 namespace {
 
-// See description in FastLocalBloomImpl
-class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder {
+// Metadata trailer size for built-in filters. (This is separate from
+// block-based table block trailer.)
+//
+// Originally this was 1 byte for num_probes and 4 bytes for number of
+// cache lines in the Bloom filter, but now the first trailer byte is
+// usually an implementation marker and remaining 4 bytes have various
+// meanings.
+static constexpr uint32_t kMetadataLen = 5;
+
+Slice FinishAlwaysFalse(std::unique_ptr<const char[]>* /*buf*/) {
+  // Missing metadata, treated as zero entries
+  return Slice(nullptr, 0);
+}
+
+// Base class for filter builders using the XXH3 preview hash,
+// also known as Hash64 or GetSliceHash64.
+class XXH3pFilterBitsBuilder : public BuiltinFilterBitsBuilder {
  public:
-  // Non-null aggregate_rounding_balance implies optimize_filters_for_memory
-  explicit FastLocalBloomBitsBuilder(
-      const int millibits_per_key,
+  explicit XXH3pFilterBitsBuilder(
       std::atomic<int64_t>* aggregate_rounding_balance)
-      : millibits_per_key_(millibits_per_key),
-        aggregate_rounding_balance_(aggregate_rounding_balance) {
-    assert(millibits_per_key >= 1000);
-  }
-
-  // No Copy allowed
-  FastLocalBloomBitsBuilder(const FastLocalBloomBitsBuilder&) = delete;
-  void operator=(const FastLocalBloomBitsBuilder&) = delete;
+      : aggregate_rounding_balance_(aggregate_rounding_balance) {}
 
-  ~FastLocalBloomBitsBuilder() override {}
+  ~XXH3pFilterBitsBuilder() override {}
 
   virtual void AddKey(const Slice& key) override {
     uint64_t hash = GetSliceHash64(key);
+    // Especially with prefixes, it is common to have repetition,
+    // though only adjacent repetition, which we want to immediately
+    // recognize and collapse for estimating true filter space
+    // requirements.
     if (hash_entries_.empty() || hash != hash_entries_.back()) {
       hash_entries_.push_back(hash);
     }
   }
 
-  virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
-    size_t num_entry = hash_entries_.size();
-    std::unique_ptr<char[]> mutable_buf;
-    uint32_t len_with_metadata =
-        CalculateAndAllocate(num_entry, &mutable_buf, /*update_balance*/ true);
-
-    assert(mutable_buf);
-    assert(len_with_metadata >= 5);
-
-    // Compute num_probes after any rounding / adjustments
-    int num_probes = GetNumProbes(num_entry, len_with_metadata);
-
-    uint32_t len = len_with_metadata - 5;
-    if (len > 0) {
-      AddAllEntries(mutable_buf.get(), len, num_probes);
-    }
-
-    assert(hash_entries_.empty());
-
-    // See BloomFilterPolicy::GetBloomBitsReader re: metadata
-    // -1 = Marker for newer Bloom implementations
-    mutable_buf[len] = static_cast<char>(-1);
-    // 0 = Marker for this sub-implementation
-    mutable_buf[len + 1] = static_cast<char>(0);
-    // num_probes (and 0 in upper bits for 64-byte block size)
-    mutable_buf[len + 2] = static_cast<char>(num_probes);
-    // rest of metadata stays zero
-
-    Slice rv(mutable_buf.get(), len_with_metadata);
-    *buf = std::move(mutable_buf);
-    return rv;
+  virtual size_t EstimateEntriesAdded() override {
+    return hash_entries_.size();
   }
 
-  int CalculateNumEntry(const uint32_t bytes) override {
-    uint32_t bytes_no_meta = bytes >= 5u ? bytes - 5u : 0;
-    return static_cast<int>(uint64_t{8000} * bytes_no_meta /
-                            millibits_per_key_);
-  }
+ protected:
+  static constexpr uint32_t kMetadataLen = 5;
 
-  uint32_t CalculateSpace(const int num_entry) override {
-    // NB: the BuiltinFilterBitsBuilder API presumes len fits in uint32_t.
-    return static_cast<uint32_t>(
-        CalculateAndAllocate(static_cast<size_t>(num_entry),
-                             /* buf */ nullptr,
-                             /*update_balance*/ false));
+  // For delegating between XXH3pFilterBitsBuilders
+  void SwapEntriesWith(XXH3pFilterBitsBuilder* other) {
+    std::swap(hash_entries_, other->hash_entries_);
   }
 
-  // To choose size using malloc_usable_size, we have to actually allocate.
-  uint32_t CalculateAndAllocate(size_t num_entry, std::unique_ptr<char[]>* buf,
-                                bool update_balance) {
-    std::unique_ptr<char[]> tmpbuf;
-
-    // If not for cache line blocks in the filter, what would the target
-    // length in bytes be?
-    size_t raw_target_len = static_cast<size_t>(
-        (uint64_t{num_entry} * millibits_per_key_ + 7999) / 8000);
-
-    if (raw_target_len >= size_t{0xffffffc0}) {
-      // Max supported for this data structure implementation
-      raw_target_len = size_t{0xffffffc0};
-    }
-
-    // Round up to nearest multiple of 64 (block size). This adjustment is
-    // used for target FP rate only so that we don't receive complaints about
-    // lower FP rate vs. historic Bloom filter behavior.
-    uint32_t target_len =
-        static_cast<uint32_t>(raw_target_len + 63) & ~uint32_t{63};
+  virtual size_t RoundDownUsableSpace(size_t available_size) = 0;
 
+  // To choose size using malloc_usable_size, we have to actually allocate.
+  size_t AllocateMaybeRounding(size_t target_len_with_metadata,
+                               size_t num_entries,
+                               std::unique_ptr<char[]>* buf) {
     // Return value set to a default; overwritten in some cases
-    uint32_t rv = target_len + /* metadata */ 5;
+    size_t rv = target_len_with_metadata;
 #ifdef ROCKSDB_MALLOC_USABLE_SIZE
     if (aggregate_rounding_balance_ != nullptr) {
       // Do optimize_filters_for_memory, using malloc_usable_size.
@@ -131,7 +94,7 @@ class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder {
       // allocation size.
 
       // Although it can be considered bad practice to use malloc_usable_size
-      // to access an object beyond its original size, this approach should
+      // to access an object beyond its original size, this approach should be
       // quite general: working for all allocators that properly support
       // malloc_usable_size.
 
@@ -140,7 +103,8 @@ class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder {
       // and relative.
       int64_t balance = aggregate_rounding_balance_->load();
 
-      double target_fp_rate = EstimatedFpRate(num_entry, target_len + 5);
+      double target_fp_rate =
+          EstimatedFpRate(num_entries, target_len_with_metadata);
       double rv_fp_rate = target_fp_rate;
 
       if (balance < 0) {
@@ -151,14 +115,17 @@ class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder {
         // To simplify, we just try a few modified smaller sizes. This also
         // caps how much we vary filter size vs. target, to avoid outlier
         // behavior from excessive variance.
-        for (uint64_t maybe_len64 :
+        size_t target_len = target_len_with_metadata - kMetadataLen;
+        assert(target_len < target_len_with_metadata);  // check underflow
+        for (uint64_t maybe_len_rough :
              {uint64_t{3} * target_len / 4, uint64_t{13} * target_len / 16,
               uint64_t{7} * target_len / 8, uint64_t{15} * target_len / 16}) {
-          uint32_t maybe_len =
-              static_cast<uint32_t>(maybe_len64) & ~uint32_t{63};
-          double maybe_fp_rate = EstimatedFpRate(num_entry, maybe_len + 5);
+          size_t maybe_len_with_metadata =
+              RoundDownUsableSpace(maybe_len_rough + kMetadataLen);
+          double maybe_fp_rate =
+              EstimatedFpRate(num_entries, maybe_len_with_metadata);
           if (maybe_fp_rate <= for_balance_fp_rate) {
-            rv = maybe_len + /* metadata */ 5;
+            rv = maybe_len_with_metadata;
             rv_fp_rate = maybe_fp_rate;
             break;
           }
@@ -168,12 +135,12 @@ class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder {
       // Filter blocks are loaded into block cache with their block trailer.
       // We need to make sure that's accounted for in choosing a
       // fragmentation-friendly size.
-      const uint32_t kExtraPadding = kBlockTrailerSize;
+      const size_t kExtraPadding = kBlockTrailerSize;
       size_t requested = rv + kExtraPadding;
 
       // Allocate and get usable size
-      tmpbuf.reset(new char[requested]);
-      size_t usable = malloc_usable_size(tmpbuf.get());
+      buf->reset(new char[requested]);
+      size_t usable = malloc_usable_size(buf->get());
 
       if (usable - usable / 4 > requested) {
         // Ratio greater than 4/3 is too much for utilizing, if it's
@@ -183,53 +150,148 @@ class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder {
         // storage on disk.
         // Nothing to do, except assert that the result is accurate about
         // the usable size. (Assignment never used.)
-        assert((tmpbuf[usable - 1] = 'x'));
+        assert(((*buf)[usable - 1] = 'x'));
       } else if (usable > requested) {
-        // Adjust for reasonably larger usable size
-        size_t usable_len = (usable - kExtraPadding - /* metadata */ 5);
-        if (usable_len >= size_t{0xffffffc0}) {
-          // Max supported for this data structure implementation
-          usable_len = size_t{0xffffffc0};
-        }
-
-        rv = (static_cast<uint32_t>(usable_len) & ~uint32_t{63}) +
-             /* metadata */ 5;
-        rv_fp_rate = EstimatedFpRate(num_entry, rv);
+        rv = RoundDownUsableSpace(usable - kExtraPadding);
+        assert(rv <= usable - kExtraPadding);
+        rv_fp_rate = EstimatedFpRate(num_entries, rv);
       } else {
         // Too small means bad malloc_usable_size
         assert(usable == requested);
       }
-      memset(tmpbuf.get(), 0, rv);
+      memset(buf->get(), 0, rv);
 
-      if (update_balance) {
-        int64_t diff = static_cast<int64_t>((rv_fp_rate - target_fp_rate) *
-                                            double{0x100000000});
-        *aggregate_rounding_balance_ += diff;
-      }
+      // Update balance
+      int64_t diff = static_cast<int64_t>((rv_fp_rate - target_fp_rate) *
+                                          double{0x100000000});
+      *aggregate_rounding_balance_ += diff;
+    } else {
+      buf->reset(new char[rv]());
     }
 #else
-    (void)update_balance;
+    (void)num_entries;
+    buf->reset(new char[rv]());
 #endif  // ROCKSDB_MALLOC_USABLE_SIZE
-    if (buf) {
-      if (tmpbuf) {
-        *buf = std::move(tmpbuf);
-      } else {
-        buf->reset(new char[rv]());
-      }
+    return rv;
+  }
+
+  // A deque avoids unnecessary copying of already-saved values
+  // and has near-minimal peak memory use.
+  std::deque<uint64_t> hash_entries_;
+
+  // See BloomFilterPolicy::aggregate_rounding_balance_. If nullptr,
+  // always "round up" like historic behavior.
+  std::atomic<int64_t>* aggregate_rounding_balance_;
+};
+
+// #################### FastLocalBloom implementation ################## //
+// ############## also known as format_version=5 Bloom filter ########## //
+
+// See description in FastLocalBloomImpl
+class FastLocalBloomBitsBuilder : public XXH3pFilterBitsBuilder {
+ public:
+  // Non-null aggregate_rounding_balance implies optimize_filters_for_memory
+  explicit FastLocalBloomBitsBuilder(
+      const int millibits_per_key,
+      std::atomic<int64_t>* aggregate_rounding_balance)
+      : XXH3pFilterBitsBuilder(aggregate_rounding_balance),
+        millibits_per_key_(millibits_per_key) {
+    assert(millibits_per_key >= 1000);
+  }
+
+  // No Copy allowed
+  FastLocalBloomBitsBuilder(const FastLocalBloomBitsBuilder&) = delete;
+  void operator=(const FastLocalBloomBitsBuilder&) = delete;
+
+  ~FastLocalBloomBitsBuilder() override {}
+
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    size_t num_entries = hash_entries_.size();
+    size_t len_with_metadata = CalculateSpace(num_entries);
+
+    std::unique_ptr<char[]> mutable_buf;
+    len_with_metadata =
+        AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf);
+
+    assert(mutable_buf);
+    assert(len_with_metadata >= kMetadataLen);
+
+    // Max size supported by implementation
+    assert(len_with_metadata <= 0xffffffffU);
+
+    // Compute num_probes after any rounding / adjustments
+    int num_probes = GetNumProbes(num_entries, len_with_metadata);
+
+    uint32_t len = static_cast<uint32_t>(len_with_metadata - kMetadataLen);
+    if (len > 0) {
+      AddAllEntries(mutable_buf.get(), len, num_probes);
     }
+
+    assert(hash_entries_.empty());
+
+    // See BloomFilterPolicy::GetBloomBitsReader re: metadata
+    // -1 = Marker for newer Bloom implementations
+    mutable_buf[len] = static_cast<char>(-1);
+    // 0 = Marker for this sub-implementation
+    mutable_buf[len + 1] = static_cast<char>(0);
+    // num_probes (and 0 in upper bits for 64-byte block size)
+    mutable_buf[len + 2] = static_cast<char>(num_probes);
+    // rest of metadata stays zero
+
+    Slice rv(mutable_buf.get(), len_with_metadata);
+    *buf = std::move(mutable_buf);
     return rv;
   }
 
+  size_t ApproximateNumEntries(size_t bytes) override {
+    size_t bytes_no_meta =
+        bytes >= kMetadataLen ? RoundDownUsableSpace(bytes) - kMetadataLen : 0;
+    return static_cast<size_t>(uint64_t{8000} * bytes_no_meta /
+                               millibits_per_key_);
+  }
+
+  size_t CalculateSpace(size_t num_entries) override {
+    // If not for cache line blocks in the filter, what would the target
+    // length in bytes be?
+    size_t raw_target_len = static_cast<size_t>(
+        (uint64_t{num_entries} * millibits_per_key_ + 7999) / 8000);
+
+    if (raw_target_len >= size_t{0xffffffc0}) {
+      // Max supported for this data structure implementation
+      raw_target_len = size_t{0xffffffc0};
+    }
+
+    // Round up to nearest multiple of 64 (block size). This adjustment is
+    // used for target FP rate only so that we don't receive complaints about
+    // lower FP rate vs. historic Bloom filter behavior.
+    return ((raw_target_len + 63) & ~size_t{63}) + kMetadataLen;
+  }
+
   double EstimatedFpRate(size_t keys, size_t len_with_metadata) override {
     int num_probes = GetNumProbes(keys, len_with_metadata);
     return FastLocalBloomImpl::EstimatedFpRate(
-        keys, len_with_metadata - /*metadata*/ 5, num_probes, /*hash bits*/ 64);
+        keys, len_with_metadata - kMetadataLen, num_probes, /*hash bits*/ 64);
+  }
+
+ protected:
+  size_t RoundDownUsableSpace(size_t available_size) override {
+    size_t rv = available_size - kMetadataLen;
+
+    if (rv >= size_t{0xffffffc0}) {
+      // Max supported for this data structure implementation
+      rv = size_t{0xffffffc0};
+    }
+
+    // round down to multiple of 64 (block size)
+    rv &= ~size_t{63};
+
+    return rv + kMetadataLen;
   }
 
  private:
   // Compute num_probes after any rounding / adjustments
   int GetNumProbes(size_t keys, size_t len_with_metadata) {
-    uint64_t millibits = uint64_t{len_with_metadata - 5} * 8000;
+    uint64_t millibits = uint64_t{len_with_metadata - kMetadataLen} * 8000;
     int actual_millibits_per_key =
         static_cast<int>(millibits / std::max(keys, size_t{1}));
     // BEGIN XXX/TODO(peterd): preserving old/default behavior for now to
@@ -291,12 +353,6 @@ class FastLocalBloomBitsBuilder : public BuiltinFilterBitsBuilder {
 
   // Target allocation per added key, in thousandths of a bit.
   int millibits_per_key_;
-  // See BloomFilterPolicy::aggregate_rounding_balance_. If nullptr,
-  // always "round up" like historic behavior.
-  std::atomic<int64_t>* aggregate_rounding_balance_;
-  // A deque avoids unnecessary copying of already-saved values
-  // and has near-minimal peak memory use.
-  std::deque<uint64_t> hash_entries_;
 };
 
 // See description in FastLocalBloomImpl
@@ -341,6 +397,362 @@ class FastLocalBloomBitsReader : public FilterBitsReader {
   const uint32_t len_bytes_;
 };
 
+// ##################### Ribbon filter implementation ################### //
+
+// Implements concept RehasherTypesAndSettings in ribbon_impl.h
+struct Standard128RibbonRehasherTypesAndSettings {
+  // These are schema-critical. Any change almost certainly changes
+  // underlying data.
+  static constexpr bool kIsFilter = true;
+  static constexpr bool kHomogeneous = false;
+  static constexpr bool kFirstCoeffAlwaysOne = true;
+  static constexpr bool kUseSmash = false;
+  using CoeffRow = ROCKSDB_NAMESPACE::Unsigned128;
+  using Hash = uint64_t;
+  using Seed = uint32_t;
+  // Changing these doesn't necessarily change underlying data,
+  // but might affect supported scalability of those dimensions.
+  using Index = uint32_t;
+  using ResultRow = uint32_t;
+  // Save a conditional in Ribbon queries
+  static constexpr bool kAllowZeroStarts = false;
+};
+
+using Standard128RibbonTypesAndSettings =
+    ribbon::StandardRehasherAdapter<Standard128RibbonRehasherTypesAndSettings>;
+
+class Standard128RibbonBitsBuilder : public XXH3pFilterBitsBuilder {
+ public:
+  explicit Standard128RibbonBitsBuilder(
+      double desired_one_in_fp_rate, int bloom_millibits_per_key,
+      std::atomic<int64_t>* aggregate_rounding_balance, Logger* info_log)
+      : XXH3pFilterBitsBuilder(aggregate_rounding_balance),
+        desired_one_in_fp_rate_(desired_one_in_fp_rate),
+        info_log_(info_log),
+        bloom_fallback_(bloom_millibits_per_key, aggregate_rounding_balance) {
+    assert(desired_one_in_fp_rate >= 1.0);
+  }
+
+  // No Copy allowed
+  Standard128RibbonBitsBuilder(const Standard128RibbonBitsBuilder&) = delete;
+  void operator=(const Standard128RibbonBitsBuilder&) = delete;
+
+  ~Standard128RibbonBitsBuilder() override {}
+
+  virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
+    if (hash_entries_.size() > kMaxRibbonEntries) {
+      ROCKS_LOG_WARN(info_log_, "Too many keys for Ribbon filter: %llu",
+                     static_cast<unsigned long long>(hash_entries_.size()));
+      SwapEntriesWith(&bloom_fallback_);
+      assert(hash_entries_.empty());
+      return bloom_fallback_.Finish(buf);
+    }
+    if (hash_entries_.size() == 0) {
+      // Save a conditional in Ribbon queries by using alternate reader
+      // for zero entries added.
+      return FinishAlwaysFalse(buf);
+    }
+    uint32_t num_entries = static_cast<uint32_t>(hash_entries_.size());
+    uint32_t num_slots;
+    size_t len_with_metadata;
+
+    CalculateSpaceAndSlots(num_entries, &len_with_metadata, &num_slots);
+
+    // Bloom fall-back indicator
+    if (num_slots == 0) {
+      SwapEntriesWith(&bloom_fallback_);
+      assert(hash_entries_.empty());
+      return bloom_fallback_.Finish(buf);
+    }
+
+    uint32_t entropy = 0;
+    if (!hash_entries_.empty()) {
+      entropy = Lower32of64(hash_entries_.front());
+    }
+
+    BandingType banding;
+    bool success = banding.ResetAndFindSeedToSolve(
+        num_slots, hash_entries_.begin(), hash_entries_.end(),
+        /*starting seed*/ entropy & 255, /*seed mask*/ 255);
+    if (!success) {
+      ROCKS_LOG_WARN(info_log_,
+                     "Too many re-seeds (256) for Ribbon filter, %llu / %llu",
+                     static_cast<unsigned long long>(hash_entries_.size()),
+                     static_cast<unsigned long long>(num_slots));
+      SwapEntriesWith(&bloom_fallback_);
+      assert(hash_entries_.empty());
+      return bloom_fallback_.Finish(buf);
+    }
+    hash_entries_.clear();
+
+    uint32_t seed = banding.GetOrdinalSeed();
+    assert(seed < 256);
+
+    std::unique_ptr<char[]> mutable_buf;
+    len_with_metadata =
+        AllocateMaybeRounding(len_with_metadata, num_entries, &mutable_buf);
+
+    SolnType soln(mutable_buf.get(), len_with_metadata);
+    soln.BackSubstFrom(banding);
+    uint32_t num_blocks = soln.GetNumBlocks();
+    // This should be guaranteed:
+    // num_entries < 2^30
+    // => (overhead_factor < 2.0)
+    // num_entries * overhead_factor == num_slots < 2^31
+    // => (num_blocks = num_slots / 128)
+    // num_blocks < 2^24
+    assert(num_blocks < 0x1000000U);
+
+    // See BloomFilterPolicy::GetBloomBitsReader re: metadata
+    // -2 = Marker for Standard128 Ribbon
+    mutable_buf[len_with_metadata - 5] = static_cast<char>(-2);
+    // Hash seed
+    mutable_buf[len_with_metadata - 4] = static_cast<char>(seed);
+    // Number of blocks, in 24 bits
+    // (Along with bytes, we can derive other settings)
+    mutable_buf[len_with_metadata - 3] = static_cast<char>(num_blocks & 255);
+    mutable_buf[len_with_metadata - 2] =
+        static_cast<char>((num_blocks >> 8) & 255);
+    mutable_buf[len_with_metadata - 1] =
+        static_cast<char>((num_blocks >> 16) & 255);
+
+    Slice rv(mutable_buf.get(), len_with_metadata);
+    *buf = std::move(mutable_buf);
+    return rv;
+  }
+
+  // Setting num_slots to 0 means "fall back on Bloom filter."
+  // And note this implementation does not support num_entries or num_slots
+  // beyond uint32_t; see kMaxRibbonEntries.
+  void CalculateSpaceAndSlots(size_t num_entries,
+                              size_t* target_len_with_metadata,
+                              uint32_t* num_slots) {
+    if (num_entries > kMaxRibbonEntries) {
+      // More entries than supported by this Ribbon
+      *num_slots = 0;  // use Bloom
+      *target_len_with_metadata = bloom_fallback_.CalculateSpace(num_entries);
+      return;
+    }
+    uint32_t entropy = 0;
+    if (!hash_entries_.empty()) {
+      entropy = Upper32of64(hash_entries_.front());
+    }
+
+    *num_slots = NumEntriesToNumSlots(static_cast<uint32_t>(num_entries));
+    *target_len_with_metadata =
+        SolnType::GetBytesForOneInFpRate(*num_slots, desired_one_in_fp_rate_,
+                                         /*rounding*/ entropy) +
+        kMetadataLen;
+
+    // Consider possible Bloom fallback for small filters
+    if (*num_slots < 1024) {
+      size_t bloom = bloom_fallback_.CalculateSpace(num_entries);
+      if (bloom < *target_len_with_metadata) {
+        *num_slots = 0;  // use Bloom
+        *target_len_with_metadata = bloom;
+        return;
+      }
+    }
+  }
+
+  size_t CalculateSpace(size_t num_entries) override {
+    if (num_entries == 0) {
+      // See FinishAlwaysFalse
+      return 0;
+    }
+    size_t target_len_with_metadata;
+    uint32_t num_slots;
+    CalculateSpaceAndSlots(num_entries, &target_len_with_metadata, &num_slots);
+    (void)num_slots;
+    return target_len_with_metadata;
+  }
+
+  // This is a somewhat ugly but reasonably fast and reasonably accurate
+  // reversal of CalculateSpace.
+  size_t ApproximateNumEntries(size_t bytes) override {
+    size_t len_no_metadata =
+        RoundDownUsableSpace(std::max(bytes, size_t{kMetadataLen})) -
+        kMetadataLen;
+
+    if (!(desired_one_in_fp_rate_ > 1.0)) {
+      // Effectively asking for 100% FP rate, or NaN etc.
+      // Note that NaN is neither < 1.0 nor > 1.0
+      return kMaxRibbonEntries;
+    }
+
+    // Find a slight under-estimate for actual average bits per slot
+    double min_real_bits_per_slot;
+    if (desired_one_in_fp_rate_ >= 1.0 + std::numeric_limits<uint32_t>::max()) {
+      // Max of 32 solution columns (result bits)
+      min_real_bits_per_slot = 32.0;
+    } else {
+      // Account for mix of b and b+1 solution columns being slightly
+      // suboptimal vs. ideal log2(1/fp_rate) bits.
+      uint32_t rounded = static_cast<uint32_t>(desired_one_in_fp_rate_);
+      int upper_bits_per_key = 1 + FloorLog2(rounded);
+      double fp_rate_for_upper = std::pow(2.0, -upper_bits_per_key);
+      double portion_lower =
+          (1.0 / desired_one_in_fp_rate_ - fp_rate_for_upper) /
+          fp_rate_for_upper;
+      min_real_bits_per_slot = upper_bits_per_key - portion_lower;
+      assert(min_real_bits_per_slot > 0.0);
+      assert(min_real_bits_per_slot <= 32.0);
+    }
+
+    // An overestimate, but this should only be O(1) slots away from truth.
+    double max_slots = len_no_metadata * 8.0 / min_real_bits_per_slot;
+
+    // Let's not bother accounting for overflow to Bloom filter
+    // (Includes NaN case)
+    if (!(max_slots < ConfigHelper::GetNumSlots(kMaxRibbonEntries))) {
+      return kMaxRibbonEntries;
+    }
+
+    // Set up for short iteration
+    uint32_t slots = static_cast<uint32_t>(max_slots);
+    slots = SolnType::RoundUpNumSlots(slots);
+
+    // Assert that we have a valid upper bound on slots
+    assert(SolnType::GetBytesForOneInFpRate(
+               SolnType::RoundUpNumSlots(slots + 1), desired_one_in_fp_rate_,
+               /*rounding*/ 0) > len_no_metadata);
+
+    // Iterate up to a few times to rather precisely account for small effects
+    for (int i = 0; slots > 0; ++i) {
+      size_t reqd_bytes =
+          SolnType::GetBytesForOneInFpRate(slots, desired_one_in_fp_rate_,
+                                           /*rounding*/ 0);
+      if (reqd_bytes <= len_no_metadata) {
+        break;  // done
+      }
+      if (i >= 2) {
+        // should have been enough iterations
+        assert(false);
+        break;
+      }
+      slots = SolnType::RoundDownNumSlots(slots - 1);
+    }
+
+    uint32_t num_entries = ConfigHelper::GetNumToAdd(slots);
+
+    // Consider possible Bloom fallback for small filters
+    if (slots < 1024) {
+      size_t bloom = bloom_fallback_.ApproximateNumEntries(bytes);
+      if (bloom > num_entries) {
+        return bloom;
+      } else {
+        return num_entries;
+      }
+    } else {
+      return std::min(num_entries, kMaxRibbonEntries);
+    }
+  }
+
+  double EstimatedFpRate(size_t num_entries,
+                         size_t len_with_metadata) override {
+    if (num_entries > kMaxRibbonEntries) {
+      // More entries than supported by this Ribbon
+      return bloom_fallback_.EstimatedFpRate(num_entries, len_with_metadata);
+    }
+    uint32_t num_slots =
+        NumEntriesToNumSlots(static_cast<uint32_t>(num_entries));
+    SolnType fake_soln(nullptr, len_with_metadata);
+    fake_soln.ConfigureForNumSlots(num_slots);
+    return fake_soln.ExpectedFpRate();
+  }
+
+ protected:
+  size_t RoundDownUsableSpace(size_t available_size) override {
+    size_t rv = available_size - kMetadataLen;
+
+    // round down to multiple of 16 (segment size)
+    rv &= ~size_t{15};
+
+    return rv + kMetadataLen;
+  }
+
+ private:
+  using TS = Standard128RibbonTypesAndSettings;
+  using SolnType = ribbon::SerializableInterleavedSolution<TS>;
+  using BandingType = ribbon::StandardBanding<TS>;
+  using ConfigHelper = ribbon::BandingConfigHelper1TS<ribbon::kOneIn20, TS>;
+
+  static uint32_t NumEntriesToNumSlots(uint32_t num_entries) {
+    uint32_t num_slots1 = ConfigHelper::GetNumSlots(num_entries);
+    return SolnType::RoundUpNumSlots(num_slots1);
+  }
+
+  // Approximate num_entries to ensure number of bytes fits in 32 bits, which
+  // is not an inherent limitation but does ensure somewhat graceful Bloom
+  // fallback for crazy high number of entries, since the Bloom implementation
+  // does not support number of bytes bigger than fits in 32 bits. This is
+  // within an order of magnitude of implementation limit on num_slots
+  // fitting in 32 bits, and even closer for num_blocks fitting in 24 bits
+  // (for filter metadata).
+  static constexpr uint32_t kMaxRibbonEntries = 950000000;  // ~ 1 billion
+
+  // A desired value for 1/fp_rate. For example, 100 -> 1% fp rate.
+  double desired_one_in_fp_rate_;
+
+  // For warnings, or can be nullptr
+  Logger* info_log_;
+
+  // For falling back on Bloom filter in some exceptional cases and
+  // very small filter cases
+  FastLocalBloomBitsBuilder bloom_fallback_;
+};
+
+// for the linker, at least with DEBUG_LEVEL=2
+constexpr uint32_t Standard128RibbonBitsBuilder::kMaxRibbonEntries;
+
+class Standard128RibbonBitsReader : public FilterBitsReader {
+ public:
+  Standard128RibbonBitsReader(const char* data, size_t len_bytes,
+                              uint32_t num_blocks, uint32_t seed)
+      : soln_(const_cast<char*>(data), len_bytes) {
+    soln_.ConfigureForNumBlocks(num_blocks);
+    hasher_.SetOrdinalSeed(seed);
+  }
+
+  // No Copy allowed
+  Standard128RibbonBitsReader(const Standard128RibbonBitsReader&) = delete;
+  void operator=(const Standard128RibbonBitsReader&) = delete;
+
+  ~Standard128RibbonBitsReader() override {}
+
+  bool MayMatch(const Slice& key) override {
+    uint64_t h = GetSliceHash64(key);
+    return soln_.FilterQuery(h, hasher_);
+  }
+
+  virtual void MayMatch(int num_keys, Slice** keys, bool* may_match) override {
+    struct SavedData {
+      uint64_t seeded_hash;
+      uint32_t segment_num;
+      uint32_t num_columns;
+      uint32_t start_bits;
+    };
+    std::array<SavedData, MultiGetContext::MAX_BATCH_SIZE> saved;
+    for (int i = 0; i < num_keys; ++i) {
+      ribbon::InterleavedPrepareQuery(
+          GetSliceHash64(*keys[i]), hasher_, soln_, &saved[i].seeded_hash,
+          &saved[i].segment_num, &saved[i].num_columns, &saved[i].start_bits);
+    }
+    for (int i = 0; i < num_keys; ++i) {
+      may_match[i] = ribbon::InterleavedFilterQuery(
+          saved[i].seeded_hash, saved[i].segment_num, saved[i].num_columns,
+          saved[i].start_bits, hasher_, soln_);
+    }
+  }
+
+ private:
+  using TS = Standard128RibbonTypesAndSettings;
+  ribbon::SerializableInterleavedSolution<TS> soln_;
+  ribbon::StandardHasher<TS> hasher_;
+};
+
+// ##################### Legacy Bloom implementation ################### //
+
 using LegacyBloomImpl = LegacyLocalityBloomImpl</*ExtraRotates*/ false>;
 
 class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder {
@@ -355,21 +767,25 @@ class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder {
 
   void AddKey(const Slice& key) override;
 
-  Slice Finish(std::unique_ptr<const char[]>* buf) override;
+  virtual size_t EstimateEntriesAdded() override {
+    return hash_entries_.size();
+  }
 
-  int CalculateNumEntry(const uint32_t bytes) override;
+  Slice Finish(std::unique_ptr<const char[]>* buf) override;
 
-  uint32_t CalculateSpace(const int num_entry) override {
+  size_t CalculateSpace(size_t num_entries) override {
     uint32_t dont_care1;
     uint32_t dont_care2;
-    return CalculateSpace(num_entry, &dont_care1, &dont_care2);
+    return CalculateSpace(num_entries, &dont_care1, &dont_care2);
   }
 
   double EstimatedFpRate(size_t keys, size_t bytes) override {
-    return LegacyBloomImpl::EstimatedFpRate(keys, bytes - /*metadata*/ 5,
+    return LegacyBloomImpl::EstimatedFpRate(keys, bytes - kMetadataLen,
                                             num_probes_);
   }
 
+  size_t ApproximateNumEntries(size_t bytes) override;
+
  private:
   int bits_per_key_;
   int num_probes_;
@@ -380,11 +796,11 @@ class LegacyBloomBitsBuilder : public BuiltinFilterBitsBuilder {
   uint32_t GetTotalBitsForLocality(uint32_t total_bits);
 
   // Reserve space for new filter
-  char* ReserveSpace(const int num_entry, uint32_t* total_bits,
+  char* ReserveSpace(size_t num_entries, uint32_t* total_bits,
                      uint32_t* num_lines);
 
   // Implementation-specific variant of public CalculateSpace
-  uint32_t CalculateSpace(const int num_entry, uint32_t* total_bits,
+  uint32_t CalculateSpace(size_t num_entries, uint32_t* total_bits,
                           uint32_t* num_lines);
 
   // Assuming single threaded access to this function.
@@ -452,7 +868,29 @@ Slice LegacyBloomBitsBuilder::Finish(std::unique_ptr<const char[]>* buf) {
   buf->reset(const_data);
   hash_entries_.clear();
 
-  return Slice(data, total_bits / 8 + 5);
+  return Slice(data, total_bits / 8 + kMetadataLen);
+}
+
+size_t LegacyBloomBitsBuilder::ApproximateNumEntries(size_t bytes) {
+  assert(bits_per_key_);
+  assert(bytes > 0);
+
+  uint64_t total_bits_tmp = bytes * 8;
+  // total bits, including temporary computations, cannot exceed 2^32
+  // for compatibility
+  total_bits_tmp = std::min(total_bits_tmp, uint64_t{0xffff0000});
+
+  uint32_t high = static_cast<uint32_t>(total_bits_tmp) /
+                      static_cast<uint32_t>(bits_per_key_) +
+                  1;
+  uint32_t low = 1;
+  uint32_t n = high;
+  for (; n >= low; n--) {
+    if (CalculateSpace(n) <= bytes) {
+      break;
+    }
+  }
+  return n;
 }
 
 uint32_t LegacyBloomBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
@@ -467,14 +905,18 @@ uint32_t LegacyBloomBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) {
   return num_lines * (CACHE_LINE_SIZE * 8);
 }
 
-uint32_t LegacyBloomBitsBuilder::CalculateSpace(const int num_entry,
+uint32_t LegacyBloomBitsBuilder::CalculateSpace(size_t num_entries,
                                                 uint32_t* total_bits,
                                                 uint32_t* num_lines) {
   assert(bits_per_key_);
-  if (num_entry != 0) {
-    uint32_t total_bits_tmp = static_cast<uint32_t>(num_entry * bits_per_key_);
-
-    *total_bits = GetTotalBitsForLocality(total_bits_tmp);
+  if (num_entries != 0) {
+    size_t total_bits_tmp = num_entries * bits_per_key_;
+    // total bits, including temporary computations, cannot exceed 2^32
+    // for compatibility
+    total_bits_tmp = std::min(total_bits_tmp, size_t{0xffff0000});
+
+    *total_bits =
+        GetTotalBitsForLocality(static_cast<uint32_t>(total_bits_tmp));
     *num_lines = *total_bits / (CACHE_LINE_SIZE * 8);
     assert(*total_bits > 0 && *total_bits % 8 == 0);
   } else {
@@ -485,34 +927,19 @@ uint32_t LegacyBloomBitsBuilder::CalculateSpace(const int num_entry,
 
   // Reserve space for Filter
   uint32_t sz = *total_bits / 8;
-  sz += 5;  // 4 bytes for num_lines, 1 byte for num_probes
+  sz += kMetadataLen;  // 4 bytes for num_lines, 1 byte for num_probes
   return sz;
 }
 
-char* LegacyBloomBitsBuilder::ReserveSpace(const int num_entry,
+char* LegacyBloomBitsBuilder::ReserveSpace(size_t num_entries,
                                            uint32_t* total_bits,
                                            uint32_t* num_lines) {
-  uint32_t sz = CalculateSpace(num_entry, total_bits, num_lines);
+  uint32_t sz = CalculateSpace(num_entries, total_bits, num_lines);
   char* data = new char[sz];
   memset(data, 0, sz);
   return data;
 }
 
-int LegacyBloomBitsBuilder::CalculateNumEntry(const uint32_t bytes) {
-  assert(bits_per_key_);
-  assert(bytes > 0);
-  int high = static_cast<int>(bytes * 8 / bits_per_key_ + 1);
-  int low = 1;
-  int n = high;
-  for (; n >= low; n--) {
-    if (CalculateSpace(n) <= bytes) {
-      break;
-    }
-  }
-  assert(n < high);  // High should be an overestimation
-  return n;
-}
-
 inline void LegacyBloomBitsBuilder::AddHash(uint32_t h, char* data,
                                             uint32_t num_lines,
                                             uint32_t total_bits) {
@@ -595,11 +1022,13 @@ const std::vector<BloomFilterPolicy::Mode> BloomFilterPolicy::kAllFixedImpls = {
     kLegacyBloom,
     kDeprecatedBlock,
     kFastLocalBloom,
+    kStandard128Ribbon,
 };
 
 const std::vector<BloomFilterPolicy::Mode> BloomFilterPolicy::kAllUserModes = {
     kDeprecatedBlock,
-    kAuto,
+    kAutoBloom,
+    kStandard128Ribbon,
 };
 
 BloomFilterPolicy::BloomFilterPolicy(double bits_per_key, Mode mode)
@@ -616,6 +1045,15 @@ BloomFilterPolicy::BloomFilterPolicy(double bits_per_key, Mode mode)
   // point are interpreted accurately.
   millibits_per_key_ = static_cast<int>(bits_per_key * 1000.0 + 0.500001);
 
+  // For now configure Ribbon filter to match Bloom FP rate and save
+  // memory. (Ribbon bits per key will be ~30% less than Bloom bits per key
+  // for same FP rate.)
+  desired_one_in_fp_rate_ =
+      1.0 / BloomMath::CacheLocalFpRate(
+                bits_per_key,
+                FastLocalBloomImpl::ChooseNumProbes(millibits_per_key_),
+                /*cache_line_bits*/ 512);
+
   // For better or worse, this is a rounding up of a nudged rounding up,
   // e.g. 7.4999999999999 will round up to 8, but that provides more
   // predictability against small arithmetic errors in floating point.
@@ -700,7 +1138,7 @@ FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext(
   // one exhaustive switch without (risky) recursion
   for (int i = 0; i < 2; ++i) {
     switch (cur) {
-      case kAuto:
+      case kAutoBloom:
         if (context.table_options.format_version < 5) {
           cur = kLegacyBloom;
         } else {
@@ -733,6 +1171,10 @@ FilterBitsBuilder* BloomFilterPolicy::GetBuilderWithContext(
         }
         return new LegacyBloomBitsBuilder(whole_bits_per_key_,
                                           context.info_log);
+      case kStandard128Ribbon:
+        return new Standard128RibbonBitsBuilder(
+            desired_one_in_fp_rate_, millibits_per_key_,
+            offm ? &aggregate_rounding_balance_ : nullptr, context.info_log);
     }
   }
   assert(false);
@@ -753,7 +1195,7 @@ FilterBitsBuilder* BloomFilterPolicy::GetBuilderFromContext(
 FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader(
     const Slice& contents) const {
   uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
-  if (len_with_meta <= 5) {
+  if (len_with_meta <= kMetadataLen) {
     // filter is empty or broken. Treat like zero keys added.
     return new AlwaysFalseFilter();
   }
@@ -771,7 +1213,7 @@ FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader(
   // len_with_meta +-----------------------------------+
 
   int8_t raw_num_probes =
-      static_cast<int8_t>(contents.data()[len_with_meta - 5]);
+      static_cast<int8_t>(contents.data()[len_with_meta - kMetadataLen]);
   // NB: *num_probes > 30 and < 128 probably have not been used, because of
   // BloomFilterPolicy::initialize, unless directly calling
   // LegacyBloomBitsBuilder as an API, but we are leaving those cases in
@@ -780,13 +1222,20 @@ FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader(
   if (raw_num_probes < 1) {
     // Note: < 0 (or unsigned > 127) indicate special new implementations
     // (or reserved for future use)
-    if (raw_num_probes == -1) {
-      // Marker for newer Bloom implementations
-      return GetBloomBitsReader(contents);
+    switch (raw_num_probes) {
+      case 0:
+        // Treat as zero probes (always FP)
+        return new AlwaysTrueFilter();
+      case -1:
+        // Marker for newer Bloom implementations
+        return GetBloomBitsReader(contents);
+      case -2:
+        // Marker for Ribbon implementations
+        return GetRibbonBitsReader(contents);
+      default:
+        // Reserved (treat as zero probes, always FP, for now)
+        return new AlwaysTrueFilter();
     }
-    // otherwise
-    // Treat as zero probes (always FP) for now.
-    return new AlwaysTrueFilter();
   }
   // else attempt decode for LegacyBloomBitsReader
 
@@ -794,7 +1243,7 @@ FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader(
   assert(num_probes >= 1);
   assert(num_probes <= 127);
 
-  uint32_t len = len_with_meta - 5;
+  uint32_t len = len_with_meta - kMetadataLen;
   assert(len > 0);
 
   uint32_t num_lines = DecodeFixed32(contents.data() + len_with_meta - 4);
@@ -824,11 +1273,34 @@ FilterBitsReader* BloomFilterPolicy::GetFilterBitsReader(
                                    log2_cache_line_size);
 }
 
+FilterBitsReader* BloomFilterPolicy::GetRibbonBitsReader(
+    const Slice& contents) const {
+  uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
+  uint32_t len = len_with_meta - kMetadataLen;
+
+  assert(len > 0);  // precondition
+
+  uint32_t seed = static_cast<uint8_t>(contents.data()[len + 1]);
+  uint32_t num_blocks = static_cast<uint8_t>(contents.data()[len + 2]);
+  num_blocks |= static_cast<uint8_t>(contents.data()[len + 3]) << 8;
+  num_blocks |= static_cast<uint8_t>(contents.data()[len + 4]) << 16;
+  if (num_blocks < 2) {
+    // Not supported
+    // num_blocks == 1 is not used because num_starts == 1 is problematic
+    // for the hashing scheme. num_blocks == 0 is unused because there's
+    // already a concise encoding of an "always false" filter.
+    // Return something safe:
+    return new AlwaysTrueFilter();
+  }
+  return new Standard128RibbonBitsReader(contents.data(), len, num_blocks,
+                                         seed);
+}
+
 // For newer Bloom filter implementations
 FilterBitsReader* BloomFilterPolicy::GetBloomBitsReader(
     const Slice& contents) const {
   uint32_t len_with_meta = static_cast<uint32_t>(contents.size());
-  uint32_t len = len_with_meta - 5;
+  uint32_t len = len_with_meta - kMetadataLen;
 
   assert(len > 0);  // precondition
 
@@ -890,7 +1362,7 @@ const FilterPolicy* NewBloomFilterPolicy(double bits_per_key,
   if (use_block_based_builder) {
     m = BloomFilterPolicy::kDeprecatedBlock;
   } else {
-    m = BloomFilterPolicy::kAuto;
+    m = BloomFilterPolicy::kAutoBloom;
   }
   assert(std::find(BloomFilterPolicy::kAllUserModes.begin(),
                    BloomFilterPolicy::kAllUserModes.end(),
@@ -898,6 +1370,12 @@ const FilterPolicy* NewBloomFilterPolicy(double bits_per_key,
   return new BloomFilterPolicy(bits_per_key, m);
 }
 
+extern const FilterPolicy* NewRibbonFilterPolicy(
+    double bloom_equivalent_bits_per_key) {
+  return new BloomFilterPolicy(bloom_equivalent_bits_per_key,
+                               BloomFilterPolicy::kStandard128Ribbon);
+}
+
 FilterBuildingContext::FilterBuildingContext(
     const BlockBasedTableOptions& _table_options)
     : table_options(_table_options) {}
@@ -908,6 +1386,8 @@ Status FilterPolicy::CreateFromString(
     const ConfigOptions& /*options*/, const std::string& value,
     std::shared_ptr<const FilterPolicy>* policy) {
   const std::string kBloomName = "bloomfilter:";
+  const std::string kExpRibbonName = "experimental_ribbon:";
+  const std::string kRibbonName = "ribbonfilter:";
   if (value == kNullptrString || value == "rocksdb.BuiltinBloomFilter") {
     policy->reset();
 #ifndef ROCKSDB_LITE
@@ -924,6 +1404,15 @@ Status FilterPolicy::CreateFromString(
       policy->reset(
           NewBloomFilterPolicy(bits_per_key, use_block_based_builder));
     }
+  } else if (value.compare(0, kExpRibbonName.size(), kExpRibbonName) == 0) {
+    double bloom_equivalent_bits_per_key =
+        ParseDouble(trim(value.substr(kExpRibbonName.size())));
+    policy->reset(
+        NewExperimentalRibbonFilterPolicy(bloom_equivalent_bits_per_key));
+  } else if (value.compare(0, kRibbonName.size(), kRibbonName) == 0) {
+    double bloom_equivalent_bits_per_key =
+        ParseDouble(trim(value.substr(kRibbonName.size())));
+    policy->reset(NewRibbonFilterPolicy(bloom_equivalent_bits_per_key));
   } else {
     return Status::NotFound("Invalid filter policy name ", value);
 #else
diff --git a/table/block_based/filter_policy_internal.h b/table/block_based/filter_policy_internal.h
index 783373b2627..1a8acfc9d4f 100644
--- a/table/block_based/filter_policy_internal.h
+++ b/table/block_based/filter_policy_internal.h
@@ -25,14 +25,17 @@ class Slice;
 class BuiltinFilterBitsBuilder : public FilterBitsBuilder {
  public:
   // Calculate number of bytes needed for a new filter, including
-  // metadata. Passing the result to CalculateNumEntry should
-  // return >= the num_entry passed in.
-  virtual uint32_t CalculateSpace(const int num_entry) = 0;
+  // metadata. Passing the result to ApproximateNumEntries should
+  // (ideally, usually) return >= the num_entry passed in.
+  // When optimize_filters_for_memory is enabled, this function
+  // is not authoritative but represents a target size that should
+  // be close to the average size.
+  virtual size_t CalculateSpace(size_t num_entries) = 0;
 
   // Returns an estimate of the FP rate of the returned filter if
-  // `keys` keys are added and the filter returned by Finish is `bytes`
-  // bytes.
-  virtual double EstimatedFpRate(size_t keys, size_t bytes) = 0;
+  // `num_entries` keys are added and the filter returned by Finish
+  // is `bytes` bytes.
+  virtual double EstimatedFpRate(size_t num_entries, size_t bytes) = 0;
 };
 
 // RocksDB built-in filter policy for Bloom or Bloom-like filters.
@@ -64,10 +67,12 @@ class BloomFilterPolicy : public FilterPolicy {
     // FastLocalBloomImpl.
     // NOTE: TESTING ONLY as this mode does not check format_version
     kFastLocalBloom = 2,
-    // Automatically choose from the above (except kDeprecatedBlock) based on
+    // A Bloom alternative saving about 30% space for ~3-4x construction
+    // CPU time. See ribbon_alg.h and ribbon_impl.h.
+    kStandard128Ribbon = 3,
+    // Automatically choose between kLegacyBloom and kFastLocalBloom based on
     // context at build time, including compatibility with format_version.
-    // NOTE: This is currently the only recommended mode that is user exposed.
-    kAuto = 100,
+    kAutoBloom = 100,
   };
   // All the different underlying implementations that a BloomFilterPolicy
   // might use, as a mode that says "always use this implementation."
@@ -115,8 +120,12 @@ class BloomFilterPolicy : public FilterPolicy {
   int GetMillibitsPerKey() const { return millibits_per_key_; }
   // Essentially for testing only: legacy whole bits/key
   int GetWholeBitsPerKey() const { return whole_bits_per_key_; }
+  // Testing only
+  Mode GetMode() const { return mode_; }
 
  private:
+  // Bits per key settings are for configuring Bloom filters.
+
   // Newer filters support fractional bits per key. For predictable behavior
   // of 0.001-precision values across floating point implementations, we
   // round to thousandths of a bit (on average) per key.
@@ -127,6 +136,10 @@ class BloomFilterPolicy : public FilterPolicy {
   // behavior with format_version < 5 just in case.)
   int whole_bits_per_key_;
 
+  // For configuring Ribbon filter: a desired value for 1/fp_rate. For
+  // example, 100 -> 1% fp rate.
+  double desired_one_in_fp_rate_;
+
   // Selected mode (a specific implementation or way of selecting an
   // implementation) for building new SST filters.
   Mode mode_;
@@ -147,6 +160,9 @@ class BloomFilterPolicy : public FilterPolicy {
 
   // For newer Bloom filter implementation(s)
   FilterBitsReader* GetBloomBitsReader(const Slice& contents) const;
+
+  // For Ribbon filter implementation(s)
+  FilterBitsReader* GetRibbonBitsReader(const Slice& contents) const;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc
index a104bec4749..0e336c37fb6 100644
--- a/table/block_based/full_filter_block.cc
+++ b/table/block_based/full_filter_block.cc
@@ -22,42 +22,63 @@ FullFilterBlockBuilder::FullFilterBlockBuilder(
       whole_key_filtering_(whole_key_filtering),
       last_whole_key_recorded_(false),
       last_prefix_recorded_(false),
-      num_added_(0) {
+      last_key_in_domain_(false),
+      any_added_(false) {
   assert(filter_bits_builder != nullptr);
   filter_bits_builder_.reset(filter_bits_builder);
 }
 
-void FullFilterBlockBuilder::Add(const Slice& key) {
-  const bool add_prefix = prefix_extractor_ && prefix_extractor_->InDomain(key);
+size_t FullFilterBlockBuilder::EstimateEntriesAdded() {
+  return filter_bits_builder_->EstimateEntriesAdded();
+}
+
+void FullFilterBlockBuilder::Add(const Slice& key_without_ts) {
+  const bool add_prefix =
+      prefix_extractor_ && prefix_extractor_->InDomain(key_without_ts);
+
+  if (!last_prefix_recorded_ && last_key_in_domain_) {
+    // We can reach here when a new filter partition starts in partitioned
+    // filter. The last prefix in the previous partition should be added if
+    // necessary regardless of key_without_ts, to support prefix SeekForPrev.
+    AddKey(last_prefix_str_);
+    last_prefix_recorded_ = true;
+  }
+
   if (whole_key_filtering_) {
     if (!add_prefix) {
-      AddKey(key);
+      AddKey(key_without_ts);
     } else {
       // if both whole_key and prefix are added to bloom then we will have whole
-      // key and prefix addition being interleaved and thus cannot rely on the
-      // bits builder to properly detect the duplicates by comparing with the
-      // last item.
+      // key_without_ts and prefix addition being interleaved and thus cannot
+      // rely on the bits builder to properly detect the duplicates by comparing
+      // with the last item.
       Slice last_whole_key = Slice(last_whole_key_str_);
-      if (!last_whole_key_recorded_ || last_whole_key.compare(key) != 0) {
-        AddKey(key);
+      if (!last_whole_key_recorded_ ||
+          last_whole_key.compare(key_without_ts) != 0) {
+        AddKey(key_without_ts);
         last_whole_key_recorded_ = true;
-        last_whole_key_str_.assign(key.data(), key.size());
+        last_whole_key_str_.assign(key_without_ts.data(),
+                                   key_without_ts.size());
       }
     }
   }
   if (add_prefix) {
-    AddPrefix(key);
+    last_key_in_domain_ = true;
+    AddPrefix(key_without_ts);
+  } else {
+    last_key_in_domain_ = false;
   }
 }
 
 // Add key to filter if needed
 inline void FullFilterBlockBuilder::AddKey(const Slice& key) {
   filter_bits_builder_->AddKey(key);
-  num_added_++;
+  any_added_ = true;
 }
 
 // Add prefix to filter if needed
 void FullFilterBlockBuilder::AddPrefix(const Slice& key) {
+  assert(prefix_extractor_ && prefix_extractor_->InDomain(key));
   Slice prefix = prefix_extractor_->Transform(key);
   if (whole_key_filtering_) {
     // if both whole_key and prefix are added to bloom then we will have whole
@@ -85,8 +106,8 @@ Slice FullFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/,
   Reset();
   // In this impl we ignore BlockHandle
   *status = Status::OK();
-  if (num_added_ != 0) {
-    num_added_ = 0;
+  if (any_added_) {
+    any_added_ = false;
     return filter_bits_builder_->Finish(&filter_data_);
   }
   return Slice();
@@ -245,9 +266,9 @@ void FullFilterBlockReader::MayMatch(
   MultiGetRange filter_range(*range, range->begin(), range->end());
   for (auto iter = filter_range.begin(); iter != filter_range.end(); ++iter) {
     if (!prefix_extractor) {
-      keys[num_keys++] = &iter->ukey;
-    } else if (prefix_extractor->InDomain(iter->ukey)) {
-      prefixes.emplace_back(prefix_extractor->Transform(iter->ukey));
+      keys[num_keys++] = &iter->ukey_without_ts;
+    } else if (prefix_extractor->InDomain(iter->ukey_without_ts)) {
+      prefixes.emplace_back(prefix_extractor->Transform(iter->ukey_without_ts));
       keys[num_keys++] = &prefixes.back();
     } else {
       filter_range.SkipKey(iter);
@@ -283,16 +304,16 @@ size_t FullFilterBlockReader::ApproximateMemoryUsage() const {
 }
 
 bool FullFilterBlockReader::RangeMayExist(
-    const Slice* iterate_upper_bound, const Slice& user_key,
+    const Slice* iterate_upper_bound, const Slice& user_key_without_ts,
     const SliceTransform* prefix_extractor, const Comparator* comparator,
     const Slice* const const_ikey_ptr, bool* filter_checked,
     bool need_upper_bound_check, bool no_io,
     BlockCacheLookupContext* lookup_context) {
-  if (!prefix_extractor || !prefix_extractor->InDomain(user_key)) {
+  if (!prefix_extractor || !prefix_extractor->InDomain(user_key_without_ts)) {
     *filter_checked = false;
     return true;
   }
-  Slice prefix = prefix_extractor->Transform(user_key);
+  Slice prefix = prefix_extractor->Transform(user_key_without_ts);
   if (need_upper_bound_check &&
       !IsFilterCompatible(iterate_upper_bound, prefix, comparator)) {
     *filter_checked = false;
@@ -318,7 +339,8 @@ bool FullFilterBlockReader::IsFilterCompatible(
     }
     Slice upper_bound_xform = prefix_extractor->Transform(*iterate_upper_bound);
     // first check if user_key and upper_bound all share the same prefix
-    if (!comparator->Equal(prefix, upper_bound_xform)) {
+    if (comparator->CompareWithoutTimestamp(prefix, false, upper_bound_xform,
+                                            false) != 0) {
       // second check if user_key's prefix is the immediate predecessor of
       // upper_bound and have the same length. If so, we know for sure all
       // keys in the range [user_key, upper_bound) share the same prefix.
diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h
index 42f4dbbc3a5..e5f6df659f0 100644
--- a/table/block_based/full_filter_block.h
+++ b/table/block_based/full_filter_block.h
@@ -50,8 +50,9 @@ class FullFilterBlockBuilder : public FilterBlockBuilder {
 
   virtual bool IsBlockBased() override { return false; }
   virtual void StartBlock(uint64_t /*block_offset*/) override {}
-  virtual void Add(const Slice& key) override;
-  virtual size_t NumAdded() const override { return num_added_; }
+  virtual void Add(const Slice& key_without_ts) override;
+  virtual bool IsEmpty() const override { return !any_added_; }
+  virtual size_t EstimateEntriesAdded() override;
   virtual Slice Finish(const BlockHandle& tmp, Status* status) override;
   using FilterBlockBuilder::Finish;
 
@@ -61,6 +62,7 @@ class FullFilterBlockBuilder : public FilterBlockBuilder {
   virtual void Reset();
   void AddPrefix(const Slice& key);
   const SliceTransform* prefix_extractor() { return prefix_extractor_; }
+  const std::string& last_prefix_str() const { return last_prefix_str_; }
 
  private:
   // important: all of these might point to invalid addresses
@@ -72,10 +74,13 @@ class FullFilterBlockBuilder : public FilterBlockBuilder {
   std::string last_whole_key_str_;
   bool last_prefix_recorded_;
   std::string last_prefix_str_;
-
-  uint32_t num_added_;
+  // Whether prefix_extractor_->InDomain(last_whole_key_) is true.
+  // Used in partitioned filters so that the last prefix from the previous
+  // filter partition will be added to the current partition if
+  // last_key_in_domain_ is true, regardless of the current key.
+  bool last_key_in_domain_;
+  bool any_added_;
   std::unique_ptr<const char[]> filter_data_;
-
 };
 
 // A FilterBlockReader is used to parse filter from SST table.
diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc
index 496b149ab75..b3563da3ea3 100644
--- a/table/block_based/full_filter_block_test.cc
+++ b/table/block_based/full_filter_block_test.cc
@@ -3,13 +3,16 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include "table/block_based/full_filter_block.h"
+
 #include <set>
 
-#include "table/block_based/full_filter_block.h"
 #include "rocksdb/filter_policy.h"
+#include "rocksdb/status.h"
 #include "table/block_based/block_based_table_reader.h"
-#include "table/block_based/mock_block_based_table.h"
 #include "table/block_based/filter_policy_internal.h"
+#include "table/block_based/mock_block_based_table.h"
+#include "table/format.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/coding.h"
@@ -224,8 +227,8 @@ class CountUniqueFilterBitsBuilderWrapper : public FilterBitsBuilder {
     return rv;
   }
 
-  int CalculateNumEntry(const uint32_t bytes) override {
-    return b_->CalculateNumEntry(bytes);
+  size_t ApproximateNumEntries(size_t bytes) override {
+    return b_->ApproximateNumEntries(bytes);
   }
 
   size_t CountUnique() { return uniq_.size(); }
@@ -239,11 +242,9 @@ TEST_F(FullFilterBlockTest, DuplicateEntries) {
     const bool WHOLE_KEY = true;
     FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
                                    bits_builder);
-    ASSERT_EQ(0, builder.NumAdded());
     ASSERT_EQ(0, bits_builder->CountUnique());
     // adds key and empty prefix; both abstractions count them
     builder.Add("key1");
-    ASSERT_EQ(2, builder.NumAdded());
     ASSERT_EQ(2, bits_builder->CountUnique());
     // Add different key (unique) and also empty prefix (not unique).
     // From here in this test, it's immaterial whether the block builder
@@ -262,7 +263,6 @@ TEST_F(FullFilterBlockTest, DuplicateEntries) {
   const bool WHOLE_KEY = true;
   FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
                                  bits_builder);
-  ASSERT_EQ(0, builder.NumAdded());
   builder.Add("");  // test with empty key too
   builder.Add("prefix1key1");
   builder.Add("prefix1key1");
@@ -275,14 +275,19 @@ TEST_F(FullFilterBlockTest, DuplicateEntries) {
 
 TEST_F(FullFilterBlockTest, SingleChunk) {
   FullFilterBlockBuilder builder(nullptr, true, GetBuilder());
-  ASSERT_EQ(0, builder.NumAdded());
+  ASSERT_TRUE(builder.IsEmpty());
   builder.Add("foo");
+  ASSERT_FALSE(builder.IsEmpty());
   builder.Add("bar");
   builder.Add("box");
   builder.Add("box");
   builder.Add("hello");
-  ASSERT_EQ(5, builder.NumAdded());
-  Slice slice = builder.Finish();
+  // "box" only counts once
+  ASSERT_EQ(4, builder.EstimateEntriesAdded());
+  ASSERT_FALSE(builder.IsEmpty());
+  Status s;
+  Slice slice = builder.Finish(BlockHandle(), &s);
+  ASSERT_OK(s);
 
   CachableEntry<ParsedFullFilterBlock> block(
       new ParsedFullFilterBlock(table_options_.filter_policy.get(),
diff --git a/table/block_based/hash_index_reader.cc b/table/block_based/hash_index_reader.cc
index 7a281edbf08..4d8544161ce 100644
--- a/table/block_based/hash_index_reader.cc
+++ b/table/block_based/hash_index_reader.cc
@@ -66,7 +66,7 @@ Status HashIndexReader::Create(const BlockBasedTable* table,
 
   RandomAccessFileReader* const file = rep->file.get();
   const Footer& footer = rep->footer;
-  const ImmutableCFOptions& ioptions = rep->ioptions;
+  const ImmutableOptions& ioptions = rep->ioptions;
   const PersistentCacheOptions& cache_options = rep->persistent_cache_options;
   MemoryAllocator* const memory_allocator =
       GetMemoryAllocator(rep->table_options);
diff --git a/table/block_based/index_reader_common.cc b/table/block_based/index_reader_common.cc
index 76f894d59ff..275ae56dc19 100644
--- a/table/block_based/index_reader_common.cc
+++ b/table/block_based/index_reader_common.cc
@@ -26,7 +26,8 @@ Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock(
   const Status s = table->RetrieveBlock(
       prefetch_buffer, read_options, rep->footer.index_handle(),
       UncompressionDict::GetEmptyDict(), index_block, BlockType::kIndex,
-      get_context, lookup_context, /* for_compaction */ false, use_cache);
+      get_context, lookup_context, /* for_compaction */ false, use_cache,
+      /* wait_for_cache */ true);
 
   return s;
 }
diff --git a/table/block_based/mock_block_based_table.h b/table/block_based/mock_block_based_table.h
index e1dcf0ebbfc..e0533a71753 100644
--- a/table/block_based/mock_block_based_table.h
+++ b/table/block_based/mock_block_based_table.h
@@ -23,7 +23,7 @@ class MockBlockBasedTableTester {
 
  public:
   Options options_;
-  ImmutableCFOptions ioptions_;
+  ImmutableOptions ioptions_;
   EnvOptions env_options_;
   BlockBasedTableOptions table_options_;
   InternalKeyComparator icomp_;
@@ -47,7 +47,7 @@ class MockBlockBasedTableTester {
     context.column_family_name = "mock_cf";
     context.compaction_style = ioptions_.compaction_style;
     context.level_at_creation = kMockLevel;
-    context.info_log = ioptions_.info_log;
+    context.info_log = ioptions_.logger;
     return BloomFilterPolicy::GetBuilderFromContext(context);
   }
 };
diff --git a/table/block_based/parsed_full_filter_block.h b/table/block_based/parsed_full_filter_block.h
index 36c619921d8..95d7b520871 100644
--- a/table/block_based/parsed_full_filter_block.h
+++ b/table/block_based/parsed_full_filter_block.h
@@ -32,6 +32,8 @@ class ParsedFullFilterBlock {
 
   bool own_bytes() const { return block_contents_.own_bytes(); }
 
+  const Slice GetBlockContentsData() const { return block_contents_.data; }
+
  private:
   BlockContents block_contents_;
   std::unique_ptr<FilterBitsReader> filter_bits_reader_;
diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
index dc25abbea41..61cd1258781 100644
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@@ -7,7 +7,7 @@
 
 #include <utility>
 
-#include "file/file_util.h"
+#include "file/random_access_file_reader.h"
 #include "monitoring/perf_context_imp.h"
 #include "port/malloc.h"
 #include "port/port.h"
@@ -33,16 +33,18 @@ PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder(
                                                  true /*use_delta_encoding*/,
                                                  use_value_delta_encoding),
       p_index_builder_(p_index_builder),
-      keys_added_to_partition_(0) {
-  keys_per_partition_ =
-      filter_bits_builder_->CalculateNumEntry(partition_size);
+      keys_added_to_partition_(0),
+      total_added_in_built_(0) {
+  keys_per_partition_ = static_cast<uint32_t>(
+      filter_bits_builder_->ApproximateNumEntries(partition_size));
   if (keys_per_partition_ < 1) {
     // partition_size (minus buffer, ~10%) might be smaller than minimum
     // filter size, sometimes based on cache line size. Try to find that
     // minimum size without CalculateSpace (not necessarily available).
     uint32_t larger = std::max(partition_size + 4, uint32_t{16});
     for (;;) {
-      keys_per_partition_ = filter_bits_builder_->CalculateNumEntry(larger);
+      keys_per_partition_ = static_cast<uint32_t>(
+          filter_bits_builder_->ApproximateNumEntries(larger));
       if (keys_per_partition_ >= 1) {
         break;
       }
@@ -72,15 +74,19 @@ void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock(
   }
   filter_gc.push_back(std::unique_ptr<const char[]>(nullptr));
 
-  // Add the prefix of the next key before finishing the partition. This hack,
-  // fixes a bug with format_verison=3 where seeking for the prefix would lead
-  // us to the previous partition.
-  const bool add_prefix =
+  // Add the prefix of the next key before finishing the partition without
+  // updating last_prefix_str_. This hack, fixes a bug with format_verison=3
+  // where seeking for the prefix would lead us to the previous partition.
+  const bool maybe_add_prefix =
       next_key && prefix_extractor() && prefix_extractor()->InDomain(*next_key);
-  if (add_prefix) {
-    FullFilterBlockBuilder::AddPrefix(*next_key);
+  if (maybe_add_prefix) {
+    const Slice next_key_prefix = prefix_extractor()->Transform(*next_key);
+    if (next_key_prefix.compare(last_prefix_str()) != 0) {
+      AddKey(next_key_prefix);
+    }
   }
 
+  total_added_in_built_ += filter_bits_builder_->EstimateEntriesAdded();
   Slice filter = filter_bits_builder_->Finish(&filter_gc.back());
   std::string& index_key = p_index_builder_->GetPartitionKey();
   filters.push_back({index_key, filter});
@@ -98,6 +104,10 @@ void PartitionedFilterBlockBuilder::AddKey(const Slice& key) {
   keys_added_to_partition_++;
 }
 
+size_t PartitionedFilterBlockBuilder::EstimateEntriesAdded() {
+  return total_added_in_built_ + filter_bits_builder_->EstimateEntriesAdded();
+}
+
 Slice PartitionedFilterBlockBuilder::Finish(
     const BlockHandle& last_partition_block_handle, Status* status) {
   if (finishing_filters == true) {
@@ -127,6 +137,8 @@ Slice PartitionedFilterBlockBuilder::Finish(
   if (UNLIKELY(filters.empty())) {
     *status = Status::OK();
     if (finishing_filters) {
+      // Simplest to just add them all at the end
+      total_added_in_built_ = 0;
       if (p_index_builder_->seperator_is_key_plus_seq()) {
         return index_on_filter_block_builder_.Finish();
       } else {
@@ -284,7 +296,8 @@ Status PartitionedFilterBlockReader::GetFilterPartitionBlock(
       table()->RetrieveBlock(prefetch_buffer, read_options, fltr_blk_handle,
                              UncompressionDict::GetEmptyDict(), filter_block,
                              BlockType::kFilter, get_context, lookup_context,
-                             /* for_compaction */ false, /* use_cache */ true);
+                             /* for_compaction */ false, /* use_cache */ true,
+                             /* wait_for_cache */ true);
 
   return s;
 }
@@ -412,8 +425,8 @@ size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const {
 }
 
 // TODO(myabandeh): merge this with the same function in IndexReader
-void PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro,
-                                                     bool pin) {
+Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro,
+                                                       bool pin) {
   assert(table());
 
   const BlockBasedTable::Rep* const rep = table()->get_rep();
@@ -426,12 +439,11 @@ void PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro,
   Status s = GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */,
                                   &lookup_context, &filter_block);
   if (!s.ok()) {
-    ROCKS_LOG_WARN(rep->ioptions.info_log,
-                   "Error retrieving top-level filter block while trying to "
-                   "cache filter partitions: %s",
-                   s.ToString().c_str());
-    IGNORE_STATUS_IF_ERROR(s);
-    return;
+    ROCKS_LOG_ERROR(rep->ioptions.logger,
+                    "Error retrieving top-level filter block while trying to "
+                    "cache filter partitions: %s",
+                    s.ToString().c_str());
+    return s;
   }
 
   // Before read partitions, prefetch them to avoid lots of IOs
@@ -457,14 +469,18 @@ void PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro,
   uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
   uint64_t prefetch_len = last_off - prefetch_off;
   std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
+  rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer,
+                                false /* Implicit autoreadahead */);
 
-  prefetch_buffer.reset(new FilePrefetchBuffer());
   IOOptions opts;
-  s = PrepareIOFromReadOptions(ro, rep->file->env(), opts);
+  s = rep->file->PrepareIOOptions(ro, opts);
   if (s.ok()) {
     s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off,
                                   static_cast<size_t>(prefetch_len));
   }
+  if (!s.ok()) {
+    return s;
+  }
 
   // After prefetch, read the partitions one by one
   for (biter.SeekToFirst(); biter.Valid(); biter.Next()) {
@@ -475,19 +491,22 @@ void PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro,
     // filter blocks
     s = table()->MaybeReadBlockAndLoadToCache(
         prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
-        &block, BlockType::kFilter, nullptr /* get_context */, &lookup_context,
-        nullptr /* contents */);
-
+        /* wait */ true, &block, BlockType::kFilter, nullptr /* get_context */,
+        &lookup_context, nullptr /* contents */);
+    if (!s.ok()) {
+      return s;
+    }
     assert(s.ok() || block.GetValue() == nullptr);
-    if (s.ok() && block.GetValue() != nullptr) {
+
+    if (block.GetValue() != nullptr) {
       if (block.IsCached()) {
         if (pin) {
           filter_map_[handle.offset()] = std::move(block);
         }
       }
     }
-    IGNORE_STATUS_IF_ERROR(s);
   }
+  return biter.status();
 }
 
 const InternalKeyComparator* PartitionedFilterBlockReader::internal_comparator()
diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h
index 2ccc8f8bcf5..0d0ee89e79e 100644
--- a/table/block_based/partitioned_filter_block.h
+++ b/table/block_based/partitioned_filter_block.h
@@ -8,6 +8,7 @@
 #include <list>
 #include <string>
 #include <unordered_map>
+
 #include "db/dbformat.h"
 #include "index_builder.h"
 #include "rocksdb/options.h"
@@ -33,6 +34,7 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
 
   void AddKey(const Slice& key) override;
   void Add(const Slice& key) override;
+  size_t EstimateEntriesAdded() override;
 
   virtual Slice Finish(const BlockHandle& last_partition_block_handle,
                        Status* status) override;
@@ -62,6 +64,9 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
   uint32_t keys_per_partition_;
   // The number of keys added to the last partition so far
   uint32_t keys_added_to_partition_;
+  // According to the bits builders, how many keys/prefixes added
+  // in all the filters we have fully built
+  uint64_t total_added_in_built_;
   BlockHandle last_encoded_handle_;
 };
 
@@ -130,7 +135,7 @@ class PartitionedFilterBlockReader : public FilterBlockReaderCommon<Block> {
                          uint64_t block_offset, BlockHandle filter_handle,
                          bool no_io, BlockCacheLookupContext* lookup_context,
                          FilterManyFunction filter_function) const;
-  void CacheDependencies(const ReadOptions& ro, bool pin) override;
+  Status CacheDependencies(const ReadOptions& ro, bool pin) override;
 
   const InternalKeyComparator* internal_comparator() const;
   bool index_key_includes_seq() const;
diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc
index e23f910b537..7b4d49baf11 100644
--- a/table/block_based/partitioned_filter_block_test.cc
+++ b/table/block_based/partitioned_filter_block_test.cc
@@ -12,7 +12,6 @@
 #include "table/block_based/filter_policy_internal.h"
 
 #include "index_builder.h"
-#include "logging/logging.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/coding.h"
@@ -59,7 +58,7 @@ class PartitionedFilterBlockTest
       virtual public ::testing::WithParamInterface<uint32_t> {
  public:
   Options options_;
-  ImmutableCFOptions ioptions_;
+  ImmutableOptions ioptions_;
   EnvOptions env_options_;
   BlockBasedTableOptions table_options_;
   InternalKeyComparator icomp_;
diff --git a/table/block_based/partitioned_index_reader.cc b/table/block_based/partitioned_index_reader.cc
index 0dbd132b16a..acb40f125d1 100644
--- a/table/block_based/partitioned_index_reader.cc
+++ b/table/block_based/partitioned_index_reader.cc
@@ -8,7 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "table/block_based/partitioned_index_reader.h"
 
-#include "file/file_util.h"
+#include "file/random_access_file_reader.h"
 #include "table/block_based/partitioned_index_iterator.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -146,9 +146,10 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro,
   uint64_t last_off = handle.offset() + block_size(handle);
   uint64_t prefetch_len = last_off - prefetch_off;
   std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
-  rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer);
+  rep->CreateFilePrefetchBuffer(0, 0, &prefetch_buffer,
+                                false /*Implicit auto readahead*/);
   IOOptions opts;
-  s = PrepareIOFromReadOptions(ro, rep->file->env(), opts);
+  s = rep->file->PrepareIOOptions(ro, opts);
   if (s.ok()) {
     s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off,
                                   static_cast<size_t>(prefetch_len));
@@ -166,8 +167,8 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro,
     // filter blocks
     s = table()->MaybeReadBlockAndLoadToCache(
         prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
-        &block, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context,
-        /*contents=*/nullptr);
+        /*wait=*/true, &block, BlockType::kIndex, /*get_context=*/nullptr,
+        &lookup_context, /*contents=*/nullptr);
 
     if (!s.ok()) {
       return s;
diff --git a/table/block_based/uncompression_dict_reader.cc b/table/block_based/uncompression_dict_reader.cc
index db33e934067..dae5ddac2ac 100644
--- a/table/block_based/uncompression_dict_reader.cc
+++ b/table/block_based/uncompression_dict_reader.cc
@@ -60,11 +60,11 @@ Status UncompressionDictReader::ReadUncompressionDictionary(
       prefetch_buffer, read_options, rep->compression_dict_handle,
       UncompressionDict::GetEmptyDict(), uncompression_dict,
       BlockType::kCompressionDictionary, get_context, lookup_context,
-      /* for_compaction */ false, use_cache);
+      /* for_compaction */ false, use_cache, /* wait_for_cache */ true);
 
   if (!s.ok()) {
     ROCKS_LOG_WARN(
-        rep->ioptions.info_log,
+        rep->ioptions.logger,
         "Encountered error while reading data from compression dictionary "
         "block %s",
         s.ToString().c_str());
diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc
index b0880d516a2..90558168eeb 100644
--- a/table/block_fetcher.cc
+++ b/table/block_fetcher.cc
@@ -12,7 +12,6 @@
 #include <cinttypes>
 #include <string>
 
-#include "file/file_util.h"
 #include "logging/logging.h"
 #include "memory/memory_allocator.h"
 #include "monitoring/perf_context_imp.h"
@@ -30,9 +29,9 @@ namespace ROCKSDB_NAMESPACE {
 inline void BlockFetcher::CheckBlockChecksum() {
   // Check the crc of the type and the block contents
   if (read_options_.verify_checksums) {
-    status_ = ROCKSDB_NAMESPACE::VerifyBlockChecksum(
+    io_status_ = status_to_io_status(ROCKSDB_NAMESPACE::VerifyBlockChecksum(
         footer_.checksum(), slice_.data(), block_size_, file_->file_name(),
-        handle_.offset());
+        handle_.offset()));
   }
 }
 
@@ -46,9 +45,9 @@ inline bool BlockFetcher::TryGetUncompressBlockFromPersistentCache() {
       return true;
     } else {
       // uncompressed page is not found
-      if (ioptions_.info_log && !status.IsNotFound()) {
+      if (ioptions_.logger && !status.IsNotFound()) {
         assert(!status.ok());
-        ROCKS_LOG_INFO(ioptions_.info_log,
+        ROCKS_LOG_INFO(ioptions_.logger,
                        "Error reading from persistent cache. %s",
                        status.ToString().c_str());
       }
@@ -60,16 +59,19 @@ inline bool BlockFetcher::TryGetUncompressBlockFromPersistentCache() {
 inline bool BlockFetcher::TryGetFromPrefetchBuffer() {
   if (prefetch_buffer_ != nullptr) {
     IOOptions opts;
-    Status s = PrepareIOFromReadOptions(read_options_, file_->env(), opts);
-    if (s.ok() && prefetch_buffer_->TryReadFromCache(
-                      opts, handle_.offset(), block_size_with_trailer_, &slice_,
-                      for_compaction_)) {
+    IOStatus io_s = file_->PrepareIOOptions(read_options_, opts);
+    if (io_s.ok() && prefetch_buffer_->TryReadFromCache(
+                         opts, handle_.offset(), block_size_with_trailer_,
+                         &slice_, &io_s, for_compaction_)) {
       CheckBlockChecksum();
-      if (!status_.ok()) {
+      if (!io_status_.ok()) {
         return true;
       }
       got_from_prefetch_buffer_ = true;
       used_buf_ = const_cast<char*>(slice_.data());
+    } else if (!io_s.ok()) {
+      io_status_ = io_s;
+      return true;
     }
   }
   return got_from_prefetch_buffer_;
@@ -80,18 +82,18 @@ inline bool BlockFetcher::TryGetCompressedBlockFromPersistentCache() {
       cache_options_.persistent_cache->IsCompressed()) {
     // lookup uncompressed cache mode p-cache
     std::unique_ptr<char[]> raw_data;
-    status_ = PersistentCacheHelper::LookupRawPage(
-        cache_options_, handle_, &raw_data, block_size_with_trailer_);
-    if (status_.ok()) {
+    io_status_ = status_to_io_status(PersistentCacheHelper::LookupRawPage(
+        cache_options_, handle_, &raw_data, block_size_with_trailer_));
+    if (io_status_.ok()) {
       heap_buf_ = CacheAllocationPtr(raw_data.release());
       used_buf_ = heap_buf_.get();
       slice_ = Slice(heap_buf_.get(), block_size_);
       return true;
-    } else if (!status_.IsNotFound() && ioptions_.info_log) {
-      assert(!status_.ok());
-      ROCKS_LOG_INFO(ioptions_.info_log,
+    } else if (!io_status_.IsNotFound() && ioptions_.logger) {
+      assert(!io_status_.ok());
+      ROCKS_LOG_INFO(ioptions_.logger,
                      "Error reading from persistent cache. %s",
-                     status_.ToString().c_str());
+                     io_status_.ToString().c_str());
     }
   }
   return false;
@@ -134,7 +136,7 @@ inline void BlockFetcher::PrepareBufferForBlockFromFile() {
 }
 
 inline void BlockFetcher::InsertCompressedBlockToPersistentCacheIfNeeded() {
-  if (status_.ok() && read_options_.fill_cache &&
+  if (io_status_.ok() && read_options_.fill_cache &&
       cache_options_.persistent_cache &&
       cache_options_.persistent_cache->IsCompressed()) {
     // insert to raw cache
@@ -144,8 +146,8 @@ inline void BlockFetcher::InsertCompressedBlockToPersistentCacheIfNeeded() {
 }
 
 inline void BlockFetcher::InsertUncompressedBlockToPersistentCacheIfNeeded() {
-  if (status_.ok() && !got_from_prefetch_buffer_ && read_options_.fill_cache &&
-      cache_options_.persistent_cache &&
+  if (io_status_.ok() && !got_from_prefetch_buffer_ &&
+      read_options_.fill_cache && cache_options_.persistent_cache &&
       !cache_options_.persistent_cache->IsCompressed()) {
     // insert to uncompressed cache
     PersistentCacheHelper::InsertUncompressedPage(cache_options_, handle_,
@@ -213,26 +215,26 @@ inline void BlockFetcher::GetBlockContents() {
 #endif
 }
 
-Status BlockFetcher::ReadBlockContents() {
+IOStatus BlockFetcher::ReadBlockContents() {
   if (TryGetUncompressBlockFromPersistentCache()) {
     compression_type_ = kNoCompression;
 #ifndef NDEBUG
     contents_->is_raw_block = true;
 #endif  // NDEBUG
-    return Status::OK();
+    return IOStatus::OK();
   }
   if (TryGetFromPrefetchBuffer()) {
-    if (!status_.ok()) {
-      return status_;
+    if (!io_status_.ok()) {
+      return io_status_;
     }
   } else if (!TryGetCompressedBlockFromPersistentCache()) {
     IOOptions opts;
-    status_ = PrepareIOFromReadOptions(read_options_, file_->env(), opts);
+    io_status_ = file_->PrepareIOOptions(read_options_, opts);
     // Actual file read
-    if (status_.ok()) {
+    if (io_status_.ok()) {
       if (file_->use_direct_io()) {
         PERF_TIMER_GUARD(block_read_time);
-        status_ =
+        io_status_ =
             file_->Read(opts, handle_.offset(), block_size_with_trailer_,
                         &slice_, nullptr, &direct_io_buf_, for_compaction_);
         PERF_COUNTER_ADD(block_read_count, 1);
@@ -240,8 +242,9 @@ Status BlockFetcher::ReadBlockContents() {
       } else {
         PrepareBufferForBlockFromFile();
         PERF_TIMER_GUARD(block_read_time);
-        status_ = file_->Read(opts, handle_.offset(), block_size_with_trailer_,
-                              &slice_, used_buf_, nullptr, for_compaction_);
+        io_status_ =
+            file_->Read(opts, handle_.offset(), block_size_with_trailer_,
+                        &slice_, used_buf_, nullptr, for_compaction_);
         PERF_COUNTER_ADD(block_read_count, 1);
 #ifndef NDEBUG
         if (slice_.data() == &stack_buf_[0]) {
@@ -275,23 +278,23 @@ Status BlockFetcher::ReadBlockContents() {
     }
 
     PERF_COUNTER_ADD(block_read_byte, block_size_with_trailer_);
-    if (!status_.ok()) {
-      return status_;
+    if (!io_status_.ok()) {
+      return io_status_;
     }
 
     if (slice_.size() != block_size_with_trailer_) {
-      return Status::Corruption("truncated block read from " +
-                                file_->file_name() + " offset " +
-                                ToString(handle_.offset()) + ", expected " +
-                                ToString(block_size_with_trailer_) +
-                                " bytes, got " + ToString(slice_.size()));
+      return IOStatus::Corruption("truncated block read from " +
+                                  file_->file_name() + " offset " +
+                                  ToString(handle_.offset()) + ", expected " +
+                                  ToString(block_size_with_trailer_) +
+                                  " bytes, got " + ToString(slice_.size()));
     }
 
     CheckBlockChecksum();
-    if (status_.ok()) {
+    if (io_status_.ok()) {
       InsertCompressedBlockToPersistentCacheIfNeeded();
     } else {
-      return status_;
+      return io_status_;
     }
   }
 
@@ -302,9 +305,9 @@ Status BlockFetcher::ReadBlockContents() {
     // compressed page, uncompress, update cache
     UncompressionContext context(compression_type_);
     UncompressionInfo info(context, uncompression_dict_, compression_type_);
-    status_ = UncompressBlockContents(info, slice_.data(), block_size_,
-                                      contents_, footer_.version(), ioptions_,
-                                      memory_allocator_);
+    io_status_ = status_to_io_status(UncompressBlockContents(
+        info, slice_.data(), block_size_, contents_, footer_.version(),
+        ioptions_, memory_allocator_));
 #ifndef NDEBUG
     num_heap_buf_memcpy_++;
 #endif
@@ -315,7 +318,7 @@ Status BlockFetcher::ReadBlockContents() {
 
   InsertUncompressedBlockToPersistentCacheIfNeeded();
 
-  return status_;
+  return io_status_;
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/block_fetcher.h b/table/block_fetcher.h
index c03352e986f..e06d964b52d 100644
--- a/table/block_fetcher.h
+++ b/table/block_fetcher.h
@@ -39,7 +39,7 @@ class BlockFetcher {
   BlockFetcher(RandomAccessFileReader* file,
                FilePrefetchBuffer* prefetch_buffer, const Footer& footer,
                const ReadOptions& read_options, const BlockHandle& handle,
-               BlockContents* contents, const ImmutableCFOptions& ioptions,
+               BlockContents* contents, const ImmutableOptions& ioptions,
                bool do_uncompress, bool maybe_compressed, BlockType block_type,
                const UncompressionDict& uncompression_dict,
                const PersistentCacheOptions& cache_options,
@@ -64,7 +64,7 @@ class BlockFetcher {
         memory_allocator_compressed_(memory_allocator_compressed),
         for_compaction_(for_compaction) {}
 
-  Status ReadBlockContents();
+  IOStatus ReadBlockContents();
   CompressionType get_compression_type() const { return compression_type_; }
 
 #ifndef NDEBUG
@@ -90,7 +90,7 @@ class BlockFetcher {
   const ReadOptions read_options_;
   const BlockHandle& handle_;
   BlockContents* contents_;
-  const ImmutableCFOptions& ioptions_;
+  const ImmutableOptions& ioptions_;
   const bool do_uncompress_;
   const bool maybe_compressed_;
   const BlockType block_type_;
@@ -100,7 +100,7 @@ class BlockFetcher {
   const PersistentCacheOptions& cache_options_;
   MemoryAllocator* memory_allocator_;
   MemoryAllocator* memory_allocator_compressed_;
-  Status status_;
+  IOStatus io_status_;
   Slice slice_;
   char* used_buf_ = nullptr;
   AlignedBuf direct_io_buf_;
diff --git a/table/block_fetcher_test.cc b/table/block_fetcher_test.cc
index 0786730af64..4499272b4fa 100644
--- a/table/block_fetcher_test.cc
+++ b/table/block_fetcher_test.cc
@@ -6,11 +6,11 @@
 #include "table/block_fetcher.h"
 
 #include "db/table_properties_collector.h"
-#include "env/composite_env_wrapper.h"
 #include "file/file_util.h"
 #include "options/options_helper.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
+#include "rocksdb/file_system.h"
 #include "table/block_based/binary_search_index_reader.h"
 #include "table/block_based/block_based_table_builder.h"
 #include "table/block_based/block_based_table_factory.h"
@@ -93,22 +93,23 @@ class BlockFetcherTest : public testing::Test {
     NewFileWriter(table_name, &writer);
 
     // Create table builder.
-    ImmutableCFOptions ioptions(options_);
+    ImmutableOptions ioptions(options_);
     InternalKeyComparator comparator(options_.comparator);
     ColumnFamilyOptions cf_options(options_);
     MutableCFOptions moptions(cf_options);
-    std::vector<std::unique_ptr<IntTblPropCollectorFactory>> factories;
+    IntTblPropCollectorFactories factories;
     std::unique_ptr<TableBuilder> table_builder(table_factory_.NewTableBuilder(
         TableBuilderOptions(ioptions, moptions, comparator, &factories,
-                            compression_type, 0 /* sample_for_compression */,
-                            CompressionOptions(), false /* skip_filters */,
-                            kDefaultColumnFamilyName, -1 /* level */),
-        0 /* column_family_id */, writer.get()));
+                            compression_type, CompressionOptions(),
+                            0 /* column_family_id */, kDefaultColumnFamilyName,
+                            -1 /* level */),
+        writer.get()));
 
     // Build table.
     for (int i = 0; i < 9; i++) {
       std::string key = ToInternalKey(std::to_string(i));
-      std::string value = std::to_string(i);
+      // Append "00000000" to string value to enhance compression ratio
+      std::string value = "00000000" + std::to_string(i);
       table_builder->Add(key, value);
     }
     ASSERT_OK(table_builder->Finish());
@@ -190,22 +191,30 @@ class BlockFetcherTest : public testing::Test {
         ASSERT_EQ(memcpy_stats[i].num_compressed_buf_memcpy,
                   expected_stats.memcpy_stats.num_compressed_buf_memcpy);
 
-        ASSERT_EQ(heap_buf_allocators[i].GetNumAllocations(),
-                  expected_stats.buf_allocation_stats.num_heap_buf_allocations);
-        ASSERT_EQ(
-            compressed_buf_allocators[i].GetNumAllocations(),
-            expected_stats.buf_allocation_stats.num_compressed_buf_allocations);
-
-        // The allocated buffers are not deallocated until
-        // the block content is deleted.
-        ASSERT_EQ(heap_buf_allocators[i].GetNumDeallocations(), 0);
-        ASSERT_EQ(compressed_buf_allocators[i].GetNumDeallocations(), 0);
-        blocks[i].allocation.reset();
-        ASSERT_EQ(heap_buf_allocators[i].GetNumDeallocations(),
-                  expected_stats.buf_allocation_stats.num_heap_buf_allocations);
-        ASSERT_EQ(
-            compressed_buf_allocators[i].GetNumDeallocations(),
-            expected_stats.buf_allocation_stats.num_compressed_buf_allocations);
+        if (kXpressCompression == compression_type) {
+          // XPRESS allocates memory internally, thus does not support for
+          // custom allocator verification
+          continue;
+        } else {
+          ASSERT_EQ(
+              heap_buf_allocators[i].GetNumAllocations(),
+              expected_stats.buf_allocation_stats.num_heap_buf_allocations);
+          ASSERT_EQ(compressed_buf_allocators[i].GetNumAllocations(),
+                    expected_stats.buf_allocation_stats
+                        .num_compressed_buf_allocations);
+
+          // The allocated buffers are not deallocated until
+          // the block content is deleted.
+          ASSERT_EQ(heap_buf_allocators[i].GetNumDeallocations(), 0);
+          ASSERT_EQ(compressed_buf_allocators[i].GetNumDeallocations(), 0);
+          blocks[i].allocation.reset();
+          ASSERT_EQ(
+              heap_buf_allocators[i].GetNumDeallocations(),
+              expected_stats.buf_allocation_stats.num_heap_buf_allocations);
+          ASSERT_EQ(compressed_buf_allocators[i].GetNumDeallocations(),
+                    expected_stats.buf_allocation_stats
+                        .num_compressed_buf_allocations);
+        }
       }
     }
   }
@@ -248,11 +257,9 @@ class BlockFetcherTest : public testing::Test {
   void NewFileWriter(const std::string& filename,
                      std::unique_ptr<WritableFileWriter>* writer) {
     std::string path = Path(filename);
-    EnvOptions env_options;
-    std::unique_ptr<WritableFile> file;
-    ASSERT_OK(env_->NewWritableFile(path, &file, env_options));
-    writer->reset(new WritableFileWriter(
-        NewLegacyWritableFileWrapper(std::move(file)), path, env_options));
+    FileOptions file_options;
+    ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), path,
+                                         file_options, writer, nullptr));
   }
 
   void NewFileReader(const std::string& filename, const FileOptions& opt,
@@ -260,10 +267,11 @@ class BlockFetcherTest : public testing::Test {
     std::string path = Path(filename);
     std::unique_ptr<FSRandomAccessFile> f;
     ASSERT_OK(fs_->NewRandomAccessFile(path, opt, &f, nullptr));
-    reader->reset(new RandomAccessFileReader(std::move(f), path, env_));
+    reader->reset(new RandomAccessFileReader(std::move(f), path,
+                                             env_->GetSystemClock().get()));
   }
 
-  void NewTableReader(const ImmutableCFOptions& ioptions,
+  void NewTableReader(const ImmutableOptions& ioptions,
                       const FileOptions& foptions,
                       const InternalKeyComparator& comparator,
                       const std::string& table_name,
@@ -309,7 +317,7 @@ class BlockFetcherTest : public testing::Test {
                   MemoryAllocator* compressed_buf_allocator,
                   BlockContents* contents, MemcpyStats* stats,
                   CompressionType* compresstion_type) {
-    ImmutableCFOptions ioptions(options_);
+    ImmutableOptions ioptions(options_);
     ReadOptions roptions;
     PersistentCacheOptions persistent_cache_options;
     Footer footer;
@@ -340,7 +348,7 @@ class BlockFetcherTest : public testing::Test {
                            MemoryAllocator* compressed_buf_allocator,
                            BlockContents* block, std::string* result,
                            MemcpyStats* memcpy_stats) {
-    ImmutableCFOptions ioptions(options_);
+    ImmutableOptions ioptions(options_);
     InternalKeyComparator comparator(options_.comparator);
     FileOptions foptions(options_);
 
diff --git a/table/cuckoo/cuckoo_table_builder.cc b/table/cuckoo/cuckoo_table_builder.cc
index f42e87bdfb1..15f21403567 100644
--- a/table/cuckoo/cuckoo_table_builder.cc
+++ b/table/cuckoo/cuckoo_table_builder.cc
@@ -82,6 +82,8 @@ CuckooTableBuilder::CuckooTableBuilder(
   properties_.column_family_name = column_family_name;
   properties_.db_id = db_id;
   properties_.db_session_id = db_session_id;
+  status_.PermitUncheckedError();
+  io_status_.PermitUncheckedError();
 }
 
 void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
@@ -90,8 +92,11 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) {
     return;
   }
   ParsedInternalKey ikey;
-  if (ParseInternalKey(key, &ikey) != Status::OK()) {
-    status_ = Status::Corruption("Unable to parse key into inernal key.");
+  Status pik_status =
+      ParseInternalKey(key, &ikey, false /* log_err_key */);  // TODO
+  if (!pik_status.ok()) {
+    status_ = Status::Corruption("Unable to parse key into internal key. ",
+                                 pik_status.getState());
     return;
   }
   if (ikey.type != kTypeDeletion && ikey.type != kTypeValue) {
@@ -247,7 +252,6 @@ Status CuckooTableBuilder::Finish() {
   assert(!closed_);
   closed_ = true;
   std::vector<CuckooBucket> buckets;
-  Status s;
   std::string unused_bucket;
   if (num_entries_ > 0) {
     // Calculate the real hash size if module hash is enabled.
diff --git a/table/cuckoo/cuckoo_table_builder_test.cc b/table/cuckoo/cuckoo_table_builder_test.cc
index 322dbf0e4af..a86b6fb18e9 100644
--- a/table/cuckoo/cuckoo_table_builder_test.cc
+++ b/table/cuckoo/cuckoo_table_builder_test.cc
@@ -5,14 +5,16 @@
 
 #ifndef ROCKSDB_LITE
 
-#include <vector>
-#include <string>
+#include "table/cuckoo/cuckoo_table_builder.h"
+
 #include <map>
+#include <string>
 #include <utility>
+#include <vector>
 
 #include "file/random_access_file_reader.h"
 #include "file/writable_file_writer.h"
-#include "table/cuckoo/cuckoo_table_builder.h"
+#include "rocksdb/file_system.h"
 #include "table/meta_blocks.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
@@ -35,7 +37,7 @@ class CuckooBuilderTest : public testing::Test {
     env_ = Env::Default();
     Options options;
     options.allow_mmap_reads = true;
-    env_options_ = EnvOptions(options);
+    file_options_ = FileOptions(options);
   }
 
   void CheckFileContents(const std::vector<std::string>& keys,
@@ -47,26 +49,25 @@ class CuckooBuilderTest : public testing::Test {
     uint64_t num_deletions = 0;
     for (const auto& key : keys) {
       ParsedInternalKey parsed;
-      if (ParseInternalKey(key, &parsed) == Status::OK() &&
-          parsed.type == kTypeDeletion) {
+      Status pik_status =
+          ParseInternalKey(key, &parsed, true /* log_err_key */);
+      if (pik_status.ok() && parsed.type == kTypeDeletion) {
         num_deletions++;
       }
     }
     // Read file
-    std::unique_ptr<RandomAccessFile> read_file;
-    ASSERT_OK(env_->NewRandomAccessFile(fname, &read_file, env_options_));
     uint64_t read_file_size;
     ASSERT_OK(env_->GetFileSize(fname, &read_file_size));
+    std::unique_ptr<RandomAccessFileReader> file_reader;
+    ASSERT_OK(RandomAccessFileReader::Create(
+        env_->GetFileSystem(), fname, file_options_, &file_reader, nullptr));
 
     Options options;
     options.allow_mmap_reads = true;
-    ImmutableCFOptions ioptions(options);
+    ImmutableOptions ioptions(options);
 
     // Assert Table Properties.
     TableProperties* props = nullptr;
-    std::unique_ptr<RandomAccessFileReader> file_reader(
-        new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file),
-                                   fname));
     ASSERT_OK(ReadTableProperties(file_reader.get(), read_file_size,
                                   kCuckooTableMagicNumber, ioptions,
                                   &props, true /* compression_type_missing */));
@@ -157,7 +158,7 @@ class CuckooBuilderTest : public testing::Test {
 
 
   Env* env_;
-  EnvOptions env_options_;
+  FileOptions file_options_;
   std::string fname;
   const double kHashTableRatio = 0.9;
 };
@@ -165,10 +166,9 @@ class CuckooBuilderTest : public testing::Test {
 TEST_F(CuckooBuilderTest, SuccessWithEmptyFile) {
   std::unique_ptr<WritableFile> writable_file;
   fname = test::PerThreadDBPath("EmptyFile");
-  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-      EnvOptions()));
+  std::unique_ptr<WritableFileWriter> file_writer;
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, 4, 100,
                              BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -206,12 +206,10 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) {
     }
     uint64_t expected_table_size = GetExpectedTableSize(keys.size());
 
-    std::unique_ptr<WritableFile> writable_file;
     fname = test::PerThreadDBPath("NoCollisionFullKey");
-    ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-        NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-        EnvOptions()));
+    std::unique_ptr<WritableFileWriter> file_writer;
+    ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                         file_options_, &file_writer, nullptr));
     CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                                100, BytewiseComparator(), 1, false, false,
                                GetSliceHash, 0 /* column_family_id */,
@@ -256,12 +254,10 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) {
   }
   uint64_t expected_table_size = GetExpectedTableSize(keys.size());
 
-  std::unique_ptr<WritableFile> writable_file;
   fname = test::PerThreadDBPath("WithCollisionFullKey");
-  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-      EnvOptions()));
+  std::unique_ptr<WritableFileWriter> file_writer;
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -305,13 +301,11 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) {
   }
   uint64_t expected_table_size = GetExpectedTableSize(keys.size());
 
-  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFileWriter> file_writer;
   uint32_t cuckoo_block_size = 2;
   fname = test::PerThreadDBPath("WithCollisionFullKey2");
-  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-      EnvOptions()));
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
   CuckooTableBuilder builder(
       file_writer.get(), kHashTableRatio, num_hash_fun, 100,
       BytewiseComparator(), cuckoo_block_size, false, false, GetSliceHash,
@@ -360,12 +354,10 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) {
   }
   uint64_t expected_table_size = GetExpectedTableSize(keys.size());
 
-  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFileWriter> file_writer;
   fname = test::PerThreadDBPath("WithCollisionPathFullKey");
-  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-      EnvOptions()));
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -411,12 +403,10 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) {
   }
   uint64_t expected_table_size = GetExpectedTableSize(keys.size());
 
-  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFileWriter> file_writer;
   fname = test::PerThreadDBPath("WithCollisionPathFullKeyAndCuckooBlock");
-  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-      EnvOptions()));
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 2, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -455,12 +445,11 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) {
   std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
   uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
 
-  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFileWriter> file_writer;
   fname = test::PerThreadDBPath("NoCollisionUserKey");
-  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-      EnvOptions()));
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -500,12 +489,11 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) {
   std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
   uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
 
-  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFileWriter> file_writer;
   fname = test::PerThreadDBPath("WithCollisionUserKey");
-  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-      EnvOptions()));
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -547,12 +535,11 @@ TEST_F(CuckooBuilderTest, WithCollisionPathUserKey) {
   std::vector<uint64_t> expected_locations = {0, 1, 3, 4, 2};
   uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
 
-  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFileWriter> file_writer;
   fname = test::PerThreadDBPath("WithCollisionPathUserKey");
-  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-      EnvOptions()));
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
+
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              2, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -593,12 +580,10 @@ TEST_F(CuckooBuilderTest, FailWhenCollisionPathTooLong) {
   };
   hash_map = std::move(hm);
 
-  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFileWriter> file_writer;
   fname = test::PerThreadDBPath("WithCollisionPathUserKey");
-  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-      EnvOptions()));
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              2, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -622,12 +607,10 @@ TEST_F(CuckooBuilderTest, FailWhenSameKeyInserted) {
   uint32_t num_hash_fun = 4;
   std::string user_key = "repeatedkey";
 
-  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFileWriter> file_writer;
   fname = test::PerThreadDBPath("FailWhenSameKeyInserted");
-  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-      EnvOptions()));
+  ASSERT_OK(WritableFileWriter::Create(env_->GetFileSystem(), fname,
+                                       file_options_, &file_writer, nullptr));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
diff --git a/table/cuckoo/cuckoo_table_factory.cc b/table/cuckoo/cuckoo_table_factory.cc
index c6d3c377ce5..4fd014e97f6 100644
--- a/table/cuckoo/cuckoo_table_factory.cc
+++ b/table/cuckoo/cuckoo_table_factory.cc
@@ -30,11 +30,8 @@ Status CuckooTableFactory::NewTableReader(
 }
 
 TableBuilder* CuckooTableFactory::NewTableBuilder(
-    const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
+    const TableBuilderOptions& table_builder_options,
     WritableFileWriter* file) const {
-  // Ignore the skipFIlters flag. Does not apply to this file format
-  //
-
   // TODO: change builder to take the option struct
   return new CuckooTableBuilder(
       file, table_options_.hash_table_ratio, 64,
@@ -42,8 +39,9 @@ TableBuilder* CuckooTableFactory::NewTableBuilder(
       table_builder_options.internal_comparator.user_comparator(),
       table_options_.cuckoo_block_size, table_options_.use_module_hash,
       table_options_.identity_as_first_hash, nullptr /* get_slice_hash */,
-      column_family_id, table_builder_options.column_family_name,
-      table_builder_options.db_id, table_builder_options.db_session_id);
+      table_builder_options.column_family_id,
+      table_builder_options.column_family_name, table_builder_options.db_id,
+      table_builder_options.db_session_id);
 }
 
 std::string CuckooTableFactory::GetPrintableOptions() const {
@@ -95,8 +93,7 @@ static std::unordered_map<std::string, OptionTypeInfo> cuckoo_table_type_info =
 
 CuckooTableFactory::CuckooTableFactory(const CuckooTableOptions& table_options)
     : table_options_(table_options) {
-  ConfigurableHelper::RegisterOptions(*this, &table_options_,
-                                      &cuckoo_table_type_info);
+  RegisterOptions(&table_options_, &cuckoo_table_type_info);
 }
 
 TableFactory* NewCuckooTableFactory(const CuckooTableOptions& table_options) {
diff --git a/table/cuckoo/cuckoo_table_factory.h b/table/cuckoo/cuckoo_table_factory.h
index 30d4155e1fa..a51f23e532a 100644
--- a/table/cuckoo/cuckoo_table_factory.h
+++ b/table/cuckoo/cuckoo_table_factory.h
@@ -56,6 +56,8 @@ class CuckooTableFactory : public TableFactory {
       const CuckooTableOptions& table_option = CuckooTableOptions());
   ~CuckooTableFactory() {}
 
+  // Method to allow CheckedCast to work for this class
+  static const char* kClassName() { return kCuckooTableName(); }
   const char* Name() const override { return kCuckooTableName(); }
 
   using TableFactory::NewTableReader;
@@ -67,7 +69,7 @@ class CuckooTableFactory : public TableFactory {
 
   TableBuilder* NewTableBuilder(
       const TableBuilderOptions& table_builder_options,
-      uint32_t column_family_id, WritableFileWriter* file) const override;
+      WritableFileWriter* file) const override;
 
   std::string GetPrintableOptions() const override;
 
diff --git a/table/cuckoo/cuckoo_table_reader.cc b/table/cuckoo/cuckoo_table_reader.cc
index 275649ea838..4045d45287d 100644
--- a/table/cuckoo/cuckoo_table_reader.cc
+++ b/table/cuckoo/cuckoo_table_reader.cc
@@ -15,7 +15,9 @@
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "memory/arena.h"
+#include "options/cf_options.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/table.h"
 #include "table/cuckoo/cuckoo_table_factory.h"
@@ -33,7 +35,7 @@ const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max();
 extern const uint64_t kCuckooTableMagicNumber;
 
 CuckooTableReader::CuckooTableReader(
-    const ImmutableCFOptions& ioptions,
+    const ImmutableOptions& ioptions,
     std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
     const Comparator* comparator,
     uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t))
@@ -172,7 +174,8 @@ Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/,
         } else {
           Slice full_key(bucket, key_length_);
           ParsedInternalKey found_ikey;
-          Status s = ParseInternalKey(full_key, &found_ikey);
+          Status s = ParseInternalKey(full_key, &found_ikey,
+                                      false /* log_err_key */);  // TODO
           if (!s.ok()) return s;
           bool dont_care __attribute__((__unused__));
           get_context->SaveValue(found_ikey, value, &dont_care);
diff --git a/table/cuckoo/cuckoo_table_reader.h b/table/cuckoo/cuckoo_table_reader.h
index 5a7c8b72db4..43afd4fd7e8 100644
--- a/table/cuckoo/cuckoo_table_reader.h
+++ b/table/cuckoo/cuckoo_table_reader.h
@@ -16,7 +16,6 @@
 
 #include "db/dbformat.h"
 #include "file/random_access_file_reader.h"
-#include "options/cf_options.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "table/table_reader.h"
@@ -25,10 +24,11 @@ namespace ROCKSDB_NAMESPACE {
 
 class Arena;
 class TableReader;
+struct ImmutableOptions;
 
 class CuckooTableReader: public TableReader {
  public:
-  CuckooTableReader(const ImmutableCFOptions& ioptions,
+  CuckooTableReader(const ImmutableOptions& ioptions,
                     std::unique_ptr<RandomAccessFileReader>&& file,
                     uint64_t file_size, const Comparator* user_comparator,
                     uint64_t (*get_slice_hash)(const Slice&, uint32_t,
diff --git a/table/cuckoo/cuckoo_table_reader_test.cc b/table/cuckoo/cuckoo_table_reader_test.cc
index 5795de80d38..1914d26f13b 100644
--- a/table/cuckoo/cuckoo_table_reader_test.cc
+++ b/table/cuckoo/cuckoo_table_reader_test.cc
@@ -31,7 +31,6 @@ int main() {
 #include "util/string_util.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
-using GFLAGS_NAMESPACE::SetUsageMessage;
 
 DEFINE_string(file_dir, "", "Directory where the files will be created"
     " for benchmark. Added for using tmpfs.");
@@ -69,7 +68,7 @@ class CuckooReaderTest : public testing::Test {
   CuckooReaderTest() {
     options.allow_mmap_reads = true;
     env = options.env;
-    env_options = EnvOptions(options);
+    file_options = FileOptions(options);
   }
 
   void SetUp(int num) {
@@ -89,12 +88,9 @@ class CuckooReaderTest : public testing::Test {
 
   void CreateCuckooFileAndCheckReader(
       const Comparator* ucomp = BytewiseComparator()) {
-    std::unique_ptr<WritableFile> writable_file;
-    ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options));
-    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-        NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-        env_options));
-
+    std::unique_ptr<WritableFileWriter> file_writer;
+    ASSERT_OK(WritableFileWriter::Create(env->GetFileSystem(), fname,
+                                         file_options, &file_writer, nullptr));
     CuckooTableBuilder builder(
         file_writer.get(), 0.9, kNumHashFunc, 100, ucomp, 2, false, false,
         GetSliceHash, 0 /* column_family_id */, kDefaultColumnFamilyName);
@@ -110,12 +106,10 @@ class CuckooReaderTest : public testing::Test {
     ASSERT_OK(file_writer->Close());
 
     // Check reader now.
-    std::unique_ptr<RandomAccessFile> read_file;
-    ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
-    std::unique_ptr<RandomAccessFileReader> file_reader(
-        new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file),
-                                   fname));
-    const ImmutableCFOptions ioptions(options);
+    std::unique_ptr<RandomAccessFileReader> file_reader;
+    ASSERT_OK(RandomAccessFileReader::Create(
+        env->GetFileSystem(), fname, file_options, &file_reader, nullptr));
+    const ImmutableOptions ioptions(options);
     CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp,
                              GetSliceHash);
     ASSERT_OK(reader.status());
@@ -140,12 +134,10 @@ class CuckooReaderTest : public testing::Test {
   }
 
   void CheckIterator(const Comparator* ucomp = BytewiseComparator()) {
-    std::unique_ptr<RandomAccessFile> read_file;
-    ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
-    std::unique_ptr<RandomAccessFileReader> file_reader(
-        new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file),
-                                   fname));
-    const ImmutableCFOptions ioptions(options);
+    std::unique_ptr<RandomAccessFileReader> file_reader;
+    ASSERT_OK(RandomAccessFileReader::Create(
+        env->GetFileSystem(), fname, file_options, &file_reader, nullptr));
+    const ImmutableOptions ioptions(options);
     CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp,
                              GetSliceHash);
     ASSERT_OK(reader.status());
@@ -212,12 +204,12 @@ class CuckooReaderTest : public testing::Test {
   uint64_t file_size;
   Options options;
   Env* env;
-  EnvOptions env_options;
+  FileOptions file_options;
 };
 
 TEST_F(CuckooReaderTest, FileNotMmaped) {
   options.allow_mmap_reads = false;
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   CuckooTableReader reader(ioptions, nullptr, 0, nullptr, nullptr);
   ASSERT_TRUE(reader.status().IsInvalidArgument());
   ASSERT_STREQ("File is not mmaped", reader.status().getState());
@@ -331,12 +323,12 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) {
   }
   auto* ucmp = BytewiseComparator();
   CreateCuckooFileAndCheckReader();
-  std::unique_ptr<RandomAccessFile> read_file;
-  ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
-  std::unique_ptr<RandomAccessFileReader> file_reader(
-      new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file),
-                                 fname));
-  const ImmutableCFOptions ioptions(options);
+
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  ASSERT_OK(RandomAccessFileReader::Create(
+      env->GetFileSystem(), fname, file_options, &file_reader, nullptr));
+
+  const ImmutableOptions ioptions(options);
   CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucmp,
                            GetSliceHash);
   ASSERT_OK(reader.status());
@@ -416,15 +408,13 @@ void WriteFile(const std::vector<std::string>& keys,
     const uint64_t num, double hash_ratio) {
   Options options;
   options.allow_mmap_reads = true;
-  Env* env = options.env;
-  EnvOptions env_options = EnvOptions(options);
+  const auto& fs = options.env->GetFileSystem();
+  FileOptions file_options(options);
   std::string fname = GetFileName(num);
 
-  std::unique_ptr<WritableFile> writable_file;
-  ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options));
-  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(writable_file)), fname,
-      env_options));
+  std::unique_ptr<WritableFileWriter> file_writer;
+  ASSERT_OK(WritableFileWriter::Create(fs, fname, file_options, &file_writer,
+                                       nullptr));
   CuckooTableBuilder builder(
       file_writer.get(), hash_ratio, 64, 1000, test::Uint64Comparator(), 5,
       false, FLAGS_identity_as_first_hash, nullptr, 0 /* column_family_id */,
@@ -441,14 +431,13 @@ void WriteFile(const std::vector<std::string>& keys,
   ASSERT_OK(file_writer->Close());
 
   uint64_t file_size;
-  env->GetFileSize(fname, &file_size);
-  std::unique_ptr<RandomAccessFile> read_file;
-  ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
-  std::unique_ptr<RandomAccessFileReader> file_reader(
-      new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file),
-                                 fname));
-
-  const ImmutableCFOptions ioptions(options);
+  ASSERT_OK(
+      fs->GetFileSize(fname, file_options.io_options, &file_size, nullptr));
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  ASSERT_OK(RandomAccessFileReader::Create(fs, fname, file_options,
+                                           &file_reader, nullptr));
+
+  const ImmutableOptions ioptions(options);
   CuckooTableReader reader(ioptions, std::move(file_reader), file_size,
                            test::Uint64Comparator(), nullptr);
   ASSERT_OK(reader.status());
@@ -470,18 +459,18 @@ void ReadKeys(uint64_t num, uint32_t batch_size) {
   Options options;
   options.allow_mmap_reads = true;
   Env* env = options.env;
-  EnvOptions env_options = EnvOptions(options);
+  const auto& fs = options.env->GetFileSystem();
+  FileOptions file_options(options);
   std::string fname = GetFileName(num);
 
   uint64_t file_size;
-  env->GetFileSize(fname, &file_size);
-  std::unique_ptr<RandomAccessFile> read_file;
-  ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
-  std::unique_ptr<RandomAccessFileReader> file_reader(
-      new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(read_file),
-                                 fname));
-
-  const ImmutableCFOptions ioptions(options);
+  ASSERT_OK(
+      fs->GetFileSize(fname, file_options.io_options, &file_size, nullptr));
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  ASSERT_OK(RandomAccessFileReader::Create(fs, fname, file_options,
+                                           &file_reader, nullptr));
+
+  const ImmutableOptions ioptions(options);
   CuckooTableReader reader(ioptions, std::move(file_reader), file_size,
                            test::Uint64Comparator(), nullptr);
   ASSERT_OK(reader.status());
diff --git a/table/format.cc b/table/format.cc
index 23dc0bbc18c..5e0307b599e 100644
--- a/table/format.cc
+++ b/table/format.cc
@@ -14,11 +14,11 @@
 
 #include "block_fetcher.h"
 #include "file/random_access_file_reader.h"
-#include "logging/logging.h"
 #include "memory/memory_allocator.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/env.h"
+#include "rocksdb/options.h"
 #include "table/block_based/block.h"
 #include "table/block_based/block_based_table_reader.h"
 #include "table/persistent_cache_helper.h"
@@ -41,6 +41,7 @@ extern const uint64_t kPlainTableMagicNumber;
 const uint64_t kLegacyPlainTableMagicNumber = 0;
 const uint64_t kPlainTableMagicNumber = 0;
 #endif
+const char* kHostnameForDbHostId = "__hostname__";
 
 bool ShouldReportDetailedTime(Env* env, Statistics* stats) {
   return env != nullptr && stats != nullptr &&
@@ -306,8 +307,9 @@ Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
   // for iterator, TryReadFromCache might do a readahead. Revisit to see if we
   // need to pass a timeout at that point
   if (prefetch_buffer == nullptr ||
-      !prefetch_buffer->TryReadFromCache(
-          IOOptions(), read_offset, Footer::kMaxEncodedLength, &footer_input)) {
+      !prefetch_buffer->TryReadFromCache(IOOptions(), read_offset,
+                                         Footer::kMaxEncodedLength,
+                                         &footer_input, nullptr)) {
     if (file->use_direct_io()) {
       s = file->Read(opts, read_offset, Footer::kMaxEncodedLength,
                      &footer_input, nullptr, &internal_buf);
@@ -345,14 +347,14 @@ Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
 Status UncompressBlockContentsForCompressionType(
     const UncompressionInfo& uncompression_info, const char* data, size_t n,
     BlockContents* contents, uint32_t format_version,
-    const ImmutableCFOptions& ioptions, MemoryAllocator* allocator) {
+    const ImmutableOptions& ioptions, MemoryAllocator* allocator) {
   Status ret = Status::OK();
 
   assert(uncompression_info.type() != kNoCompression &&
          "Invalid compression type");
 
-  StopWatchNano timer(ioptions.env, ShouldReportDetailedTime(
-                                        ioptions.env, ioptions.statistics));
+  StopWatchNano timer(ioptions.clock,
+                      ShouldReportDetailedTime(ioptions.env, ioptions.stats));
   size_t uncompressed_size = 0;
   CacheAllocationPtr ubuf =
       UncompressData(uncompression_info, data, n, &uncompressed_size,
@@ -365,13 +367,13 @@ Status UncompressBlockContentsForCompressionType(
 
   *contents = BlockContents(std::move(ubuf), uncompressed_size);
 
-  if (ShouldReportDetailedTime(ioptions.env, ioptions.statistics)) {
-    RecordTimeToHistogram(ioptions.statistics, DECOMPRESSION_TIMES_NANOS,
+  if (ShouldReportDetailedTime(ioptions.env, ioptions.stats)) {
+    RecordTimeToHistogram(ioptions.stats, DECOMPRESSION_TIMES_NANOS,
                           timer.ElapsedNanos());
   }
-  RecordTimeToHistogram(ioptions.statistics, BYTES_DECOMPRESSED,
+  RecordTimeToHistogram(ioptions.stats, BYTES_DECOMPRESSED,
                         contents->data.size());
-  RecordTick(ioptions.statistics, NUMBER_BLOCK_DECOMPRESSED);
+  RecordTick(ioptions.stats, NUMBER_BLOCK_DECOMPRESSED);
 
   TEST_SYNC_POINT_CALLBACK(
       "UncompressBlockContentsForCompressionType:TamperWithReturnValue",
@@ -394,7 +396,7 @@ Status UncompressBlockContentsForCompressionType(
 Status UncompressBlockContents(const UncompressionInfo& uncompression_info,
                                const char* data, size_t n,
                                BlockContents* contents, uint32_t format_version,
-                               const ImmutableCFOptions& ioptions,
+                               const ImmutableOptions& ioptions,
                                MemoryAllocator* allocator) {
   assert(data[n] != kNoCompression);
   assert(data[n] == static_cast<char>(uncompression_info.type()));
@@ -403,4 +405,18 @@ Status UncompressBlockContents(const UncompressionInfo& uncompression_info,
                                                    ioptions, allocator);
 }
 
+// Replace the contents of db_host_id with the actual hostname, if db_host_id
+// matches the keyword kHostnameForDbHostId
+Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id) {
+  assert(db_host_id);
+  if (*db_host_id == kHostnameForDbHostId) {
+    Status s = env->GetHostNameString(db_host_id);
+    if (!s.ok()) {
+      db_host_id->clear();
+    }
+    return s;
+  }
+
+  return Status::OK();
+}
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/format.h b/table/format.h
index e40a5ceaeac..7d7962a4d54 100644
--- a/table/format.h
+++ b/table/format.h
@@ -305,7 +305,7 @@ struct BlockContents {
 extern Status ReadBlockContents(
     RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
     const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
-    BlockContents* contents, const ImmutableCFOptions& ioptions,
+    BlockContents* contents, const ImmutableOptions& ioptions,
     bool do_uncompress = true, const Slice& compression_dict = Slice(),
     const PersistentCacheOptions& cache_options = PersistentCacheOptions());
 
@@ -320,7 +320,7 @@ extern Status UncompressBlockContents(const UncompressionInfo& info,
                                       const char* data, size_t n,
                                       BlockContents* contents,
                                       uint32_t compress_format_version,
-                                      const ImmutableCFOptions& ioptions,
+                                      const ImmutableOptions& ioptions,
                                       MemoryAllocator* allocator = nullptr);
 
 // This is an extension to UncompressBlockContents that accepts
@@ -329,7 +329,10 @@ extern Status UncompressBlockContents(const UncompressionInfo& info,
 extern Status UncompressBlockContentsForCompressionType(
     const UncompressionInfo& info, const char* data, size_t n,
     BlockContents* contents, uint32_t compress_format_version,
-    const ImmutableCFOptions& ioptions, MemoryAllocator* allocator = nullptr);
+    const ImmutableOptions& ioptions, MemoryAllocator* allocator = nullptr);
+
+// Replace db_host_id contents with the real hostname if necessary
+extern Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id);
 
 // Implementation details follow.  Clients should ignore,
 
diff --git a/table/get_context.cc b/table/get_context.cc
index ecd59220a73..919ed4c3429 100644
--- a/table/get_context.cc
+++ b/table/get_context.cc
@@ -4,15 +4,16 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "table/get_context.h"
+
 #include "db/merge_helper.h"
 #include "db/pinned_iterators_manager.h"
 #include "db/read_callback.h"
 #include "monitoring/file_read_sample.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
-#include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/statistics.h"
+#include "rocksdb/system_clock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -38,14 +39,17 @@ void appendToReplayLog(std::string* replay_log, ValueType type, Slice value) {
 
 }  // namespace
 
-GetContext::GetContext(
-    const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger,
-    Statistics* statistics, GetState init_state, const Slice& user_key,
-    PinnableSlice* pinnable_val, std::string* timestamp, bool* value_found,
-    MergeContext* merge_context, bool do_merge,
-    SequenceNumber* _max_covering_tombstone_seq, Env* env, SequenceNumber* seq,
-    PinnedIteratorsManager* _pinned_iters_mgr, ReadCallback* callback,
-    bool* is_blob_index, uint64_t tracing_get_id)
+GetContext::GetContext(const Comparator* ucmp,
+                       const MergeOperator* merge_operator, Logger* logger,
+                       Statistics* statistics, GetState init_state,
+                       const Slice& user_key, PinnableSlice* pinnable_val,
+                       std::string* timestamp, bool* value_found,
+                       MergeContext* merge_context, bool do_merge,
+                       SequenceNumber* _max_covering_tombstone_seq,
+                       SystemClock* clock, SequenceNumber* seq,
+                       PinnedIteratorsManager* _pinned_iters_mgr,
+                       ReadCallback* callback, bool* is_blob_index,
+                       uint64_t tracing_get_id, BlobFetcher* blob_fetcher)
     : ucmp_(ucmp),
       merge_operator_(merge_operator),
       logger_(logger),
@@ -57,14 +61,15 @@ GetContext::GetContext(
       value_found_(value_found),
       merge_context_(merge_context),
       max_covering_tombstone_seq_(_max_covering_tombstone_seq),
-      env_(env),
+      clock_(clock),
       seq_(seq),
       replay_log_(nullptr),
       pinned_iters_mgr_(_pinned_iters_mgr),
       callback_(callback),
       do_merge_(do_merge),
       is_blob_index_(is_blob_index),
-      tracing_get_id_(tracing_get_id) {
+      tracing_get_id_(tracing_get_id),
+      blob_fetcher_(blob_fetcher) {
   if (seq_) {
     *seq_ = kMaxSequenceNumber;
   }
@@ -75,13 +80,14 @@ GetContext::GetContext(
     const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger,
     Statistics* statistics, GetState init_state, const Slice& user_key,
     PinnableSlice* pinnable_val, bool* value_found, MergeContext* merge_context,
-    bool do_merge, SequenceNumber* _max_covering_tombstone_seq, Env* env,
-    SequenceNumber* seq, PinnedIteratorsManager* _pinned_iters_mgr,
-    ReadCallback* callback, bool* is_blob_index, uint64_t tracing_get_id)
+    bool do_merge, SequenceNumber* _max_covering_tombstone_seq,
+    SystemClock* clock, SequenceNumber* seq,
+    PinnedIteratorsManager* _pinned_iters_mgr, ReadCallback* callback,
+    bool* is_blob_index, uint64_t tracing_get_id, BlobFetcher* blob_fetcher)
     : GetContext(ucmp, merge_operator, logger, statistics, init_state, user_key,
                  pinnable_val, nullptr, value_found, merge_context, do_merge,
-                 _max_covering_tombstone_seq, env, seq, _pinned_iters_mgr,
-                 callback, is_blob_index, tracing_get_id) {}
+                 _max_covering_tombstone_seq, clock, seq, _pinned_iters_mgr,
+                 callback, is_blob_index, tracing_get_id, blob_fetcher) {}
 
 // Called from TableCache::Get and Table::Get when file/block in which
 // key may exist are not there in TableCache/BlockCache respectively. In this
@@ -216,7 +222,7 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
   assert(matched);
   assert((state_ != kMerge && parsed_key.type != kTypeMerge) ||
          merge_context_ != nullptr);
-  if (ucmp_->CompareWithoutTimestamp(parsed_key.user_key, user_key_) == 0) {
+  if (ucmp_->EqualWithoutTimestamp(parsed_key.user_key, user_key_)) {
     *matched = true;
     // If the value is not in the snapshot, skip it
     if (!CheckCallback(parsed_key.sequence)) {
@@ -245,9 +251,12 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
         assert(state_ == kNotFound || state_ == kMerge);
         if (type == kTypeBlobIndex && is_blob_index_ == nullptr) {
           // Blob value not supported. Stop.
-          state_ = kBlobIndex;
+          state_ = kUnexpectedBlobIndex;
           return false;
         }
+        if (is_blob_index_ != nullptr) {
+          *is_blob_index_ = (type == kTypeBlobIndex);
+        }
         if (kNotFound == state_) {
           state_ = kFound;
           if (do_merge_) {
@@ -258,7 +267,6 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
               } else {
                 TEST_SYNC_POINT_CALLBACK("GetContext::SaveValue::PinSelf",
                                          this);
-
                 // Otherwise copy the value
                 pinnable_val_->PinSelf(value);
               }
@@ -267,27 +275,44 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
             // It means this function is called as part of DB GetMergeOperands
             // API and the current value should be part of
             // merge_context_->operand_list
-            push_operand(value, value_pinner);
+            if (is_blob_index_ != nullptr && *is_blob_index_) {
+              PinnableSlice pin_val;
+              if (GetBlobValue(value, &pin_val) == false) {
+                return false;
+              }
+              Slice blob_value(pin_val);
+              push_operand(blob_value, nullptr);
+            } else {
+              push_operand(value, value_pinner);
+            }
           }
         } else if (kMerge == state_) {
           assert(merge_operator_ != nullptr);
-          state_ = kFound;
-          if (do_merge_) {
-            if (LIKELY(pinnable_val_ != nullptr)) {
-              Status merge_status = MergeHelper::TimedFullMerge(
-                  merge_operator_, user_key_, &value,
-                  merge_context_->GetOperands(), pinnable_val_->GetSelf(),
-                  logger_, statistics_, env_);
-              pinnable_val_->PinSelf();
-              if (!merge_status.ok()) {
-                state_ = kCorrupt;
-              }
+          if (is_blob_index_ != nullptr && *is_blob_index_) {
+            PinnableSlice pin_val;
+            if (GetBlobValue(value, &pin_val) == false) {
+              return false;
+            }
+            Slice blob_value(pin_val);
+            state_ = kFound;
+            if (do_merge_) {
+              Merge(&blob_value);
+            } else {
+              // It means this function is called as part of DB GetMergeOperands
+              // API and the current value should be part of
+              // merge_context_->operand_list
+              push_operand(blob_value, nullptr);
             }
           } else {
-            // It means this function is called as part of DB GetMergeOperands
-            // API and the current value should be part of
-            // merge_context_->operand_list
-            push_operand(value, value_pinner);
+            state_ = kFound;
+            if (do_merge_) {
+              Merge(&value);
+            } else {
+              // It means this function is called as part of DB GetMergeOperands
+              // API and the current value should be part of
+              // merge_context_->operand_list
+              push_operand(value, value_pinner);
+            }
           }
         }
         if (state_ == kFound) {
@@ -297,9 +322,6 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
             timestamp_->assign(ts.data(), ts.size());
           }
         }
-        if (is_blob_index_ != nullptr) {
-          *is_blob_index_ = (type == kTypeBlobIndex);
-        }
         return false;
 
       case kTypeDeletion:
@@ -313,20 +335,9 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
           state_ = kDeleted;
         } else if (kMerge == state_) {
           state_ = kFound;
-          if (LIKELY(pinnable_val_ != nullptr)) {
-            if (do_merge_) {
-              Status merge_status = MergeHelper::TimedFullMerge(
-                  merge_operator_, user_key_, nullptr,
-                  merge_context_->GetOperands(), pinnable_val_->GetSelf(),
-                  logger_, statistics_, env_);
-              pinnable_val_->PinSelf();
-              if (!merge_status.ok()) {
-                state_ = kCorrupt;
-              }
-            }
-            // If do_merge_ = false then the current value shouldn't be part of
-            // merge_context_->operand_list
-          }
+          Merge(nullptr);
+          // If do_merge_ = false then the current value shouldn't be part of
+          // merge_context_->operand_list
         }
         return false;
 
@@ -339,20 +350,7 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
             merge_operator_->ShouldMerge(
                 merge_context_->GetOperandsDirectionBackward())) {
           state_ = kFound;
-          if (LIKELY(pinnable_val_ != nullptr)) {
-            // do_merge_ = true this is the case where this function is called
-            // as part of DB Get API hence merge operators should be merged.
-            if (do_merge_) {
-              Status merge_status = MergeHelper::TimedFullMerge(
-                  merge_operator_, user_key_, nullptr,
-                  merge_context_->GetOperands(), pinnable_val_->GetSelf(),
-                  logger_, statistics_, env_);
-              pinnable_val_->PinSelf();
-              if (!merge_status.ok()) {
-                state_ = kCorrupt;
-              }
-            }
-          }
+          Merge(nullptr);
           return false;
         }
         return true;
@@ -367,6 +365,35 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
   return false;
 }
 
+void GetContext::Merge(const Slice* value) {
+  if (LIKELY(pinnable_val_ != nullptr)) {
+    if (do_merge_) {
+      Status merge_status = MergeHelper::TimedFullMerge(
+          merge_operator_, user_key_, value, merge_context_->GetOperands(),
+          pinnable_val_->GetSelf(), logger_, statistics_, clock_);
+      pinnable_val_->PinSelf();
+      if (!merge_status.ok()) {
+        state_ = kCorrupt;
+      }
+    }
+  }
+}
+
+bool GetContext::GetBlobValue(const Slice& blob_index,
+                              PinnableSlice* blob_value) {
+  Status status = blob_fetcher_->FetchBlob(user_key_, blob_index, blob_value);
+  if (!status.ok()) {
+    if (status.IsIncomplete()) {
+      MarkKeyMayExist();
+      return false;
+    }
+    state_ = kCorrupt;
+    return false;
+  }
+  *is_blob_index_ = false;
+  return true;
+}
+
 void GetContext::push_operand(const Slice& value, Cleanable* value_pinner) {
   if (pinned_iters_mgr() && pinned_iters_mgr()->PinningEnabled() &&
       value_pinner != nullptr) {
diff --git a/table/get_context.h b/table/get_context.h
index c349a3e6ff9..9b2f678078d 100644
--- a/table/get_context.h
+++ b/table/get_context.h
@@ -5,10 +5,11 @@
 
 #pragma once
 #include <string>
+
+#include "db/blob/blob_fetcher.h"
 #include "db/dbformat.h"
 #include "db/merge_context.h"
 #include "db/read_callback.h"
-#include "rocksdb/env.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/types.h"
 #include "table/block_based/block.h"
@@ -16,6 +17,7 @@
 namespace ROCKSDB_NAMESPACE {
 class MergeContext;
 class PinnedIteratorsManager;
+class SystemClock;
 
 // Data structure for accumulating statistics during a point lookup. At the
 // end of the point lookup, the corresponding ticker stats are updated. This
@@ -71,7 +73,7 @@ class GetContext {
     kDeleted,
     kCorrupt,
     kMerge,  // saver contains the current merge result (the operands)
-    kBlobIndex,
+    kUnexpectedBlobIndex,
   };
   GetContextStats get_context_stats_;
 
@@ -97,23 +99,23 @@ class GetContext {
   // merge_context and they are never merged. The value pointer is untouched.
   GetContext(const Comparator* ucmp, const MergeOperator* merge_operator,
              Logger* logger, Statistics* statistics, GetState init_state,
-             const Slice& user_key, PinnableSlice* value,
-             bool* value_found, MergeContext* merge_context, bool do_merge,
-             SequenceNumber* max_covering_tombstone_seq, Env* env,
+             const Slice& user_key, PinnableSlice* value, bool* value_found,
+             MergeContext* merge_context, bool do_merge,
+             SequenceNumber* max_covering_tombstone_seq, SystemClock* clock,
              SequenceNumber* seq = nullptr,
              PinnedIteratorsManager* _pinned_iters_mgr = nullptr,
              ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
-             uint64_t tracing_get_id = 0);
+             uint64_t tracing_get_id = 0, BlobFetcher* blob_fetcher = nullptr);
   GetContext(const Comparator* ucmp, const MergeOperator* merge_operator,
              Logger* logger, Statistics* statistics, GetState init_state,
              const Slice& user_key, PinnableSlice* value,
              std::string* timestamp, bool* value_found,
              MergeContext* merge_context, bool do_merge,
-             SequenceNumber* max_covering_tombstone_seq, Env* env,
+             SequenceNumber* max_covering_tombstone_seq, SystemClock* clock,
              SequenceNumber* seq = nullptr,
              PinnedIteratorsManager* _pinned_iters_mgr = nullptr,
              ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
-             uint64_t tracing_get_id = 0);
+             uint64_t tracing_get_id = 0, BlobFetcher* blob_fetcher = nullptr);
 
   GetContext() = delete;
 
@@ -170,6 +172,9 @@ class GetContext {
   void push_operand(const Slice& value, Cleanable* value_pinner);
 
  private:
+  void Merge(const Slice* value);
+  bool GetBlobValue(const Slice& blob_index, PinnableSlice* blob_value);
+
   const Comparator* ucmp_;
   const MergeOperator* merge_operator_;
   // the merge operations encountered;
@@ -183,7 +188,7 @@ class GetContext {
   bool* value_found_;  // Is value set correctly? Used by KeyMayExist
   MergeContext* merge_context_;
   SequenceNumber* max_covering_tombstone_seq_;
-  Env* env_;
+  SystemClock* clock_;
   // If a key is found, seq_ will be set to the SequenceNumber of most recent
   // write to the key or kMaxSequenceNumber if unknown
   SequenceNumber* seq_;
@@ -200,6 +205,7 @@ class GetContext {
   // Used for block cache tracing only. A tracing get id uniquely identifies a
   // Get or a MultiGet.
   const uint64_t tracing_get_id_;
+  BlobFetcher* blob_fetcher_;
 };
 
 // Call this to replay a log and bring the get_context up to date. The replay
diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc
index 6b781de1ea9..52e56be81bf 100644
--- a/table/meta_blocks.cc
+++ b/table/meta_blocks.cc
@@ -83,6 +83,7 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
   Add(TablePropertiesNames::kIndexValueIsDeltaEncoded,
       props.index_value_is_delta_encoded);
   Add(TablePropertiesNames::kNumEntries, props.num_entries);
+  Add(TablePropertiesNames::kNumFilterEntries, props.num_filter_entries);
   Add(TablePropertiesNames::kDeletedKeys, props.num_deletions);
   Add(TablePropertiesNames::kMergeOperands, props.num_merge_operands);
   Add(TablePropertiesNames::kNumRangeDeletions, props.num_range_deletions);
@@ -96,12 +97,23 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
   if (props.file_creation_time > 0) {
     Add(TablePropertiesNames::kFileCreationTime, props.file_creation_time);
   }
+  if (props.slow_compression_estimated_data_size > 0) {
+    Add(TablePropertiesNames::kSlowCompressionEstimatedDataSize,
+        props.slow_compression_estimated_data_size);
+  }
+  if (props.fast_compression_estimated_data_size > 0) {
+    Add(TablePropertiesNames::kFastCompressionEstimatedDataSize,
+        props.fast_compression_estimated_data_size);
+  }
   if (!props.db_id.empty()) {
     Add(TablePropertiesNames::kDbId, props.db_id);
   }
   if (!props.db_session_id.empty()) {
     Add(TablePropertiesNames::kDbSessionId, props.db_session_id);
   }
+  if (!props.db_host_id.empty()) {
+    Add(TablePropertiesNames::kDbHostId, props.db_host_id);
+  }
 
   if (!props.filter_policy_name.empty()) {
     Add(TablePropertiesNames::kFilterPolicy, props.filter_policy_name);
@@ -141,8 +153,8 @@ Slice PropertyBlockBuilder::Finish() {
   return properties_block_->Finish();
 }
 
-void LogPropertiesCollectionError(
-    Logger* info_log, const std::string& method, const std::string& name) {
+void LogPropertiesCollectionError(Logger* info_log, const std::string& method,
+                                  const std::string& name) {
   assert(method == "Add" || method == "Finish");
 
   std::string msg =
@@ -169,11 +181,11 @@ bool NotifyCollectTableCollectorsOnAdd(
 
 void NotifyCollectTableCollectorsOnBlockAdd(
     const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
-    const uint64_t blockRawBytes, const uint64_t blockCompressedBytesFast,
-    const uint64_t blockCompressedBytesSlow) {
+    const uint64_t block_raw_bytes, const uint64_t block_compressed_bytes_fast,
+    const uint64_t block_compressed_bytes_slow) {
   for (auto& collector : collectors) {
-    collector->BlockAdd(blockRawBytes, blockCompressedBytesFast,
-                        blockCompressedBytesSlow);
+    collector->BlockAdd(block_raw_bytes, block_compressed_bytes_fast,
+                        block_compressed_bytes_slow);
   }
 }
 
@@ -200,7 +212,7 @@ bool NotifyCollectTableCollectorsOnFinish(
 Status ReadProperties(const ReadOptions& read_options,
                       const Slice& handle_value, RandomAccessFileReader* file,
                       FilePrefetchBuffer* prefetch_buffer, const Footer& footer,
-                      const ImmutableCFOptions& ioptions,
+                      const ImmutableOptions& ioptions,
                       TableProperties** table_properties, bool verify_checksum,
                       BlockHandle* ret_block_handle,
                       CacheAllocationPtr* verification_buf,
@@ -258,6 +270,8 @@ Status ReadProperties(const ReadOptions& read_options,
       {TablePropertiesNames::kNumDataBlocks,
        &new_table_properties->num_data_blocks},
       {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries},
+      {TablePropertiesNames::kNumFilterEntries,
+       &new_table_properties->num_filter_entries},
       {TablePropertiesNames::kDeletedKeys,
        &new_table_properties->num_deletions},
       {TablePropertiesNames::kMergeOperands,
@@ -276,6 +290,10 @@ Status ReadProperties(const ReadOptions& read_options,
        &new_table_properties->oldest_key_time},
       {TablePropertiesNames::kFileCreationTime,
        &new_table_properties->file_creation_time},
+      {TablePropertiesNames::kSlowCompressionEstimatedDataSize,
+       &new_table_properties->slow_compression_estimated_data_size},
+      {TablePropertiesNames::kFastCompressionEstimatedDataSize,
+       &new_table_properties->fast_compression_estimated_data_size},
   };
 
   std::string last_key;
@@ -314,7 +332,7 @@ Status ReadProperties(const ReadOptions& read_options,
         auto error_msg =
           "Detect malformed value in properties meta-block:"
           "\tkey: " + key + "\tval: " + raw_val.ToString();
-        ROCKS_LOG_ERROR(ioptions.info_log, "%s", error_msg.c_str());
+        ROCKS_LOG_ERROR(ioptions.logger, "%s", error_msg.c_str());
         continue;
       }
       *(pos->second) = val;
@@ -322,6 +340,8 @@ Status ReadProperties(const ReadOptions& read_options,
       new_table_properties->db_id = raw_val.ToString();
     } else if (key == TablePropertiesNames::kDbSessionId) {
       new_table_properties->db_session_id = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kDbHostId) {
+      new_table_properties->db_host_id = raw_val.ToString();
     } else if (key == TablePropertiesNames::kFilterPolicy) {
       new_table_properties->filter_policy_name = raw_val.ToString();
     } else if (key == TablePropertiesNames::kColumnFamilyName) {
@@ -366,7 +386,7 @@ Status ReadProperties(const ReadOptions& read_options,
 
 Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
                            uint64_t table_magic_number,
-                           const ImmutableCFOptions& ioptions,
+                           const ImmutableOptions& ioptions,
                            TableProperties** properties,
                            bool compression_type_missing,
                            MemoryAllocator* memory_allocator,
@@ -437,7 +457,7 @@ Status FindMetaBlock(InternalIterator* meta_index_iter,
 
 Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
                      uint64_t table_magic_number,
-                     const ImmutableCFOptions& ioptions,
+                     const ImmutableOptions& ioptions,
                      const std::string& meta_block_name,
                      BlockHandle* block_handle,
                      bool /*compression_type_missing*/,
@@ -479,7 +499,7 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
 Status ReadMetaBlock(RandomAccessFileReader* file,
                      FilePrefetchBuffer* prefetch_buffer, uint64_t file_size,
                      uint64_t table_magic_number,
-                     const ImmutableCFOptions& ioptions,
+                     const ImmutableOptions& ioptions,
                      const std::string& meta_block_name, BlockType block_type,
                      BlockContents* contents, bool /*compression_type_missing*/,
                      MemoryAllocator* memory_allocator) {
diff --git a/table/meta_blocks.h b/table/meta_blocks.h
index 319b2c7127e..01b56d57c0c 100644
--- a/table/meta_blocks.h
+++ b/table/meta_blocks.h
@@ -70,8 +70,8 @@ class PropertyBlockBuilder {
 
 // Were we encounter any error occurs during user-defined statistics collection,
 // we'll write the warning message to info log.
-void LogPropertiesCollectionError(
-    Logger* info_log, const std::string& method, const std::string& name);
+void LogPropertiesCollectionError(Logger* info_log, const std::string& method,
+                                  const std::string& name);
 
 // Utility functions help table builder to trigger batch events for user
 // defined property collectors.
@@ -86,8 +86,8 @@ bool NotifyCollectTableCollectorsOnAdd(
 
 void NotifyCollectTableCollectorsOnBlockAdd(
     const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
-    uint64_t blockRawBytes, uint64_t blockCompressedBytesFast,
-    uint64_t blockCompressedBytesSlow);
+    uint64_t block_raw_bytes, uint64_t block_compressed_bytes_fast,
+    uint64_t block_compressed_bytes_slow);
 
 // NotifyCollectTableCollectorsOnFinish() triggers the `Finish` event for all
 // property collectors. The collected properties will be added to `builder`.
@@ -102,7 +102,7 @@ bool NotifyCollectTableCollectorsOnFinish(
 Status ReadProperties(const ReadOptions& ro, const Slice& handle_value,
                       RandomAccessFileReader* file,
                       FilePrefetchBuffer* prefetch_buffer, const Footer& footer,
-                      const ImmutableCFOptions& ioptions,
+                      const ImmutableOptions& ioptions,
                       TableProperties** table_properties, bool verify_checksum,
                       BlockHandle* block_handle,
                       CacheAllocationPtr* verification_buf,
@@ -119,7 +119,7 @@ Status ReadProperties(const ReadOptions& ro, const Slice& handle_value,
 // `ReadProperties`, `FindMetaBlock`, and `ReadMetaBlock`
 Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
                            uint64_t table_magic_number,
-                           const ImmutableCFOptions& ioptions,
+                           const ImmutableOptions& ioptions,
                            TableProperties** properties,
                            bool compression_type_missing = false,
                            MemoryAllocator* memory_allocator = nullptr,
@@ -133,7 +133,7 @@ Status FindMetaBlock(InternalIterator* meta_index_iter,
 // Find the meta block
 Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
                      uint64_t table_magic_number,
-                     const ImmutableCFOptions& ioptions,
+                     const ImmutableOptions& ioptions,
                      const std::string& meta_block_name,
                      BlockHandle* block_handle,
                      bool compression_type_missing = false,
@@ -145,7 +145,7 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
 Status ReadMetaBlock(RandomAccessFileReader* file,
                      FilePrefetchBuffer* prefetch_buffer, uint64_t file_size,
                      uint64_t table_magic_number,
-                     const ImmutableCFOptions& ioptions,
+                     const ImmutableOptions& ioptions,
                      const std::string& meta_block_name, BlockType block_type,
                      BlockContents* contents,
                      bool compression_type_missing = false,
diff --git a/table/mock_table.cc b/table/mock_table.cc
index 757fdb963a5..cc3cff97331 100644
--- a/table/mock_table.cc
+++ b/table/mock_table.cc
@@ -18,8 +18,8 @@ namespace mock {
 
 KVVector MakeMockFile(std::initializer_list<KVPair> l) { return KVVector(l); }
 
-void SortKVVector(KVVector* kv_vector) {
-  InternalKeyComparator icmp(BytewiseComparator());
+void SortKVVector(KVVector* kv_vector, const Comparator* ucmp) {
+  InternalKeyComparator icmp(ucmp);
   std::sort(kv_vector->begin(), kv_vector->end(),
             [icmp](KVPair a, KVPair b) -> bool {
               return icmp.Compare(a.first, b.first) < 0;
@@ -207,8 +207,10 @@ Status MockTableReader::Get(const ReadOptions&, const Slice& key,
   std::unique_ptr<MockTableIterator> iter(new MockTableIterator(table_));
   for (iter->Seek(key); iter->Valid(); iter->Next()) {
     ParsedInternalKey parsed_key;
-    if (ParseInternalKey(iter->key(), &parsed_key) != Status::OK()) {
-      return Status::Corruption(Slice());
+    Status pik_status =
+        ParseInternalKey(iter->key(), &parsed_key, true /* log_err_key */);
+    if (!pik_status.ok()) {
+      return pik_status;
     }
 
     bool dont_care __attribute__((__unused__));
@@ -233,7 +235,11 @@ Status MockTableFactory::NewTableReader(
     std::unique_ptr<RandomAccessFileReader>&& file, uint64_t /*file_size*/,
     std::unique_ptr<TableReader>* table_reader,
     bool /*prefetch_index_and_filter_in_cache*/) const {
-  uint32_t id = GetIDFromFile(file.get());
+  uint32_t id;
+  Status s = GetIDFromFile(file.get(), &id);
+  if (!s.ok()) {
+    return s;
+  }
 
   MutexLock lock_guard(&file_system_.mutex);
 
@@ -249,42 +255,46 @@ Status MockTableFactory::NewTableReader(
 
 TableBuilder* MockTableFactory::NewTableBuilder(
     const TableBuilderOptions& /*table_builder_options*/,
-    uint32_t /*column_family_id*/, WritableFileWriter* file) const {
-  uint32_t id = GetAndWriteNextID(file);
+    WritableFileWriter* file) const {
+  uint32_t id;
+  Status s = GetAndWriteNextID(file, &id);
+  assert(s.ok());
 
   return new MockTableBuilder(id, &file_system_, corrupt_mode_);
 }
 
 Status MockTableFactory::CreateMockTable(Env* env, const std::string& fname,
                                          KVVector file_contents) {
-  std::unique_ptr<WritableFile> file;
-  auto s = env->NewWritableFile(fname, &file, EnvOptions());
+  std::unique_ptr<WritableFileWriter> file_writer;
+  auto s = WritableFileWriter::Create(env->GetFileSystem(), fname,
+                                      FileOptions(), &file_writer, nullptr);
   if (!s.ok()) {
     return s;
   }
-
-  WritableFileWriter file_writer(NewLegacyWritableFileWrapper(std::move(file)),
-                                 fname, EnvOptions());
-
-  uint32_t id = GetAndWriteNextID(&file_writer);
-  file_system_.files.insert({id, std::move(file_contents)});
-  return Status::OK();
+  uint32_t id;
+  s = GetAndWriteNextID(file_writer.get(), &id);
+  if (s.ok()) {
+    file_system_.files.insert({id, std::move(file_contents)});
+  }
+  return s;
 }
 
-uint32_t MockTableFactory::GetAndWriteNextID(WritableFileWriter* file) const {
-  uint32_t next_id = next_id_.fetch_add(1);
+Status MockTableFactory::GetAndWriteNextID(WritableFileWriter* file,
+                                           uint32_t* next_id) const {
+  *next_id = next_id_.fetch_add(1);
   char buf[4];
-  EncodeFixed32(buf, next_id);
-  file->Append(Slice(buf, 4));
-  return next_id;
+  EncodeFixed32(buf, *next_id);
+  return file->Append(Slice(buf, 4));
 }
 
-uint32_t MockTableFactory::GetIDFromFile(RandomAccessFileReader* file) const {
+Status MockTableFactory::GetIDFromFile(RandomAccessFileReader* file,
+                                       uint32_t* id) const {
   char buf[4];
   Slice result;
-  file->Read(IOOptions(), 0, 4, &result, buf, nullptr);
+  Status s = file->Read(IOOptions(), 0, 4, &result, buf, nullptr);
   assert(result.size() == 4);
-  return DecodeFixed32(buf);
+  *id = DecodeFixed32(buf);
+  return s;
 }
 
 void MockTableFactory::AssertSingleFile(const KVVector& file_contents) {
@@ -303,8 +313,9 @@ void MockTableFactory::AssertLatestFile(const KVVector& file_contents) {
       ParsedInternalKey ikey;
       std::string key, value;
       std::tie(key, value) = kv;
-      ASSERT_OK(ParseInternalKey(Slice(key), &ikey));
-      std::cout << ikey.DebugString(false) << " -> " << value << std::endl;
+      ASSERT_OK(ParseInternalKey(Slice(key), &ikey, true /* log_err_key */));
+      std::cout << ikey.DebugString(true, false) << " -> " << value
+                << std::endl;
     }
     FAIL();
   }
diff --git a/table/mock_table.h b/table/mock_table.h
index 0ab9674d6e9..095f6334101 100644
--- a/table/mock_table.h
+++ b/table/mock_table.h
@@ -31,7 +31,8 @@ using KVPair = std::pair<std::string, std::string>;
 using KVVector = std::vector<KVPair>;
 
 KVVector MakeMockFile(std::initializer_list<KVPair> l = {});
-void SortKVVector(KVVector* kv_vector);
+void SortKVVector(KVVector* kv_vector,
+                  const Comparator* ucmp = BytewiseComparator());
 
 struct MockTableFileSystem {
   port::Mutex mutex;
@@ -57,7 +58,7 @@ class MockTableFactory : public TableFactory {
       bool prefetch_index_and_filter_in_cache = true) const override;
   TableBuilder* NewTableBuilder(
       const TableBuilderOptions& table_builder_options,
-      uint32_t column_familly_id, WritableFileWriter* file) const override;
+      WritableFileWriter* file) const override;
 
   // This function will directly create mock table instead of going through
   // MockTableBuilder. file_contents has to have a format of <internal_key,
@@ -76,8 +77,8 @@ class MockTableFactory : public TableFactory {
   void AssertLatestFile(const KVVector& file_contents);
 
  private:
-  uint32_t GetAndWriteNextID(WritableFileWriter* file) const;
-  uint32_t GetIDFromFile(RandomAccessFileReader* file) const;
+  Status GetAndWriteNextID(WritableFileWriter* file, uint32_t* id) const;
+  Status GetIDFromFile(RandomAccessFileReader* file, uint32_t* id) const;
 
   mutable MockTableFileSystem file_system_;
   mutable std::atomic<uint32_t> next_id_;
diff --git a/table/multiget_context.h b/table/multiget_context.h
index 604a26f8b4e..1c9f8da940b 100644
--- a/table/multiget_context.h
+++ b/table/multiget_context.h
@@ -7,6 +7,8 @@
 #include <algorithm>
 #include <array>
 #include <string>
+
+#include "db/dbformat.h"
 #include "db/lookup_key.h"
 #include "db/merge_context.h"
 #include "rocksdb/env.h"
@@ -21,13 +23,15 @@ class GetContext;
 struct KeyContext {
   const Slice* key;
   LookupKey* lkey;
-  Slice ukey;
+  Slice ukey_with_ts;
+  Slice ukey_without_ts;
   Slice ikey;
   ColumnFamilyHandle* column_family;
   Status* s;
   MergeContext merge_context;
   SequenceNumber max_covering_tombstone_seq;
   bool key_exists;
+  bool is_blob_index;
   void* cb_arg;
   PinnableSlice* value;
   std::string* timestamp;
@@ -41,6 +45,7 @@ struct KeyContext {
         s(stat),
         max_covering_tombstone_seq(0),
         key_exists(false),
+        is_blob_index(false),
         cb_arg(nullptr),
         value(val),
         timestamp(ts),
@@ -110,7 +115,10 @@ class MultiGetContext {
       sorted_keys_[iter] = (*sorted_keys)[begin + iter];
       sorted_keys_[iter]->lkey = new (&lookup_key_ptr_[iter])
           LookupKey(*sorted_keys_[iter]->key, snapshot, read_opts.timestamp);
-      sorted_keys_[iter]->ukey = sorted_keys_[iter]->lkey->user_key();
+      sorted_keys_[iter]->ukey_with_ts = sorted_keys_[iter]->lkey->user_key();
+      sorted_keys_[iter]->ukey_without_ts = StripTimestampFromUserKey(
+          sorted_keys_[iter]->lkey->user_key(),
+          read_opts.timestamp == nullptr ? 0 : read_opts.timestamp->size());
       sorted_keys_[iter]->ikey = sorted_keys_[iter]->lkey->internal_key();
     }
   }
diff --git a/table/plain/plain_table_builder.cc b/table/plain/plain_table_builder.cc
index faebcfe2f03..3a1f0a41bcc 100644
--- a/table/plain/plain_table_builder.cc
+++ b/table/plain/plain_table_builder.cc
@@ -57,9 +57,8 @@ extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull;
 extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull;
 
 PlainTableBuilder::PlainTableBuilder(
-    const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
-    const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-        int_tbl_prop_collector_factories,
+    const ImmutableOptions& ioptions, const MutableCFOptions& moptions,
+    const IntTblPropCollectorFactoryRange& int_tbl_prop_collector_factories,
     uint32_t column_family_id, WritableFileWriter* file, uint32_t user_key_len,
     EncodingType encoding_type, size_t index_sparseness,
     uint32_t bloom_bits_per_key, const std::string& column_family_name,
@@ -100,6 +99,10 @@ PlainTableBuilder::PlainTableBuilder(
   properties_.column_family_name = column_family_name;
   properties_.db_id = db_id;
   properties_.db_session_id = db_session_id;
+  properties_.db_host_id = ioptions.db_host_id;
+  if (!ReifyDbHostIdProperty(ioptions_.env, &properties_.db_host_id).ok()) {
+    ROCKS_LOG_INFO(ioptions_.logger, "db_host_id property will not be set");
+  }
   properties_.prefix_extractor_name = moptions_.prefix_extractor != nullptr
                                           ? moptions_.prefix_extractor->Name()
                                           : "nullptr";
@@ -109,9 +112,12 @@ PlainTableBuilder::PlainTableBuilder(
   properties_.user_collected_properties
       [PlainTablePropertyNames::kEncodingType] = val;
 
-  for (auto& collector_factories : *int_tbl_prop_collector_factories) {
+  for (auto it = int_tbl_prop_collector_factories.first;
+       it != int_tbl_prop_collector_factories.second; ++it) {
+    assert(*it);
+
     table_properties_collectors_.emplace_back(
-        collector_factories->CreateIntTblPropCollector(column_family_id));
+        (*it)->CreateIntTblPropCollector(column_family_id));
   }
 }
 
@@ -128,7 +134,8 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
   size_t meta_bytes_buf_size = 0;
 
   ParsedInternalKey internal_key;
-  if (ParseInternalKey(key, &internal_key) != Status::OK()) {
+  if (!ParseInternalKey(key, &internal_key, false /* log_err_key */)
+           .ok()) {  // TODO
     assert(false);
     return;
   }
@@ -188,7 +195,7 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
 
   // notify property collectors
   NotifyCollectTableCollectorsOnAdd(
-      key, value, offset_, table_properties_collectors_, ioptions_.info_log);
+      key, value, offset_, table_properties_collectors_, ioptions_.logger);
   status_ = io_status_;
 }
 
@@ -209,13 +216,12 @@ Status PlainTableBuilder::Finish() {
 
   if (store_index_in_file_ && (properties_.num_entries > 0)) {
     assert(properties_.num_entries <= std::numeric_limits<uint32_t>::max());
-    Status s;
     BlockHandle bloom_block_handle;
     if (bloom_bits_per_key_ > 0) {
       bloom_block_.SetTotalBits(
           &arena_,
           static_cast<uint32_t>(properties_.num_entries) * bloom_bits_per_key_,
-          ioptions_.bloom_locality, huge_page_tlb_size_, ioptions_.info_log);
+          ioptions_.bloom_locality, huge_page_tlb_size_, ioptions_.logger);
 
       PutVarint32(&properties_.user_collected_properties
                        [PlainTablePropertyNames::kNumBloomBlocks],
@@ -259,9 +265,8 @@ Status PlainTableBuilder::Finish() {
   property_block_builder.Add(properties_.user_collected_properties);
 
   // -- Add user collected properties
-  NotifyCollectTableCollectorsOnFinish(table_properties_collectors_,
-                                       ioptions_.info_log,
-                                       &property_block_builder);
+  NotifyCollectTableCollectorsOnFinish(
+      table_properties_collectors_, ioptions_.logger, &property_block_builder);
 
   // -- Write property block
   BlockHandle property_block_handle;
diff --git a/table/plain/plain_table_builder.h b/table/plain/plain_table_builder.h
index 6ab5d59e305..7305cb15349 100644
--- a/table/plain/plain_table_builder.h
+++ b/table/plain/plain_table_builder.h
@@ -37,9 +37,8 @@ class PlainTableBuilder: public TableBuilder {
   // will be part of level specified by 'level'.  A value of -1 means
   // that the caller does not know which level the output file will reside.
   PlainTableBuilder(
-      const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
-      const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-          int_tbl_prop_collector_factories,
+      const ImmutableOptions& ioptions, const MutableCFOptions& moptions,
+      const IntTblPropCollectorFactoryRange& int_tbl_prop_collector_factories,
       uint32_t column_family_id, WritableFileWriter* file,
       uint32_t user_key_size, EncodingType encoding_type,
       size_t index_sparseness, uint32_t bloom_bits_per_key,
@@ -96,7 +95,7 @@ class PlainTableBuilder: public TableBuilder {
 
  private:
   Arena arena_;
-  const ImmutableCFOptions& ioptions_;
+  const ImmutableOptions& ioptions_;
   const MutableCFOptions& moptions_;
   std::vector<std::unique_ptr<IntTblPropCollector>>
       table_properties_collectors_;
diff --git a/table/plain/plain_table_factory.cc b/table/plain/plain_table_factory.cc
index e0d0e69f68d..dd345492dfe 100644
--- a/table/plain/plain_table_factory.cc
+++ b/table/plain/plain_table_factory.cc
@@ -52,8 +52,7 @@ static std::unordered_map<std::string, OptionTypeInfo> plain_table_type_info = {
 
 PlainTableFactory::PlainTableFactory(const PlainTableOptions& options)
     : table_options_(options) {
-  ConfigurableHelper::RegisterOptions(*this, &table_options_,
-                                      &plain_table_type_info);
+  RegisterOptions(&table_options_, &plain_table_type_info);
 }
 
 Status PlainTableFactory::NewTableReader(
@@ -71,7 +70,7 @@ Status PlainTableFactory::NewTableReader(
 }
 
 TableBuilder* PlainTableFactory::NewTableBuilder(
-    const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
+    const TableBuilderOptions& table_builder_options,
     WritableFileWriter* file) const {
   // Ignore the skip_filters flag. PlainTable format is optimized for small
   // in-memory dbs. The skip_filters optimization is not useful for plain
@@ -79,9 +78,10 @@ TableBuilder* PlainTableFactory::NewTableBuilder(
   //
   return new PlainTableBuilder(
       table_builder_options.ioptions, table_builder_options.moptions,
-      table_builder_options.int_tbl_prop_collector_factories, column_family_id,
-      file, table_options_.user_key_len, table_options_.encoding_type,
-      table_options_.index_sparseness, table_options_.bloom_bits_per_key,
+      table_builder_options.int_tbl_prop_collector_factories,
+      table_builder_options.column_family_id, file, table_options_.user_key_len,
+      table_options_.encoding_type, table_options_.index_sparseness,
+      table_options_.bloom_bits_per_key,
       table_builder_options.column_family_name, 6,
       table_options_.huge_page_tlb_size, table_options_.hash_table_ratio,
       table_options_.store_index_in_file, table_builder_options.db_id,
diff --git a/table/plain/plain_table_factory.h b/table/plain/plain_table_factory.h
index 61a1ed935cb..e482403277d 100644
--- a/table/plain/plain_table_factory.h
+++ b/table/plain/plain_table_factory.h
@@ -156,6 +156,8 @@ class PlainTableFactory : public TableFactory {
   explicit PlainTableFactory(
       const PlainTableOptions& _table_options = PlainTableOptions());
 
+  // Method to allow CheckedCast to work for this class
+  static const char* kClassName() { return kPlainTableName(); }
   const char* Name() const override { return kPlainTableName(); }
   using TableFactory::NewTableReader;
   Status NewTableReader(const ReadOptions& ro,
@@ -166,7 +168,7 @@ class PlainTableFactory : public TableFactory {
 
   TableBuilder* NewTableBuilder(
       const TableBuilderOptions& table_builder_options,
-      uint32_t column_family_id, WritableFileWriter* file) const override;
+      WritableFileWriter* file) const override;
 
   std::string GetPrintableOptions() const override;
   static const char kValueTypeSeqId0 = char(~0);
diff --git a/table/plain/plain_table_index.cc b/table/plain/plain_table_index.cc
index 1099dfa6e43..f9f700e6e73 100644
--- a/table/plain/plain_table_index.cc
+++ b/table/plain/plain_table_index.cc
@@ -98,7 +98,7 @@ Slice PlainTableIndexBuilder::Finish() {
   BucketizeIndexes(&hash_to_offsets, &entries_per_bucket);
 
   keys_per_prefix_hist_.Add(num_keys_per_prefix_);
-  ROCKS_LOG_INFO(ioptions_.info_log, "Number of Keys per prefix Histogram: %s",
+  ROCKS_LOG_INFO(ioptions_.logger, "Number of Keys per prefix Histogram: %s",
                  keys_per_prefix_hist_.ToString().c_str());
 
   // From the temp data structure, populate indexes.
@@ -153,12 +153,12 @@ void PlainTableIndexBuilder::BucketizeIndexes(
 Slice PlainTableIndexBuilder::FillIndexes(
     const std::vector<IndexRecord*>& hash_to_offsets,
     const std::vector<uint32_t>& entries_per_bucket) {
-  ROCKS_LOG_DEBUG(ioptions_.info_log,
+  ROCKS_LOG_DEBUG(ioptions_.logger,
                   "Reserving %" PRIu32 " bytes for plain table's sub_index",
                   sub_index_size_);
   auto total_allocate_size = GetTotalSize();
   char* allocated = arena_->AllocateAligned(
-      total_allocate_size, huge_page_tlb_size_, ioptions_.info_log);
+      total_allocate_size, huge_page_tlb_size_, ioptions_.logger);
 
   auto temp_ptr = EncodeVarint32(allocated, index_size_);
   uint32_t* index =
@@ -198,7 +198,7 @@ Slice PlainTableIndexBuilder::FillIndexes(
   }
   assert(sub_index_offset == sub_index_size_);
 
-  ROCKS_LOG_DEBUG(ioptions_.info_log,
+  ROCKS_LOG_DEBUG(ioptions_.logger,
                   "hash table size: %" PRIu32 ", suffix_map length %" PRIu32,
                   index_size_, sub_index_size_);
   return Slice(allocated, GetTotalSize());
diff --git a/table/plain/plain_table_index.h b/table/plain/plain_table_index.h
index 1202a2f566f..3ef0705d484 100644
--- a/table/plain/plain_table_index.h
+++ b/table/plain/plain_table_index.h
@@ -131,7 +131,7 @@ class PlainTableIndex {
 // The class is used by PlainTableBuilder class.
 class PlainTableIndexBuilder {
  public:
-  PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions,
+  PlainTableIndexBuilder(Arena* arena, const ImmutableOptions& ioptions,
                          const SliceTransform* prefix_extractor,
                          size_t index_sparseness, double hash_table_ratio,
                          size_t huge_page_tlb_size)
@@ -222,7 +222,7 @@ class PlainTableIndexBuilder {
                     const std::vector<uint32_t>& entries_per_bucket);
 
   Arena* arena_;
-  const ImmutableCFOptions ioptions_;
+  const ImmutableOptions ioptions_;
   HistogramImpl keys_per_prefix_hist_;
   IndexRecordList record_list_;
   bool is_first_record_;
diff --git a/table/plain/plain_table_key_coding.cc b/table/plain/plain_table_key_coding.cc
index 39feb8dd050..e3a76f89ea2 100644
--- a/table/plain/plain_table_key_coding.cc
+++ b/table/plain/plain_table_key_coding.cc
@@ -85,8 +85,10 @@ IOStatus PlainTableKeyEncoder::AppendKey(const Slice& key,
                                          uint64_t* offset, char* meta_bytes_buf,
                                          size_t* meta_bytes_buf_size) {
   ParsedInternalKey parsed_key;
-  if (ParseInternalKey(key, &parsed_key) != Status::OK()) {
-    return IOStatus::Corruption(Slice());
+  Status pik_status =
+      ParseInternalKey(key, &parsed_key, false /* log_err_key */);  // TODO
+  if (!pik_status.ok()) {
+    return IOStatus::Corruption(pik_status.getState());
   }
 
   Slice key_to_write = key;  // Portion of internal key to write out.
@@ -279,9 +281,12 @@ Status PlainTableKeyDecoder::ReadInternalKey(
       return file_reader_.status();
     }
     *internal_key_valid = true;
-    if (ParseInternalKey(*internal_key, parsed_key) != Status::OK()) {
+    Status pik_status = ParseInternalKey(*internal_key, parsed_key,
+                                         false /* log_err_key */);  // TODO
+    if (!pik_status.ok()) {
       return Status::Corruption(
-          Slice("Incorrect value type found when reading the next key"));
+          Slice("Corrupted key found during next key read. "),
+          pik_status.getState());
     }
     *bytes_read += user_key_size + 8;
   }
diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc
index e08174948c9..fbd62426862 100644
--- a/table/plain/plain_table_reader.cc
+++ b/table/plain/plain_table_reader.cc
@@ -93,7 +93,7 @@ class PlainTableIterator : public InternalIterator {
 
 extern const uint64_t kPlainTableMagicNumber;
 PlainTableReader::PlainTableReader(
-    const ImmutableCFOptions& ioptions,
+    const ImmutableOptions& ioptions,
     std::unique_ptr<RandomAccessFileReader>&& file,
     const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
     EncodingType encoding_type, uint64_t file_size,
@@ -118,7 +118,7 @@ PlainTableReader::~PlainTableReader() {
 }
 
 Status PlainTableReader::Open(
-    const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
+    const ImmutableOptions& ioptions, const EnvOptions& env_options,
     const InternalKeyComparator& internal_comparator,
     std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
     std::unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key,
@@ -277,7 +277,7 @@ void PlainTableReader::AllocateBloom(int bloom_bits_per_key, int num_keys,
   if (bloom_total_bits > 0) {
     enable_bloom_ = true;
     bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality,
-                        huge_page_tlb_size, ioptions_.info_log);
+                        huge_page_tlb_size, ioptions_.logger);
   }
 }
 
@@ -457,7 +457,8 @@ Status PlainTableReader::GetOffset(PlainTableKeyDecoder* decoder,
   uint32_t high = upper_bound;
   ParsedInternalKey mid_key;
   ParsedInternalKey parsed_target;
-  Status s = ParseInternalKey(target, &parsed_target);
+  Status s = ParseInternalKey(target, &parsed_target,
+                              false /* log_err_key */);  // TODO
   if (!s.ok()) return s;
 
   // The key is between [low, high). Do a binary search between it.
@@ -593,8 +594,9 @@ Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target,
   }
   ParsedInternalKey found_key;
   ParsedInternalKey parsed_target;
-  s = ParseInternalKey(target, &parsed_target);
-  if (!s.ok()) return Status::Corruption(Slice());
+  s = ParseInternalKey(target, &parsed_target,
+                       false /* log_err_key */);  // TODO
+  if (!s.ok()) return s;
 
   Slice found_value;
   while (offset < file_info_.data_end_offset) {
diff --git a/table/plain/plain_table_reader.h b/table/plain/plain_table_reader.h
index e3b12a9c32b..98bfa59df60 100644
--- a/table/plain/plain_table_reader.h
+++ b/table/plain/plain_table_reader.h
@@ -67,7 +67,7 @@ class PlainTableReader: public TableReader {
 // whether it points to the data offset of the first key with the key prefix
 // or the offset of it. If there are too many keys share this prefix, it will
 // create a binary search-able index from the suffix to offset on disk.
-  static Status Open(const ImmutableCFOptions& ioptions,
+  static Status Open(const ImmutableOptions& ioptions,
                      const EnvOptions& env_options,
                      const InternalKeyComparator& internal_comparator,
                      std::unique_ptr<RandomAccessFileReader>&& file,
@@ -110,7 +110,7 @@ class PlainTableReader: public TableReader {
     return arena_.MemoryAllocatedBytes();
   }
 
-  PlainTableReader(const ImmutableCFOptions& ioptions,
+  PlainTableReader(const ImmutableOptions& ioptions,
                    std::unique_ptr<RandomAccessFileReader>&& file,
                    const EnvOptions& env_options,
                    const InternalKeyComparator& internal_comparator,
@@ -163,7 +163,7 @@ class PlainTableReader: public TableReader {
   CacheAllocationPtr index_block_alloc_;
   CacheAllocationPtr bloom_block_alloc_;
 
-  const ImmutableCFOptions& ioptions_;
+  const ImmutableOptions& ioptions_;
   std::unique_ptr<Cleanable> dummy_cleanable_;
   uint64_t file_size_;
  protected: // for testing
diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc
index cff9dd970ef..d2ade5bfeaf 100644
--- a/table/sst_file_dumper.cc
+++ b/table/sst_file_dumper.cc
@@ -18,7 +18,6 @@
 #include "db/blob/blob_index.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
-#include "env/composite_env_wrapper.h"
 #include "options/cf_options.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
@@ -80,11 +79,13 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) {
   // read table magic number
   Footer footer;
 
-  std::unique_ptr<RandomAccessFile> file;
+  const auto& fs = options_.env->GetFileSystem();
+  std::unique_ptr<FSRandomAccessFile> file;
   uint64_t file_size = 0;
-  Status s = options_.env->NewRandomAccessFile(file_path, &file, soptions_);
+  Status s = fs->NewRandomAccessFile(file_path, FileOptions(soptions_), &file,
+                                     nullptr);
   if (s.ok()) {
-    s = options_.env->GetFileSize(file_path, &file_size);
+    s = fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr);
   }
 
   // check empty file
@@ -93,8 +94,7 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) {
     return Status::Aborted(file_path, "Empty file");
   }
 
-  file_.reset(new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(file),
-                                         file_path));
+  file_.reset(new RandomAccessFileReader(std::move(file), file_path));
 
   FilePrefetchBuffer prefetch_buffer(nullptr, 0, 0, true /* enable */,
                                      false /* track_min_offset */);
@@ -119,9 +119,10 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) {
     if (magic_number == kPlainTableMagicNumber ||
         magic_number == kLegacyPlainTableMagicNumber) {
       soptions_.use_mmap_reads = true;
-      options_.env->NewRandomAccessFile(file_path, &file, soptions_);
-      file_.reset(new RandomAccessFileReader(
-          NewLegacyRandomAccessFileWrapper(file), file_path));
+
+      fs->NewRandomAccessFile(file_path, FileOptions(soptions_), &file,
+                              nullptr);
+      file_.reset(new RandomAccessFileReader(std::move(file), file_path));
     }
     options_.comparator = &internal_comparator_;
     // For old sst format, ReadTableProperties might fail but file can be read
@@ -144,7 +145,7 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) {
 }
 
 Status SstFileDumper::NewTableReader(
-    const ImmutableCFOptions& /*ioptions*/, const EnvOptions& /*soptions*/,
+    const ImmutableOptions& /*ioptions*/, const EnvOptions& /*soptions*/,
     const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size,
     std::unique_ptr<TableReader>* /*table_reader*/) {
   auto t_opt =
@@ -177,8 +178,10 @@ Status SstFileDumper::VerifyChecksum() {
 Status SstFileDumper::DumpTable(const std::string& out_filename) {
   std::unique_ptr<WritableFile> out_file;
   Env* env = options_.env;
-  env->NewWritableFile(out_filename, &out_file, soptions_);
-  Status s = table_reader_->DumpTable(out_file.get());
+  Status s = env->NewWritableFile(out_filename, &out_file, soptions_);
+  if (s.ok()) {
+    s = table_reader_->DumpTable(out_file.get());
+  }
   if (!s.ok()) {
     // close the file before return error, ignore the close error if there's any
     out_file->Close().PermitUncheckedError();
@@ -190,23 +193,20 @@ Status SstFileDumper::DumpTable(const std::string& out_filename) {
 Status SstFileDumper::CalculateCompressedTableSize(
     const TableBuilderOptions& tb_options, size_t block_size,
     uint64_t* num_data_blocks, uint64_t* compressed_table_size) {
-  std::unique_ptr<WritableFile> out_file;
   std::unique_ptr<Env> env(NewMemEnv(options_.env));
-  Status s = env->NewWritableFile(testFileName, &out_file, soptions_);
+  std::unique_ptr<WritableFileWriter> dest_writer;
+  Status s =
+      WritableFileWriter::Create(env->GetFileSystem(), testFileName,
+                                 FileOptions(soptions_), &dest_writer, nullptr);
   if (!s.ok()) {
     return s;
   }
-  std::unique_ptr<WritableFileWriter> dest_writer;
-  dest_writer.reset(
-      new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(out_file)),
-                             testFileName, soptions_));
   BlockBasedTableOptions table_options;
   table_options.block_size = block_size;
   BlockBasedTableFactory block_based_tf(table_options);
   std::unique_ptr<TableBuilder> table_builder;
   table_builder.reset(block_based_tf.NewTableBuilder(
       tb_options,
-      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
       dest_writer.get()));
   std::unique_ptr<InternalIterator> iter(table_reader_->NewIterator(
       read_options_, moptions_.prefix_extractor.get(), /*arena=*/nullptr,
@@ -233,7 +233,8 @@ Status SstFileDumper::ShowAllCompressionSizes(
     const std::vector<std::pair<CompressionType, const char*>>&
         compression_types,
     int32_t compress_level_from, int32_t compress_level_to,
-    uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes) {
+    uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes,
+    uint64_t max_dict_buffer_bytes) {
   fprintf(stdout, "Block Size: %" ROCKSDB_PRIszt "\n", block_size);
   for (auto& i : compression_types) {
     if (CompressionTypeSupported(i.first)) {
@@ -241,6 +242,7 @@ Status SstFileDumper::ShowAllCompressionSizes(
       CompressionOptions compress_opt;
       compress_opt.max_dict_bytes = max_dict_bytes;
       compress_opt.zstd_max_train_bytes = zstd_max_train_bytes;
+      compress_opt.max_dict_buffer_bytes = max_dict_buffer_bytes;
       for (int32_t j = compress_level_from; j <= compress_level_to; j++) {
         fprintf(stdout, "Compression level: %d", j);
         compress_opt.level = j;
@@ -262,18 +264,18 @@ Status SstFileDumper::ShowCompressionSize(
   Options opts;
   opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
   opts.statistics->set_stats_level(StatsLevel::kAll);
-  const ImmutableCFOptions imoptions(opts);
+  const ImmutableOptions imoptions(opts);
   const ColumnFamilyOptions cfo(opts);
   const MutableCFOptions moptions(cfo);
   ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator);
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      block_based_table_factories;
+  IntTblPropCollectorFactories block_based_table_factories;
 
   std::string column_family_name;
   int unknown_level = -1;
   TableBuilderOptions tb_opts(
       imoptions, moptions, ikc, &block_based_table_factories, compress_type,
-      0 /* sample_for_compression */, compress_opt, false /* skip_filters */,
+      compress_opt,
+      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
       column_family_name, unknown_level);
   uint64_t num_data_blocks = 0;
   std::chrono::steady_clock::time_point start =
@@ -441,9 +443,9 @@ Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num,
     if (read_num > 0 && i > read_num) break;
 
     ParsedInternalKey ikey;
-    if (ParseInternalKey(key, &ikey) != Status::OK()) {
-      std::cerr << "Internal Key [" << key.ToString(true /* in hex*/)
-                << "] parse error!\n";
+    Status pik_status = ParseInternalKey(key, &ikey, true /* log_err_key */);
+    if (!pik_status.ok()) {
+      std::cerr << pik_status.getState() << "\n";
       continue;
     }
 
@@ -459,7 +461,8 @@ Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num,
 
     if (print_kv) {
       if (!decode_blob_index_ || ikey.type != kTypeBlobIndex) {
-        fprintf(stdout, "%s => %s\n", ikey.DebugString(output_hex_).c_str(),
+        fprintf(stdout, "%s => %s\n",
+                ikey.DebugString(true, output_hex_).c_str(),
                 value.ToString(output_hex_).c_str());
       } else {
         BlobIndex blob_index;
@@ -467,11 +470,12 @@ Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num,
         const Status s = blob_index.DecodeFrom(value);
         if (!s.ok()) {
           fprintf(stderr, "%s => error decoding blob index\n",
-                  ikey.DebugString(output_hex_).c_str());
+                  ikey.DebugString(true, output_hex_).c_str());
           continue;
         }
 
-        fprintf(stdout, "%s => %s\n", ikey.DebugString(output_hex_).c_str(),
+        fprintf(stdout, "%s => %s\n",
+                ikey.DebugString(true, output_hex_).c_str(),
                 blob_index.DebugString(output_hex_).c_str());
       }
     }
diff --git a/table/sst_file_dumper.h b/table/sst_file_dumper.h
index 9153f8a3800..32aa7be9474 100644
--- a/table/sst_file_dumper.h
+++ b/table/sst_file_dumper.h
@@ -40,7 +40,8 @@ class SstFileDumper {
       const std::vector<std::pair<CompressionType, const char*>>&
           compression_types,
       int32_t compress_level_from, int32_t compress_level_to,
-      uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes);
+      uint32_t max_dict_bytes, uint32_t zstd_max_train_bytes,
+      uint64_t max_dict_buffer_bytes);
 
   Status ShowCompressionSize(size_t block_size, CompressionType compress_type,
                              const CompressionOptions& compress_opt);
@@ -62,7 +63,7 @@ class SstFileDumper {
 
   // Helper function to call the factory with settings specific to the
   // factory implementation
-  Status NewTableReader(const ImmutableCFOptions& ioptions,
+  Status NewTableReader(const ImmutableOptions& ioptions,
                         const EnvOptions& soptions,
                         const InternalKeyComparator& internal_comparator,
                         uint64_t file_size,
@@ -84,7 +85,7 @@ class SstFileDumper {
   std::unique_ptr<TableReader> table_reader_;
   std::unique_ptr<RandomAccessFileReader> file_;
 
-  const ImmutableCFOptions ioptions_;
+  const ImmutableOptions ioptions_;
   const MutableCFOptions moptions_;
   ReadOptions read_options_;
   InternalKeyComparator internal_comparator_;
diff --git a/table/sst_file_reader.cc b/table/sst_file_reader.cc
index f7f22b06110..e106bca9d65 100644
--- a/table/sst_file_reader.cc
+++ b/table/sst_file_reader.cc
@@ -10,9 +10,10 @@
 #include "db/arena_wrapped_db_iter.h"
 #include "db/db_iter.h"
 #include "db/dbformat.h"
-#include "env/composite_env_wrapper.h"
 #include "file/random_access_file_reader.h"
 #include "options/cf_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
 #include "table/get_context.h"
 #include "table/table_builder.h"
 #include "table/table_reader.h"
@@ -22,7 +23,7 @@ namespace ROCKSDB_NAMESPACE {
 struct SstFileReader::Rep {
   Options options;
   EnvOptions soptions;
-  ImmutableCFOptions ioptions;
+  ImmutableOptions ioptions;
   MutableCFOptions moptions;
 
   std::unique_ptr<TableReader> table_reader;
@@ -42,15 +43,17 @@ Status SstFileReader::Open(const std::string& file_path) {
   auto r = rep_.get();
   Status s;
   uint64_t file_size = 0;
-  std::unique_ptr<RandomAccessFile> file;
+  std::unique_ptr<FSRandomAccessFile> file;
   std::unique_ptr<RandomAccessFileReader> file_reader;
-  s = r->options.env->GetFileSize(file_path, &file_size);
+  FileOptions fopts(r->soptions);
+  const auto& fs = r->options.env->GetFileSystem();
+
+  s = fs->GetFileSize(file_path, fopts.io_options, &file_size, nullptr);
   if (s.ok()) {
-    s = r->options.env->NewRandomAccessFile(file_path, &file, r->soptions);
+    s = fs->NewRandomAccessFile(file_path, fopts, &file, nullptr);
   }
   if (s.ok()) {
-    file_reader.reset(new RandomAccessFileReader(
-        NewLegacyRandomAccessFileWrapper(file), file_path));
+    file_reader.reset(new RandomAccessFileReader(std::move(file), file_path));
   }
   if (s.ok()) {
     TableReaderOptions t_opt(r->ioptions, r->moptions.prefix_extractor.get(),
@@ -69,11 +72,12 @@ Iterator* SstFileReader::NewIterator(const ReadOptions& roptions) {
                       ? roptions.snapshot->GetSequenceNumber()
                       : kMaxSequenceNumber;
   ArenaWrappedDBIter* res = new ArenaWrappedDBIter();
-  res->Init(r->options.env, roptions, r->ioptions, r->moptions, sequence,
+  res->Init(r->options.env, roptions, r->ioptions, r->moptions,
+            nullptr /* version */, sequence,
             r->moptions.max_sequential_skip_in_iterations,
             0 /* version_number */, nullptr /* read_callback */,
-            nullptr /* db_impl */, nullptr /* cfd */, false /* allow_blob */,
-            false /* allow_refresh */);
+            nullptr /* db_impl */, nullptr /* cfd */,
+            true /* expose_blob_index */, false /* allow_refresh */);
   auto internal_iter = r->table_reader->NewIterator(
       res->GetReadOptions(), r->moptions.prefix_extractor.get(),
       res->GetArena(), false /* skip_filters */,
diff --git a/table/sst_file_reader_test.cc b/table/sst_file_reader_test.cc
index 8a63b69bdd4..52cab2ab36e 100644
--- a/table/sst_file_reader_test.cc
+++ b/table/sst_file_reader_test.cc
@@ -5,11 +5,13 @@
 
 #ifndef ROCKSDB_LITE
 
+#include "rocksdb/sst_file_reader.h"
+
 #include <cinttypes>
 
 #include "port/stack_trace.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
-#include "rocksdb/sst_file_reader.h"
 #include "rocksdb/sst_file_writer.h"
 #include "table/sst_file_writer_collectors.h"
 #include "test_util/testharness.h"
@@ -37,14 +39,8 @@ class SstFileReaderTest : public testing::Test {
     sst_name_ = test::PerThreadDBPath("sst_file");
 
     Env* base_env = Env::Default();
-    const char* test_env_uri = getenv("TEST_ENV_URI");
-    if(test_env_uri) {
-      Env* test_env = nullptr;
-      Status s = Env::LoadEnv(test_env_uri, &test_env, &env_guard_);
-      base_env = test_env;
-      EXPECT_OK(s);
-      EXPECT_NE(Env::Default(), base_env);
-    }
+    EXPECT_OK(
+        test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_));
     EXPECT_NE(nullptr, base_env);
     env_ = base_env;
     options_.env = env_;
@@ -90,6 +86,9 @@ class SstFileReaderTest : public testing::Test {
     if (check_global_seqno) {
       auto properties = reader.GetTableProperties();
       ASSERT_TRUE(properties);
+      std::string hostname;
+      ASSERT_OK(env_->GetHostNameString(&hostname));
+      ASSERT_EQ(properties->db_host_id, hostname);
       auto& user_properties = properties->user_collected_properties;
       ASSERT_TRUE(
           user_properties.count(ExternalSstFilePropertyNames::kGlobalSeqno));
diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc
index f6583beac80..9943296e991 100644
--- a/table/sst_file_writer.cc
+++ b/table/sst_file_writer.cc
@@ -8,8 +8,8 @@
 #include <vector>
 
 #include "db/dbformat.h"
-#include "env/composite_env_wrapper.h"
 #include "file/writable_file_writer.h"
+#include "rocksdb/file_system.h"
 #include "rocksdb/table.h"
 #include "table/block_based/block_based_table_builder.h"
 #include "table/sst_file_writer_collectors.h"
@@ -46,7 +46,7 @@ struct SstFileWriter::Rep {
   std::unique_ptr<WritableFileWriter> file_writer;
   std::unique_ptr<TableBuilder> builder;
   EnvOptions env_options;
-  ImmutableCFOptions ioptions;
+  ImmutableOptions ioptions;
   MutableCFOptions mutable_cf_options;
   Env::IOPriority io_priority;
   InternalKeyComparator internal_comparator;
@@ -104,7 +104,8 @@ struct SstFileWriter::Rep {
     file_info.largest_key.assign(user_key.data(), user_key.size());
     file_info.file_size = builder->FileSize();
 
-    return InvalidatePageCache(false /* closing */);
+    InvalidatePageCache(false /* closing */).PermitUncheckedError();
+    return Status::OK();
   }
 
   Status DeleteRange(const Slice& begin_key, const Slice& end_key) {
@@ -138,7 +139,8 @@ struct SstFileWriter::Rep {
     file_info.num_range_del_entries++;
     file_info.file_size = builder->FileSize();
 
-    return InvalidatePageCache(false /* closing */);
+    InvalidatePageCache(false /* closing */).PermitUncheckedError();
+    return Status::OK();
   }
 
   Status InvalidatePageCache(bool closing) {
@@ -189,8 +191,10 @@ SstFileWriter::~SstFileWriter() {
 Status SstFileWriter::Open(const std::string& file_path) {
   Rep* r = rep_.get();
   Status s;
-  std::unique_ptr<WritableFile> sst_file;
-  s = r->ioptions.env->NewWritableFile(file_path, &sst_file, r->env_options);
+  std::unique_ptr<FSWritableFile> sst_file;
+  FileOptions cur_file_opts(r->env_options);
+  s = r->ioptions.env->GetFileSystem()->NewWritableFile(
+      file_path, cur_file_opts, &sst_file, nullptr);
   if (!s.ok()) {
     return s;
   }
@@ -215,11 +219,8 @@ Status SstFileWriter::Open(const std::string& file_path) {
     compression_type = r->mutable_cf_options.compression;
     compression_opts = r->mutable_cf_options.compression_opts;
   }
-  uint64_t sample_for_compression =
-      r->mutable_cf_options.sample_for_compression;
 
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
 
   // SstFileWriter properties collector to add SstFileWriter version.
   int_tbl_prop_collector_factories.emplace_back(
@@ -257,21 +258,25 @@ Status SstFileWriter::Open(const std::string& file_path) {
   }
   TableBuilderOptions table_builder_options(
       r->ioptions, r->mutable_cf_options, r->internal_comparator,
-      &int_tbl_prop_collector_factories, compression_type,
-      sample_for_compression, compression_opts, r->skip_filters,
-      r->column_family_name, unknown_level, 0 /* creation_time */,
-      0 /* oldest_key_time */, 0 /* target_file_size */,
-      0 /* file_creation_time */, "SST Writer" /* db_id */, db_session_id);
+      &int_tbl_prop_collector_factories, compression_type, compression_opts,
+      cf_id, r->column_family_name, unknown_level, false /* is_bottommost */,
+      TableFileCreationReason::kMisc, 0 /* creation_time */,
+      0 /* oldest_key_time */, 0 /* file_creation_time */,
+      "SST Writer" /* db_id */, db_session_id, 0 /* target_file_size */, 0);
+  // XXX: when we can remove skip_filters from the SstFileWriter public API
+  // we can remove it from TableBuilderOptions.
+  table_builder_options.skip_filters = r->skip_filters;
+  FileTypeSet tmp_set = r->ioptions.checksum_handoff_file_types;
   r->file_writer.reset(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(sst_file)), file_path,
-      r->env_options, r->ioptions.env, nullptr /* io_tracer */,
-      nullptr /* stats */, r->ioptions.listeners,
-      r->ioptions.file_checksum_gen_factory));
+      std::move(sst_file), file_path, r->env_options, r->ioptions.clock,
+      nullptr /* io_tracer */, nullptr /* stats */, r->ioptions.listeners,
+      r->ioptions.file_checksum_gen_factory.get(),
+      tmp_set.Contains(FileType::kTableFile)));
 
   // TODO(tec) : If table_factory is using compressed block cache, we will
   // be adding the external sst file blocks into it, which is wasteful.
   r->builder.reset(r->ioptions.table_factory->NewTableBuilder(
-      table_builder_options, cf_id, r->file_writer.get()));
+      table_builder_options, r->file_writer.get()));
 
   r->file_info = ExternalSstFileInfo();
   r->file_info.file_path = file_path;
@@ -318,9 +323,7 @@ Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) {
       s = r->file_writer->Sync(r->ioptions.use_fsync);
     }
     if (s.ok()) {
-      s = r->InvalidatePageCache(true /* closing */);
-    }
-    if (s.ok()) {
+      r->InvalidatePageCache(true /* closing */).PermitUncheckedError();
       s = r->file_writer->Close();
     }
   }
diff --git a/table/sst_file_writer_collectors.h b/table/sst_file_writer_collectors.h
index 01ecec97176..2dbd611ab6f 100644
--- a/table/sst_file_writer_collectors.h
+++ b/table/sst_file_writer_collectors.h
@@ -35,9 +35,9 @@ class SstFileWriterPropertiesCollector : public IntTblPropCollector {
     return Status::OK();
   }
 
-  virtual void BlockAdd(uint64_t /* blockRawBytes */,
-                        uint64_t /* blockCompressedBytesFast */,
-                        uint64_t /* blockCompressedBytesSlow */) override {
+  virtual void BlockAdd(uint64_t /* block_raw_bytes */,
+                        uint64_t /* block_compressed_bytes_fast */,
+                        uint64_t /* block_compressed_bytes_slow */) override {
     // Intentionally left blank. No interest in collecting stats for
     // blocks.
     return;
diff --git a/table/table_builder.h b/table/table_builder.h
index 36475c14376..f22b10750af 100644
--- a/table/table_builder.h
+++ b/table/table_builder.h
@@ -10,9 +10,11 @@
 #pragma once
 
 #include <stdint.h>
+
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "db/dbformat.h"
 #include "db/table_properties_collector.h"
 #include "file/writable_file_writer.h"
@@ -28,22 +30,25 @@ class Status;
 
 struct TableReaderOptions {
   // @param skip_filters Disables loading/accessing the filter block
-  TableReaderOptions(const ImmutableCFOptions& _ioptions,
+  TableReaderOptions(const ImmutableOptions& _ioptions,
                      const SliceTransform* _prefix_extractor,
                      const EnvOptions& _env_options,
                      const InternalKeyComparator& _internal_comparator,
                      bool _skip_filters = false, bool _immortal = false,
                      bool _force_direct_prefetch = false, int _level = -1,
                      BlockCacheTracer* const _block_cache_tracer = nullptr,
-                     size_t _max_file_size_for_l0_meta_pin = 0)
-      : TableReaderOptions(_ioptions, _prefix_extractor, _env_options,
-                           _internal_comparator, _skip_filters, _immortal,
-                           _force_direct_prefetch, _level,
-                           0 /* _largest_seqno */, _block_cache_tracer,
-                           _max_file_size_for_l0_meta_pin) {}
+                     size_t _max_file_size_for_l0_meta_pin = 0,
+                     const std::string& _cur_db_session_id = "",
+                     uint64_t _cur_file_num = 0)
+      : TableReaderOptions(
+            _ioptions, _prefix_extractor, _env_options, _internal_comparator,
+            _skip_filters, _immortal, _force_direct_prefetch, _level,
+            0 /* _largest_seqno */, _block_cache_tracer,
+            _max_file_size_for_l0_meta_pin, _cur_db_session_id, _cur_file_num) {
+  }
 
   // @param skip_filters Disables loading/accessing the filter block
-  TableReaderOptions(const ImmutableCFOptions& _ioptions,
+  TableReaderOptions(const ImmutableOptions& _ioptions,
                      const SliceTransform* _prefix_extractor,
                      const EnvOptions& _env_options,
                      const InternalKeyComparator& _internal_comparator,
@@ -51,7 +56,9 @@ struct TableReaderOptions {
                      bool _force_direct_prefetch, int _level,
                      SequenceNumber _largest_seqno,
                      BlockCacheTracer* const _block_cache_tracer,
-                     size_t _max_file_size_for_l0_meta_pin)
+                     size_t _max_file_size_for_l0_meta_pin,
+                     const std::string& _cur_db_session_id,
+                     uint64_t _cur_file_num)
       : ioptions(_ioptions),
         prefix_extractor(_prefix_extractor),
         env_options(_env_options),
@@ -62,9 +69,11 @@ struct TableReaderOptions {
         level(_level),
         largest_seqno(_largest_seqno),
         block_cache_tracer(_block_cache_tracer),
-        max_file_size_for_l0_meta_pin(_max_file_size_for_l0_meta_pin) {}
+        max_file_size_for_l0_meta_pin(_max_file_size_for_l0_meta_pin),
+        cur_db_session_id(_cur_db_session_id),
+        cur_file_num(_cur_file_num) {}
 
-  const ImmutableCFOptions& ioptions;
+  const ImmutableOptions& ioptions;
   const SliceTransform* prefix_extractor;
   const EnvOptions& env_options;
   const InternalKeyComparator& internal_comparator;
@@ -76,7 +85,8 @@ struct TableReaderOptions {
   // fetch into RocksDB's buffer, rather than relying
   // RandomAccessFile::Prefetch().
   bool force_direct_prefetch;
-  // what level this table/file is on, -1 for "not set, don't know"
+  // What level this table/file is on, -1 for "not set, don't know." Used
+  // for level-specific statistics.
   int level;
   // largest seqno in the table
   SequenceNumber largest_seqno;
@@ -84,55 +94,93 @@ struct TableReaderOptions {
   // Largest L0 file size whose meta-blocks may be pinned (can be zero when
   // unknown).
   const size_t max_file_size_for_l0_meta_pin;
+
+  std::string cur_db_session_id;
+
+  uint64_t cur_file_num;
 };
 
 struct TableBuilderOptions {
   TableBuilderOptions(
-      const ImmutableCFOptions& _ioptions, const MutableCFOptions& _moptions,
+      const ImmutableOptions& _ioptions, const MutableCFOptions& _moptions,
       const InternalKeyComparator& _internal_comparator,
-      const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-          _int_tbl_prop_collector_factories,
-      CompressionType _compression_type, uint64_t _sample_for_compression,
-      const CompressionOptions& _compression_opts, bool _skip_filters,
+      const IntTblPropCollectorFactoryRange& _int_tbl_prop_collector_factories,
+      CompressionType _compression_type,
+      const CompressionOptions& _compression_opts, uint32_t _column_family_id,
       const std::string& _column_family_name, int _level,
+      bool _is_bottommost = false,
+      TableFileCreationReason _reason = TableFileCreationReason::kMisc,
       const uint64_t _creation_time = 0, const int64_t _oldest_key_time = 0,
-      const uint64_t _target_file_size = 0,
       const uint64_t _file_creation_time = 0, const std::string& _db_id = "",
-      const std::string& _db_session_id = "")
+      const std::string& _db_session_id = "",
+      const uint64_t _target_file_size = 0, const uint64_t _cur_file_num = 0)
       : ioptions(_ioptions),
         moptions(_moptions),
         internal_comparator(_internal_comparator),
         int_tbl_prop_collector_factories(_int_tbl_prop_collector_factories),
         compression_type(_compression_type),
-        sample_for_compression(_sample_for_compression),
         compression_opts(_compression_opts),
-        skip_filters(_skip_filters),
+        column_family_id(_column_family_id),
         column_family_name(_column_family_name),
-        level(_level),
         creation_time(_creation_time),
         oldest_key_time(_oldest_key_time),
         target_file_size(_target_file_size),
         file_creation_time(_file_creation_time),
         db_id(_db_id),
-        db_session_id(_db_session_id) {}
+        db_session_id(_db_session_id),
+        level_at_creation(_level),
+        is_bottommost(_is_bottommost),
+        reason(_reason),
+        cur_file_num(_cur_file_num) {}
 
-  const ImmutableCFOptions& ioptions;
+  TableBuilderOptions(
+      const ImmutableOptions& _ioptions, const MutableCFOptions& _moptions,
+      const InternalKeyComparator& _internal_comparator,
+      const IntTblPropCollectorFactories* _int_tbl_prop_collector_factories,
+      CompressionType _compression_type,
+      const CompressionOptions& _compression_opts, uint32_t _column_family_id,
+      const std::string& _column_family_name, int _level,
+      bool _is_bottommost = false,
+      TableFileCreationReason _reason = TableFileCreationReason::kMisc,
+      const uint64_t _creation_time = 0, const int64_t _oldest_key_time = 0,
+      const uint64_t _file_creation_time = 0, const std::string& _db_id = "",
+      const std::string& _db_session_id = "",
+      const uint64_t _target_file_size = 0, const uint64_t _cur_file_num = 0)
+      : TableBuilderOptions(_ioptions, _moptions, _internal_comparator,
+                            IntTblPropCollectorFactoryRange(
+                                _int_tbl_prop_collector_factories->begin(),
+                                _int_tbl_prop_collector_factories->end()),
+                            _compression_type, _compression_opts,
+                            _column_family_id, _column_family_name, _level,
+                            _is_bottommost, _reason, _creation_time,
+                            _oldest_key_time, _file_creation_time, _db_id,
+                            _db_session_id, _target_file_size, _cur_file_num) {}
+
+  const ImmutableOptions& ioptions;
   const MutableCFOptions& moptions;
   const InternalKeyComparator& internal_comparator;
-  const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
-      int_tbl_prop_collector_factories;
-  CompressionType compression_type;
-  uint64_t sample_for_compression;
+  const IntTblPropCollectorFactoryRange int_tbl_prop_collector_factories;
+  const CompressionType compression_type;
   const CompressionOptions& compression_opts;
-  bool skip_filters;  // only used by BlockBasedTableBuilder
+  const uint32_t column_family_id;
   const std::string& column_family_name;
-  int level; // what level this table/file is on, -1 for "not set, don't know"
   const uint64_t creation_time;
   const int64_t oldest_key_time;
   const uint64_t target_file_size;
   const uint64_t file_creation_time;
   const std::string db_id;
   const std::string db_session_id;
+  // BEGIN for FilterBuildingContext
+  const int level_at_creation;
+  const bool is_bottommost;
+  const TableFileCreationReason reason;
+  // END for FilterBuildingContext
+
+  // XXX: only used by BlockBasedTableBuilder for SstFileWriter. If you
+  // want to skip filters, that should be (for example) null filter_policy
+  // in the table options of the ioptions.table_factory
+  bool skip_filters = false;
+  const uint64_t cur_file_num;
 };
 
 // TableBuilder provides the interface used to build a Table
diff --git a/table/table_factory.cc b/table/table_factory.cc
index 18935c8591d..962bad9badd 100644
--- a/table/table_factory.cc
+++ b/table/table_factory.cc
@@ -3,6 +3,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include "options/customizable_helper.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/table.h"
 #include "table/block_based/block_based_table_factory.h"
@@ -11,23 +12,9 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-Status TableFactory::CreateFromString(const ConfigOptions& config_options_in,
-                                      const std::string& id,
-                                      std::shared_ptr<TableFactory>* factory) {
-  Status status;
-  std::string name = id;
-
-  std::string existing_opts;
-
-  ConfigOptions config_options = config_options_in;
-  if (factory->get() != nullptr && name == factory->get()->Name()) {
-    config_options.delimiter = ";";
-
-    status = factory->get()->GetOptionString(config_options, &existing_opts);
-    if (!status.ok()) {
-      return status;
-    }
-  }
+static bool LoadFactory(const std::string& name,
+                        std::shared_ptr<TableFactory>* factory) {
+  bool success = true;
   if (name == TableFactory::kBlockBasedTableName()) {
     factory->reset(new BlockBasedTableFactory());
 #ifndef ROCKSDB_LITE
@@ -37,14 +24,15 @@ Status TableFactory::CreateFromString(const ConfigOptions& config_options_in,
     factory->reset(new CuckooTableFactory());
 #endif  // ROCKSDB_LITE
   } else {
-    status = Status::NotSupported("Could not load table factory: ", name);
-    return status;
-  }
-  if (!existing_opts.empty()) {
-    config_options.invoke_prepare_options = false;
-    status = factory->get()->ConfigureFromString(config_options, existing_opts);
+    success = false;
   }
-  return status;
+  return success;
 }
 
+Status TableFactory::CreateFromString(const ConfigOptions& config_options,
+                                      const std::string& value,
+                                      std::shared_ptr<TableFactory>* factory) {
+  return LoadSharedObject<TableFactory>(config_options, value, LoadFactory,
+                                        factory);
+}
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/table/table_properties.cc b/table/table_properties.cc
index 622f3d45b17..373c763d6a6 100644
--- a/table/table_properties.cc
+++ b/table/table_properties.cc
@@ -111,6 +111,8 @@ std::string TableProperties::ToString(
   }
   AppendProperty(result, "filter block size", filter_size, prop_delim,
                  kv_delim);
+  AppendProperty(result, "# entries for filter", num_filter_entries, prop_delim,
+                 kv_delim);
   AppendProperty(result, "(estimated) table size",
                  data_size + index_size + filter_size, prop_delim, kv_delim);
 
@@ -168,6 +170,11 @@ std::string TableProperties::ToString(
   AppendProperty(result, "file creation time", file_creation_time, prop_delim,
                  kv_delim);
 
+  AppendProperty(result, "slow compression estimated data size",
+                 slow_compression_estimated_data_size, prop_delim, kv_delim);
+  AppendProperty(result, "fast compression estimated data size",
+                 fast_compression_estimated_data_size, prop_delim, kv_delim);
+
   // DB identity and DB session ID
   AppendProperty(result, "DB identity", db_id, prop_delim, kv_delim);
   AppendProperty(result, "DB session identity", db_session_id, prop_delim,
@@ -188,14 +195,44 @@ void TableProperties::Add(const TableProperties& tp) {
   raw_value_size += tp.raw_value_size;
   num_data_blocks += tp.num_data_blocks;
   num_entries += tp.num_entries;
+  num_filter_entries += tp.num_filter_entries;
   num_deletions += tp.num_deletions;
   num_merge_operands += tp.num_merge_operands;
   num_range_deletions += tp.num_range_deletions;
+  slow_compression_estimated_data_size +=
+      tp.slow_compression_estimated_data_size;
+  fast_compression_estimated_data_size +=
+      tp.fast_compression_estimated_data_size;
+}
+
+std::map<std::string, uint64_t>
+TableProperties::GetAggregatablePropertiesAsMap() const {
+  std::map<std::string, uint64_t> rv;
+  rv["data_size"] = data_size;
+  rv["index_size"] = index_size;
+  rv["index_partitions"] = index_partitions;
+  rv["top_level_index_size"] = top_level_index_size;
+  rv["filter_size"] = filter_size;
+  rv["raw_key_size"] = raw_key_size;
+  rv["raw_value_size"] = raw_value_size;
+  rv["num_data_blocks"] = num_data_blocks;
+  rv["num_entries"] = num_entries;
+  rv["num_filter_entries"] = num_filter_entries;
+  rv["num_deletions"] = num_deletions;
+  rv["num_merge_operands"] = num_merge_operands;
+  rv["num_range_deletions"] = num_range_deletions;
+  rv["slow_compression_estimated_data_size"] =
+      slow_compression_estimated_data_size;
+  rv["fast_compression_estimated_data_size"] =
+      fast_compression_estimated_data_size;
+  return rv;
 }
 
 const std::string TablePropertiesNames::kDbId = "rocksdb.creating.db.identity";
 const std::string TablePropertiesNames::kDbSessionId =
     "rocksdb.creating.session.identity";
+const std::string TablePropertiesNames::kDbHostId =
+    "rocksdb.creating.host.identity";
 const std::string TablePropertiesNames::kDataSize  =
     "rocksdb.data.size";
 const std::string TablePropertiesNames::kIndexSize =
@@ -218,6 +255,8 @@ const std::string TablePropertiesNames::kNumDataBlocks =
     "rocksdb.num.data.blocks";
 const std::string TablePropertiesNames::kNumEntries =
     "rocksdb.num.entries";
+const std::string TablePropertiesNames::kNumFilterEntries =
+    "rocksdb.num.filter_entries";
 const std::string TablePropertiesNames::kDeletedKeys = "rocksdb.deleted.keys";
 const std::string TablePropertiesNames::kMergeOperands =
     "rocksdb.merge.operands";
@@ -248,6 +287,10 @@ const std::string TablePropertiesNames::kOldestKeyTime =
     "rocksdb.oldest.key.time";
 const std::string TablePropertiesNames::kFileCreationTime =
     "rocksdb.file.creation.time";
+const std::string TablePropertiesNames::kSlowCompressionEstimatedDataSize =
+    "rocksdb.sample_for_compression.slow.data.size";
+const std::string TablePropertiesNames::kFastCompressionEstimatedDataSize =
+    "rocksdb.sample_for_compression.fast.data.size";
 
 extern const std::string kPropertiesBlock = "rocksdb.properties";
 // Old property block name for backward compatibility
diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc
index f1fd605aa49..df4a750d793 100644
--- a/table/table_reader_bench.cc
+++ b/table/table_reader_bench.cc
@@ -13,11 +13,12 @@ int main() {
 
 #include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
-#include "env/composite_env_wrapper.h"
 #include "file/random_access_file_reader.h"
 #include "monitoring/histogram.h"
 #include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/system_clock.h"
 #include "rocksdb/table.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/get_context.h"
@@ -50,8 +51,8 @@ static std::string MakeKey(int i, int j, bool through_db) {
   return key.Encode().ToString();
 }
 
-uint64_t Now(Env* env, bool measured_by_nanosecond) {
-  return measured_by_nanosecond ? env->NowNanos() : env->NowMicros();
+uint64_t Now(SystemClock* clock, bool measured_by_nanosecond) {
+  return measured_by_nanosecond ? clock->NowNanos() : clock->NowMicros();
 }
 }  // namespace
 
@@ -81,30 +82,28 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
   std::string dbname = test::PerThreadDBPath("rocksdb_table_reader_bench_db");
   WriteOptions wo;
   Env* env = Env::Default();
+  auto* clock = env->GetSystemClock().get();
   TableBuilder* tb = nullptr;
   DB* db = nullptr;
   Status s;
-  const ImmutableCFOptions ioptions(opts);
+  const ImmutableOptions ioptions(opts);
   const ColumnFamilyOptions cfo(opts);
   const MutableCFOptions moptions(cfo);
   std::unique_ptr<WritableFileWriter> file_writer;
   if (!through_db) {
-    std::unique_ptr<WritableFile> file;
-    env->NewWritableFile(file_name, &file, env_options);
+    ASSERT_OK(WritableFileWriter::Create(env->GetFileSystem(), file_name,
+                                         FileOptions(env_options), &file_writer,
+                                         nullptr));
 
-    std::vector<std::unique_ptr<IntTblPropCollectorFactory> >
-        int_tbl_prop_collector_factories;
+    IntTblPropCollectorFactories int_tbl_prop_collector_factories;
 
-    file_writer.reset(new WritableFileWriter(
-        NewLegacyWritableFileWrapper(std::move(file)), file_name, env_options));
     int unknown_level = -1;
     tb = opts.table_factory->NewTableBuilder(
         TableBuilderOptions(
             ioptions, moptions, ikc, &int_tbl_prop_collector_factories,
-            CompressionType::kNoCompression, 0 /* sample_for_compression */,
-            CompressionOptions(), false /* skip_filters */,
-            kDefaultColumnFamilyName, unknown_level),
-        0 /* column_family_id */, file_writer.get());
+            CompressionType::kNoCompression, CompressionOptions(),
+            0 /* column_family_id */, kDefaultColumnFamilyName, unknown_level),
+        file_writer.get());
   } else {
     s = DB::Open(opts, dbname, &db);
     ASSERT_OK(s);
@@ -130,17 +129,19 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
 
   std::unique_ptr<TableReader> table_reader;
   if (!through_db) {
-    std::unique_ptr<RandomAccessFile> raf;
-    s = env->NewRandomAccessFile(file_name, &raf, env_options);
+    const auto& fs = env->GetFileSystem();
+    FileOptions fopts(env_options);
+
+    std::unique_ptr<FSRandomAccessFile> raf;
+    s = fs->NewRandomAccessFile(file_name, fopts, &raf, nullptr);
     if (!s.ok()) {
       fprintf(stderr, "Create File Error: %s\n", s.ToString().c_str());
       exit(1);
     }
     uint64_t file_size;
-    env->GetFileSize(file_name, &file_size);
+    fs->GetFileSize(file_name, fopts.io_options, &file_size, nullptr);
     std::unique_ptr<RandomAccessFileReader> file_reader(
-        new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(raf),
-                                   file_name));
+        new RandomAccessFileReader(std::move(raf), file_name));
     s = opts.table_factory->NewTableReader(
         TableReaderOptions(ioptions, moptions.prefix_extractor.get(),
                            env_options, ikc),
@@ -168,21 +169,21 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
         if (!for_iterator) {
           // Query one existing key;
           std::string key = MakeKey(r1, r2, through_db);
-          uint64_t start_time = Now(env, measured_by_nanosecond);
+          uint64_t start_time = Now(clock, measured_by_nanosecond);
           if (!through_db) {
             PinnableSlice value;
             MergeContext merge_context;
             SequenceNumber max_covering_tombstone_seq = 0;
-            GetContext get_context(ioptions.user_comparator,
-                                   ioptions.merge_operator, ioptions.info_log,
-                                   ioptions.statistics, GetContext::kNotFound,
-                                   Slice(key), &value, nullptr, &merge_context,
-                                   true, &max_covering_tombstone_seq, env);
+            GetContext get_context(
+                ioptions.user_comparator, ioptions.merge_operator.get(),
+                ioptions.logger, ioptions.stats, GetContext::kNotFound,
+                Slice(key), &value, nullptr, &merge_context, true,
+                &max_covering_tombstone_seq, clock);
             s = table_reader->Get(read_options, key, &get_context, nullptr);
           } else {
             s = db->Get(read_options, key, &result);
           }
-          hist.Add(Now(env, measured_by_nanosecond) - start_time);
+          hist.Add(Now(clock, measured_by_nanosecond) - start_time);
         } else {
           int r2_len;
           if (if_query_empty_keys) {
@@ -196,7 +197,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
           std::string start_key = MakeKey(r1, r2, through_db);
           std::string end_key = MakeKey(r1, r2 + r2_len, through_db);
           uint64_t total_time = 0;
-          uint64_t start_time = Now(env, measured_by_nanosecond);
+          uint64_t start_time = Now(clock, measured_by_nanosecond);
           Iterator* iter = nullptr;
           InternalIterator* iiter = nullptr;
           if (!through_db) {
@@ -214,10 +215,10 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
               break;
             }
             // verify key;
-            total_time += Now(env, measured_by_nanosecond) - start_time;
+            total_time += Now(clock, measured_by_nanosecond) - start_time;
             assert(Slice(MakeKey(r1, r2 + count, through_db)) ==
                    (through_db ? iter->key() : iiter->key()));
-            start_time = Now(env, measured_by_nanosecond);
+            start_time = Now(clock, measured_by_nanosecond);
             if (++count >= r2_len) {
               break;
             }
@@ -229,7 +230,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
             assert(false);
           }
           delete iter;
-          total_time += Now(env, measured_by_nanosecond) - start_time;
+          total_time += Now(clock, measured_by_nanosecond) - start_time;
           hist.Add(total_time);
         }
       }
diff --git a/table/table_test.cc b/table/table_test.cc
index 3333f2bd239..2c5550a4f7c 100644
--- a/table/table_test.cc
+++ b/table/table_test.cc
@@ -155,6 +155,9 @@ void Increment(const Comparator* cmp, std::string* key) {
   }
 }
 
+const auto kUnknownColumnFamily =
+    TablePropertiesCollectorFactory::Context::kUnknownColumnFamily;
+
 }  // namespace
 
 // Helper class for tests to unify the interface between
@@ -172,7 +175,7 @@ class Constructor {
   // Finish constructing the data structure with all the keys that have
   // been added so far.  Returns the keys in sorted order in "*keys"
   // and stores the key/value pairs in "*kvmap"
-  void Finish(const Options& options, const ImmutableCFOptions& ioptions,
+  void Finish(const Options& options, const ImmutableOptions& ioptions,
               const MutableCFOptions& moptions,
               const BlockBasedTableOptions& table_options,
               const InternalKeyComparator& internal_comparator,
@@ -191,7 +194,7 @@ class Constructor {
 
   // Construct the data structure from the data in "data"
   virtual Status FinishImpl(const Options& options,
-                            const ImmutableCFOptions& ioptions,
+                            const ImmutableOptions& ioptions,
                             const MutableCFOptions& moptions,
                             const BlockBasedTableOptions& table_options,
                             const InternalKeyComparator& internal_comparator,
@@ -252,9 +255,11 @@ class KeyConvertingIterator : public InternalIterator {
   Slice key() const override {
     assert(Valid());
     ParsedInternalKey parsed_key;
-    if (ParseInternalKey(iter_->key(), &parsed_key) != Status::OK()) {
-      status_ = Status::Corruption("malformed internal key");
-      return Slice("corrupted key");
+    Status pik_status =
+        ParseInternalKey(iter_->key(), &parsed_key, true /* log_err_key */);
+    if (!pik_status.ok()) {
+      status_ = pik_status;
+      return Slice(status_.getState());
     }
     return parsed_key.user_key;
   }
@@ -281,7 +286,7 @@ class BlockConstructor : public Constructor {
       : Constructor(cmp), comparator_(cmp), block_(nullptr) {}
   ~BlockConstructor() override { delete block_; }
   Status FinishImpl(const Options& /*options*/,
-                    const ImmutableCFOptions& /*ioptions*/,
+                    const ImmutableOptions& /*ioptions*/,
                     const MutableCFOptions& /*moptions*/,
                     const BlockBasedTableOptions& table_options,
                     const InternalKeyComparator& /*internal_comparator*/,
@@ -336,18 +341,18 @@ class TableConstructor : public Constructor {
   }
   ~TableConstructor() override { Reset(); }
 
-  Status FinishImpl(const Options& options, const ImmutableCFOptions& ioptions,
+  Status FinishImpl(const Options& options, const ImmutableOptions& ioptions,
                     const MutableCFOptions& moptions,
                     const BlockBasedTableOptions& /*table_options*/,
                     const InternalKeyComparator& internal_comparator,
                     const stl_wrappers::KVMap& kv_map) override {
     Reset();
     soptions.use_mmap_reads = ioptions.allow_mmap_reads;
-    file_writer_.reset(test::GetWritableFileWriter(new test::StringSink(),
-                                                   "" /* don't care */));
+    std::unique_ptr<FSWritableFile> sink(new test::StringSink());
+    file_writer_.reset(new WritableFileWriter(
+        std::move(sink), "" /* don't care */, FileOptions()));
     std::unique_ptr<TableBuilder> builder;
-    std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-        int_tbl_prop_collector_factories;
+    IntTblPropCollectorFactories int_tbl_prop_collector_factories;
 
     if (largest_seqno_ != 0) {
       // Pretend that it's an external file written by SstFileWriter.
@@ -360,10 +365,8 @@ class TableConstructor : public Constructor {
     builder.reset(ioptions.table_factory->NewTableBuilder(
         TableBuilderOptions(ioptions, moptions, internal_comparator,
                             &int_tbl_prop_collector_factories,
-                            options.compression, options.sample_for_compression,
-                            options.compression_opts, false /* skip_filters */,
-                            column_family_name, level_),
-        TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+                            options.compression, options.compression_opts,
+                            kUnknownColumnFamily, column_family_name, level_),
         file_writer_.get()));
 
     for (const auto& kv : kv_map) {
@@ -375,25 +378,27 @@ class TableConstructor : public Constructor {
       } else {
         builder->Add(kv.first, kv.second);
       }
-      EXPECT_TRUE(builder->status().ok());
+      EXPECT_OK(builder->status());
     }
     Status s = builder->Finish();
-    file_writer_->Flush();
+    EXPECT_OK(file_writer_->Flush());
     EXPECT_TRUE(s.ok()) << s.ToString();
 
     EXPECT_EQ(TEST_GetSink()->contents().size(), builder->FileSize());
 
     // Open the table
     uniq_id_ = cur_uniq_id_++;
-    file_reader_.reset(test::GetRandomAccessFileReader(new test::StringSource(
-        TEST_GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads)));
+    std::unique_ptr<FSRandomAccessFile> source(new test::StringSource(
+        TEST_GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads));
+
+    file_reader_.reset(new RandomAccessFileReader(std::move(source), "test"));
     const bool kSkipFilters = true;
     const bool kImmortal = true;
     return ioptions.table_factory->NewTableReader(
         TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions,
                            internal_comparator, !kSkipFilters, !kImmortal,
                            false, level_, largest_seqno_, &block_cache_tracer_,
-                           moptions.write_buffer_size),
+                           moptions.write_buffer_size, "", uniq_id_),
         std::move(file_reader_), TEST_GetSink()->contents().size(),
         &table_reader_);
   }
@@ -421,10 +426,12 @@ class TableConstructor : public Constructor {
         key, TableReaderCaller::kUncategorized);
   }
 
-  virtual Status Reopen(const ImmutableCFOptions& ioptions,
+  virtual Status Reopen(const ImmutableOptions& ioptions,
                         const MutableCFOptions& moptions) {
-    file_reader_.reset(test::GetRandomAccessFileReader(new test::StringSource(
-        TEST_GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads)));
+    std::unique_ptr<FSRandomAccessFile> source(new test::StringSource(
+        TEST_GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads));
+
+    file_reader_.reset(new RandomAccessFileReader(std::move(source), "test"));
     return ioptions.table_factory->NewTableReader(
         TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions,
                            *last_internal_key_),
@@ -443,8 +450,7 @@ class TableConstructor : public Constructor {
   bool ConvertToInternalKey() { return convert_to_internal_key_; }
 
   test::StringSink* TEST_GetSink() {
-    return ROCKSDB_NAMESPACE::test::GetStringSinkFromLegacyWriter(
-        file_writer_.get());
+    return static_cast<test::StringSink*>(file_writer_->writable_file());
   }
 
   BlockCacheTracer block_cache_tracer_;
@@ -482,27 +488,31 @@ class MemTableConstructor: public Constructor {
         write_buffer_manager_(wb),
         table_factory_(new SkipListFactory) {
     options_.memtable_factory = table_factory_;
-    ImmutableCFOptions ioptions(options_);
+    ImmutableOptions ioptions(options_);
     memtable_ =
         new MemTable(internal_comparator_, ioptions, MutableCFOptions(options_),
                      wb, kMaxSequenceNumber, 0 /* column_family_id */);
     memtable_->Ref();
   }
   ~MemTableConstructor() override { delete memtable_->Unref(); }
-  Status FinishImpl(const Options&, const ImmutableCFOptions& ioptions,
+  Status FinishImpl(const Options&, const ImmutableOptions& ioptions,
                     const MutableCFOptions& /*moptions*/,
                     const BlockBasedTableOptions& /*table_options*/,
                     const InternalKeyComparator& /*internal_comparator*/,
                     const stl_wrappers::KVMap& kv_map) override {
     delete memtable_->Unref();
-    ImmutableCFOptions mem_ioptions(ioptions);
+    ImmutableOptions mem_ioptions(ioptions);
     memtable_ = new MemTable(internal_comparator_, mem_ioptions,
                              MutableCFOptions(options_), write_buffer_manager_,
                              kMaxSequenceNumber, 0 /* column_family_id */);
     memtable_->Ref();
     int seq = 1;
     for (const auto& kv : kv_map) {
-      memtable_->Add(seq, kTypeValue, kv.first, kv.second);
+      Status s = memtable_->Add(seq, kTypeValue, kv.first, kv.second,
+                                nullptr /* kv_prot_info */);
+      if (!s.ok()) {
+        return s;
+      }
       seq++;
     }
     return Status::OK();
@@ -554,7 +564,7 @@ class DBConstructor: public Constructor {
   }
   ~DBConstructor() override { delete db_; }
   Status FinishImpl(const Options& /*options*/,
-                    const ImmutableCFOptions& /*ioptions*/,
+                    const ImmutableOptions& /*ioptions*/,
                     const MutableCFOptions& /*moptions*/,
                     const BlockBasedTableOptions& /*table_options*/,
                     const InternalKeyComparator& /*internal_comparator*/,
@@ -846,7 +856,7 @@ class HarnessTest : public testing::Test {
         constructor_.reset(new DBConstructor(options_.comparator));
         break;
     }
-    ioptions_ = ImmutableCFOptions(options_);
+    ioptions_ = ImmutableOptions(options_);
     moptions_ = MutableCFOptions(options_);
   }
 
@@ -1049,7 +1059,7 @@ class HarnessTest : public testing::Test {
  private:
   TestArgs args_;
   Options options_;
-  ImmutableCFOptions ioptions_;
+  ImmutableOptions ioptions_;
   MutableCFOptions moptions_;
   BlockBasedTableOptions table_options_;
   std::unique_ptr<Constructor> constructor_;
@@ -1130,7 +1140,8 @@ class BlockBasedTableTest
                                  &trace_writer));
     // Always return Status::OK().
     assert(c->block_cache_tracer_
-               .StartTrace(env_, trace_opt, std::move(trace_writer))
+               .StartTrace(env_->GetSystemClock().get(), trace_opt,
+                           std::move(trace_writer))
                .ok());
     {
       std::string user_key = "k01";
@@ -1225,7 +1236,9 @@ class FileChecksumTestHelper {
 
   void CreateWriteableFile() {
     sink_ = new test::StringSink();
-    file_writer_.reset(test::GetWritableFileWriter(sink_, "" /* don't care */));
+    std::unique_ptr<FSWritableFile> holder(sink_);
+    file_writer_.reset(new WritableFileWriter(
+        std::move(holder), "" /* don't care */, FileOptions()));
   }
 
   void SetFileChecksumGenerator(FileChecksumGenerator* checksum_generator) {
@@ -1265,15 +1278,15 @@ class FileChecksumTestHelper {
       EXPECT_TRUE(table_builder_->status().ok());
     }
     Status s = table_builder_->Finish();
-    file_writer_->Flush();
-    EXPECT_TRUE(s.ok());
+    EXPECT_OK(file_writer_->Flush());
+    EXPECT_OK(s);
 
     EXPECT_EQ(sink_->contents().size(), table_builder_->FileSize());
     return s;
   }
 
   std::string GetFileChecksum() {
-    file_writer_->Close();
+    EXPECT_OK(file_writer_->Close());
     return table_builder_->GetFileChecksum();
   }
 
@@ -1286,10 +1299,11 @@ class FileChecksumTestHelper {
     assert(file_checksum_generator != nullptr);
     cur_uniq_id_ = checksum_uniq_id_++;
     test::StringSink* ss_rw =
-        ROCKSDB_NAMESPACE::test::GetStringSinkFromLegacyWriter(
-            file_writer_.get());
-    file_reader_.reset(test::GetRandomAccessFileReader(
-        new test::StringSource(ss_rw->contents())));
+        static_cast<test::StringSink*>(file_writer_->writable_file());
+    std::unique_ptr<FSRandomAccessFile> source(
+        new test::StringSource(ss_rw->contents()));
+    file_reader_.reset(new RandomAccessFileReader(std::move(source), "test"));
+
     std::unique_ptr<char[]> scratch(new char[2048]);
     Slice result;
     uint64_t offset = 0;
@@ -1396,9 +1410,8 @@ TEST_P(BlockBasedTableTest, BasicBlockBasedTableProperties) {
   table_options.block_restart_interval = 1;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   MutableCFOptions moptions(options);
-  ioptions.statistics = options.statistics.get();
   c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
   ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_NOT_COMPRESSED), 0);
@@ -1445,9 +1458,8 @@ uint64_t BlockBasedTableTest::IndexUncompressedHelper(bool compressed) {
   table_options.enable_index_compression = compressed;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   MutableCFOptions moptions(options);
-  ioptions.statistics = options.statistics.get();
   c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
   c.ResetTableReader();
@@ -1472,7 +1484,7 @@ TEST_P(BlockBasedTableTest, BlockBasedTableProperties2) {
     BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-    const ImmutableCFOptions ioptions(options);
+    const ImmutableOptions ioptions(options);
     const MutableCFOptions moptions(options);
     c.Finish(options, ioptions, moptions, table_options,
              GetPlainInternalComparator(options.comparator), &keys, &kvmap);
@@ -1506,7 +1518,7 @@ TEST_P(BlockBasedTableTest, BlockBasedTableProperties2) {
     options.table_properties_collector_factories.emplace_back(
         new DummyPropertiesCollectorFactory2());
 
-    const ImmutableCFOptions ioptions(options);
+    const ImmutableOptions ioptions(options);
     const MutableCFOptions moptions(options);
     c.Finish(options, ioptions, moptions, table_options,
              GetPlainInternalComparator(options.comparator), &keys, &kvmap);
@@ -1550,7 +1562,7 @@ TEST_P(BlockBasedTableTest, RangeDelBlock) {
   table_options.block_restart_interval = 1;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   std::unique_ptr<InternalKeyComparator> internal_cmp(
       new InternalKeyComparator(options.comparator));
@@ -1571,7 +1583,8 @@ TEST_P(BlockBasedTableTest, RangeDelBlock) {
     for (size_t i = 0; i < expected_tombstones.size(); i++) {
       ASSERT_TRUE(iter->Valid());
       ParsedInternalKey parsed_key;
-      ASSERT_OK(ParseInternalKey(iter->key(), &parsed_key));
+      ASSERT_OK(
+          ParseInternalKey(iter->key(), &parsed_key, true /* log_err_key */));
       RangeTombstone t(parsed_key, iter->value());
       const auto& expected_t = expected_tombstones[i];
       ASSERT_EQ(t.start_key_, expected_t.start_key_);
@@ -1593,7 +1606,7 @@ TEST_P(BlockBasedTableTest, FilterPolicyNameProperties) {
   Options options;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
@@ -1637,7 +1650,7 @@ void PrefetchRange(TableConstructor* c, Options* opt,
   // reset the cache and reopen the table
   table_options->block_cache = NewLRUCache(16 * 1024 * 1024, 4);
   opt->table_factory.reset(NewBlockBasedTableFactory(*table_options));
-  const ImmutableCFOptions ioptions2(*opt);
+  const ImmutableOptions ioptions2(*opt);
   const MutableCFOptions moptions(*opt);
   ASSERT_OK(c->Reopen(ioptions2, moptions));
 
@@ -1695,7 +1708,7 @@ TEST_P(BlockBasedTableTest, PrefetchTest) {
   c.Add("k07", std::string(100000, 'x'));
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
-  const ImmutableCFOptions ioptions(opt);
+  const ImmutableOptions ioptions(opt);
   const MutableCFOptions moptions(opt);
   c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
   c.ResetTableReader();
@@ -1796,7 +1809,7 @@ TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) {
     c.Add("cccc2", std::string('a', 56));
     std::vector<std::string> keys;
     stl_wrappers::KVMap kvmap;
-    const ImmutableCFOptions ioptions(options);
+    const ImmutableOptions ioptions(options);
     const MutableCFOptions moptions(options);
     c.Finish(options, ioptions, moptions, table_options,
              GetPlainInternalComparator(options.comparator), &keys, &kvmap);
@@ -1855,7 +1868,7 @@ TEST_P(BlockBasedTableTest, NoopTransformSeek) {
   c.Add(key.Encode().ToString(), "b");
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   const InternalKeyComparator internal_comparator(options.comparator);
   c.Finish(options, ioptions, moptions, table_options, internal_comparator,
@@ -1893,14 +1906,14 @@ TEST_P(BlockBasedTableTest, SkipPrefixBloomFilter) {
   c.Add(key.Encode().ToString(), "test");
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   const InternalKeyComparator internal_comparator(options.comparator);
   c.Finish(options, ioptions, moptions, table_options, internal_comparator,
            &keys, &kvmap);
   // TODO(Zhongyi): update test to use MutableCFOptions
   options.prefix_extractor.reset(NewFixedPrefixTransform(9));
-  const ImmutableCFOptions new_ioptions(options);
+  const ImmutableOptions new_ioptions(options);
   const MutableCFOptions new_moptions(options);
   ASSERT_OK(c.Reopen(new_ioptions, new_moptions));
   auto reader = c.GetTableReader();
@@ -1957,7 +1970,7 @@ void TableTest::IndexTest(BlockBasedTableOptions table_options) {
 
   std::unique_ptr<InternalKeyComparator> comparator(
       new InternalKeyComparator(BytewiseComparator()));
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
            &kvmap);
@@ -2160,7 +2173,7 @@ TEST_P(BlockBasedTableTest, IndexSeekOptimizationIncomplete) {
   BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
   Options options;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
 
   TableConstructor c(BytewiseComparator());
@@ -2247,7 +2260,7 @@ TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKey2) {
     Statistics* stats = options.statistics.get();
     std::unique_ptr<InternalKeyComparator> comparator(
         new InternalKeyComparator(BytewiseComparator()));
-    const ImmutableCFOptions ioptions(options);
+    const ImmutableOptions ioptions(options);
     const MutableCFOptions moptions(options);
 
     TableConstructor c(BytewiseComparator());
@@ -2445,7 +2458,7 @@ TEST_P(BlockBasedTableTest, BinaryIndexWithFirstKeyGlobalSeqno) {
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   std::unique_ptr<InternalKeyComparator> comparator(
       new InternalKeyComparator(BytewiseComparator()));
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
 
   TableConstructor c(BytewiseComparator(), /* convert_to_internal_key */ false,
@@ -2520,7 +2533,7 @@ TEST_P(BlockBasedTableTest, IndexSizeStat) {
     table_options.block_restart_interval = 1;
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-    const ImmutableCFOptions ioptions(options);
+    const ImmutableOptions ioptions(options);
     const MutableCFOptions moptions(options);
     c.Finish(options, ioptions, moptions, table_options,
              GetPlainInternalComparator(options.comparator), &ks, &kvmap);
@@ -2549,7 +2562,7 @@ TEST_P(BlockBasedTableTest, NumBlockStat) {
 
   std::vector<std::string> ks;
   stl_wrappers::KVMap kvmap;
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &ks, &kvmap);
@@ -2570,7 +2583,7 @@ TEST_P(BlockBasedTableTest, TracingGetTest) {
   SetupTracingTest(&c);
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
@@ -2644,7 +2657,7 @@ TEST_P(BlockBasedTableTest, TracingApproximateOffsetOfTest) {
   SetupTracingTest(&c);
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
@@ -2688,7 +2701,7 @@ TEST_P(BlockBasedTableTest, TracingIterator) {
   SetupTracingTest(&c);
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
@@ -2818,7 +2831,7 @@ TEST_P(BlockBasedTableTest, BlockCacheDisabledTest) {
 
   TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
   c.Add("key", "value");
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
@@ -2870,7 +2883,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
 
   TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
   c.Add("key", "value");
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
@@ -2950,7 +2963,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
   table_options.block_cache = NewLRUCache(1, 4);
   options.statistics = CreateDBStatistics();
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
-  const ImmutableCFOptions ioptions2(options);
+  const ImmutableOptions ioptions2(options);
   const MutableCFOptions moptions2(options);
   ASSERT_OK(c.Reopen(ioptions2, moptions2));
   {
@@ -2997,7 +3010,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
   std::string user_key = "k01";
   InternalKey internal_key(user_key, 0, kTypeValue);
   c3.Add(internal_key.Encode().ToString(), "hello");
-  ImmutableCFOptions ioptions3(options);
+  ImmutableOptions ioptions3(options);
   MutableCFOptions moptions3(options);
   // Generate table without filter policy
   c3.Finish(options, ioptions3, moptions3, table_options,
@@ -3008,7 +3021,7 @@ TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
   table_options.filter_policy.reset(NewBloomFilterPolicy(1));
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
   options.statistics = CreateDBStatistics();
-  ImmutableCFOptions ioptions4(options);
+  ImmutableOptions ioptions4(options);
   MutableCFOptions moptions4(options);
   ASSERT_OK(c3.Reopen(ioptions4, moptions4));
   reader = dynamic_cast<BlockBasedTable*>(c3.GetTableReader());
@@ -3092,7 +3105,7 @@ TEST_P(BlockBasedTableTest, BlockReadCountTest) {
       InternalKey internal_key(user_key, 0, kTypeValue);
       std::string encoded_key = internal_key.Encode().ToString();
       c.Add(encoded_key, "hello");
-      ImmutableCFOptions ioptions(options);
+      ImmutableOptions ioptions(options);
       MutableCFOptions moptions(options);
       // Generate table with filter policy
       c.Finish(options, ioptions, moptions, table_options,
@@ -3180,7 +3193,7 @@ TEST_P(BlockBasedTableTest, BlockCacheLeak) {
   c.Add("k07", std::string(100000, 'x'));
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
-  const ImmutableCFOptions ioptions(opt);
+  const ImmutableOptions ioptions(opt);
   const MutableCFOptions moptions(opt);
   c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
 
@@ -3195,7 +3208,7 @@ TEST_P(BlockBasedTableTest, BlockCacheLeak) {
   ASSERT_OK(iter->status());
   iter.reset();
 
-  const ImmutableCFOptions ioptions1(opt);
+  const ImmutableOptions ioptions1(opt);
   const MutableCFOptions moptions1(opt);
   ASSERT_OK(c.Reopen(ioptions1, moptions1));
   auto table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
@@ -3208,7 +3221,7 @@ TEST_P(BlockBasedTableTest, BlockCacheLeak) {
   // rerun with different block cache
   table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4);
   opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  const ImmutableCFOptions ioptions2(opt);
+  const ImmutableOptions ioptions2(opt);
   const MutableCFOptions moptions2(opt);
   ASSERT_OK(c.Reopen(ioptions2, moptions2));
   table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
@@ -3268,7 +3281,7 @@ TEST_P(BlockBasedTableTest, MemoryAllocator) {
     c.Add("k07", std::string(100000, 'x'));
     std::vector<std::string> keys;
     stl_wrappers::KVMap kvmap;
-    const ImmutableCFOptions ioptions(opt);
+    const ImmutableOptions ioptions(opt);
     const MutableCFOptions moptions(opt);
     c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
 
@@ -3294,14 +3307,13 @@ TEST_P(BlockBasedTableTest, MemoryAllocator) {
 // Test the file checksum of block based table
 TEST_P(BlockBasedTableTest, NoFileChecksum) {
   Options options;
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   MutableCFOptions moptions(options);
   BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
   std::unique_ptr<InternalKeyComparator> comparator(
       new InternalKeyComparator(BytewiseComparator()));
   int level = 0;
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
   std::string column_family_name;
 
   FileChecksumTestHelper f(true);
@@ -3310,14 +3322,12 @@ TEST_P(BlockBasedTableTest, NoFileChecksum) {
   builder.reset(ioptions.table_factory->NewTableBuilder(
       TableBuilderOptions(ioptions, moptions, *comparator,
                           &int_tbl_prop_collector_factories,
-                          options.compression, options.sample_for_compression,
-                          options.compression_opts, false /* skip_filters */,
-                          column_family_name, level),
-      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+                          options.compression, options.compression_opts,
+                          kUnknownColumnFamily, column_family_name, level),
       f.GetFileWriter()));
   ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
   f.AddKVtoKVMap(1000);
-  f.WriteKVAndFlushTable();
+  ASSERT_OK(f.WriteKVAndFlushTable());
   ASSERT_STREQ(f.GetFileChecksumFuncName(), kUnknownFileChecksumFuncName);
   ASSERT_STREQ(f.GetFileChecksum().c_str(), kUnknownFileChecksum);
 }
@@ -3327,14 +3337,13 @@ TEST_P(BlockBasedTableTest, Crc32cFileChecksum) {
       new FileChecksumGenCrc32cFactory();
   Options options;
   options.file_checksum_gen_factory.reset(file_checksum_gen_factory);
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   MutableCFOptions moptions(options);
   BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
   std::unique_ptr<InternalKeyComparator> comparator(
       new InternalKeyComparator(BytewiseComparator()));
   int level = 0;
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
   std::string column_family_name;
 
   FileChecksumGenContext gen_context;
@@ -3349,14 +3358,12 @@ TEST_P(BlockBasedTableTest, Crc32cFileChecksum) {
   builder.reset(ioptions.table_factory->NewTableBuilder(
       TableBuilderOptions(ioptions, moptions, *comparator,
                           &int_tbl_prop_collector_factories,
-                          options.compression, options.sample_for_compression,
-                          options.compression_opts, false /* skip_filters */,
-                          column_family_name, level),
-      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+                          options.compression, options.compression_opts,
+                          kUnknownColumnFamily, column_family_name, level),
       f.GetFileWriter()));
   ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
   f.AddKVtoKVMap(1000);
-  f.WriteKVAndFlushTable();
+  ASSERT_OK(f.WriteKVAndFlushTable());
   ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c");
 
   std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen2 =
@@ -3386,23 +3393,21 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) {
   plain_table_options.hash_table_ratio = 0;
 
   PlainTableFactory factory(plain_table_options);
-  test::StringSink sink;
-  std::unique_ptr<WritableFileWriter> file_writer(
-      test::GetWritableFileWriter(new test::StringSink(), "" /* don't care */));
+  std::unique_ptr<FSWritableFile> sink(new test::StringSink());
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(sink), "" /* don't care */, FileOptions()));
   Options options;
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   InternalKeyComparator ikc(options.comparator);
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
   std::string column_family_name;
   int unknown_level = -1;
   std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
-      TableBuilderOptions(
-          ioptions, moptions, ikc, &int_tbl_prop_collector_factories,
-          kNoCompression, 0 /* sample_for_compression */, CompressionOptions(),
-          false /* skip_filters */, column_family_name, unknown_level),
-      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kNoCompression,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, unknown_level),
       file_writer.get()));
 
   for (char c = 'a'; c <= 'z'; ++c) {
@@ -3412,13 +3417,14 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) {
     builder->Add(key, value);
   }
   ASSERT_OK(builder->Finish());
-  file_writer->Flush();
+  ASSERT_OK(file_writer->Flush());
 
   test::StringSink* ss =
-      ROCKSDB_NAMESPACE::test::GetStringSinkFromLegacyWriter(file_writer.get());
+      static_cast<test::StringSink*>(file_writer->writable_file());
+  std::unique_ptr<FSRandomAccessFile> source(
+      new test::StringSource(ss->contents(), 72242, true));
   std::unique_ptr<RandomAccessFileReader> file_reader(
-      test::GetRandomAccessFileReader(
-          new test::StringSource(ss->contents(), 72242, true)));
+      new RandomAccessFileReader(std::move(source), "test"));
 
   TableProperties* props = nullptr;
   auto s = ReadTableProperties(file_reader.get(), ss->contents().size(),
@@ -3443,26 +3449,24 @@ TEST_F(PlainTableTest, NoFileChecksum) {
   PlainTableFactory factory(plain_table_options);
 
   Options options;
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   InternalKeyComparator ikc(options.comparator);
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
   std::string column_family_name;
   int unknown_level = -1;
   FileChecksumTestHelper f(true);
   f.CreateWriteableFile();
 
   std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
-      TableBuilderOptions(
-          ioptions, moptions, ikc, &int_tbl_prop_collector_factories,
-          kNoCompression, 0 /* sample_for_compression */, CompressionOptions(),
-          false /* skip_filters */, column_family_name, unknown_level),
-      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kNoCompression,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, unknown_level),
       f.GetFileWriter()));
   ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
   f.AddKVtoKVMap(1000);
-  f.WriteKVAndFlushTable();
+  ASSERT_OK(f.WriteKVAndFlushTable());
   ASSERT_STREQ(f.GetFileChecksumFuncName(), kUnknownFileChecksumFuncName);
   EXPECT_EQ(f.GetFileChecksum(), kUnknownFileChecksum);
 }
@@ -3478,11 +3482,10 @@ TEST_F(PlainTableTest, Crc32cFileChecksum) {
       new FileChecksumGenCrc32cFactory();
   Options options;
   options.file_checksum_gen_factory.reset(file_checksum_gen_factory);
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   InternalKeyComparator ikc(options.comparator);
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
   std::string column_family_name;
   int unknown_level = -1;
 
@@ -3496,15 +3499,14 @@ TEST_F(PlainTableTest, Crc32cFileChecksum) {
   f.SetFileChecksumGenerator(checksum_crc32c_gen1.release());
 
   std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
-      TableBuilderOptions(
-          ioptions, moptions, ikc, &int_tbl_prop_collector_factories,
-          kNoCompression, 0 /* sample_for_compression */, CompressionOptions(),
-          false /* skip_filters */, column_family_name, unknown_level),
-      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kNoCompression,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, unknown_level),
       f.GetFileWriter()));
   ASSERT_OK(f.ResetTableBuilder(std::move(builder)));
   f.AddKVtoKVMap(1000);
-  f.WriteKVAndFlushTable();
+  ASSERT_OK(f.WriteKVAndFlushTable());
   ASSERT_STREQ(f.GetFileChecksumFuncName(), "FileChecksumCrc32c");
 
   std::unique_ptr<FileChecksumGenerator> checksum_crc32c_gen2 =
@@ -3529,11 +3531,12 @@ TEST_F(GeneralTableTest, ApproximateOffsetOfPlain) {
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
   Options options;
+  options.db_host_id = "";
   test::PlainInternalKeyComparator internal_comparator(options.comparator);
   options.compression = kNoCompression;
   BlockBasedTableOptions table_options;
   table_options.block_size = 1024;
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options, internal_comparator,
            &keys, &kvmap);
@@ -3569,16 +3572,16 @@ static void DoCompressionTest(CompressionType comp) {
   options.compression = comp;
   BlockBasedTableOptions table_options;
   table_options.block_size = 1024;
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options, ikc, &keys, &kvmap);
 
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3500));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3500));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 7000));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3525));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3525));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 7050));
   c.ResetTableReader();
 }
 
@@ -3661,19 +3664,33 @@ TEST_F(DBHarnessTest, RandomizedLongDB) {
 #endif  // ROCKSDB_LITE
 #endif  // ROCKSDB_VALGRIND_RUN
 
-class MemTableTest : public testing::Test {};
+class MemTableTest : public testing::Test {
+ public:
+  MemTableTest() {
+    InternalKeyComparator cmp(BytewiseComparator());
+    auto table_factory = std::make_shared<SkipListFactory>();
+    options_.memtable_factory = table_factory;
+    ImmutableOptions ioptions(options_);
+    wb_ = new WriteBufferManager(options_.db_write_buffer_size);
+    memtable_ = new MemTable(cmp, ioptions, MutableCFOptions(options_), wb_,
+                             kMaxSequenceNumber, 0 /* column_family_id */);
+    memtable_->Ref();
+  }
+
+  ~MemTableTest() {
+    delete memtable_->Unref();
+    delete wb_;
+  }
+
+  MemTable* GetMemTable() { return memtable_; }
+
+ private:
+  MemTable* memtable_;
+  Options options_;
+  WriteBufferManager* wb_;
+};
 
 TEST_F(MemTableTest, Simple) {
-  InternalKeyComparator cmp(BytewiseComparator());
-  auto table_factory = std::make_shared<SkipListFactory>();
-  Options options;
-  options.memtable_factory = table_factory;
-  ImmutableCFOptions ioptions(options);
-  WriteBufferManager wb(options.db_write_buffer_size);
-  MemTable* memtable =
-      new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
-                   kMaxSequenceNumber, 0 /* column_family_id */);
-  memtable->Ref();
   WriteBatch batch;
   WriteBatchInternal::SetSequence(&batch, 100);
   ASSERT_OK(batch.Put(std::string("k1"), std::string("v1")));
@@ -3682,7 +3699,7 @@ TEST_F(MemTableTest, Simple) {
   ASSERT_OK(batch.Put(std::string("largekey"), std::string("vlarge")));
   ASSERT_OK(batch.DeleteRange(std::string("chi"), std::string("xigua")));
   ASSERT_OK(batch.DeleteRange(std::string("begin"), std::string("end")));
-  ColumnFamilyMemTablesDefault cf_mems_default(memtable);
+  ColumnFamilyMemTablesDefault cf_mems_default(GetMemTable());
   ASSERT_TRUE(
       WriteBatchInternal::InsertInto(&batch, &cf_mems_default, nullptr, nullptr)
           .ok());
@@ -3693,10 +3710,10 @@ TEST_F(MemTableTest, Simple) {
     std::unique_ptr<InternalIterator> iter_guard;
     InternalIterator* iter;
     if (i == 0) {
-      iter = memtable->NewIterator(ReadOptions(), &arena);
+      iter = GetMemTable()->NewIterator(ReadOptions(), &arena);
       arena_iter_guard.set(iter);
     } else {
-      iter = memtable->NewRangeTombstoneIterator(
+      iter = GetMemTable()->NewRangeTombstoneIterator(
           ReadOptions(), kMaxSequenceNumber /* read_seq */);
       iter_guard.reset(iter);
     }
@@ -3710,8 +3727,6 @@ TEST_F(MemTableTest, Simple) {
       iter->Next();
     }
   }
-
-  delete memtable->Unref();
 }
 
 // Test the empty key
@@ -3893,6 +3908,8 @@ TEST_P(IndexBlockRestartIntervalTest, IndexBlockRestartInterval) {
   table_options.index_block_restart_interval = index_block_restart_interval;
   if (value_delta_encoding) {
     table_options.format_version = 4;
+  } else {
+    table_options.format_version = 3;
   }
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
 
@@ -3907,7 +3924,7 @@ TEST_P(IndexBlockRestartIntervalTest, IndexBlockRestartInterval) {
   stl_wrappers::KVMap kvmap;
   std::unique_ptr<InternalKeyComparator> comparator(
       new InternalKeyComparator(BytewiseComparator()));
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
            &kvmap);
@@ -3959,8 +3976,7 @@ class TestPrefixExtractor : public ROCKSDB_NAMESPACE::SliceTransform {
   }
 
   bool InDomain(const ROCKSDB_NAMESPACE::Slice& src) const override {
-    assert(IsValid(src));
-    return true;
+    return IsValid(src);
   }
 
   bool InRange(const ROCKSDB_NAMESPACE::Slice& /*dst*/) const override {
@@ -4003,7 +4019,7 @@ TEST_F(PrefixTest, PrefixAndWholeKeyTest) {
 
   const std::string kDBPath = test::PerThreadDBPath("table_prefix_test");
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  DestroyDB(kDBPath, options);
+  ASSERT_OK(DestroyDB(kDBPath, options));
   ROCKSDB_NAMESPACE::DB* db;
   ASSERT_OK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
 
@@ -4033,15 +4049,15 @@ TEST_F(PrefixTest, PrefixAndWholeKeyTest) {
 TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
   BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
   test::StringSink* sink = new test::StringSink();
-  std::unique_ptr<WritableFileWriter> file_writer(
-      test::GetWritableFileWriter(sink, "" /* don't care */));
+  std::unique_ptr<FSWritableFile> holder(sink);
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(holder), "" /* don't care */, FileOptions()));
   Options options;
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   InternalKeyComparator ikc(options.comparator);
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
   int_tbl_prop_collector_factories.emplace_back(
       new SstFileWriterPropertiesCollectorFactory(2 /* version */,
                                                   0 /* global_seqno*/));
@@ -4049,9 +4065,8 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
   std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
       TableBuilderOptions(ioptions, moptions, ikc,
                           &int_tbl_prop_collector_factories, kNoCompression,
-                          0 /* sample_for_compression */, CompressionOptions(),
-                          false /* skip_filters */, column_family_name, -1),
-      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, -1),
       file_writer.get()));
 
   for (char c = 'a'; c <= 'z'; ++c) {
@@ -4062,7 +4077,7 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
     builder->Add(ik.Encode(), value);
   }
   ASSERT_OK(builder->Finish());
-  file_writer->Flush();
+  ASSERT_OK(file_writer->Flush());
 
   test::RandomRWStringSink ss_rw(sink);
   uint32_t version;
@@ -4071,9 +4086,10 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
 
   // Helper function to get version, global_seqno, global_seqno_offset
   std::function<void()> GetVersionAndGlobalSeqno = [&]() {
+    std::unique_ptr<FSRandomAccessFile> source(
+        new test::StringSource(ss_rw.contents(), 73342, true));
     std::unique_ptr<RandomAccessFileReader> file_reader(
-        test::GetRandomAccessFileReader(
-            new test::StringSource(ss_rw.contents(), 73342, true)));
+        new RandomAccessFileReader(std::move(source), ""));
 
     TableProperties* props = nullptr;
     ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(),
@@ -4096,16 +4112,18 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
     std::string new_global_seqno;
     PutFixed64(&new_global_seqno, val);
 
-    ASSERT_OK(ss_rw.Write(global_seqno_offset, new_global_seqno));
+    ASSERT_OK(ss_rw.Write(global_seqno_offset, new_global_seqno, IOOptions(),
+                          nullptr));
   };
 
   // Helper function to get the contents of the table InternalIterator
   std::unique_ptr<TableReader> table_reader;
   const ReadOptions read_options;
   std::function<InternalIterator*()> GetTableInternalIter = [&]() {
+    std::unique_ptr<FSRandomAccessFile> source(
+        new test::StringSource(ss_rw.contents(), 73342, true));
     std::unique_ptr<RandomAccessFileReader> file_reader(
-        test::GetRandomAccessFileReader(
-            new test::StringSource(ss_rw.contents(), 73342, true)));
+        new RandomAccessFileReader(std::move(source), ""));
 
     options.table_factory->NewTableReader(
         TableReaderOptions(ioptions, moptions.prefix_extractor.get(),
@@ -4125,7 +4143,7 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
   char current_c = 'a';
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     ParsedInternalKey pik;
-    ASSERT_OK(ParseInternalKey(iter->key(), &pik));
+    ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
 
     ASSERT_EQ(pik.type, ValueType::kTypeValue);
     ASSERT_EQ(pik.sequence, 0);
@@ -4146,7 +4164,7 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
   current_c = 'a';
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     ParsedInternalKey pik;
-    ASSERT_OK(ParseInternalKey(iter->key(), &pik));
+    ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
 
     ASSERT_EQ(pik.type, ValueType::kTypeValue);
     ASSERT_EQ(pik.sequence, 10);
@@ -4164,7 +4182,7 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
     ASSERT_TRUE(iter->Valid());
 
     ParsedInternalKey pik;
-    ASSERT_OK(ParseInternalKey(iter->key(), &pik));
+    ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
 
     ASSERT_EQ(pik.type, ValueType::kTypeValue);
     ASSERT_EQ(pik.sequence, 10);
@@ -4183,7 +4201,7 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
   current_c = 'a';
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     ParsedInternalKey pik;
-    ASSERT_OK(ParseInternalKey(iter->key(), &pik));
+    ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
 
     ASSERT_EQ(pik.type, ValueType::kTypeValue);
     ASSERT_EQ(pik.sequence, 3);
@@ -4202,7 +4220,7 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
     ASSERT_TRUE(iter->Valid());
 
     ParsedInternalKey pik;
-    ASSERT_OK(ParseInternalKey(iter->key(), &pik));
+    ASSERT_OK(ParseInternalKey(iter->key(), &pik, true /* log_err_key */));
 
     ASSERT_EQ(pik.type, ValueType::kTypeValue);
     ASSERT_EQ(pik.sequence, 3);
@@ -4217,23 +4235,22 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) {
   BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
   bbto.block_align = true;
   test::StringSink* sink = new test::StringSink();
-  std::unique_ptr<WritableFileWriter> file_writer(
-      test::GetWritableFileWriter(sink, "" /* don't care */));
+  std::unique_ptr<FSWritableFile> holder(sink);
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(holder), "" /* don't care */, FileOptions()));
   Options options;
   options.compression = kNoCompression;
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   InternalKeyComparator ikc(options.comparator);
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
   std::string column_family_name;
   std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
       TableBuilderOptions(ioptions, moptions, ikc,
                           &int_tbl_prop_collector_factories, kNoCompression,
-                          0 /* sample_for_compression */, CompressionOptions(),
-                          false /* skip_filters */, column_family_name, -1),
-      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, -1),
       file_writer.get()));
 
   for (int i = 1; i <= 10000; ++i) {
@@ -4246,19 +4263,18 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) {
     builder->Add(ik.Encode(), value);
   }
   ASSERT_OK(builder->Finish());
-  file_writer->Flush();
+  ASSERT_OK(file_writer->Flush());
 
-  test::RandomRWStringSink ss_rw(sink);
+  std::unique_ptr<FSRandomAccessFile> source(
+      new test::StringSource(sink->contents(), 73342, false));
   std::unique_ptr<RandomAccessFileReader> file_reader(
-      test::GetRandomAccessFileReader(
-          new test::StringSource(ss_rw.contents(), 73342, true)));
-
+      new RandomAccessFileReader(std::move(source), "test"));
   // Helper function to get version, global_seqno, global_seqno_offset
   std::function<void()> VerifyBlockAlignment = [&]() {
     TableProperties* props = nullptr;
-    ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(),
-                                  kBlockBasedTableMagicNumber, ioptions,
-                                  &props, true /* compression_type_missing */));
+    ASSERT_OK(ReadTableProperties(file_reader.get(), sink->contents().size(),
+                                  kBlockBasedTableMagicNumber, ioptions, &props,
+                                  true /* compression_type_missing */));
 
     uint64_t data_block_size = props->data_size / props->num_data_blocks;
     ASSERT_EQ(data_block_size, 4096);
@@ -4275,14 +4291,14 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) {
   bbto.block_align = false;
   Options options2;
   options2.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  ImmutableCFOptions ioptions2(options2);
+  ImmutableOptions ioptions2(options2);
   const MutableCFOptions moptions2(options2);
 
   ASSERT_OK(ioptions.table_factory->NewTableReader(
       TableReaderOptions(ioptions2, moptions2.prefix_extractor.get(),
                          EnvOptions(),
                          GetPlainInternalComparator(options2.comparator)),
-      std::move(file_reader), ss_rw.contents().size(), &table_reader));
+      std::move(file_reader), sink->contents().size(), &table_reader));
 
   ReadOptions read_options;
   std::unique_ptr<InternalIterator> db_iter(table_reader->NewIterator(
@@ -4309,26 +4325,25 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
   BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
   bbto.block_align = true;
   test::StringSink* sink = new test::StringSink();
-  std::unique_ptr<WritableFileWriter> file_writer(
-      test::GetWritableFileWriter(sink, "" /* don't care */));
+  std::unique_ptr<FSWritableFile> holder(sink);
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(holder), "" /* don't care */, FileOptions()));
 
   Options options;
   options.compression = kNoCompression;
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
 
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   InternalKeyComparator ikc(options.comparator);
-  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
-      int_tbl_prop_collector_factories;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
   std::string column_family_name;
 
   std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
       TableBuilderOptions(ioptions, moptions, ikc,
                           &int_tbl_prop_collector_factories, kNoCompression,
-                          0 /* sample_for_compression */, CompressionOptions(),
-                          false /* skip_filters */, column_family_name, -1),
-      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+                          CompressionOptions(), kUnknownColumnFamily,
+                          column_family_name, -1),
       file_writer.get()));
 
   for (int i = 1; i <= 10000; ++i) {
@@ -4341,16 +4356,16 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
     builder->Add(ik.Encode(), value);
   }
   ASSERT_OK(builder->Finish());
-  file_writer->Flush();
+  ASSERT_OK(file_writer->Flush());
 
-  test::RandomRWStringSink ss_rw(sink);
+  std::unique_ptr<FSRandomAccessFile> source(
+      new test::StringSource(sink->contents(), 73342, true));
   std::unique_ptr<RandomAccessFileReader> file_reader(
-      test::GetRandomAccessFileReader(
-          new test::StringSource(ss_rw.contents(), 73342, true)));
+      new RandomAccessFileReader(std::move(source), "test"));
 
   {
     RandomAccessFileReader* file = file_reader.get();
-    uint64_t file_size = ss_rw.contents().size();
+    uint64_t file_size = sink->contents().size();
 
     Footer footer;
     IOOptions opts;
@@ -4424,7 +4439,7 @@ TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) {
   table_options.filter_policy.reset(NewBloomFilterPolicy(
       8 /* bits_per_key */, false /* use_block_based_filter */));
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  ImmutableCFOptions ioptions(options);
+  ImmutableOptions ioptions(options);
   MutableCFOptions moptions(options);
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
@@ -4433,10 +4448,11 @@ TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) {
 
   // get file reader
   test::StringSink* table_sink = c.TEST_GetSink();
-  std::unique_ptr<RandomAccessFileReader> table_reader{
-      test::GetRandomAccessFileReader(
-          new test::StringSource(table_sink->contents(), 0 /* unique_id */,
-                                 false /* allow_mmap_reads */))};
+  std::unique_ptr<FSRandomAccessFile> source(new test::StringSource(
+      table_sink->contents(), 0 /* unique_id */, false /* allow_mmap_reads */));
+
+  std::unique_ptr<RandomAccessFileReader> table_reader(
+      new RandomAccessFileReader(std::move(source), "test"));
   size_t table_size = table_sink->contents().size();
 
   // read footer
@@ -4492,7 +4508,7 @@ TEST_P(BlockBasedTableTest, BadOptions) {
   const std::string kDBPath =
       test::PerThreadDBPath("block_based_table_bad_options_test");
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  DestroyDB(kDBPath, options);
+  ASSERT_OK(DestroyDB(kDBPath, options));
   ROCKSDB_NAMESPACE::DB* db;
   ASSERT_NOK(ROCKSDB_NAMESPACE::DB::Open(options, kDBPath, &db));
 
@@ -4540,9 +4556,9 @@ TEST_F(BBTTailPrefetchTest, FilePrefetchBufferMinOffset) {
   TailPrefetchStats tpstats;
   FilePrefetchBuffer buffer(nullptr, 0, 0, false, true);
   IOOptions opts;
-  buffer.TryReadFromCache(opts, 500, 10, nullptr);
-  buffer.TryReadFromCache(opts, 480, 10, nullptr);
-  buffer.TryReadFromCache(opts, 490, 10, nullptr);
+  buffer.TryReadFromCache(opts, 500, 10, nullptr, nullptr);
+  buffer.TryReadFromCache(opts, 480, 10, nullptr, nullptr);
+  buffer.TryReadFromCache(opts, 490, 10, nullptr, nullptr);
   ASSERT_EQ(480, buffer.min_offset_read());
 }
 
@@ -4572,7 +4588,7 @@ TEST_P(BlockBasedTableTest, DataBlockHashIndex) {
 
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   const InternalKeyComparator internal_comparator(options.comparator);
   c.Finish(options, ioptions, moptions, table_options, internal_comparator,
@@ -4655,7 +4671,7 @@ TEST_P(BlockBasedTableTest, OutOfBoundOnSeek) {
   Options options;
   BlockBasedTableOptions table_opt(GetBlockBasedTableOptions());
   options.table_factory.reset(NewBlockBasedTableFactory(table_opt));
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_opt,
            GetPlainInternalComparator(BytewiseComparator()), &keys, &kvmap);
@@ -4695,7 +4711,7 @@ TEST_P(BlockBasedTableTest, OutOfBoundOnNext) {
   table_opt.flush_block_policy_factory =
       std::make_shared<FlushBlockEveryKeyPolicyFactory>();
   options.table_factory.reset(NewBlockBasedTableFactory(table_opt));
-  const ImmutableCFOptions ioptions(options);
+  const ImmutableOptions ioptions(options);
   const MutableCFOptions moptions(options);
   c.Finish(options, ioptions, moptions, table_opt,
            GetPlainInternalComparator(BytewiseComparator()), &keys, &kvmap);
diff --git a/test_util/mock_time_env.cc b/test_util/mock_time_env.cc
index 8316406ec59..23888e69e3f 100644
--- a/test_util/mock_time_env.cc
+++ b/test_util/mock_time_env.cc
@@ -12,7 +12,7 @@ namespace ROCKSDB_NAMESPACE {
 // TODO: this is a workaround for the different behavior on different platform
 // for timedwait timeout. Ideally timedwait API should be moved to env.
 // details: PR #7101.
-void MockTimeEnv::InstallTimedWaitFixCallback() {
+void MockSystemClock::InstallTimedWaitFixCallback() {
 #ifndef NDEBUG
   SyncPoint::GetInstance()->DisableProcessing();
   SyncPoint::GetInstance()->ClearAllCallBacks();
diff --git a/test_util/mock_time_env.h b/test_util/mock_time_env.h
index 1f454144ad1..61dc4e4437b 100644
--- a/test_util/mock_time_env.h
+++ b/test_util/mock_time_env.h
@@ -6,18 +6,21 @@
 #pragma once
 
 #include <atomic>
+#include <limits>
 
-#include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 // NOTE: SpecialEnv offers most of this functionality, along with hooks
 // for safe DB behavior under a mock time environment, so should be used
-// instead of MockTimeEnv for DB tests.
-class MockTimeEnv : public EnvWrapper {
+// instead of MockSystemClock for DB tests.
+class MockSystemClock : public SystemClockWrapper {
  public:
-  explicit MockTimeEnv(Env* base) : EnvWrapper(base) {}
+  explicit MockSystemClock(const std::shared_ptr<SystemClock>& base)
+      : SystemClockWrapper(base) {}
 
+  const char* Name() const override { return "MockSystemClock"; }
   virtual Status GetCurrentTime(int64_t* time_sec) override {
     assert(time_sec != nullptr);
     *time_sec = static_cast<int64_t>(current_time_us_ / kMicrosInSecond);
@@ -33,9 +36,9 @@ class MockTimeEnv : public EnvWrapper {
     return current_time_us_ * 1000;
   }
 
-  uint64_t RealNowMicros() { return target()->NowMicros(); }
+  uint64_t RealNowMicros() { return target_->NowMicros(); }
 
-  void set_current_time(uint64_t time_sec) {
+  void SetCurrentTime(uint64_t time_sec) {
     assert(time_sec < std::numeric_limits<uint64_t>::max() / kMicrosInSecond);
     assert(time_sec * kMicrosInSecond >= current_time_us_);
     current_time_us_ = time_sec * kMicrosInSecond;
diff --git a/test_util/sync_point.cc b/test_util/sync_point.cc
index 16eb4e55315..4c71fc6bc46 100644
--- a/test_util/sync_point.cc
+++ b/test_util/sync_point.cc
@@ -6,11 +6,9 @@
 #include "test_util/sync_point.h"
 
 #include <fcntl.h>
-#include <sys/stat.h>
 
 #include "test_util/sync_point_impl.h"
 
-int rocksdb_kill_odds = 0;
 std::vector<std::string> rocksdb_kill_exclude_prefixes;
 
 #ifndef NDEBUG
diff --git a/test_util/sync_point.h b/test_util/sync_point.h
index 08d6c037abb..775fd5c36e6 100644
--- a/test_util/sync_point.h
+++ b/test_util/sync_point.h
@@ -13,34 +13,42 @@
 
 #include "rocksdb/rocksdb_namespace.h"
 
-// This is only set from db_stress.cc and for testing only.
-// If non-zero, kill at various points in source code with probability 1/this
-extern int rocksdb_kill_odds;
-// If kill point has a prefix on this list, will skip killing.
-extern std::vector<std::string> rocksdb_kill_exclude_prefixes;
-
 #ifdef NDEBUG
 // empty in release build
-#define TEST_KILL_RANDOM(kill_point, rocksdb_kill_odds)
+#define TEST_KILL_RANDOM_WITH_WEIGHT(kill_point, rocksdb_kill_odds_weight)
+#define TEST_KILL_RANDOM(kill_point)
 #else
 
 namespace ROCKSDB_NAMESPACE {
-// Kill the process with probability 1/odds for testing.
-extern void TestKillRandom(std::string kill_point, int odds,
-                           const std::string& srcfile, int srcline);
 
 // To avoid crashing always at some frequently executed codepaths (during
 // kill random test), use this factor to reduce odds
 #define REDUCE_ODDS 2
 #define REDUCE_ODDS2 4
 
-#define TEST_KILL_RANDOM(kill_point, rocksdb_kill_odds)                  \
-  {                                                                      \
-    if (rocksdb_kill_odds > 0) {                                         \
-      TestKillRandom(kill_point, rocksdb_kill_odds, __FILE__, __LINE__); \
-    }                                                                    \
+// A class used to pass when a kill point is reached.
+struct KillPoint {
+ public:
+  // This is only set from db_stress.cc and for testing only.
+  // If non-zero, kill at various points in source code with probability 1/this
+  int rocksdb_kill_odds = 0;
+  // If kill point has a prefix on this list, will skip killing.
+  std::vector<std::string> rocksdb_kill_exclude_prefixes;
+  // Kill the process with probability 1/odds for testing.
+  void TestKillRandom(std::string kill_point, int odds,
+                      const std::string& srcfile, int srcline);
+
+  static KillPoint* GetInstance();
+};
+
+#define TEST_KILL_RANDOM_WITH_WEIGHT(kill_point, rocksdb_kill_odds_weight) \
+  {                                                                        \
+    KillPoint::GetInstance()->TestKillRandom(                              \
+        kill_point, rocksdb_kill_odds_weight, __FILE__, __LINE__);         \
   }
+#define TEST_KILL_RANDOM(kill_point) TEST_KILL_RANDOM_WITH_WEIGHT(kill_point, 1)
 }  // namespace ROCKSDB_NAMESPACE
+
 #endif
 
 #ifdef NDEBUG
diff --git a/test_util/sync_point_impl.cc b/test_util/sync_point_impl.cc
index e1877e39860..1d87a05fe07 100644
--- a/test_util/sync_point_impl.cc
+++ b/test_util/sync_point_impl.cc
@@ -7,9 +7,17 @@
 
 #ifndef NDEBUG
 namespace ROCKSDB_NAMESPACE {
+KillPoint* KillPoint::GetInstance() {
+  static KillPoint kp;
+  return &kp;
+}
 
-void TestKillRandom(std::string kill_point, int odds,
-                    const std::string& srcfile, int srcline) {
+void KillPoint::TestKillRandom(std::string kill_point, int odds_weight,
+                               const std::string& srcfile, int srcline) {
+  if (rocksdb_kill_odds <= 0) {
+    return;
+  }
+  int odds = rocksdb_kill_odds * odds_weight;
   for (auto& p : rocksdb_kill_exclude_prefixes) {
     if (kill_point.substr(0, p.length()) == p) {
       return;
@@ -29,7 +37,6 @@ void TestKillRandom(std::string kill_point, int odds,
   }
 }
 
-
 void SyncPoint::Data::LoadDependency(const std::vector<SyncPointPair>& dependencies) {
   std::lock_guard<std::mutex> lock(mutex_);
   successors_.clear();
@@ -38,6 +45,8 @@ void SyncPoint::Data::LoadDependency(const std::vector<SyncPointPair>& dependenc
   for (const auto& dependency : dependencies) {
     successors_[dependency.predecessor].push_back(dependency.successor);
     predecessors_[dependency.successor].push_back(dependency.predecessor);
+    point_filter_.Add(dependency.successor);
+    point_filter_.Add(dependency.predecessor);
   }
   cv_.notify_all();
 }
@@ -54,11 +63,15 @@ void SyncPoint::Data::LoadDependencyAndMarkers(
   for (const auto& dependency : dependencies) {
     successors_[dependency.predecessor].push_back(dependency.successor);
     predecessors_[dependency.successor].push_back(dependency.predecessor);
+    point_filter_.Add(dependency.successor);
+    point_filter_.Add(dependency.predecessor);
   }
   for (const auto& marker : markers) {
     successors_[marker.predecessor].push_back(marker.successor);
     predecessors_[marker.successor].push_back(marker.predecessor);
     markers_[marker.predecessor].push_back(marker.successor);
+    point_filter_.Add(marker.predecessor);
+    point_filter_.Add(marker.successor);
   }
   cv_.notify_all();
 }
@@ -92,6 +105,10 @@ void SyncPoint::Data::Process(const std::string& point, void* cb_arg) {
   if (!enabled_) {
     return;
   }
+  // Use a filter to prevent mutex lock if possible.
+  if (!point_filter_.MayContain(point)) {
+    return;
+  }
 
   std::unique_lock<std::mutex> lock(mutex_);
   auto thread_id = std::this_thread::get_id();
@@ -100,6 +117,7 @@ void SyncPoint::Data::Process(const std::string& point, void* cb_arg) {
   if (marker_iter != markers_.end()) {
     for (auto& marked_point : marker_iter->second) {
       marked_thread_id_.emplace(marked_point, thread_id);
+      point_filter_.Add(marked_point);
     }
   }
 
diff --git a/test_util/sync_point_impl.h b/test_util/sync_point_impl.h
index b246c019857..ba818e381fd 100644
--- a/test_util/sync_point_impl.h
+++ b/test_util/sync_point_impl.h
@@ -3,9 +3,8 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "test_util/sync_point.h"
-
 #include <assert.h>
+
 #include <atomic>
 #include <condition_variable>
 #include <functional>
@@ -15,15 +14,39 @@
 #include <unordered_map>
 #include <unordered_set>
 
+#include "memory/concurrent_arena.h"
 #include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/dynamic_bloom.h"
 #include "util/random.h"
 
 #pragma once
 
 #ifndef NDEBUG
 namespace ROCKSDB_NAMESPACE {
+// A hacky allocator for single use.
+// Arena depends on SyncPoint and create circular dependency.
+class SingleAllocator : public Allocator {
+ public:
+  char* Allocate(size_t) override {
+    assert(false);
+    return nullptr;
+  }
+  char* AllocateAligned(size_t bytes, size_t, Logger*) override {
+    buf_.resize(bytes);
+    return const_cast<char*>(buf_.data());
+  }
+  size_t BlockSize() const override {
+    assert(false);
+    return 0;
+  }
+
+ private:
+  std::string buf_;
+};
+
 struct SyncPoint::Data {
-  Data() : enabled_(false) {}
+  Data() : point_filter_(&alloc_, /*total_bits=*/8192), enabled_(false) {}
   // Enable proper deletion by subclasses
   virtual ~Data() {}
   // successor/predecessor map loaded from LoadDependency
@@ -37,6 +60,9 @@ struct SyncPoint::Data {
   std::condition_variable cv_;
   // sync points that have been passed through
   std::unordered_set<std::string> cleared_points_;
+  SingleAllocator alloc_;
+  // A filter before holding mutex to speed up process.
+  DynamicBloom point_filter_;
   std::atomic<bool> enabled_;
   int num_callbacks_running_ = 0;
 
@@ -48,6 +74,7 @@ struct SyncPoint::Data {
     const std::function<void(void*)>& callback) {
   std::lock_guard<std::mutex> lock(mutex_);
   callbacks_[point] = callback;
+  point_filter_.Add(point);
 }
 
   void ClearCallBack(const std::string& point);
diff --git a/test_util/testharness.cc b/test_util/testharness.cc
index 50e105c51d3..d8650dafbd7 100644
--- a/test_util/testharness.cc
+++ b/test_util/testharness.cc
@@ -14,6 +14,14 @@
 namespace ROCKSDB_NAMESPACE {
 namespace test {
 
+#ifdef OS_WIN
+#include <windows.h>
+
+std::string GetPidStr() { return std::to_string(GetCurrentProcessId()); }
+#else
+std::string GetPidStr() { return std::to_string(getpid()); }
+#endif
+
 ::testing::AssertionResult AssertStatus(const char* s_expr, const Status& s) {
   if (s.ok()) {
     return ::testing::AssertionSuccess();
@@ -32,7 +40,7 @@ std::string TmpDir(Env* env) {
 
 std::string PerThreadDBPath(std::string dir, std::string name) {
   size_t tid = std::hash<std::thread::id>()(std::this_thread::get_id());
-  return dir + "/" + name + "_" + std::to_string(tid);
+  return dir + "/" + name + "_" + GetPidStr() + "_" + std::to_string(tid);
 }
 
 std::string PerThreadDBPath(std::string name) {
diff --git a/test_util/testharness.h b/test_util/testharness.h
index 60a195e2b79..739f32cb9b0 100644
--- a/test_util/testharness.h
+++ b/test_util/testharness.h
@@ -15,6 +15,42 @@
 #include <gtest/gtest.h>
 #endif
 
+// A "skipped" test has a specific meaning in Facebook infrastructure: the
+// test is in good shape and should be run, but something about the
+// compilation or execution environment means the test cannot be run.
+// Specifically, there is a hole in intended testing if any
+// parameterization of a test (e.g. Foo/FooTest.Bar/42) is skipped for all
+// tested build configurations/platforms/etc.
+//
+// If GTEST_SKIP is available, use it. Otherwise, define skip as success.
+//
+// The GTEST macros do not seem to print the message, even with -verbose,
+// so these print to stderr. Note that these do not exit the test themselves;
+// calling code should 'return' or similar from the test.
+#ifdef GTEST_SKIP_
+#define ROCKSDB_GTEST_SKIP(m)          \
+  do {                                 \
+    fputs("SKIPPED: " m "\n", stderr); \
+    GTEST_SKIP_(m);                    \
+  } while (false) /* user ; */
+#else
+#define ROCKSDB_GTEST_SKIP(m)          \
+  do {                                 \
+    fputs("SKIPPED: " m "\n", stderr); \
+    GTEST_SUCCESS_("SKIPPED: " m);     \
+  } while (false) /* user ; */
+#endif
+
+// We add "bypass" as an alternative to ROCKSDB_GTEST_SKIP that is allowed to
+// be a permanent condition, e.g. for intentionally omitting or disabling some
+// parameterizations for some tests. (Use _DISABLED at the end of the test
+// name to disable an entire test.)
+#define ROCKSDB_GTEST_BYPASS(m)         \
+  do {                                  \
+    fputs("BYPASSED: " m "\n", stderr); \
+    GTEST_SUCCESS_("BYPASSED: " m);     \
+  } while (false) /* user ; */
+
 #include <string>
 #include "rocksdb/env.h"
 
diff --git a/test_util/testutil.cc b/test_util/testutil.cc
index d05bb766e27..450598cecb9 100644
--- a/test_util/testutil.cc
+++ b/test_util/testutil.cc
@@ -24,6 +24,7 @@
 #include "file/writable_file_writer.h"
 #include "port/port.h"
 #include "rocksdb/convenience.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/sync_point.h"
 #include "util/random.h"
 
@@ -171,25 +172,6 @@ const Comparator* ComparatorWithU64Ts() {
   return &comp_with_u64_ts;
 }
 
-WritableFileWriter* GetWritableFileWriter(WritableFile* wf,
-                                          const std::string& fname) {
-  std::unique_ptr<WritableFile> file(wf);
-  return new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(file)),
-                                fname, EnvOptions());
-}
-
-RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf) {
-  std::unique_ptr<RandomAccessFile> file(raf);
-  return new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(file),
-                                    "[test RandomAccessFileReader]");
-}
-
-SequentialFileReader* GetSequentialFileReader(SequentialFile* se,
-                                              const std::string& fname) {
-  std::unique_ptr<SequentialFile> file(se);
-  return new SequentialFileReader(NewLegacySequentialFileWrapper(file), fname);
-}
-
 void CorruptKeyType(InternalKey* ikey) {
   std::string keystr = ikey->Encode().ToString();
   keystr[keystr.size() - 8] = kTypeLogData;
@@ -205,6 +187,38 @@ std::string KeyStr(const std::string& user_key, const SequenceNumber& seq,
   return k.Encode().ToString();
 }
 
+std::string KeyStr(uint64_t ts, const std::string& user_key,
+                   const SequenceNumber& seq, const ValueType& t,
+                   bool corrupt) {
+  std::string user_key_with_ts(user_key);
+  std::string ts_str;
+  PutFixed64(&ts_str, ts);
+  user_key_with_ts.append(ts_str);
+  return KeyStr(user_key_with_ts, seq, t, corrupt);
+}
+
+bool SleepingBackgroundTask::TimedWaitUntilSleeping(uint64_t wait_time) {
+  auto abs_time = SystemClock::Default()->NowMicros() + wait_time;
+  MutexLock l(&mutex_);
+  while (!sleeping_ || !should_sleep_) {
+    if (bg_cv_.TimedWait(abs_time)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool SleepingBackgroundTask::TimedWaitUntilDone(uint64_t wait_time) {
+  auto abs_time = SystemClock::Default()->NowMicros() + wait_time;
+  MutexLock l(&mutex_);
+  while (!done_with_sleep_) {
+    if (bg_cv_.TimedWait(abs_time)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 std::string RandomName(Random* rnd, const size_t len) {
   std::stringstream ss;
   for (size_t i = 0; i < len; ++i) {
@@ -309,6 +323,7 @@ void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) {
   db_opt->error_if_exists = rnd->Uniform(2);
   db_opt->is_fd_close_on_exec = rnd->Uniform(2);
   db_opt->paranoid_checks = rnd->Uniform(2);
+  db_opt->track_and_verify_wals_in_manifest = rnd->Uniform(2);
   db_opt->skip_log_error_on_recovery = rnd->Uniform(2);
   db_opt->skip_stats_update_on_db_open = rnd->Uniform(2);
   db_opt->skip_checking_sst_file_sizes_on_db_open = rnd->Uniform(2);
@@ -370,12 +385,14 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions& db_options,
   cf_opt->compaction_options_fifo.allow_compaction = rnd->Uniform(2);
   cf_opt->memtable_whole_key_filtering = rnd->Uniform(2);
   cf_opt->enable_blob_files = rnd->Uniform(2);
+  cf_opt->enable_blob_garbage_collection = rnd->Uniform(2);
 
   // double options
   cf_opt->hard_rate_limit = static_cast<double>(rnd->Uniform(10000)) / 13;
   cf_opt->soft_rate_limit = static_cast<double>(rnd->Uniform(10000)) / 13;
   cf_opt->memtable_prefix_bloom_size_ratio =
       static_cast<double>(rnd->Uniform(10000)) / 20000.0;
+  cf_opt->blob_garbage_collection_age_cutoff = rnd->Uniform(10000) / 10000.0;
 
   // int options
   cf_opt->level0_file_num_compaction_trigger = rnd->Uniform(100);
@@ -458,6 +475,26 @@ bool IsDirectIOSupported(Env* env, const std::string& dir) {
   return s.ok();
 }
 
+bool IsPrefetchSupported(const std::shared_ptr<FileSystem>& fs,
+                         const std::string& dir) {
+  bool supported = false;
+  std::string tmp = TempFileName(dir, 999);
+  Random rnd(301);
+  std::string test_string = rnd.RandomString(4096);
+  Slice data(test_string);
+  Status s = WriteStringToFile(fs.get(), data, tmp, true);
+  if (s.ok()) {
+    std::unique_ptr<FSRandomAccessFile> file;
+    auto io_s = fs->NewRandomAccessFile(tmp, FileOptions(), &file, nullptr);
+    if (io_s.ok()) {
+      supported = !(file->Prefetch(0, data.size(), IOOptions(), nullptr)
+                        .IsNotSupported());
+    }
+    s = fs->DeleteFile(tmp, IOOptions(), nullptr);
+  }
+  return s.ok() && supported;
+}
+
 size_t GetLinesCount(const std::string& fname, const std::string& pattern) {
   std::stringstream ssbuf;
   std::string line;
@@ -475,46 +512,93 @@ size_t GetLinesCount(const std::string& fname, const std::string& pattern) {
   return count;
 }
 
-
-void CorruptFile(const std::string& fname, int offset, int bytes_to_corrupt) {
-  struct stat sbuf;
-  if (stat(fname.c_str(), &sbuf) != 0) {
-    // strerror is not thread-safe so should not be used in the "passing" path
-    // of unit tests (sometimes parallelized) but is OK here where test fails
-    const char* msg = strerror(errno);
-    fprintf(stderr, "%s:%s\n", fname.c_str(), msg);
-    assert(false);
-  }
-
-  if (offset < 0) {
+Status CorruptFile(Env* env, const std::string& fname, int offset,
+                   int bytes_to_corrupt, bool verify_checksum /*=true*/) {
+  uint64_t size;
+  Status s = env->GetFileSize(fname, &size);
+  if (!s.ok()) {
+    return s;
+  } else if (offset < 0) {
     // Relative to end of file; make it absolute
-    if (-offset > sbuf.st_size) {
+    if (-offset > static_cast<int>(size)) {
       offset = 0;
     } else {
-      offset = static_cast<int>(sbuf.st_size + offset);
+      offset = static_cast<int>(size + offset);
     }
   }
-  if (offset > sbuf.st_size) {
-    offset = static_cast<int>(sbuf.st_size);
+  if (offset > static_cast<int>(size)) {
+    offset = static_cast<int>(size);
   }
-  if (offset + bytes_to_corrupt > sbuf.st_size) {
-    bytes_to_corrupt = static_cast<int>(sbuf.st_size - offset);
+  if (offset + bytes_to_corrupt > static_cast<int>(size)) {
+    bytes_to_corrupt = static_cast<int>(size - offset);
   }
 
   // Do it
   std::string contents;
-  Status s = ReadFileToString(Env::Default(), fname, &contents);
-  assert(s.ok());
-  for (int i = 0; i < bytes_to_corrupt; i++) {
-    contents[i + offset] ^= 0x80;
+  s = ReadFileToString(env, fname, &contents);
+  if (s.ok()) {
+    for (int i = 0; i < bytes_to_corrupt; i++) {
+      contents[i + offset] ^= 0x80;
+    }
+    s = WriteStringToFile(env, contents, fname);
   }
-  s = WriteStringToFile(Env::Default(), contents, fname);
-  assert(s.ok());
-  Options options;
-  EnvOptions env_options;
+  if (s.ok() && verify_checksum) {
 #ifndef ROCKSDB_LITE
-  assert(!VerifySstFileChecksum(options, env_options, fname).ok());
+    Options options;
+    options.env = env;
+    EnvOptions env_options;
+    Status v = VerifySstFileChecksum(options, env_options, fname);
+    assert(!v.ok());
 #endif
+  }
+  return s;
+}
+
+Status TruncateFile(Env* env, const std::string& fname, uint64_t new_length) {
+  uint64_t old_length;
+  Status s = env->GetFileSize(fname, &old_length);
+  if (!s.ok() || new_length == old_length) {
+    return s;
+  }
+  // Do it
+  std::string contents;
+  s = ReadFileToString(env, fname, &contents);
+  if (s.ok()) {
+    contents.resize(static_cast<size_t>(new_length), 'b');
+    s = WriteStringToFile(env, contents, fname);
+  }
+  return s;
+}
+
+// Try and delete a directory if it exists
+Status TryDeleteDir(Env* env, const std::string& dirname) {
+  bool is_dir = false;
+  Status s = env->IsDirectory(dirname, &is_dir);
+  if (s.ok() && is_dir) {
+    s = env->DeleteDir(dirname);
+  }
+  return s;
+}
+
+// Delete a directory if it exists
+void DeleteDir(Env* env, const std::string& dirname) {
+  TryDeleteDir(env, dirname).PermitUncheckedError();
+}
+
+Status CreateEnvFromSystem(const ConfigOptions& config_options, Env** result,
+                           std::shared_ptr<Env>* guard) {
+  const char* env_uri = getenv("TEST_ENV_URI");
+  const char* fs_uri = getenv("TEST_FS_URI");
+  if (env_uri || fs_uri) {
+    return Env::CreateFromUri(config_options,
+                              (env_uri != nullptr) ? env_uri : "",
+                              (fs_uri != nullptr) ? fs_uri : "", result, guard);
+  } else {
+    // Neither specified.  Use the default
+    *result = config_options.env;
+    guard->reset();
+    return Status::OK();
+  }
 }
 
 }  // namespace test
diff --git a/test_util/testutil.h b/test_util/testutil.h
index 4255a48f2d5..ae6d1dec4ef 100644
--- a/test_util/testutil.h
+++ b/test_util/testutil.h
@@ -22,12 +22,11 @@
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/table.h"
-#include "table/block_based/block_based_table_factory.h"
 #include "table/internal_iterator.h"
-#include "table/plain/plain_table_factory.h"
 #include "util/mutexlock.h"
 
 namespace ROCKSDB_NAMESPACE {
+class FileSystem;
 class Random;
 class SequentialFile;
 class SequentialFileReader;
@@ -55,9 +54,10 @@ class ErrorEnv : public EnvWrapper {
   bool writable_file_error_;
   int num_writable_file_errors_;
 
-  ErrorEnv() : EnvWrapper(Env::Default()),
-               writable_file_error_(false),
-               num_writable_file_errors_(0) { }
+  ErrorEnv(Env* _target)
+      : EnvWrapper(_target),
+        writable_file_error_(false),
+        num_writable_file_errors_(0) {}
 
   virtual Status NewWritableFile(const std::string& fname,
                                  std::unique_ptr<WritableFile>* result,
@@ -179,23 +179,16 @@ class VectorIterator : public InternalIterator {
   std::vector<std::string> values_;
   size_t current_;
 };
-extern WritableFileWriter* GetWritableFileWriter(WritableFile* wf,
-                                                 const std::string& fname);
 
-extern RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf);
-
-extern SequentialFileReader* GetSequentialFileReader(SequentialFile* se,
-                                                     const std::string& fname);
-
-class StringSink: public WritableFile {
+class StringSink : public FSWritableFile {
  public:
   std::string contents_;
 
-  explicit StringSink(Slice* reader_contents = nullptr) :
-      WritableFile(),
-      contents_(""),
-      reader_contents_(reader_contents),
-      last_flush_(0) {
+  explicit StringSink(Slice* reader_contents = nullptr)
+      : FSWritableFile(),
+        contents_(""),
+        reader_contents_(reader_contents),
+        last_flush_(0) {
     if (reader_contents_ != nullptr) {
       *reader_contents_ = Slice(contents_.data(), 0);
     }
@@ -203,12 +196,15 @@ class StringSink: public WritableFile {
 
   const std::string& contents() const { return contents_; }
 
-  virtual Status Truncate(uint64_t size) override {
+  IOStatus Truncate(uint64_t size, const IOOptions& /*opts*/,
+                    IODebugContext* /*dbg*/) override {
     contents_.resize(static_cast<size_t>(size));
-    return Status::OK();
+    return IOStatus::OK();
   }
-  virtual Status Close() override { return Status::OK(); }
-  virtual Status Flush() override {
+  IOStatus Close(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
+  IOStatus Flush(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
     if (reader_contents_ != nullptr) {
       assert(reader_contents_->size() <= last_flush_);
       size_t offset = last_flush_ - reader_contents_->size();
@@ -218,12 +214,17 @@ class StringSink: public WritableFile {
       last_flush_ = contents_.size();
     }
 
-    return Status::OK();
+    return IOStatus::OK();
+  }
+  IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
   }
-  virtual Status Sync() override { return Status::OK(); }
-  virtual Status Append(const Slice& slice) override {
+
+  using FSWritableFile::Append;
+  IOStatus Append(const Slice& slice, const IOOptions& /*opts*/,
+                  IODebugContext* /*dbg*/) override {
     contents_.append(slice.data(), slice.size());
-    return Status::OK();
+    return IOStatus::OK();
   }
   void Drop(size_t bytes) {
     if (reader_contents_ != nullptr) {
@@ -240,36 +241,44 @@ class StringSink: public WritableFile {
 };
 
 // A wrapper around a StringSink to give it a RandomRWFile interface
-class RandomRWStringSink : public RandomRWFile {
+class RandomRWStringSink : public FSRandomRWFile {
  public:
   explicit RandomRWStringSink(StringSink* ss) : ss_(ss) {}
 
-  Status Write(uint64_t offset, const Slice& data) override {
+  IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& /*opts*/,
+                 IODebugContext* /*dbg*/) override {
     if (offset + data.size() > ss_->contents_.size()) {
       ss_->contents_.resize(static_cast<size_t>(offset) + data.size(), '\0');
     }
 
     char* pos = const_cast<char*>(ss_->contents_.data() + offset);
     memcpy(pos, data.data(), data.size());
-    return Status::OK();
+    return IOStatus::OK();
   }
 
-  Status Read(uint64_t offset, size_t n, Slice* result,
-              char* /*scratch*/) const override {
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*opts*/,
+                Slice* result, char* /*scratch*/,
+                IODebugContext* /*dbg*/) const override {
     *result = Slice(nullptr, 0);
     if (offset < ss_->contents_.size()) {
       size_t str_res_sz =
           std::min(static_cast<size_t>(ss_->contents_.size() - offset), n);
       *result = Slice(ss_->contents_.data() + offset, str_res_sz);
     }
-    return Status::OK();
+    return IOStatus::OK();
   }
 
-  Status Flush() override { return Status::OK(); }
+  IOStatus Flush(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
 
-  Status Sync() override { return Status::OK(); }
+  IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
 
-  Status Close() override { return Status::OK(); }
+  IOStatus Close(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
 
   const std::string& contents() const { return ss_->contents(); }
 
@@ -280,34 +289,42 @@ class RandomRWStringSink : public RandomRWFile {
 // Like StringSink, this writes into a string.  Unlink StringSink, it
 // has some initial content and overwrites it, just like a recycled
 // log file.
-class OverwritingStringSink : public WritableFile {
+class OverwritingStringSink : public FSWritableFile {
  public:
   explicit OverwritingStringSink(Slice* reader_contents)
-      : WritableFile(),
+      : FSWritableFile(),
         contents_(""),
         reader_contents_(reader_contents),
         last_flush_(0) {}
 
   const std::string& contents() const { return contents_; }
 
-  virtual Status Truncate(uint64_t size) override {
+  IOStatus Truncate(uint64_t size, const IOOptions& /*opts*/,
+                    IODebugContext* /*dbg*/) override {
     contents_.resize(static_cast<size_t>(size));
-    return Status::OK();
+    return IOStatus::OK();
   }
-  virtual Status Close() override { return Status::OK(); }
-  virtual Status Flush() override {
+  IOStatus Close(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
+  }
+  IOStatus Flush(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
     if (last_flush_ < contents_.size()) {
       assert(reader_contents_->size() >= contents_.size());
       memcpy((char*)reader_contents_->data() + last_flush_,
              contents_.data() + last_flush_, contents_.size() - last_flush_);
       last_flush_ = contents_.size();
     }
-    return Status::OK();
+    return IOStatus::OK();
+  }
+  IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::OK();
   }
-  virtual Status Sync() override { return Status::OK(); }
-  virtual Status Append(const Slice& slice) override {
+
+  using FSWritableFile::Append;
+  IOStatus Append(const Slice& slice, const IOOptions& /*opts*/,
+                  IODebugContext* /*dbg*/) override {
     contents_.append(slice.data(), slice.size());
-    return Status::OK();
+    return IOStatus::OK();
   }
   void Drop(size_t bytes) {
     contents_.resize(contents_.size() - bytes);
@@ -320,7 +337,7 @@ class OverwritingStringSink : public WritableFile {
   size_t last_flush_;
 };
 
-class StringSource: public RandomAccessFile {
+class StringSource : public FSRandomAccessFile {
  public:
   explicit StringSource(const Slice& contents, uint64_t uniq_id = 0,
                         bool mmap = false)
@@ -333,11 +350,23 @@ class StringSource: public RandomAccessFile {
 
   uint64_t Size() const { return contents_.size(); }
 
-  virtual Status Read(uint64_t offset, size_t n, Slice* result,
-      char* scratch) const override {
+  IOStatus Prefetch(uint64_t /*offset*/, size_t /*n*/,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    // If we are using mmap_, it is equivalent to performing a prefetch
+    if (mmap_) {
+      return IOStatus::OK();
+    } else {
+      return IOStatus::NotSupported("Prefetch not supported");
+    }
+  }
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& /*opts*/,
+                Slice* result, char* scratch,
+                IODebugContext* /*dbg*/) const override {
     total_reads_++;
     if (offset > contents_.size()) {
-      return Status::InvalidArgument("invalid Read offset");
+      return IOStatus::InvalidArgument("invalid Read offset");
     }
     if (offset + n > contents_.size()) {
       n = contents_.size() - static_cast<size_t>(offset);
@@ -348,10 +377,10 @@ class StringSource: public RandomAccessFile {
     } else {
       *result = Slice(&contents_[static_cast<size_t>(offset)], n);
     }
-    return Status::OK();
+    return IOStatus::OK();
   }
 
-  virtual size_t GetUniqueId(char* id, size_t max_size) const override {
+  size_t GetUniqueId(char* id, size_t max_size) const override {
     if (max_size < 20) {
       return 0;
     }
@@ -373,13 +402,6 @@ class StringSource: public RandomAccessFile {
   mutable int total_reads_;
 };
 
-inline StringSink* GetStringSinkFromLegacyWriter(
-    const WritableFileWriter* writer) {
-  LegacyWritableFileWrapper* file =
-      static_cast<LegacyWritableFileWrapper*>(writer->writable_file());
-  return static_cast<StringSink*>(file->target());
-}
-
 class NullLogger : public Logger {
  public:
   using Logger::Logv;
@@ -394,6 +416,10 @@ extern std::string KeyStr(const std::string& user_key,
                           const SequenceNumber& seq, const ValueType& t,
                           bool corrupt = false);
 
+extern std::string KeyStr(uint64_t ts, const std::string& user_key,
+                          const SequenceNumber& seq, const ValueType& t,
+                          bool corrupt = false);
+
 class SleepingBackgroundTask {
  public:
   SleepingBackgroundTask()
@@ -427,16 +453,8 @@ class SleepingBackgroundTask {
   // otherwise times out.
   // wait_time is in microseconds.
   // Returns true when times out, false otherwise.
-  bool TimedWaitUntilSleeping(uint64_t wait_time) {
-    auto abs_time = Env::Default()->NowMicros() + wait_time;
-    MutexLock l(&mutex_);
-    while (!sleeping_ || !should_sleep_) {
-      if (bg_cv_.TimedWait(abs_time)) {
-        return true;
-      }
-    }
-    return false;
-  }
+  bool TimedWaitUntilSleeping(uint64_t wait_time);
+
   void WakeUp() {
     MutexLock l(&mutex_);
     should_sleep_ = false;
@@ -450,16 +468,8 @@ class SleepingBackgroundTask {
   }
   // Similar to TimedWaitUntilSleeping.
   // Waits until the task is done.
-  bool TimedWaitUntilDone(uint64_t wait_time) {
-    auto abs_time = Env::Default()->NowMicros() + wait_time;
-    MutexLock l(&mutex_);
-    while (!done_with_sleep_) {
-      if (bg_cv_.TimedWait(abs_time)) {
-        return true;
-      }
-    }
-    return false;
-  }
+  bool TimedWaitUntilDone(uint64_t wait_time);
+
   bool WokenUp() {
     MutexLock l(&mutex_);
     return should_sleep_ == false;
@@ -522,176 +532,220 @@ inline std::string EncodeInt(uint64_t x) {
   return result;
 }
 
-  class SeqStringSource : public SequentialFile {
+class SeqStringSource : public FSSequentialFile {
+ public:
+  SeqStringSource(const std::string& data, std::atomic<int>* read_count)
+      : data_(data), offset_(0), read_count_(read_count) {}
+  ~SeqStringSource() override {}
+  IOStatus Read(size_t n, const IOOptions& /*opts*/, Slice* result,
+                char* scratch, IODebugContext* /*dbg*/) override {
+    std::string output;
+    if (offset_ < data_.size()) {
+      n = std::min(data_.size() - offset_, n);
+      memcpy(scratch, data_.data() + offset_, n);
+      offset_ += n;
+      *result = Slice(scratch, n);
+    } else {
+      return IOStatus::InvalidArgument(
+          "Attempt to read when it already reached eof.");
+    }
+    (*read_count_)++;
+    return IOStatus::OK();
+  }
+
+  IOStatus Skip(uint64_t n) override {
+    if (offset_ >= data_.size()) {
+      return IOStatus::InvalidArgument(
+          "Attempt to read when it already reached eof.");
+    }
+    // TODO(yhchiang): Currently doesn't handle the overflow case.
+    offset_ += static_cast<size_t>(n);
+    return IOStatus::OK();
+  }
+
+ private:
+  std::string data_;
+  size_t offset_;
+  std::atomic<int>* read_count_;
+};
+
+class StringFS : public FileSystemWrapper {
+ public:
+  class StringSink : public FSWritableFile {
    public:
-    SeqStringSource(const std::string& data, std::atomic<int>* read_count)
-        : data_(data), offset_(0), read_count_(read_count) {}
-    ~SeqStringSource() override {}
-    Status Read(size_t n, Slice* result, char* scratch) override {
-      std::string output;
-      if (offset_ < data_.size()) {
-        n = std::min(data_.size() - offset_, n);
-        memcpy(scratch, data_.data() + offset_, n);
-        offset_ += n;
-        *result = Slice(scratch, n);
-      } else {
-        return Status::InvalidArgument(
-            "Attemp to read when it already reached eof.");
-      }
-      (*read_count_)++;
-      return Status::OK();
-    }
-    Status Skip(uint64_t n) override {
-      if (offset_ >= data_.size()) {
-        return Status::InvalidArgument(
-            "Attemp to read when it already reached eof.");
-      }
-      // TODO(yhchiang): Currently doesn't handle the overflow case.
-      offset_ += static_cast<size_t>(n);
-      return Status::OK();
+    explicit StringSink(std::string* contents)
+        : FSWritableFile(), contents_(contents) {}
+    IOStatus Truncate(uint64_t size, const IOOptions& /*opts*/,
+                      IODebugContext* /*dbg*/) override {
+      contents_->resize(static_cast<size_t>(size));
+      return IOStatus::OK();
+    }
+    IOStatus Close(const IOOptions& /*opts*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Flush(const IOOptions& /*opts*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Sync(const IOOptions& /*opts*/, IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+
+    using FSWritableFile::Append;
+    IOStatus Append(const Slice& slice, const IOOptions& /*opts*/,
+                    IODebugContext* /*dbg*/) override {
+      contents_->append(slice.data(), slice.size());
+      return IOStatus::OK();
     }
 
    private:
-    std::string data_;
-    size_t offset_;
-    std::atomic<int>* read_count_;
+    std::string* contents_;
   };
 
-  class StringEnv : public EnvWrapper {
-   public:
-    class StringSink : public WritableFile {
-     public:
-      explicit StringSink(std::string* contents)
-          : WritableFile(), contents_(contents) {}
-      virtual Status Truncate(uint64_t size) override {
-        contents_->resize(static_cast<size_t>(size));
-        return Status::OK();
-      }
-      virtual Status Close() override { return Status::OK(); }
-      virtual Status Flush() override { return Status::OK(); }
-      virtual Status Sync() override { return Status::OK(); }
-      virtual Status Append(const Slice& slice) override {
-        contents_->append(slice.data(), slice.size());
-        return Status::OK();
-      }
-
-     private:
-      std::string* contents_;
-    };
-
-    explicit StringEnv(Env* t) : EnvWrapper(t) {}
-    ~StringEnv() override {}
-
-    const std::string& GetContent(const std::string& f) { return files_[f]; }
-
-    const Status WriteToNewFile(const std::string& file_name,
+  explicit StringFS(const std::shared_ptr<FileSystem>& t)
+      : FileSystemWrapper(t) {}
+  ~StringFS() override {}
+
+  const std::string& GetContent(const std::string& f) { return files_[f]; }
+
+  const IOStatus WriteToNewFile(const std::string& file_name,
                                 const std::string& content) {
-      std::unique_ptr<WritableFile> r;
-      auto s = NewWritableFile(file_name, &r, EnvOptions());
-      if (s.ok()) {
-        s = r->Append(content);
-      }
-      if (s.ok()) {
-        s = r->Flush();
-      }
-      if (s.ok()) {
-        s = r->Close();
-      }
-      assert(!s.ok() || files_[file_name] == content);
-      return s;
-    }
-
-    // The following text is boilerplate that forwards all methods to target()
-    Status NewSequentialFile(const std::string& f,
-                             std::unique_ptr<SequentialFile>* r,
-                             const EnvOptions& /*options*/) override {
-      auto iter = files_.find(f);
-      if (iter == files_.end()) {
-        return Status::NotFound("The specified file does not exist", f);
-      }
-      r->reset(new SeqStringSource(iter->second, &num_seq_file_read_));
-      return Status::OK();
-    }
-    Status NewRandomAccessFile(const std::string& /*f*/,
-                               std::unique_ptr<RandomAccessFile>* /*r*/,
-                               const EnvOptions& /*options*/) override {
-      return Status::NotSupported();
-    }
-    Status NewWritableFile(const std::string& f,
-                           std::unique_ptr<WritableFile>* r,
-                           const EnvOptions& /*options*/) override {
-      auto iter = files_.find(f);
-      if (iter != files_.end()) {
-        return Status::IOError("The specified file already exists", f);
-      }
-      r->reset(new StringSink(&files_[f]));
-      return Status::OK();
-    }
-    virtual Status NewDirectory(
-        const std::string& /*name*/,
-        std::unique_ptr<Directory>* /*result*/) override {
-      return Status::NotSupported();
-    }
-    Status FileExists(const std::string& f) override {
-      if (files_.find(f) == files_.end()) {
-        return Status::NotFound();
-      }
-      return Status::OK();
-    }
-    Status GetChildren(const std::string& /*dir*/,
-                       std::vector<std::string>* /*r*/) override {
-      return Status::NotSupported();
-    }
-    Status DeleteFile(const std::string& f) override {
-      files_.erase(f);
-      return Status::OK();
-    }
-    Status CreateDir(const std::string& /*d*/) override {
-      return Status::NotSupported();
-    }
-    Status CreateDirIfMissing(const std::string& /*d*/) override {
-      return Status::NotSupported();
-    }
-    Status DeleteDir(const std::string& /*d*/) override {
-      return Status::NotSupported();
-    }
-    Status GetFileSize(const std::string& f, uint64_t* s) override {
-      auto iter = files_.find(f);
-      if (iter == files_.end()) {
-        return Status::NotFound("The specified file does not exist:", f);
-      }
-      *s = iter->second.size();
-      return Status::OK();
-    }
-
-    Status GetFileModificationTime(const std::string& /*fname*/,
-                                   uint64_t* /*file_mtime*/) override {
-      return Status::NotSupported();
-    }
-
-    Status RenameFile(const std::string& /*s*/,
-                      const std::string& /*t*/) override {
-      return Status::NotSupported();
-    }
-
-    Status LinkFile(const std::string& /*s*/,
-                    const std::string& /*t*/) override {
-      return Status::NotSupported();
-    }
-
-    Status LockFile(const std::string& /*f*/, FileLock** /*l*/) override {
-      return Status::NotSupported();
-    }
-
-    Status UnlockFile(FileLock* /*l*/) override {
-      return Status::NotSupported();
-    }
-
-    std::atomic<int> num_seq_file_read_;
+    std::unique_ptr<FSWritableFile> r;
+    FileOptions file_opts;
+    IOOptions io_opts;
 
-   protected:
-    std::unordered_map<std::string, std::string> files_;
-  };
+    auto s = NewWritableFile(file_name, file_opts, &r, nullptr);
+    if (s.ok()) {
+      s = r->Append(content, io_opts, nullptr);
+    }
+    if (s.ok()) {
+      s = r->Flush(io_opts, nullptr);
+    }
+    if (s.ok()) {
+      s = r->Close(io_opts, nullptr);
+    }
+    assert(!s.ok() || files_[file_name] == content);
+    return s;
+  }
+
+  // The following text is boilerplate that forwards all methods to target()
+  IOStatus NewSequentialFile(const std::string& f,
+                             const FileOptions& /*options*/,
+                             std::unique_ptr<FSSequentialFile>* r,
+                             IODebugContext* /*dbg*/) override {
+    auto iter = files_.find(f);
+    if (iter == files_.end()) {
+      return IOStatus::NotFound("The specified file does not exist", f);
+    }
+    r->reset(new SeqStringSource(iter->second, &num_seq_file_read_));
+    return IOStatus::OK();
+  }
+
+  IOStatus NewRandomAccessFile(const std::string& /*f*/,
+                               const FileOptions& /*options*/,
+                               std::unique_ptr<FSRandomAccessFile>* /*r*/,
+                               IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus NewWritableFile(const std::string& f, const FileOptions& /*options*/,
+                           std::unique_ptr<FSWritableFile>* r,
+                           IODebugContext* /*dbg*/) override {
+    auto iter = files_.find(f);
+    if (iter != files_.end()) {
+      return IOStatus::IOError("The specified file already exists", f);
+    }
+    r->reset(new StringSink(&files_[f]));
+    return IOStatus::OK();
+  }
+  IOStatus NewDirectory(const std::string& /*name*/,
+                        const IOOptions& /*options*/,
+                        std::unique_ptr<FSDirectory>* /*result*/,
+                        IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus FileExists(const std::string& f, const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    if (files_.find(f) == files_.end()) {
+      return IOStatus::NotFound();
+    }
+    return IOStatus::OK();
+  }
+
+  IOStatus GetChildren(const std::string& /*dir*/, const IOOptions& /*options*/,
+                       std::vector<std::string>* /*r*/,
+                       IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus DeleteFile(const std::string& f, const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    files_.erase(f);
+    return IOStatus::OK();
+  }
+
+  IOStatus CreateDir(const std::string& /*d*/, const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus CreateDirIfMissing(const std::string& /*d*/,
+                              const IOOptions& /*options*/,
+                              IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus DeleteDir(const std::string& /*d*/, const IOOptions& /*options*/,
+                     IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus GetFileSize(const std::string& f, const IOOptions& /*options*/,
+                       uint64_t* s, IODebugContext* /*dbg*/) override {
+    auto iter = files_.find(f);
+    if (iter == files_.end()) {
+      return IOStatus::NotFound("The specified file does not exist:", f);
+    }
+    *s = iter->second.size();
+    return IOStatus::OK();
+  }
+
+  IOStatus GetFileModificationTime(const std::string& /*fname*/,
+                                   const IOOptions& /*options*/,
+                                   uint64_t* /*file_mtime*/,
+                                   IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus RenameFile(const std::string& /*s*/, const std::string& /*t*/,
+                      const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus LinkFile(const std::string& /*s*/, const std::string& /*t*/,
+                    const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus LockFile(const std::string& /*f*/, const IOOptions& /*options*/,
+                    FileLock** /*l*/, IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  IOStatus UnlockFile(FileLock* /*l*/, const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+    return IOStatus::NotSupported();
+  }
+
+  std::atomic<int> num_seq_file_read_;
+
+ protected:
+  std::unordered_map<std::string, std::string> files_;
+};
 
 // Randomly initialize the given DBOptions
 void RandomInitDBOptions(DBOptions* db_opt, Random* rnd);
@@ -792,6 +846,9 @@ std::string RandomName(Random* rnd, const size_t len);
 
 bool IsDirectIOSupported(Env* env, const std::string& dir);
 
+bool IsPrefetchSupported(const std::shared_ptr<FileSystem>& fs,
+                         const std::string& dir);
+
 // Return the number of lines where a given pattern was found in a file.
 size_t GetLinesCount(const std::string& fname, const std::string& pattern);
 
@@ -800,8 +857,19 @@ size_t GetLinesCount(const std::string& fname, const std::string& pattern);
 // Tries to set TEST_TMPDIR to a directory supporting direct IO.
 void ResetTmpDirForDirectIO();
 
+Status CorruptFile(Env* env, const std::string& fname, int offset,
+                   int bytes_to_corrupt, bool verify_checksum = true);
+Status TruncateFile(Env* env, const std::string& fname, uint64_t length);
+
+// Try and delete a directory if it exists
+Status TryDeleteDir(Env* env, const std::string& dirname);
 
-void CorruptFile(const std::string& fname, int offset, int bytes_to_corrupt);
+// Delete a directory if it exists
+void DeleteDir(Env* env, const std::string& dirname);
 
+// Creates an Env from the system environment by looking at the system
+// environment variables.
+Status CreateEnvFromSystem(const ConfigOptions& options, Env** result,
+                           std::shared_ptr<Env>* guard);
 }  // namespace test
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/test_util/transaction_test_util.cc b/test_util/transaction_test_util.cc
index a9410f5fc47..28f16a5e71b 100644
--- a/test_util/transaction_test_util.cc
+++ b/test_util/transaction_test_util.cc
@@ -349,6 +349,7 @@ Status RandomTransactionInserter::Verify(DB* db, uint16_t num_sets,
             static_cast<int>(key.size()), key.data(), int_value);
         total += int_value;
       }
+      iter->status().PermitUncheckedError();
       delete iter;
     }
 
diff --git a/third-party/folly/folly/synchronization/DistributedMutex-inl.h b/third-party/folly/folly/synchronization/DistributedMutex-inl.h
index ca5650de696..6e250c94d25 100644
--- a/third-party/folly/folly/synchronization/DistributedMutex-inl.h
+++ b/third-party/folly/folly/synchronization/DistributedMutex-inl.h
@@ -1374,7 +1374,8 @@ inline std::uintptr_t tryWake(
     // we need release here because of the write to waker_ and also because we
     // are unlocking the mutex, the thread we do the handoff to here should
     // see the modified data
-    new (&waiter->metadata_) Metadata(waker, bit_cast<uintptr_t>(sleepers));
+    new (&waiter->metadata_)
+        Metadata(waker, folly::bit_cast<std::uintptr_t>(sleepers));
     waiter->futex_.store(kWake, std::memory_order_release);
     return 0;
   }
diff --git a/third-party/gcc/ppc-asm.h b/third-party/gcc/ppc-asm.h
new file mode 100644
index 00000000000..e0bce9c5aec
--- /dev/null
+++ b/third-party/gcc/ppc-asm.h
@@ -0,0 +1,390 @@
+/* PowerPC asm definitions for GNU C.
+
+Copyright (C) 2002-2020 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/* Under winnt, 1) gas supports the following as names and 2) in particular
+   defining "toc" breaks the FUNC_START macro as ".toc" becomes ".2" */
+
+#define r0	0
+#define sp	1
+#define toc	2
+#define r3	3
+#define r4	4
+#define r5	5
+#define r6	6
+#define r7	7
+#define r8	8
+#define r9	9
+#define r10	10
+#define r11	11
+#define r12	12
+#define r13	13
+#define r14	14
+#define r15	15
+#define r16	16
+#define r17	17
+#define r18	18
+#define r19     19
+#define r20	20
+#define r21	21
+#define r22	22
+#define r23	23
+#define r24	24
+#define r25	25
+#define r26	26
+#define r27	27
+#define r28	28
+#define r29	29
+#define r30	30
+#define r31	31
+
+#define cr0	0
+#define cr1	1
+#define cr2	2
+#define cr3	3
+#define cr4	4
+#define cr5	5
+#define cr6	6
+#define cr7	7
+
+#define f0	0
+#define f1	1
+#define f2	2
+#define f3	3
+#define f4	4
+#define f5	5
+#define f6	6
+#define f7	7
+#define f8	8
+#define f9	9
+#define f10	10
+#define f11	11
+#define f12	12
+#define f13	13
+#define f14	14
+#define f15	15
+#define f16	16
+#define f17	17
+#define f18	18
+#define f19	19
+#define f20	20
+#define f21	21
+#define f22	22
+#define f23	23
+#define f24	24
+#define f25	25
+#define f26	26
+#define f27	27
+#define f28	28
+#define f29	29
+#define f30	30
+#define f31	31
+
+#ifdef __VSX__
+#define f32	32
+#define f33	33
+#define f34	34
+#define f35	35
+#define f36	36
+#define f37	37
+#define f38	38
+#define f39	39
+#define f40	40
+#define f41	41
+#define f42	42
+#define f43	43
+#define f44	44
+#define f45	45
+#define f46	46
+#define f47	47
+#define f48	48
+#define f49	49
+#define f50	50
+#define f51	51
+#define f52	52
+#define f53	53
+#define f54	54
+#define f55	55
+#define f56	56
+#define f57	57
+#define f58	58
+#define f59	59
+#define f60	60
+#define f61	61
+#define f62	62
+#define f63	63
+#endif
+
+#ifdef __ALTIVEC__
+#define v0	0
+#define v1	1
+#define v2	2
+#define v3	3
+#define v4	4
+#define v5	5
+#define v6	6
+#define v7	7
+#define v8	8
+#define v9	9
+#define v10	10
+#define v11	11
+#define v12	12
+#define v13	13
+#define v14	14
+#define v15	15
+#define v16	16
+#define v17	17
+#define v18	18
+#define v19	19
+#define v20	20
+#define v21	21
+#define v22	22
+#define v23	23
+#define v24	24
+#define v25	25
+#define v26	26
+#define v27	27
+#define v28	28
+#define v29	29
+#define v30	30
+#define v31	31
+#endif
+
+#ifdef __VSX__
+#define vs0	0
+#define vs1	1
+#define vs2	2
+#define vs3	3
+#define vs4	4
+#define vs5	5
+#define vs6	6
+#define vs7	7
+#define vs8	8
+#define vs9	9
+#define vs10	10
+#define vs11	11
+#define vs12	12
+#define vs13	13
+#define vs14	14
+#define vs15	15
+#define vs16	16
+#define vs17	17
+#define vs18	18
+#define vs19	19
+#define vs20	20
+#define vs21	21
+#define vs22	22
+#define vs23	23
+#define vs24	24
+#define vs25	25
+#define vs26	26
+#define vs27	27
+#define vs28	28
+#define vs29	29
+#define vs30	30
+#define vs31	31
+#define vs32	32
+#define vs33	33
+#define vs34	34
+#define vs35	35
+#define vs36	36
+#define vs37	37
+#define vs38	38
+#define vs39	39
+#define vs40	40
+#define vs41	41
+#define vs42	42
+#define vs43	43
+#define vs44	44
+#define vs45	45
+#define vs46	46
+#define vs47	47
+#define vs48	48
+#define vs49	49
+#define vs50	50
+#define vs51	51
+#define vs52	52
+#define vs53	53
+#define vs54	54
+#define vs55	55
+#define vs56	56
+#define vs57	57
+#define vs58	58
+#define vs59	59
+#define vs60	60
+#define vs61	61
+#define vs62	62
+#define vs63	63
+#endif
+
+/*
+ * Macros to glue together two tokens.
+ */
+
+#ifdef __STDC__
+#define XGLUE(a,b) a##b
+#else
+#define XGLUE(a,b) a/**/b
+#endif
+
+#define GLUE(a,b) XGLUE(a,b)
+
+/*
+ * Macros to begin and end a function written in assembler.  If -mcall-aixdesc
+ * or -mcall-nt, create a function descriptor with the given name, and create
+ * the real function with one or two leading periods respectively.
+ */
+
+#if defined(__powerpc64__) && _CALL_ELF == 2
+
+/* Defining "toc" above breaks @toc in assembler code.  */
+#undef toc
+
+#define FUNC_NAME(name) GLUE(__USER_LABEL_PREFIX__,name)
+#ifdef __PCREL__
+#define JUMP_TARGET(name) GLUE(FUNC_NAME(name),@notoc)
+#define FUNC_START(name) \
+	.type FUNC_NAME(name),@function; \
+	.globl FUNC_NAME(name); \
+FUNC_NAME(name): \
+	.localentry FUNC_NAME(name),1
+#else
+#define JUMP_TARGET(name) FUNC_NAME(name)
+#define FUNC_START(name) \
+	.type FUNC_NAME(name),@function; \
+	.globl FUNC_NAME(name); \
+FUNC_NAME(name): \
+0:	addis 2,12,(.TOC.-0b)@ha; \
+	addi 2,2,(.TOC.-0b)@l; \
+	.localentry FUNC_NAME(name),.-FUNC_NAME(name)
+#endif /* !__PCREL__ */
+
+#define HIDDEN_FUNC(name) \
+  FUNC_START(name) \
+  .hidden FUNC_NAME(name);
+
+#define FUNC_END(name) \
+	.size FUNC_NAME(name),.-FUNC_NAME(name)
+
+#elif defined (__powerpc64__)
+
+#define FUNC_NAME(name) GLUE(.,name)
+#define JUMP_TARGET(name) FUNC_NAME(name)
+#define FUNC_START(name) \
+	.section ".opd","aw"; \
+name: \
+	.quad GLUE(.,name); \
+	.quad .TOC.@tocbase; \
+	.quad 0; \
+	.previous; \
+	.type GLUE(.,name),@function; \
+	.globl name; \
+	.globl GLUE(.,name); \
+GLUE(.,name):
+
+#define HIDDEN_FUNC(name) \
+  FUNC_START(name) \
+  .hidden name;	\
+  .hidden GLUE(.,name);
+
+#define FUNC_END(name) \
+GLUE(.L,name): \
+	.size GLUE(.,name),GLUE(.L,name)-GLUE(.,name)
+
+#elif defined(_CALL_AIXDESC)
+
+#ifdef _RELOCATABLE
+#define DESC_SECTION ".got2"
+#else
+#define DESC_SECTION ".got1"
+#endif
+
+#define FUNC_NAME(name) GLUE(.,name)
+#define JUMP_TARGET(name) FUNC_NAME(name)
+#define FUNC_START(name) \
+	.section DESC_SECTION,"aw"; \
+name: \
+	.long GLUE(.,name); \
+	.long _GLOBAL_OFFSET_TABLE_; \
+	.long 0; \
+	.previous; \
+	.type GLUE(.,name),@function; \
+	.globl name; \
+	.globl GLUE(.,name); \
+GLUE(.,name):
+
+#define HIDDEN_FUNC(name) \
+  FUNC_START(name) \
+  .hidden name; \
+  .hidden GLUE(.,name);
+
+#define FUNC_END(name) \
+GLUE(.L,name): \
+	.size GLUE(.,name),GLUE(.L,name)-GLUE(.,name)
+
+#else
+
+#define FUNC_NAME(name) GLUE(__USER_LABEL_PREFIX__,name)
+#if defined __PIC__ || defined __pic__
+#define JUMP_TARGET(name) FUNC_NAME(name@plt)
+#else
+#define JUMP_TARGET(name) FUNC_NAME(name)
+#endif
+#define FUNC_START(name) \
+	.type FUNC_NAME(name),@function; \
+	.globl FUNC_NAME(name); \
+FUNC_NAME(name):
+
+#define HIDDEN_FUNC(name) \
+  FUNC_START(name) \
+  .hidden FUNC_NAME(name);
+
+#define FUNC_END(name) \
+GLUE(.L,name): \
+	.size FUNC_NAME(name),GLUE(.L,name)-FUNC_NAME(name)
+#endif
+
+#ifdef IN_GCC
+/* For HAVE_GAS_CFI_DIRECTIVE.  */
+#include "auto-host.h"
+
+#ifdef HAVE_GAS_CFI_DIRECTIVE
+# define CFI_STARTPROC			.cfi_startproc
+# define CFI_ENDPROC			.cfi_endproc
+# define CFI_OFFSET(reg, off)		.cfi_offset reg, off
+# define CFI_DEF_CFA_REGISTER(reg)	.cfi_def_cfa_register reg
+# define CFI_RESTORE(reg)		.cfi_restore reg
+#else
+# define CFI_STARTPROC
+# define CFI_ENDPROC
+# define CFI_OFFSET(reg, off)
+# define CFI_DEF_CFA_REGISTER(reg)
+# define CFI_RESTORE(reg)
+#endif
+#endif
+
+#if defined __linux__ && !defined __powerpc64__
+	.section .note.GNU-stack
+	.previous
+#endif
diff --git a/tools/backup_db.sh b/tools/backup_db.sh
new file mode 100755
index 00000000000..aa82f1dbaed
--- /dev/null
+++ b/tools/backup_db.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+#
+
+if [ "$#" -lt 2 ]; then
+  echo "usage: ${BASH_SOURCE[0]} <DB Path> <Backup Dir>"
+  exit 1
+fi
+
+db_dir="$1"
+backup_dir="$2"
+
+echo "== Backing up DB $db_dir to $backup_dir"
+./ldb backup --db="$db_dir" --backup_dir="$backup_dir"
diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
index bffd480b435..29ec8cb91ba 100644
--- a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc
@@ -19,6 +19,7 @@
 #include <sstream>
 
 #include "monitoring/histogram.h"
+#include "rocksdb/system_clock.h"
 #include "util/gflags_compat.h"
 #include "util/string_util.h"
 
@@ -1519,6 +1520,7 @@ Status BlockCacheTraceAnalyzer::RecordAccess(
 }
 
 Status BlockCacheTraceAnalyzer::Analyze() {
+  SystemClock* clock = env_->GetSystemClock().get();
   std::unique_ptr<BlockCacheTraceReader> reader;
   Status s = Status::OK();
   if (is_human_readable_trace_file_) {
@@ -1542,7 +1544,7 @@ Status BlockCacheTraceAnalyzer::Analyze() {
       return s;
     }
   }
-  uint64_t start = env_->NowMicros();
+  uint64_t start = clock->NowMicros();
   uint64_t time_interval = 0;
   while (s.ok()) {
     BlockCacheTraceRecord access;
@@ -1568,7 +1570,7 @@ Status BlockCacheTraceAnalyzer::Analyze() {
       cache_simulator_->Access(access);
     }
     access_sequence_number_++;
-    uint64_t now = env_->NowMicros();
+    uint64_t now = clock->NowMicros();
     uint64_t duration = (now - start) / kMicrosInSecond;
     if (duration > 10 * time_interval) {
       uint64_t trace_duration =
@@ -1582,7 +1584,7 @@ Status BlockCacheTraceAnalyzer::Analyze() {
       time_interval++;
     }
   }
-  uint64_t now = env_->NowMicros();
+  uint64_t now = clock->NowMicros();
   uint64_t duration = (now - start) / kMicrosInSecond;
   uint64_t trace_duration =
       trace_end_timestamp_in_seconds_ - trace_start_timestamp_in_seconds_;
diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer.h b/tools/block_cache_analyzer/block_cache_trace_analyzer.h
index 74fc22b10fd..4436e0b77a1 100644
--- a/tools/block_cache_analyzer/block_cache_trace_analyzer.h
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.h
@@ -103,7 +103,8 @@ struct BlockAccessInfo {
         num_referenced_key_exist_in_block++;
         if (referenced_data_size > block_size && block_size != 0) {
           ParsedInternalKey internal_key;
-          Status s = ParseInternalKey(access.referenced_key, &internal_key);
+          Status s = ParseInternalKey(access.referenced_key, &internal_key,
+                                      false /* log_err_key */);  // TODO
           assert(s.ok());  // TODO
         }
       } else {
diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
index c26795d60ad..91bd30652f7 100644
--- a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
+++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc
@@ -9,7 +9,7 @@
 int main() {
   fprintf(stderr,
           "Please install gflags to run block_cache_trace_analyzer_test\n");
-  return 1;
+  return 0;
 }
 #else
 
@@ -44,7 +44,7 @@ const size_t kArgBufferSize = 100000;
 class BlockCacheTracerTest : public testing::Test {
  public:
   BlockCacheTracerTest() {
-    test_path_ = test::PerThreadDBPath("block_cache_tracer_test");
+    test_path_ = test::PerThreadDBPath("block_cache_trace_analyzer_test");
     env_ = ROCKSDB_NAMESPACE::Env::Default();
     EXPECT_OK(env_->CreateDir(test_path_));
     trace_file_path_ = test_path_ + "/block_cache_trace";
@@ -225,7 +225,9 @@ TEST_F(BlockCacheTracerTest, BlockCacheAnalyzer) {
     std::unique_ptr<TraceWriter> trace_writer;
     ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
                                  &trace_writer));
-    BlockCacheTraceWriter writer(env_, trace_opt, std::move(trace_writer));
+    const auto& clock = env_->GetSystemClock();
+    BlockCacheTraceWriter writer(clock.get(), trace_opt,
+                                 std::move(trace_writer));
     ASSERT_OK(writer.WriteHeader());
     WriteBlockAccess(&writer, 0, TraceType::kBlockTraceDataBlock, 50);
     ASSERT_OK(env_->FileExists(trace_file_path_));
@@ -610,9 +612,11 @@ TEST_F(BlockCacheTracerTest, MixedBlocks) {
     // kSSTStoringEvenKeys.
     TraceOptions trace_opt;
     std::unique_ptr<TraceWriter> trace_writer;
+    const auto& clock = env_->GetSystemClock();
     ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
                                  &trace_writer));
-    BlockCacheTraceWriter writer(env_, trace_opt, std::move(trace_writer));
+    BlockCacheTraceWriter writer(clock.get(), trace_opt,
+                                 std::move(trace_writer));
     ASSERT_OK(writer.WriteHeader());
     // Write blocks of different types.
     WriteBlockAccess(&writer, 0, TraceType::kBlockTraceUncompressionDictBlock,
diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh
index e9424196d5c..5bba01daefa 100755
--- a/tools/check_format_compatible.sh
+++ b/tools/check_format_compatible.sh
@@ -1,24 +1,82 @@
 #!/usr/bin/env bash
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 #
-# A shell script to load some pre generated data file to a DB using ldb tool
-# ./ldb needs to be avaible to be executed.
+# A shell script to build and run different versions of ldb to check for
+# expected forward and backward compatibility with "current" version. The
+# working copy must have no uncommitted changes.
+#
+# Usage: <SCRIPT> [ref_for_current]
+# `ref_for_current` can be a revision, tag, commit or branch name. Default is HEAD.
 #
-# Usage: <SCRIPT> [checkout]
-# `checkout` can be a tag, commit or branch name. Will build using it and check DBs generated by all previous branches (or tags for very old versions without branch) can be opened by it.
 # Return value 0 means all regression tests pass. 1 if not pass.
+#
+# Environment options:
+#  SHORT_TEST=1 - Test only the oldest branch for each kind of test. This is
+#    a good choice for PR validation as it is relatively fast and will find
+#    most issues.
+#  USE_SSH=1 - Connect to GitHub with ssh instead of https
+
+if ! git diff-index --quiet HEAD; then
+  echo "You have uncommitted changes. Aborting."
+  exit 1
+fi
+
+current_checkout_name=${1:-HEAD}
+# This allows the script to work even if with transient refs like "HEAD"
+current_checkout_hash="$(git rev-parse --quiet --verify $current_checkout_name)"
+
+if [ "$current_checkout_hash" == "" ]; then
+  echo "Not a recognized ref: $current_checkout_name"
+  exit 1
+fi
+
+# To restore to prior branch at the end
+orig_branch="$(git rev-parse --abbrev-ref HEAD)"
+tmp_branch=_tmp_format_compatible
+tmp_origin=_tmp_origin
+
+# Don't depend on what current "origin" might be
+set -e
+git remote remove $tmp_origin 2>/dev/null || true
+if [ "$USE_SSH" ]; then
+  git remote add $tmp_origin "git@github.com:facebook/rocksdb.git"
+else
+  git remote add $tmp_origin "https://github.com/facebook/rocksdb.git"
+fi
+git fetch $tmp_origin
 
-scriptpath=`dirname $BASH_SOURCE`
-test_dir=${TEST_TMPDIR:-"/tmp"}"/format_compatible_check"
+cleanup() {
+  echo "== Cleaning up"
+  git checkout "$orig_branch" || true
+  git branch -D $tmp_branch || true
+  git remote remove $tmp_origin || true
+}
+trap cleanup EXIT # Always clean up, even on failure or Ctrl+C
+
+scriptpath=`dirname ${BASH_SOURCE[0]}`
+
+test_dir=${TEST_TMPDIR:-"/tmp"}"/rocksdb_format_compatible_$USER"
+rm -rf ${test_dir:?}
+
+# For saving current version of scripts as we checkout different versions to test
 script_copy_dir=$test_dir"/script_copy"
-input_data_path=$test_dir"/test_data_input/"
+mkdir -p $script_copy_dir
+cp -f $scriptpath/*.sh $script_copy_dir
 
-python_bin=$(which python3 || which python || echo python3)
+# For shared raw input data
+input_data_path=$test_dir"/test_data_input"
+mkdir -p $input_data_path
+# For external sst ingestion test
+ext_test_dir=$test_dir"/ext"
+mkdir -p $ext_test_dir
+# For DB dump test
+db_test_dir=$test_dir"/db"
+mkdir -p $db_test_dir
+# For backup/restore test (uses DB test)
+bak_test_dir=$test_dir"/bak"
+mkdir -p $bak_test_dir
 
-mkdir $test_dir || true
-mkdir $input_data_path || true
-rm -rf $script_copy_dir
-cp $scriptpath $script_copy_dir -rf
+python_bin=$(which python3 || which python || echo python3)
 
 # Generate random files.
 for i in {1..6}
@@ -55,11 +113,55 @@ with open('${sorted_input_data}', 'w') as f:
     print(k + " ==> " + v, file=f)
 EOF
 
-declare -a backward_compatible_checkout_objs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.7.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb" "5.11.fb" "5.12.fb" "5.13.fb" "5.14.fb" "5.15.fb")
-declare -a forward_compatible_checkout_objs=() # N/A at the moment
-declare -a forward_compatible_with_options_checkout_objs=("5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb" "6.6.fb" "6.7.fb" "6.8.fb" "6.9.fb" "6.10.fb" "6.11.fb" "6.12.fb" "6.13.fb")
-declare -a checkout_objs=(${backward_compatible_checkout_objs[@]} ${forward_compatible_checkout_objs[@]} ${forward_compatible_with_options_checkout_objs[@]})
-declare -a extern_sst_ingestion_compatible_checkout_objs=("5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb" "6.6.fb" "6.7.fb" "6.8.fb" "6.9.fb" "6.10.fb" "6.11.fb" "6.12.fb" "6.13.fb")
+# db_backward_only_refs defined below the rest
+
+# To check for DB forward compatibility with loading options (old version
+# reading data from new), as well as backward compatibility
+declare -a db_forward_with_options_refs=("6.6.fb" "6.7.fb" "6.8.fb" "6.9.fb" "6.10.fb" "6.11.fb" "6.12.fb" "6.13.fb" "6.14.fb" "6.15.fb" "6.16.fb" "6.17.fb" "6.18.fb" "6.19.fb" "6.20.fb")
+# To check for DB forward compatibility without loading options (in addition
+# to the "with loading options" set), as well as backward compatibility
+declare -a db_forward_no_options_refs=() # N/A at the moment
+
+# To check for SST ingestion backward compatibility (new version reading
+# data from old) (ldb ingest_extern_sst added in 5.16.x, back-ported to
+# 5.14.x, 5.15.x)
+declare -a ext_backward_only_refs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb" "6.0.fb" "6.1.fb" "6.2.fb" "6.3.fb" "6.4.fb" "6.5.fb")
+# To check for SST ingestion forward compatibility (old version reading
+# data from new) as well as backward compatibility
+declare -a ext_forward_refs=("${db_forward_no_options_refs[@]}" "${db_forward_with_options_refs[@]}")
+
+# To check for backup backward compatibility (new version reading data
+# from old) (ldb backup/restore added in 4.11.x)
+declare -a bak_backward_only_refs=("4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb" "5.11.fb" "5.12.fb" "5.13.fb" "${ext_backward_only_refs[@]}")
+# To check for backup forward compatibility (old version reading data
+# from new) as well as backward compatibility
+declare -a bak_forward_refs=("${db_forward_no_options_refs[@]}" "${db_forward_with_options_refs[@]}")
+
+# Branches (git refs) to check for DB backward compatibility (new version
+# reading data from old) (in addition to the "forward compatible" list)
+# NOTE: 2.7.fb.branch shows assertion violation in some configurations
+declare -a db_backward_only_refs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.7.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "${bak_backward_only_refs[@]}")
+
+if [ "$SHORT_TEST" ]; then
+  # Use only the first (if exists) of each list
+  db_backward_only_refs=(${db_backward_only_refs[0]})
+  db_forward_no_options_refs=(${db_forward_no_options_refs[0]})
+  db_forward_with_options_refs=(${db_forward_with_options_refs[0]})
+  ext_backward_only_refs=(${ext_backward_only_refs[0]})
+  ext_forward_refs=(${ext_forward_refs[0]})
+  bak_backward_only_refs=(${bak_backward_only_refs[0]})
+  bak_forward_refs=(${bak_forward_refs[0]})
+fi
+
+# De-duplicate & accumulate
+declare -a checkout_refs=()
+for checkout_ref in "${db_backward_only_refs[@]}" "${db_forward_no_options_refs[@]}" "${db_forward_with_options_refs[@]}" "${ext_backward_only_refs[@]}" "${ext_forward_refs[@]}" "${bak_backward_only_refs[@]}" "${bak_forward_refs[@]}"
+do
+  if [ ! -e $db_test_dir/$checkout_ref ]; then
+    mkdir -p $db_test_dir/$checkout_ref
+    checkout_refs+=($checkout_ref)
+  fi
+done
 
 generate_db()
 {
@@ -105,89 +207,155 @@ ingest_external_sst()
     set -e
 }
 
-# Sandcastle sets us up with a remote that is just another directory on the same
-# machine and doesn't have our branches. Need to fetch them so checkout works.
-# Remote add may fail if added previously (we don't cleanup).
-git remote add github_origin "https://github.com/facebook/rocksdb.git"
-set -e
-git fetch github_origin
+backup_db()
+{
+    set +e
+    $script_copy_dir/backup_db.sh $1 $2
+    if [ $? -ne 0 ]; then
+        echo ==== Error backing up DB $1 to $2 ====
+        exit 1
+    fi
+    set -e
+}
 
-# Compatibility test for external SST file ingestion
-for checkout_obj in "${extern_sst_ingestion_compatible_checkout_objs[@]}"
-do
-  echo == Generating DB with extern SST file in "$checkout_obj" ...
-  git checkout github_origin/$checkout_obj -b $checkout_obj
-  make clean
-  DISABLE_WARNING_AS_ERROR=1 make ldb -j32
-  write_external_sst $input_data_path $test_dir/$checkout_obj $test_dir/$checkout_obj
-  ingest_external_sst $test_dir/$checkout_obj $test_dir/$checkout_obj
-done
+restore_db()
+{
+    set +e
+    $script_copy_dir/restore_db.sh $1 $2
+    if [ $? -ne 0 ]; then
+        echo ==== Error restoring from $1 to $2 ====
+        exit 1
+    fi
+    set -e
+}
+
+member_of_array()
+{
+  local e match="$1"
+  shift
+  for e; do [[ "$e" == "$match" ]] && return 0; done
+  return 1
+}
+
+# General structure from here:
+# * Check out, build, and do stuff with the "current" branch.
+# * For each older branch under consideration,
+#   * Check out, build, and do stuff with it, potentially using data
+#     generated from "current" branch.
+# * (Again) check out, build, and do (other) stuff with the "current"
+#    branch, potentially using data from older branches.
+#
+# This way, we only do at most n+1 checkout+build steps, without the
+# need to stash away executables.
 
-checkout_flag=${1:-"master"}
+# Decorate name
+current_checkout_name="$current_checkout_name ($current_checkout_hash)"
 
-echo == Building $checkout_flag debug
-git checkout github_origin/$checkout_flag -b tmp-$checkout_flag
+echo "== Building $current_checkout_name debug"
+git checkout -B $tmp_branch $current_checkout_hash
 make clean
 DISABLE_WARNING_AS_ERROR=1 make ldb -j32
-compare_base_db_dir=$test_dir"/base_db_dir"
-write_external_sst $input_data_path $compare_base_db_dir $compare_base_db_dir
-ingest_external_sst $compare_base_db_dir $compare_base_db_dir
 
-for checkout_obj in "${extern_sst_ingestion_compatible_checkout_objs[@]}"
+echo "== Using $current_checkout_name, generate DB with extern SST and ingest"
+current_ext_test_dir=$ext_test_dir"/current"
+write_external_sst $input_data_path ${current_ext_test_dir}_pointless $current_ext_test_dir
+ingest_external_sst ${current_ext_test_dir}_ingest $current_ext_test_dir
+
+echo "== Generating DB from $current_checkout_name ..."
+current_db_test_dir=$db_test_dir"/current"
+generate_db $input_data_path $current_db_test_dir
+
+echo "== Creating backup of DB from $current_checkout_name ..."
+current_bak_test_dir=$bak_test_dir"/current"
+backup_db $current_db_test_dir $current_bak_test_dir
+
+for checkout_ref in "${checkout_refs[@]}"
 do
-  echo == Build "$checkout_obj" and try to open DB generated using $checkout_flag
-  git checkout $checkout_obj
+  echo "== Building $checkout_ref debug"
+  git reset --hard $tmp_origin/$checkout_ref
   make clean
   DISABLE_WARNING_AS_ERROR=1 make ldb -j32
-  compare_db $test_dir/$checkout_obj $compare_base_db_dir db_dump.txt 1 1
-  git checkout tmp-$checkout_flag
-  # Clean up
-  git branch -D $checkout_obj
-done
 
-echo == Finish compatibility test for SST ingestion.
+  # We currently assume DB backward compatibility for every branch listed
+  echo "== Use $checkout_ref to generate a DB ..."
+  generate_db $input_data_path $db_test_dir/$checkout_ref
 
-for checkout_obj in "${checkout_objs[@]}"
-do
-   echo == Generating DB from "$checkout_obj" ...
-   git checkout github_origin/$checkout_obj -b $checkout_obj
-   make clean
-   DISABLE_WARNING_AS_ERROR=1 make ldb -j32
-   generate_db $input_data_path $test_dir/$checkout_obj
-done
+  if member_of_array "$checkout_ref" "${ext_backward_only_refs[@]}" ||
+    member_of_array "$checkout_ref" "${ext_forward_refs[@]}"
+  then
+    echo "== Use $checkout_ref to generate DB with extern SST file"
+    write_external_sst $input_data_path $ext_test_dir/${checkout_ref}_pointless $ext_test_dir/$checkout_ref
+  fi
+
+  if member_of_array "$checkout_ref" "${ext_forward_refs[@]}"
+  then
+    echo "== Use $checkout_ref to ingest extern SST file and compare vs. $current_checkout_name"
+    ingest_external_sst $ext_test_dir/${checkout_ref}_ingest $ext_test_dir/$checkout_ref
+    compare_db $ext_test_dir/${checkout_ref}_ingest ${current_ext_test_dir}_ingest db_dump.txt 1 1
+
+    rm -rf ${ext_test_dir:?}/${checkout_ref}_ingest
+    echo "== Use $checkout_ref to ingest extern SST file from $current_checkout_name"
+    ingest_external_sst $ext_test_dir/${checkout_ref}_ingest $current_ext_test_dir
+    compare_db $ext_test_dir/${checkout_ref}_ingest ${current_ext_test_dir}_ingest db_dump.txt 1 1
+  fi
+
+  if member_of_array "$checkout_ref" "${db_forward_no_options_refs[@]}" ||
+    member_of_array "$checkout_ref" "${db_forward_with_options_refs[@]}"
+  then
+    echo "== Use $checkout_ref to open DB generated using $current_checkout_name..."
+    compare_db $db_test_dir/$checkout_ref $current_db_test_dir forward_${checkout_ref}_dump.txt 0
+  fi
 
-checkout_flag=${1:-"master"}
+  if member_of_array "$checkout_ref" "${db_forward_with_options_refs[@]}"
+  then
+    echo "== Use $checkout_ref to open DB generated using $current_checkout_name with its options..."
+    compare_db $db_test_dir/$checkout_ref $current_db_test_dir forward_${checkout_ref}_dump.txt 1 1
+  fi
 
-echo == Building $checkout_flag debug
-git checkout tmp-$checkout_flag
+  if member_of_array "$checkout_ref" "${bak_backward_only_refs[@]}" ||
+    member_of_array "$checkout_ref" "${bak_forward_refs[@]}"
+  then
+    echo "== Use $checkout_ref to backup DB"
+    backup_db $db_test_dir/$checkout_ref $bak_test_dir/$checkout_ref
+  fi
+
+  if member_of_array "$checkout_ref" "${bak_forward_refs[@]}"
+  then
+    echo "== Use $checkout_ref to restore DB from $current_checkout_name"
+    rm -rf ${db_test_dir:?}/$checkout_ref
+    restore_db $current_bak_test_dir $db_test_dir/$checkout_ref
+    compare_db $db_test_dir/$checkout_ref $current_db_test_dir forward_${checkout_ref}_dump.txt 0
+  fi
+done
+
+echo "== Building $current_checkout_name debug (again, final)"
+git reset --hard $current_checkout_hash
 make clean
 DISABLE_WARNING_AS_ERROR=1 make ldb -j32
-compare_base_db_dir=$test_dir"/base_db_dir"
-echo == Generate compare base DB to $compare_base_db_dir
-generate_db $input_data_path $compare_base_db_dir
 
-for checkout_obj in "${checkout_objs[@]}"
+for checkout_ref in "${checkout_refs[@]}"
 do
-   echo == Opening DB from "$checkout_obj" using debug build of $checkout_flag ...
-   compare_db $test_dir/$checkout_obj $compare_base_db_dir db_dump.txt 1 0
-done
+  # We currently assume DB backward compatibility for every branch listed
+  echo "== Use $current_checkout_name to open DB generated using $checkout_ref..."
+  compare_db $db_test_dir/$checkout_ref $current_db_test_dir db_dump.txt 1 0
 
-for checkout_obj in "${forward_compatible_checkout_objs[@]}"
-do
-   echo == Build "$checkout_obj" and try to open DB generated using $checkout_flag...
-   git checkout $checkout_obj
-   make clean
-   DISABLE_WARNING_AS_ERROR=1 make ldb -j32
-   compare_db $test_dir/$checkout_obj $compare_base_db_dir forward_${checkout_obj}_dump.txt 0
-done
+  if member_of_array "$checkout_ref" "${ext_backward_only_refs[@]}" ||
+    member_of_array "$checkout_ref" "${ext_forward_refs[@]}"
+  then
+    rm -rf ${ext_test_dir:?}/${checkout_ref}_ingest
+    echo "== Use $current_checkout_name to ingest extern SST file from $checkout_ref"
+    ingest_external_sst $ext_test_dir/${checkout_ref}_ingest $current_ext_test_dir
+    compare_db $ext_test_dir/${checkout_ref}_ingest ${current_ext_test_dir}_ingest db_dump.txt 1 1
+  fi
 
-for checkout_obj in "${forward_compatible_with_options_checkout_objs[@]}"
-do
-   echo == Build "$checkout_obj" and try to open DB generated using $checkout_flag with its options...
-   git checkout $checkout_obj
-   make clean
-   DISABLE_WARNING_AS_ERROR=1 make ldb -j32
-   compare_db $test_dir/$checkout_obj $compare_base_db_dir forward_${checkout_obj}_dump.txt 1 1
+  if member_of_array "$checkout_ref" "${bak_backward_only_refs[@]}" ||
+    member_of_array "$checkout_ref" "${bak_forward_refs[@]}"
+  then
+    echo "== Use $current_checkout_name to restore DB from $checkout_ref"
+    rm -rf ${db_test_dir:?}/$checkout_ref
+    restore_db $bak_test_dir/$checkout_ref $db_test_dir/$checkout_ref
+    compare_db $db_test_dir/$checkout_ref $current_db_test_dir db_dump.txt 1 0
+  fi
 done
 
 echo ==== Compatibility Test PASSED ====
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index b818c5bb057..27a60c67072 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -10,7 +10,6 @@
 #ifdef GFLAGS
 #ifdef NUMA
 #include <numa.h>
-#include <numaif.h>
 #endif
 #ifndef OS_WIN
 #include <unistd.h>
@@ -20,6 +19,14 @@
 #include <stdlib.h>
 #include <sys/types.h>
 
+#ifdef __APPLE__
+#include <mach/host_info.h>
+#include <mach/mach_host.h>
+#include <sys/sysctl.h>
+#endif
+#ifdef __FreeBSD__
+#include <sys/sysctl.h>
+#endif
 #include <atomic>
 #include <cinttypes>
 #include <condition_variable>
@@ -40,6 +47,7 @@
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/filter_policy.h"
@@ -48,11 +56,13 @@
 #include "rocksdb/perf_context.h"
 #include "rocksdb/persistent_cache.h"
 #include "rocksdb/rate_limiter.h"
+#include "rocksdb/secondary_cache.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/stats_history.h"
 #include "rocksdb/utilities/object_registry.h"
 #include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/options_type.h"
 #include "rocksdb/utilities/options_util.h"
 #include "rocksdb/utilities/sim_cache.h"
 #include "rocksdb/utilities/transaction.h"
@@ -60,6 +70,7 @@
 #include "rocksdb/write_batch.h"
 #include "test_util/testutil.h"
 #include "test_util/transaction_test_util.h"
+#include "tools/simulated_hybrid_file_system.h"
 #include "util/cast_util.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
@@ -105,6 +116,12 @@ DEFINE_string(
     "readreverse,"
     "compact,"
     "compactall,"
+    "flush,"
+#ifndef ROCKSDB_LITE
+    "compact0,"
+    "compact1,"
+    "waitforcompaction,"
+#endif
     "multireadrandom,"
     "mixgraph,"
     "readseq,"
@@ -194,9 +211,16 @@ DEFINE_string(
     "Meta operations:\n"
     "\tcompact     -- Compact the entire DB; If multiple, randomly choose one\n"
     "\tcompactall  -- Compact the entire DB\n"
+#ifndef ROCKSDB_LITE
+    "\tcompact0  -- compact L0 into L1\n"
+    "\tcompact1  -- compact L1 into L2\n"
+    "\twaitforcompaction - pause until compaction is (probably) done\n"
+#endif
+    "\tflush - flush the memtable\n"
     "\tstats       -- Print DB stats\n"
     "\tresetstats  -- Reset DB stats\n"
     "\tlevelstats  -- Print the number of files and bytes per level\n"
+    "\tmemstats  -- Print memtable stats\n"
     "\tsstables    -- Print sstable info\n"
     "\theapprofile -- Dump a heap profile (if supported by this port)\n"
     "\treplay      -- replay the trace file specified with trace_file\n"
@@ -326,6 +350,9 @@ DEFINE_int64(db_write_buffer_size,
 DEFINE_bool(cost_write_buffer_to_cache, false,
             "The usage of memtable is costed to the block cache");
 
+DEFINE_int64(arena_block_size, ROCKSDB_NAMESPACE::Options().arena_block_size,
+             "The size, in bytes, of one block in arena memory allocation.");
+
 DEFINE_int64(write_buffer_size, ROCKSDB_NAMESPACE::Options().write_buffer_size,
              "Number of bytes to buffer in memtable before compacting");
 
@@ -532,6 +559,10 @@ DEFINE_bool(block_align,
             ROCKSDB_NAMESPACE::BlockBasedTableOptions().block_align,
             "Align data blocks on page size");
 
+DEFINE_int64(prepopulate_block_cache, 0,
+             "Pre-populate hot/warm blocks in block cache. 0 to disable and 1 "
+             "to insert during flush");
+
 DEFINE_bool(use_data_block_hash_index, false,
             "if use kDataBlockBinaryAndHash "
             "instead of kDataBlockBinarySearch. "
@@ -571,8 +602,12 @@ DEFINE_int32(random_access_max_buffer_size, 1024 * 1024,
 DEFINE_int32(writable_file_max_buffer_size, 1024 * 1024,
              "Maximum write buffer for Writable File");
 
-DEFINE_int32(bloom_bits, -1, "Bloom filter bits per key. Negative means"
-             " use default settings.");
+DEFINE_int32(bloom_bits, -1,
+             "Bloom filter bits per key. Negative means use default."
+             "Zero disables.");
+
+DEFINE_bool(use_ribbon_filter, false, "Use Ribbon instead of Bloom filter");
+
 DEFINE_double(memtable_bloom_size_ratio, 0,
               "Ratio of memtable size used for bloom filter. 0 means no bloom "
               "filter.");
@@ -793,55 +828,91 @@ DEFINE_bool(fifo_compaction_allow_compaction, true,
 
 DEFINE_uint64(fifo_compaction_ttl, 0, "TTL for the SST Files in seconds.");
 
-// Blob DB Options
-DEFINE_bool(use_blob_db, false,
-            "Open a BlobDB instance. "
-            "Required for large value benchmark.");
+// Stacked BlobDB Options
+DEFINE_bool(use_blob_db, false, "[Stacked BlobDB] Open a BlobDB instance.");
 
 DEFINE_bool(
     blob_db_enable_gc,
     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().enable_garbage_collection,
-    "Enable BlobDB garbage collection.");
+    "[Stacked BlobDB] Enable BlobDB garbage collection.");
 
 DEFINE_double(
     blob_db_gc_cutoff,
     ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff,
-    "Cutoff ratio for BlobDB garbage collection.");
+    "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection.");
 
 DEFINE_bool(blob_db_is_fifo,
             ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().is_fifo,
-            "Enable FIFO eviction strategy in BlobDB.");
+            "[Stacked BlobDB] Enable FIFO eviction strategy in BlobDB.");
 
 DEFINE_uint64(blob_db_max_db_size,
               ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().max_db_size,
-              "Max size limit of the directory where blob files are stored.");
+              "[Stacked BlobDB] Max size limit of the directory where blob "
+              "files are stored.");
 
-DEFINE_uint64(
-    blob_db_max_ttl_range, 0,
-    "TTL range to generate BlobDB data (in seconds). 0 means no TTL.");
+DEFINE_uint64(blob_db_max_ttl_range, 0,
+              "[Stacked BlobDB] TTL range to generate BlobDB data (in "
+              "seconds). 0 means no TTL.");
 
-DEFINE_uint64(blob_db_ttl_range_secs,
-              ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().ttl_range_secs,
-              "TTL bucket size to use when creating blob files.");
+DEFINE_uint64(
+    blob_db_ttl_range_secs,
+    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().ttl_range_secs,
+    "[Stacked BlobDB] TTL bucket size to use when creating blob files.");
 
-DEFINE_uint64(blob_db_min_blob_size,
-              ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size,
-              "Smallest blob to store in a file. Blobs smaller than this "
-              "will be inlined with the key in the LSM tree.");
+DEFINE_uint64(
+    blob_db_min_blob_size,
+    ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().min_blob_size,
+    "[Stacked BlobDB] Smallest blob to store in a file. Blobs "
+    "smaller than this will be inlined with the key in the LSM tree.");
 
 DEFINE_uint64(blob_db_bytes_per_sync,
               ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().bytes_per_sync,
-              "Bytes to sync blob file at.");
+              "[Stacked BlobDB] Bytes to sync blob file at.");
 
 DEFINE_uint64(blob_db_file_size,
               ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().blob_file_size,
-              "Target size of each blob file.");
+              "[Stacked BlobDB] Target size of each blob file.");
 
-DEFINE_string(blob_db_compression_type, "snappy",
-              "Algorithm to use to compress blob in blob file");
+DEFINE_string(
+    blob_db_compression_type, "snappy",
+    "[Stacked BlobDB] Algorithm to use to compress blobs in blob files.");
 static enum ROCKSDB_NAMESPACE::CompressionType
     FLAGS_blob_db_compression_type_e = ROCKSDB_NAMESPACE::kSnappyCompression;
 
+#endif  // ROCKSDB_LITE
+
+// Integrated BlobDB options
+DEFINE_bool(
+    enable_blob_files,
+    ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().enable_blob_files,
+    "[Integrated BlobDB] Enable writing large values to separate blob files.");
+
+DEFINE_uint64(min_blob_size,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().min_blob_size,
+              "[Integrated BlobDB] The size of the smallest value to be stored "
+              "separately in a blob file.");
+
+DEFINE_uint64(blob_file_size,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions().blob_file_size,
+              "[Integrated BlobDB] The size limit for blob files.");
+
+DEFINE_string(blob_compression_type, "none",
+              "[Integrated BlobDB] The compression algorithm to use for large "
+              "values stored in blob files.");
+
+DEFINE_bool(enable_blob_garbage_collection,
+            ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
+                .enable_blob_garbage_collection,
+            "[Integrated BlobDB] Enable blob garbage collection.");
+
+DEFINE_double(blob_garbage_collection_age_cutoff,
+              ROCKSDB_NAMESPACE::AdvancedColumnFamilyOptions()
+                  .blob_garbage_collection_age_cutoff,
+              "[Integrated BlobDB] The cutoff in terms of blob file age for "
+              "garbage collection.");
+
+#ifndef ROCKSDB_LITE
+
 // Secondary DB instance Options
 DEFINE_bool(use_secondary_db, false,
             "Open a RocksDB secondary instance. A primary instance can be "
@@ -947,10 +1018,14 @@ DEFINE_int32(min_level_to_compress, -1, "If non-negative, compression starts"
 DEFINE_int32(compression_parallel_threads, 1,
              "Number of threads for parallel compression.");
 
+DEFINE_uint64(compression_max_dict_buffer_bytes,
+              ROCKSDB_NAMESPACE::CompressionOptions().max_dict_buffer_bytes,
+              "Maximum bytes to buffer to collect samples for dictionary.");
+
 static bool ValidateTableCacheNumshardbits(const char* flagname,
                                            int32_t value) {
-  if (0 >= value || value > 20) {
-    fprintf(stderr, "Invalid value for --%s: %d, must be  0 < val <= 20\n",
+  if (0 >= value || value >= 20) {
+    fprintf(stderr, "Invalid value for --%s: %d, must be  0 < val < 20\n",
             flagname, value);
     return false;
   }
@@ -975,6 +1050,10 @@ DEFINE_bool(keep_local_sst_files, true,
 DEFINE_string(hdfs, "",
               "Name of hdfs environment. Mutually exclusive with"
               " --env_uri and --fs_uri");
+DEFINE_string(simulate_hybrid_fs_file, "",
+              "File for Store Metadata for Simulate hybrid FS. Empty means "
+              "disable the feature. Now, if it is set, "
+              "bottommost_temperature is set to kWarm.");
 
 static std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard;
 
@@ -1004,15 +1083,6 @@ DEFINE_int32(thread_status_per_interval, 0,
 DEFINE_int32(perf_level, ROCKSDB_NAMESPACE::PerfLevel::kDisable,
              "Level of perf collection");
 
-#ifndef ROCKSDB_LITE
-static ROCKSDB_NAMESPACE::Env* GetCompositeEnv(
-    std::shared_ptr<ROCKSDB_NAMESPACE::FileSystem> fs) {
-  static std::shared_ptr<ROCKSDB_NAMESPACE::Env> composite_env =
-      ROCKSDB_NAMESPACE::NewCompositeEnv(fs);
-  return composite_env.get();
-}
-#endif
-
 static bool ValidateRateLimit(const char* flagname, double value) {
   const double EPSILON = 1e-10;
   if ( value < -EPSILON ) {
@@ -1401,6 +1471,12 @@ DEFINE_bool(read_with_latest_user_timestamp, true,
             "If true, always use the current latest timestamp for read. If "
             "false, choose a random timestamp from the past.");
 
+#ifndef ROCKSDB_LITE
+DEFINE_string(secondary_cache_uri, "",
+              "Full URI for creating a custom secondary cache object");
+static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
+#endif  // ROCKSDB_LITE
+
 static const bool FLAGS_soft_rate_limit_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_soft_rate_limit, &ValidateRateLimit);
 
@@ -1535,6 +1611,12 @@ class ReportFileOpEnv : public EnvWrapper {
         return rv;
       }
 
+      Status Append(
+          const Slice& data,
+          const DataVerificationInfo& /* verification_info */) override {
+        return Append(data);
+      }
+
       Status Truncate(uint64_t size) override { return target_->Truncate(size); }
       Status Close() override { return target_->Close(); }
       Status Flush() override { return target_->Flush(); }
@@ -1859,7 +1941,8 @@ class ReporterAgent {
  private:
   std::string Header() const { return "secs_elapsed,interval_qps"; }
   void SleepAndReport() {
-    auto time_started = env_->NowMicros();
+    auto* clock = env_->GetSystemClock().get();
+    auto time_started = clock->NowMicros();
     while (true) {
       {
         std::unique_lock<std::mutex> lk(mutex_);
@@ -1874,7 +1957,7 @@ class ReporterAgent {
       auto total_ops_done_snapshot = total_ops_done_.load();
       // round the seconds elapsed
       auto secs_elapsed =
-          (env_->NowMicros() - time_started + kMicrosInSecond / 2) /
+          (clock->NowMicros() - time_started + kMicrosInSecond / 2) /
           kMicrosInSecond;
       std::string report = ToString(secs_elapsed) + "," +
                            ToString(total_ops_done_snapshot - last_report_) +
@@ -1937,6 +2020,7 @@ static std::unordered_map<OperationType, std::string, std::hash<unsigned char>>
 class CombinedStats;
 class Stats {
  private:
+  SystemClock* clock_;
   int id_;
   uint64_t start_ = 0;
   uint64_t sine_interval_;
@@ -1956,7 +2040,7 @@ class Stats {
   friend class CombinedStats;
 
  public:
-  Stats() { Start(-1); }
+  Stats() : clock_(FLAGS_env->GetSystemClock().get()) { Start(-1); }
 
   void SetReporterAgent(ReporterAgent* reporter_agent) {
     reporter_agent_ = reporter_agent;
@@ -1971,8 +2055,8 @@ class Stats {
     last_report_done_ = 0;
     bytes_ = 0;
     seconds_ = 0;
-    start_ = FLAGS_env->NowMicros();
-    sine_interval_ = FLAGS_env->NowMicros();
+    start_ = clock_->NowMicros();
+    sine_interval_ = clock_->NowMicros();
     finish_ = start_;
     last_report_finish_ = start_;
     message_.clear();
@@ -2004,7 +2088,7 @@ class Stats {
   }
 
   void Stop() {
-    finish_ = FLAGS_env->NowMicros();
+    finish_ = clock_->NowMicros();
     seconds_ = (finish_ - start_) * 1e-6;
   }
 
@@ -2024,7 +2108,7 @@ class Stats {
         "ElapsedTime", "Stage", "State", "OperationProperties");
 
     int64_t current_time = 0;
-    FLAGS_env->GetCurrentTime(&current_time);
+    clock_->GetCurrentTime(&current_time).PermitUncheckedError();
     for (auto ts : thread_list) {
       fprintf(stderr, "%18" PRIu64 " %10s %12s %20s %13s %45s %12s",
           ts.thread_id,
@@ -2045,9 +2129,7 @@ class Stats {
     }
   }
 
-  void ResetSineInterval() {
-    sine_interval_ = FLAGS_env->NowMicros();
-  }
+  void ResetSineInterval() { sine_interval_ = clock_->NowMicros(); }
 
   uint64_t GetSineInterval() {
     return sine_interval_;
@@ -2059,7 +2141,7 @@ class Stats {
 
   void ResetLastOpTime() {
     // Set to now to avoid latency from calls to SleepForMicroseconds
-    last_op_finish_ = FLAGS_env->NowMicros();
+    last_op_finish_ = clock_->NowMicros();
   }
 
   void FinishedOps(DBWithColumnFamilies* db_with_cfh, DB* db, int64_t num_ops,
@@ -2068,7 +2150,7 @@ class Stats {
       reporter_agent_->ReportFinishedOps(num_ops);
     }
     if (FLAGS_histogram) {
-      uint64_t now = FLAGS_env->NowMicros();
+      uint64_t now = clock_->NowMicros();
       uint64_t micros = now - last_op_finish_;
 
       if (hist_.find(op_type) == hist_.end())
@@ -2097,7 +2179,7 @@ class Stats {
         else                            next_report_ += 100000;
         fprintf(stderr, "... finished %" PRIu64 " ops%30s\r", done_, "");
       } else {
-        uint64_t now = FLAGS_env->NowMicros();
+        uint64_t now = clock_->NowMicros();
         int64_t usecs_since_last = now - last_report_finish_;
 
         // Determine whether to print status where interval is either
@@ -2109,15 +2191,13 @@ class Stats {
           next_report_ += FLAGS_stats_interval;
 
         } else {
-
           fprintf(stderr,
-                  "%s ... thread %d: (%" PRIu64 ",%" PRIu64 ") ops and "
+                  "%s ... thread %d: (%" PRIu64 ",%" PRIu64
+                  ") ops and "
                   "(%.1f,%.1f) ops/second in (%.6f,%.6f) seconds\n",
-                  FLAGS_env->TimeToString(now/1000000).c_str(),
-                  id_,
+                  clock_->TimeToString(now / 1000000).c_str(), id_,
                   done_ - last_report_done_, done_,
-                  (done_ - last_report_done_) /
-                  (usecs_since_last / 1000000.0),
+                  (done_ - last_report_done_) / (usecs_since_last / 1000000.0),
                   done_ / ((now - start_) / 1000000.0),
                   (now - last_report_finish_) / 1000000.0,
                   (now - start_) / 1000000.0);
@@ -2404,7 +2484,6 @@ class Benchmark {
  private:
   std::shared_ptr<Cache> cache_;
   std::shared_ptr<Cache> compressed_cache_;
-  std::shared_ptr<const FilterPolicy> filter_policy_;
   const SliceTransform* prefix_extractor_;
   DBWithColumnFamilies db_;
   std::vector<DBWithColumnFamilies> multi_dbs_;
@@ -2431,7 +2510,7 @@ class Benchmark {
   int64_t readwrites_;
   int64_t merge_keys_;
   bool report_file_operations_;
-  bool use_blob_db_;
+  bool use_blob_db_;  // Stacked BlobDB
   std::vector<std::string> keys_;
 
   class ErrorHandlerListener : public EventListener {
@@ -2626,7 +2705,7 @@ class Benchmark {
     fprintf(stderr, "RocksDB:    version %d.%d\n",
             kMajorVersion, kMinorVersion);
 
-#if defined(__linux)
+#if defined(__linux) || defined(__APPLE__) || defined(__FreeBSD__)
     time_t now = time(nullptr);
     char buf[52];
     // Lint complains about ctime() usage, so replace it with ctime_r(). The
@@ -2634,6 +2713,7 @@ class Benchmark {
     fprintf(stderr, "Date:       %s",
             ctime_r(&now, buf));  // ctime_r() adds newline
 
+#if defined(__linux)
     FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
     if (cpuinfo != nullptr) {
       char line[1000];
@@ -2658,6 +2738,45 @@ class Benchmark {
       fprintf(stderr, "CPU:        %d * %s\n", num_cpus, cpu_type.c_str());
       fprintf(stderr, "CPUCache:   %s\n", cache_size.c_str());
     }
+#elif defined(__APPLE__)
+    struct host_basic_info h;
+    size_t hlen = HOST_BASIC_INFO_COUNT;
+    if (host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&h,
+                  (uint32_t*)&hlen) == KERN_SUCCESS) {
+      std::string cpu_type;
+      std::string cache_size;
+      size_t hcache_size;
+      hlen = sizeof(hcache_size);
+      if (sysctlbyname("hw.cachelinesize", &hcache_size, &hlen, NULL, 0) == 0) {
+        cache_size = std::to_string(hcache_size);
+      }
+      switch (h.cpu_type) {
+        case CPU_TYPE_X86_64:
+          cpu_type = "x86_64";
+          break;
+        case CPU_TYPE_ARM64:
+          cpu_type = "arm64";
+          break;
+        default:
+          break;
+      }
+      fprintf(stderr, "CPU:        %d * %s\n", h.max_cpus, cpu_type.c_str());
+      fprintf(stderr, "CPUCache:   %s\n", cache_size.c_str());
+    }
+#elif defined(__FreeBSD__)
+    int ncpus;
+    size_t len = sizeof(ncpus);
+    int mib[2] = {CTL_HW, HW_NCPU};
+    if (sysctl(mib, 2, &ncpus, &len, nullptr, 0) == 0) {
+      char cpu_type[16];
+      len = sizeof(cpu_type) - 1;
+      mib[1] = HW_MACHINE;
+      if (sysctl(mib, 2, cpu_type, &len, nullptr, 0) == 0) cpu_type[len] = 0;
+
+      fprintf(stderr, "CPU:        %d * %s\n", ncpus, cpu_type);
+      // no programmatic way to get the cache line size except on PPC
+    }
+#endif
 #endif
   }
 
@@ -2718,22 +2837,39 @@ class Benchmark {
       }
       return cache;
     } else {
-      if (FLAGS_use_cache_memkind_kmem_allocator) {
+      LRUCacheOptions opts(
+          static_cast<size_t>(capacity), FLAGS_cache_numshardbits,
+          false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio,
 #ifdef MEMKIND
-        return NewLRUCache(
-            static_cast<size_t>(capacity), FLAGS_cache_numshardbits,
-            false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio,
-            std::make_shared<MemkindKmemAllocator>());
-
+          FLAGS_use_cache_memkind_kmem_allocator
+              ? std::make_shared<MemkindKmemAllocator>()
+              : nullptr
 #else
+          nullptr
+#endif
+      );
+      if (FLAGS_use_cache_memkind_kmem_allocator) {
+#ifndef MEMKIND
         fprintf(stderr, "Memkind library is not linked with the binary.");
         exit(1);
 #endif
-      } else {
-        return NewLRUCache(
-            static_cast<size_t>(capacity), FLAGS_cache_numshardbits,
-            false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio);
       }
+#ifndef ROCKSDB_LITE
+      if (!FLAGS_secondary_cache_uri.empty()) {
+        Status s =
+            ObjectRegistry::NewInstance()->NewSharedObject<SecondaryCache>(
+                FLAGS_secondary_cache_uri, &secondary_cache);
+        if (secondary_cache == nullptr) {
+          fprintf(
+              stderr,
+              "No secondary cache registered matching string: %s status=%s\n",
+              FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str());
+          exit(1);
+        }
+        opts.secondary_cache = secondary_cache;
+      }
+#endif  // ROCKSDB_LITE
+      return NewLRUCache(opts);
     }
   }
 
@@ -2741,10 +2877,6 @@ class Benchmark {
   Benchmark()
       : cache_(NewCache(FLAGS_cache_size)),
         compressed_cache_(NewCache(FLAGS_compressed_cache_size)),
-        filter_policy_(FLAGS_bloom_bits >= 0
-                           ? NewBloomFilterPolicy(FLAGS_bloom_bits,
-                                                  FLAGS_use_block_based_filter)
-                           : nullptr),
         prefix_extractor_(NewFixedPrefixTransform(FLAGS_prefix_size)),
         num_(FLAGS_num),
         key_size_(FLAGS_key_size),
@@ -2762,9 +2894,9 @@ class Benchmark {
         merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys),
         report_file_operations_(FLAGS_report_file_operations),
 #ifndef ROCKSDB_LITE
-        use_blob_db_(FLAGS_use_blob_db)
+        use_blob_db_(FLAGS_use_blob_db)  // Stacked BlobDB
 #else
-        use_blob_db_(false)
+        use_blob_db_(false)  // Stacked BlobDB
 #endif  // !ROCKSDB_LITE
   {
     // use simcache instead of cache
@@ -2807,6 +2939,7 @@ class Benchmark {
       }
 #ifndef ROCKSDB_LITE
       if (use_blob_db_) {
+        // Stacked BlobDB
         blob_db::DestroyBlobDB(FLAGS_db, options, blob_db::BlobDBOptions());
       }
 #endif  // !ROCKSDB_LITE
@@ -2831,8 +2964,13 @@ class Benchmark {
 
   ~Benchmark() {
     db_.DeleteDBs();
+    for (auto db : multi_dbs_) {
+      db.DeleteDBs();
+    }
     delete prefix_extractor_;
     if (cache_.get() != nullptr) {
+      // Clear cache reference first
+      open_options_.write_buffer_manager.reset();
       // this will leak, but we're shutting down so nobody cares
       cache_->DisownData();
     }
@@ -3183,6 +3321,16 @@ class Benchmark {
         method = &Benchmark::Compact;
       } else if (name == "compactall") {
         CompactAll();
+#ifndef ROCKSDB_LITE
+      } else if (name == "compact0") {
+        CompactLevel(0);
+      } else if (name == "compact1") {
+        CompactLevel(1);
+      } else if (name == "waitforcompaction") {
+        WaitForCompaction();
+#endif
+      } else if (name == "flush") {
+        Flush();
       } else if (name == "crc32c") {
         method = &Benchmark::Crc32c;
       } else if (name == "xxhash") {
@@ -3218,6 +3366,14 @@ class Benchmark {
         VerifyDBFromDB(FLAGS_truth_db);
       } else if (name == "levelstats") {
         PrintStats("rocksdb.levelstats");
+      } else if (name == "memstats") {
+        std::vector<std::string> keys{"rocksdb.num-immutable-mem-table",
+                                      "rocksdb.cur-size-active-mem-table",
+                                      "rocksdb.cur-size-all-mem-tables",
+                                      "rocksdb.size-all-mem-tables",
+                                      "rocksdb.num-entries-active-mem-table",
+                                      "rocksdb.num-entries-imm-mem-tables"};
+        PrintStats(keys);
       } else if (name == "sstables") {
         PrintStats("rocksdb.sstables");
       } else if (name == "stats_history") {
@@ -3686,6 +3842,7 @@ class Benchmark {
       options.write_buffer_manager.reset(
           new WriteBufferManager(FLAGS_db_write_buffer_size, cache_));
     }
+    options.arena_block_size = FLAGS_arena_block_size;
     options.write_buffer_size = FLAGS_write_buffer_size;
     options.max_write_buffer_number = FLAGS_max_write_buffer_number;
     options.min_write_buffer_number_to_merge =
@@ -3790,7 +3947,7 @@ class Benchmark {
 
       int bloom_bits_per_key = FLAGS_bloom_bits;
       if (bloom_bits_per_key < 0) {
-        bloom_bits_per_key = 0;
+        bloom_bits_per_key = PlainTableOptions().bloom_bits_per_key;
       }
 
       PlainTableOptions plain_table_options;
@@ -3897,13 +4054,27 @@ class Benchmark {
       block_based_options.block_restart_interval = FLAGS_block_restart_interval;
       block_based_options.index_block_restart_interval =
           FLAGS_index_block_restart_interval;
-      block_based_options.filter_policy = filter_policy_;
       block_based_options.format_version =
           static_cast<uint32_t>(FLAGS_format_version);
       block_based_options.read_amp_bytes_per_bit = FLAGS_read_amp_bytes_per_bit;
       block_based_options.enable_index_compression =
           FLAGS_enable_index_compression;
       block_based_options.block_align = FLAGS_block_align;
+      BlockBasedTableOptions::PrepopulateBlockCache prepopulate_block_cache =
+          block_based_options.prepopulate_block_cache;
+      switch (FLAGS_prepopulate_block_cache) {
+        case 0:
+          prepopulate_block_cache =
+              BlockBasedTableOptions::PrepopulateBlockCache::kDisable;
+          break;
+        case 1:
+          prepopulate_block_cache =
+              BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly;
+          break;
+        default:
+          fprintf(stderr, "Unknown prepopulate block cache mode\n");
+      }
+      block_based_options.prepopulate_block_cache = prepopulate_block_cache;
       if (FLAGS_use_data_block_hash_index) {
         block_based_options.data_block_index_type =
             ROCKSDB_NAMESPACE::BlockBasedTableOptions::kDataBlockBinaryAndHash;
@@ -3972,6 +4143,9 @@ class Benchmark {
     options.level0_slowdown_writes_trigger =
       FLAGS_level0_slowdown_writes_trigger;
     options.compression = FLAGS_compression_type_e;
+    if (FLAGS_simulate_hybrid_fs_file != "") {
+      options.bottommost_temperature = Temperature::kWarm;
+    }
     options.sample_for_compression = FLAGS_sample_for_compression;
     options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
     options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
@@ -4066,6 +4240,17 @@ class Benchmark {
       options.comparator = ROCKSDB_NAMESPACE::test::ComparatorWithU64Ts();
     }
 
+    // Integrated BlobDB
+    options.enable_blob_files = FLAGS_enable_blob_files;
+    options.min_blob_size = FLAGS_min_blob_size;
+    options.blob_file_size = FLAGS_blob_file_size;
+    options.blob_compression_type =
+        StringToCompressionType(FLAGS_blob_compression_type.c_str());
+    options.enable_blob_garbage_collection =
+        FLAGS_enable_blob_garbage_collection;
+    options.blob_garbage_collection_age_cutoff =
+        FLAGS_blob_garbage_collection_age_cutoff;
+
 #ifndef ROCKSDB_LITE
     if (FLAGS_readonly && FLAGS_transaction_db) {
       fprintf(stderr, "Cannot use readonly flag with transaction_db\n");
@@ -4102,6 +4287,8 @@ class Benchmark {
         FLAGS_compression_zstd_max_train_bytes;
     options.compression_opts.parallel_threads =
         FLAGS_compression_parallel_threads;
+    options.compression_opts.max_dict_buffer_bytes =
+        FLAGS_compression_max_dict_buffer_bytes;
     // If this is a block based table, set some related options
     auto table_options =
         options.table_factory->GetOptions<BlockBasedTableOptions>();
@@ -4109,9 +4296,16 @@ class Benchmark {
       if (FLAGS_cache_size) {
         table_options->block_cache = cache_;
       }
-      if (FLAGS_bloom_bits >= 0) {
-        table_options->filter_policy.reset(NewBloomFilterPolicy(
-            FLAGS_bloom_bits, FLAGS_use_block_based_filter));
+      if (FLAGS_bloom_bits < 0) {
+        table_options->filter_policy = BlockBasedTableOptions().filter_policy;
+      } else if (FLAGS_bloom_bits == 0) {
+        table_options->filter_policy.reset();
+      } else {
+        table_options->filter_policy.reset(
+            FLAGS_use_ribbon_filter
+                ? NewRibbonFilterPolicy(FLAGS_bloom_bits)
+                : NewBloomFilterPolicy(FLAGS_bloom_bits,
+                                       FLAGS_use_block_based_filter));
       }
     }
     if (FLAGS_row_cache_size) {
@@ -4152,6 +4346,7 @@ class Benchmark {
     }
 
     options.listeners.emplace_back(listener_);
+
     if (FLAGS_num_multi_db <= 1) {
       OpenDb(options, FLAGS_db, &db_);
     } else {
@@ -4291,6 +4486,7 @@ class Benchmark {
         db->db = ptr;
       }
     } else if (FLAGS_use_blob_db) {
+      // Stacked BlobDB
       blob_db::BlobDBOptions blob_db_options;
       blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
       blob_db_options.garbage_collection_cutoff = FLAGS_blob_db_gc_cutoff;
@@ -4509,6 +4705,7 @@ class Benchmark {
         Slice val = gen.Generate();
         if (use_blob_db_) {
 #ifndef ROCKSDB_LITE
+          // Stacked BlobDB
           blob_db::BlobDB* blobdb =
               static_cast<blob_db::BlobDB*>(db_with_cfh->db);
           if (FLAGS_blob_db_max_ttl_range > 0) {
@@ -4546,6 +4743,7 @@ class Benchmark {
                                  &expanded_keys[offset]);
               if (use_blob_db_) {
 #ifndef ROCKSDB_LITE
+                // Stacked BlobDB
                 s = db_with_cfh->db->Delete(write_options_,
                                             expanded_keys[offset]);
 #endif  //  ROCKSDB_LITE
@@ -4562,6 +4760,7 @@ class Benchmark {
                                &end_key);
             if (use_blob_db_) {
 #ifndef ROCKSDB_LITE
+              // Stacked BlobDB
               s = db_with_cfh->db->DeleteRange(
                   write_options_, db_with_cfh->db->DefaultColumnFamily(),
                   begin_key, end_key);
@@ -4594,6 +4793,7 @@ class Benchmark {
         }
       }
       if (!use_blob_db_) {
+        // Not stacked BlobDB
         s = db_with_cfh->db->Write(write_options_, &batch);
       }
       thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db,
@@ -5203,11 +5403,11 @@ class Benchmark {
         ts_ptr = &ts_ret;
       }
       Status s;
+      pinnable_val.Reset();
       if (FLAGS_num_column_families > 1) {
         s = db_with_cfh->db->Get(options, db_with_cfh->GetCfh(key_rand), key,
                                  &pinnable_val, ts_ptr);
       } else {
-        pinnable_val.Reset();
         s = db_with_cfh->db->Get(options,
                                  db_with_cfh->db->DefaultColumnFamily(), key,
                                  &pinnable_val, ts_ptr);
@@ -5266,7 +5466,7 @@ class Benchmark {
     }
 
     Duration duration(FLAGS_duration, reads_);
-    while (!duration.Done(1)) {
+    while (!duration.Done(entries_per_batch_)) {
       DB* db = SelectDB(thread);
       if (FLAGS_multiread_stride) {
         int64_t key = GetRandomKey(&thread->rand);
@@ -5593,12 +5793,12 @@ class Benchmark {
     }
   };
 
-  // The social graph wokrload mixed with Get, Put, Iterator queries.
+  // The social graph workload mixed with Get, Put, Iterator queries.
   // The value size and iterator length follow Pareto distribution.
   // The overall key access follow power distribution. If user models the
   // workload based on different key-ranges (or different prefixes), user
   // can use two-term-exponential distribution to fit the workload. User
-  // needs to decides the ratio between Get, Put, Iterator queries before
+  // needs to decide the ratio between Get, Put, Iterator queries before
   // starting the benchmark.
   void MixGraph(ThreadState* thread) {
     int64_t read = 0;  // including single gets and Next of iterators
@@ -7212,6 +7412,167 @@ class Benchmark {
     }
   }
 
+#ifndef ROCKSDB_LITE
+  void WaitForCompactionHelper(DBWithColumnFamilies& db) {
+    // This is an imperfect way of waiting for compaction. The loop and sleep
+    // is done because a thread that finishes a compaction job should get a
+    // chance to pickup a new compaction job.
+
+    std::vector<std::string> keys = {DB::Properties::kMemTableFlushPending,
+                                     DB::Properties::kNumRunningFlushes,
+                                     DB::Properties::kCompactionPending,
+                                     DB::Properties::kNumRunningCompactions};
+
+    fprintf(stdout, "waitforcompaction(%s): started\n",
+            db.db->GetName().c_str());
+
+    while (true) {
+      bool retry = false;
+
+      for (const auto& k : keys) {
+        uint64_t v;
+        if (!db.db->GetIntProperty(k, &v)) {
+          fprintf(stderr, "waitforcompaction(%s): GetIntProperty(%s) failed\n",
+                  db.db->GetName().c_str(), k.c_str());
+          exit(1);
+        } else if (v > 0) {
+          fprintf(stdout,
+                  "waitforcompaction(%s): active(%s). Sleep 10 seconds\n",
+                  db.db->GetName().c_str(), k.c_str());
+          sleep(10);
+          retry = true;
+          break;
+        }
+      }
+
+      if (!retry) {
+        fprintf(stdout, "waitforcompaction(%s): finished\n",
+                db.db->GetName().c_str());
+        return;
+      }
+    }
+  }
+
+  void WaitForCompaction() {
+    // Give background threads a chance to wake
+    sleep(5);
+
+    // I am skeptical that this check race free. I hope that checking twice
+    // reduces the chance.
+    if (db_.db != nullptr) {
+      WaitForCompactionHelper(db_);
+      WaitForCompactionHelper(db_);
+    } else {
+      for (auto& db_with_cfh : multi_dbs_) {
+        WaitForCompactionHelper(db_with_cfh);
+        WaitForCompactionHelper(db_with_cfh);
+      }
+    }
+  }
+
+  bool CompactLevelHelper(DBWithColumnFamilies& db_with_cfh, int from_level) {
+    std::vector<LiveFileMetaData> files;
+    db_with_cfh.db->GetLiveFilesMetaData(&files);
+
+    assert(from_level == 0 || from_level == 1);
+
+    int real_from_level = from_level;
+    if (real_from_level > 0) {
+      // With dynamic leveled compaction the first level with data beyond L0
+      // might not be L1.
+      real_from_level = std::numeric_limits<int>::max();
+
+      for (auto& f : files) {
+        if (f.level > 0 && f.level < real_from_level) real_from_level = f.level;
+      }
+
+      if (real_from_level == std::numeric_limits<int>::max()) {
+        fprintf(stdout, "compact%d found 0 files to compact\n", from_level);
+        return true;
+      }
+    }
+
+    // The goal is to compact from from_level to the level that follows it,
+    // and with dynamic leveled compaction the next level might not be
+    // real_from_level+1
+    int next_level = std::numeric_limits<int>::max();
+
+    std::vector<std::string> files_to_compact;
+    for (auto& f : files) {
+      if (f.level == real_from_level)
+        files_to_compact.push_back(f.name);
+      else if (f.level > real_from_level && f.level < next_level)
+        next_level = f.level;
+    }
+
+    if (files_to_compact.empty()) {
+      fprintf(stdout, "compact%d found 0 files to compact\n", from_level);
+      return true;
+    } else if (next_level == std::numeric_limits<int>::max()) {
+      // There is no data beyond real_from_level. So we are done.
+      fprintf(stdout, "compact%d found no data beyond L%d\n", from_level,
+              real_from_level);
+      return true;
+    }
+
+    fprintf(stdout, "compact%d found %d files to compact from L%d to L%d\n",
+            from_level, static_cast<int>(files_to_compact.size()),
+            real_from_level, next_level);
+
+    ROCKSDB_NAMESPACE::CompactionOptions options;
+    // Lets RocksDB use the configured compression for this level
+    options.compression = ROCKSDB_NAMESPACE::kDisableCompressionOption;
+
+    ROCKSDB_NAMESPACE::ColumnFamilyDescriptor cfDesc;
+    db_with_cfh.db->DefaultColumnFamily()->GetDescriptor(&cfDesc);
+    options.output_file_size_limit = cfDesc.options.target_file_size_base;
+
+    Status status =
+        db_with_cfh.db->CompactFiles(options, files_to_compact, next_level);
+    if (!status.ok()) {
+      // This can fail for valid reasons including the operation was aborted
+      // or a filename is invalid because background compaction removed it.
+      // Having read the current cases for which an error is raised I prefer
+      // not to figure out whether an exception should be thrown here.
+      fprintf(stderr, "compact%d CompactFiles failed: %s\n", from_level,
+              status.ToString().c_str());
+      return false;
+    }
+    return true;
+  }
+
+  void CompactLevel(int from_level) {
+    if (db_.db != nullptr) {
+      while (!CompactLevelHelper(db_, from_level)) WaitForCompaction();
+    }
+    for (auto& db_with_cfh : multi_dbs_) {
+      while (!CompactLevelHelper(db_with_cfh, from_level)) WaitForCompaction();
+    }
+  }
+#endif
+
+  void Flush() {
+    FlushOptions flush_opt;
+    flush_opt.wait = true;
+
+    if (db_.db != nullptr) {
+      Status s = db_.db->Flush(flush_opt, db_.cfh);
+      if (!s.ok()) {
+        fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+    } else {
+      for (const auto& db_with_cfh : multi_dbs_) {
+        Status s = db_with_cfh.db->Flush(flush_opt, db_with_cfh.cfh);
+        if (!s.ok()) {
+          fprintf(stderr, "Flush failed: %s\n", s.ToString().c_str());
+          exit(1);
+        }
+      }
+    }
+    fprintf(stdout, "flush memtable\n");
+  }
+
   void ResetStats() {
     if (db_.db != nullptr) {
       db_.db->ResetStats();
@@ -7274,6 +7635,30 @@ class Benchmark {
     fprintf(stdout, "\n%s\n", stats.c_str());
   }
 
+  void PrintStats(const std::vector<std::string>& keys) {
+    if (db_.db != nullptr) {
+      PrintStats(db_.db, keys);
+    }
+    for (const auto& db_with_cfh : multi_dbs_) {
+      PrintStats(db_with_cfh.db, keys, true);
+    }
+  }
+
+  void PrintStats(DB* db, const std::vector<std::string>& keys,
+                  bool print_header = false) {
+    if (print_header) {
+      fprintf(stdout, "\n==== DB: %s ===\n", db->GetName().c_str());
+    }
+
+    for (const auto& key : keys) {
+      std::string stats;
+      if (!db->GetProperty(key, &stats)) {
+        stats = "(failed)";
+      }
+      fprintf(stdout, "%s: %s\n", key.c_str(), stats.c_str());
+    }
+  }
+
   void Replay(ThreadState* thread) {
     if (db_.db != nullptr) {
       Replay(thread, &db_);
@@ -7361,6 +7746,7 @@ int db_bench_tool(int argc, char** argv) {
     StringToCompressionType(FLAGS_compression_type.c_str());
 
 #ifndef ROCKSDB_LITE
+  // Stacked BlobDB
   FLAGS_blob_db_compression_type_e =
     StringToCompressionType(FLAGS_blob_db_compression_type.c_str());
 
@@ -7372,20 +7758,20 @@ int db_bench_tool(int argc, char** argv) {
     exit(1);
   }
 
-  if (!FLAGS_env_uri.empty()) {
-    Status s = Env::LoadEnv(FLAGS_env_uri, &FLAGS_env, &env_guard);
-    if (FLAGS_env == nullptr) {
-      fprintf(stderr, "No Env registered for URI: %s\n", FLAGS_env_uri.c_str());
-      exit(1);
-    }
-  } else if (!FLAGS_fs_uri.empty()) {
-    std::shared_ptr<FileSystem> fs;
-    Status s = FileSystem::Load(FLAGS_fs_uri, &fs);
-    if (fs == nullptr) {
-      fprintf(stderr, "Error: %s\n", s.ToString().c_str());
+  if (env_opts == 1) {
+    Status s = Env::CreateFromUri(ConfigOptions(), FLAGS_env_uri, FLAGS_fs_uri,
+                                  &FLAGS_env, &env_guard);
+    if (!s.ok()) {
+      fprintf(stderr, "Failed creating env: %s\n", s.ToString().c_str());
       exit(1);
     }
-    FLAGS_env = GetCompositeEnv(fs);
+  } else if (FLAGS_simulate_hybrid_fs_file != "") {
+    //**TODO: Make the simulate fs something that can be loaded
+    // from the ObjectRegistry...
+    static std::shared_ptr<ROCKSDB_NAMESPACE::Env> composite_env =
+        NewCompositeEnv(std::make_shared<SimulatedHybridFileSystem>(
+            FileSystem::Default(), FLAGS_simulate_hybrid_fs_file));
+    FLAGS_env = composite_env.get();
   }
 #endif  // ROCKSDB_LITE
   if (FLAGS_use_existing_keys && !FLAGS_use_existing_db) {
@@ -7445,6 +7831,13 @@ int db_bench_tool(int argc, char** argv) {
     exit(1);
   }
 
+  if ((FLAGS_enable_blob_files || FLAGS_enable_blob_garbage_collection) &&
+      !FLAGS_merge_operator.empty()) {
+    fprintf(stderr,
+            "Integrated BlobDB is currently incompatible with Merge.\n");
+    exit(1);
+  }
+
   ROCKSDB_NAMESPACE::Benchmark benchmark;
   benchmark.Run();
 
diff --git a/tools/db_bench_tool_test.cc b/tools/db_bench_tool_test.cc
index cd1c41e8538..9c351587699 100644
--- a/tools/db_bench_tool_test.cc
+++ b/tools/db_bench_tool_test.cc
@@ -32,7 +32,6 @@ class DBBenchTest : public testing::Test {
     Env::Default()->CreateDir(test_path_);
     db_path_ = test_path_ + "/db";
     wal_path_ = test_path_ + "/wal";
-    fs_.reset(new LegacyFileSystemWrapper(Env::Default()));
   }
 
   ~DBBenchTest() {
@@ -114,7 +113,6 @@ class DBBenchTest : public testing::Test {
   std::string db_path_;
   std::string test_path_;
   std::string wal_path_;
-  std::unique_ptr<LegacyFileSystemWrapper> fs_;
 
   char arg_buffer_[kArgBufferSize];
   char* argv_[kMaxArgCount];
@@ -129,8 +127,8 @@ TEST_F(DBBenchTest, OptionsFile) {
   const std::string kOptionsFileName = test_path_ + "/OPTIONS_test";
   Options opt = GetDefaultOptions();
   ASSERT_OK(PersistRocksDBOptions(DBOptions(opt), {"default"},
-                                  {ColumnFamilyOptions()}, kOptionsFileName,
-                                  fs_.get()));
+                                  {ColumnFamilyOptions(opt)}, kOptionsFileName,
+                                  opt.env->GetFileSystem().get()));
 
   // override the following options as db_bench will not take these
   // options from the options file
@@ -149,7 +147,7 @@ TEST_F(DBBenchTest, OptionsFileUniversal) {
 
   ASSERT_OK(PersistRocksDBOptions(DBOptions(opt), {"default"},
                                   {ColumnFamilyOptions(opt)}, kOptionsFileName,
-                                  fs_.get()));
+                                  opt.env->GetFileSystem().get()));
 
   // override the following options as db_bench will not take these
   // options from the options file
@@ -166,7 +164,7 @@ TEST_F(DBBenchTest, OptionsFileMultiLevelUniversal) {
 
   ASSERT_OK(PersistRocksDBOptions(DBOptions(opt), {"default"},
                                   {ColumnFamilyOptions(opt)}, kOptionsFileName,
-                                  fs_.get()));
+                                  opt.env->GetFileSystem().get()));
 
   // override the following options as db_bench will not take these
   // options from the options file
@@ -273,6 +271,12 @@ const std::string options_file_content = R"OPTIONS_FILE(
   hard_pending_compaction_bytes_limit=0
   disable_auto_compactions=false
   compaction_measure_io_stats=false
+  enable_blob_files=true
+  min_blob_size=16
+  blob_file_size=10485760
+  blob_compression_type=kNoCompression
+  enable_blob_garbage_collection=true
+  blob_garbage_collection_age_cutoff=0.75
 
 [TableOptions/BlockBasedTable "default"]
   format_version=0
@@ -318,7 +322,7 @@ TEST_F(DBBenchTest, OptionsFileFromFile) {
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
-  google::ParseCommandLineFlags(&argc, &argv, true);
+  GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
   return RUN_ALL_TESTS();
 }
 
diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py
index 4f67c4e8977..263ea807db1 100644
--- a/tools/db_crashtest.py
+++ b/tools/db_crashtest.py
@@ -25,13 +25,13 @@
 #   for txn:
 #       default_params < {blackbox,whitebox}_default_params < txn_params < args
 
-expected_values_file = tempfile.NamedTemporaryFile()
 
 default_params = {
     "acquire_snapshot_one_in": 10000,
     "backup_max_size": 100 * 1024 * 1024,
     # Consider larger number when backups considered more stable
     "backup_one_in": 100000,
+    "batch_protection_bytes_per_key": lambda: random.choice([0, 8]),
     "block_size": 16384,
     "bloom_bits": lambda: random.choice([random.randint(0,19),
                                          random.lognormvariate(2.3, 1.3)]),
@@ -51,6 +51,7 @@
     # Disabled compression_parallel_threads as the feature is not stable
     # lambda: random.choice([1] * 9 + [4])
     "compression_parallel_threads": 1,
+    "compression_max_dict_buffer_bytes": lambda: (1 << random.randint(0, 40)) - 1,
     "clear_column_family_one_in": 0,
     "compact_files_one_in": 1000000,
     "compact_range_one_in": 1000000,
@@ -59,7 +60,8 @@
     "destroy_db_initially": 0,
     "enable_pipelined_write": lambda: random.randint(0, 1),
     "enable_compaction_filter": lambda: random.choice([0, 0, 0, 1]),
-    "expected_values_path": expected_values_file.name,
+    "expected_values_path": lambda: setup_expected_values_file(),
+    "fail_if_options_file_error": lambda: random.randint(0, 1),
     "flush_one_in": 1000000,
     "file_checksum_impl": lambda: random.choice(["none", "crc32c", "xxh64", "big"]),
     "get_live_files_one_in": 1000000,
@@ -80,6 +82,7 @@
     "open_files": lambda : random.choice([-1, -1, 100, 500000]),
     "optimize_filters_for_memory": lambda: random.randint(0, 1),
     "partition_filters": lambda: random.randint(0, 1),
+    "partition_pinning": lambda: random.randint(0, 3),
     "pause_background_one_in": 1000000,
     "prefixpercent": 5,
     "progress_reports": 0,
@@ -93,11 +96,15 @@
     "subcompactions": lambda: random.randint(1, 4),
     "target_file_size_base": 2097152,
     "target_file_size_multiplier": 2,
+    "top_level_index_pinning": lambda: random.randint(0, 3),
+    "unpartitioned_pinning": lambda: random.randint(0, 3),
     "use_direct_reads": lambda: random.randint(0, 1),
     "use_direct_io_for_flush_and_compaction": lambda: random.randint(0, 1),
     "mock_direct_io": False,
+    "use_clock_cache": 0, # currently broken
     "use_full_merge_v1": lambda: random.randint(0, 1),
     "use_merge": lambda: random.randint(0, 1),
+    "use_ribbon_filter": lambda: random.randint(0, 1),
     "verify_checksum": 1,
     "write_buffer_size": 4 * 1024 * 1024,
     "writepercent": 35,
@@ -132,9 +139,13 @@
     "max_key_len": 3,
     "key_len_percent_dist": "1,30,69",
     "read_fault_one_in": lambda: random.choice([0, 1000]),
+    "open_metadata_write_fault_one_in": lambda: random.choice([0, 8]),
     "sync_fault_injection": False,
     "get_property_one_in": 1000000,
     "paranoid_file_checks": lambda: random.choice([0, 1, 1, 1]),
+    "max_write_buffer_size_to_maintain": lambda: random.choice(
+        [0, 1024 * 1024, 2 * 1024 * 1024, 4 * 1024 * 1024, 8 * 1024 * 1024]),
+    "user_timestamp_size": 0,
 }
 
 _TEST_DIR_ENV_VAR = 'TEST_TMPDIR'
@@ -156,6 +167,24 @@ def get_dbname(test_name):
         os.mkdir(dbname)
     return dbname
 
+expected_values_file = None
+def setup_expected_values_file():
+    global expected_values_file
+    if expected_values_file is not None:
+        return expected_values_file
+    expected_file_name = "rocksdb_crashtest_" + "expected"
+    test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
+    if test_tmpdir is None or test_tmpdir == "":
+        expected_values_file = tempfile.NamedTemporaryFile(
+            prefix=expected_file_name, delete=False).name
+    else:
+        # if tmpdir is specified, store the expected_values_file in the same dir
+        expected_values_file = test_tmpdir + "/" + expected_file_name
+        if os.path.exists(expected_values_file):
+            os.remove(expected_values_file)
+        open(expected_values_file, 'a').close()
+    return expected_values_file
+
 
 def is_direct_io_supported(dbname):
     with tempfile.NamedTemporaryFile(dir=dbname) as f:
@@ -241,11 +270,46 @@ def is_direct_io_supported(dbname):
     "continuous_verification_interval": 0,
 }
 
+blob_params = {
+    "allow_setting_blob_options_dynamically": 1,
+    # Enable blob files and GC with a 75% chance initially; note that they might still be
+    # enabled/disabled during the test via SetOptions
+    "enable_blob_files": lambda: random.choice([0] + [1] * 3),
+    "min_blob_size": lambda: random.choice([0, 8, 16]),
+    "blob_file_size": lambda: random.choice([1048576, 16777216, 268435456, 1073741824]),
+    "blob_compression_type": lambda: random.choice(["none", "snappy", "lz4", "zstd"]),
+    "enable_blob_garbage_collection": lambda: random.choice([0] + [1] * 3),
+    "blob_garbage_collection_age_cutoff": lambda: random.choice([0.0, 0.25, 0.5, 0.75, 1.0]),
+    # The following are currently incompatible with the integrated BlobDB
+    "use_merge": 0,
+}
+
+ts_params = {
+    "test_cf_consistency": 0,
+    "test_batches_snapshots": 0,
+    "user_timestamp_size": 8,
+    "use_merge": 0,
+    "use_full_merge_v1": 0,
+    # In order to disable SingleDelete
+    "nooverwritepercent": 0,
+    "use_txn": 0,
+    "read_only": 0,
+    "secondary_catch_up_one_in": 0,
+    "continuous_verification_interval": 0,
+    "checkpoint_one_in": 0,
+    "enable_blob_files": 0,
+    "use_blob_db": 0,
+    "enable_compaction_filter": 0,
+    "ingest_external_file_one_in": 0,
+}
+
 def finalize_and_sanitize(src_params):
     dest_params = dict([(k,  v() if callable(v) else v)
                         for (k, v) in src_params.items()])
-    if dest_params.get("compression_type") != "zstd" or \
-            dest_params.get("compression_max_dict_bytes") == 0:
+    if dest_params.get("compression_max_dict_bytes") == 0:
+        dest_params["compression_zstd_max_train_bytes"] = 0
+        dest_params["compression_max_dict_buffer_bytes"] = 0
+    if dest_params.get("compression_type") != "zstd":
         dest_params["compression_zstd_max_train_bytes"] = 0
     if dest_params.get("allow_concurrent_memtable_write", 1) == 1:
         dest_params["memtablerep"] = "skip_list"
@@ -256,14 +320,18 @@ def finalize_and_sanitize(src_params):
             or dest_params["use_direct_reads"] == 1) and \
             not is_direct_io_supported(dest_params["db"]):
         if is_release_mode():
-            print("{} does not support direct IO".format(dest_params["db"]))
-            sys.exit(1)
+            print("{} does not support direct IO. Disabling use_direct_reads and "
+                    "use_direct_io_for_flush_and_compaction.\n".format(
+                        dest_params["db"]))
+            dest_params["use_direct_reads"] = 0
+            dest_params["use_direct_io_for_flush_and_compaction"] = 0
         else:
             dest_params["mock_direct_io"] = True
 
-    # DeleteRange is not currnetly compatible with Txns
-    if dest_params.get("test_batches_snapshots") == 1 or \
-            dest_params.get("use_txn") == 1:
+    # DeleteRange is not currnetly compatible with Txns and timestamp
+    if (dest_params.get("test_batches_snapshots") == 1 or
+        dest_params.get("use_txn") == 1 or
+        dest_params.get("user_timestamp_size") > 0):
         dest_params["delpercent"] += dest_params["delrangepercent"]
         dest_params["delrangepercent"] = 0
     # Only under WritePrepared txns, unordered_write would provide the same guarnatees as vanilla rocksdb
@@ -273,6 +341,7 @@ def finalize_and_sanitize(src_params):
     if dest_params.get("disable_wal", 0) == 1:
         dest_params["atomic_flush"] = 1
         dest_params["sync"] = 0
+        dest_params["write_fault_one_in"] = 0
     if dest_params.get("open_files", 1) != -1:
         # Compaction TTL and periodic compactions are only compatible
         # with open_files = -1
@@ -303,6 +372,8 @@ def finalize_and_sanitize(src_params):
         dest_params["readpercent"] += dest_params.get("iterpercent", 10)
         dest_params["iterpercent"] = 0
         dest_params["test_batches_snapshots"] = 0
+    if dest_params.get("test_batches_snapshots") == 0:
+        dest_params["batch_protection_bytes_per_key"] = 0
     return dest_params
 
 def gen_cmd_params(args):
@@ -325,6 +396,16 @@ def gen_cmd_params(args):
         params.update(txn_params)
     if args.test_best_efforts_recovery:
         params.update(best_efforts_recovery_params)
+    if args.enable_ts:
+        params.update(ts_params)
+
+    # Best-effort recovery and BlobDB are currently incompatible. Test BE recovery
+    # if specified on the command line; otherwise, apply BlobDB related overrides
+    # with a 10% chance.
+    if (not args.test_best_efforts_recovery and
+        not args.enable_ts and
+        random.choice([0] * 9 + [1]) == 1):
+        params.update(blob_params)
 
     for k, v in vars(args).items():
         if v is not None:
@@ -339,7 +420,7 @@ def gen_cmd(params, unknown_params):
         for k, v in [(k, finalzied_params[k]) for k in sorted(finalzied_params)]
         if k not in set(['test_type', 'simple', 'duration', 'interval',
                          'random_kill_odd', 'cf_consistency', 'txn',
-                         'test_best_efforts_recovery'])
+                         'test_best_efforts_recovery', 'enable_ts'])
         and v is not None] + unknown_params
     return cmd
 
@@ -380,6 +461,25 @@ def inject_inconsistencies_to_db_dir(dir_path):
         with open(os.path.join(dir_path, fname), "w") as fd:
             fd.write("garbage")
 
+def execute_cmd(cmd, timeout):
+    child = subprocess.Popen(cmd, stderr=subprocess.PIPE,
+                             stdout=subprocess.PIPE)
+    print("Running db_stress with pid=%d: %s\n\n"
+          % (child.pid, ' '.join(cmd)))
+
+    try:
+        outs, errs = child.communicate(timeout=timeout)
+        hit_timeout = False
+        print("WARNING: db_stress ended before kill: exitcode=%d\n"
+              % child.returncode)
+    except subprocess.TimeoutExpired:
+        hit_timeout = True
+        child.kill()
+        print("KILLED %d\n" % child.pid)
+        outs, errs = child.communicate()
+
+    return hit_timeout, child.returncode, outs.decode('utf-8'), errs.decode('utf-8')
+
 
 # This script runs and kills db_stress multiple times. It checks consistency
 # in case of unsafe crashes in RocksDB.
@@ -393,47 +493,26 @@ def blackbox_crash_main(args, unknown_args):
           + "total-duration=" + str(cmd_params['duration']) + "\n")
 
     while time.time() < exit_time:
-        run_had_errors = False
-        killtime = time.time() + cmd_params['interval']
-
         cmd = gen_cmd(dict(
             list(cmd_params.items())
             + list({'db': dbname}.items())), unknown_args)
 
-        child = subprocess.Popen(cmd, stderr=subprocess.PIPE)
-        print("Running db_stress with pid=%d: %s\n\n"
-              % (child.pid, ' '.join(cmd)))
-
-        stop_early = False
-        while time.time() < killtime:
-            if child.poll() is not None:
-                print("WARNING: db_stress ended before kill: exitcode=%d\n"
-                      % child.returncode)
-                stop_early = True
-                break
-            time.sleep(1)
-
-        if not stop_early:
-            if child.poll() is not None:
-                print("WARNING: db_stress ended before kill: exitcode=%d\n"
-                      % child.returncode)
-            else:
-                child.kill()
-                print("KILLED %d\n" % child.pid)
-                time.sleep(1)  # time to stabilize after a kill
-
-        while True:
-            line = child.stderr.readline().strip().decode('utf-8')
-            if line == '':
-                break
-            elif not line.startswith('WARNING'):
+        hit_timeout, retcode, outs, errs = execute_cmd(cmd, cmd_params['interval'])
+
+        if not hit_timeout:
+            print('Exit Before Killing') 
+            print('stdout:')
+            print(outs)
+            print('stderr:')
+            print(errs)
+            sys.exit(2)
+
+        for line in errs.split('\n'):
+            if line != '' and  not line.startswith('WARNING'):
                 run_had_errors = True
                 print('stderr has error message:')
                 print('***' + line + '***')
 
-        if run_had_errors:
-            sys.exit(2)
-
         time.sleep(1)  # time to stabilize before the next run
 
         if args.test_best_efforts_recovery:
@@ -533,18 +612,24 @@ def whitebox_crash_main(args, unknown_args):
 
         print("Running:" + ' '.join(cmd) + "\n")  # noqa: E999 T25377293 Grandfathered in
 
-        popen = subprocess.Popen(cmd, stdout=subprocess.PIPE,
-                                 stderr=subprocess.STDOUT)
-        stdoutdata, stderrdata = popen.communicate()
-        if stdoutdata:
-            stdoutdata = stdoutdata.decode('utf-8')
-        if stderrdata:
-            stderrdata = stderrdata.decode('utf-8')
-        retncode = popen.returncode
+        # If the running time is 15 minutes over the run time, explicit kill and
+        # exit even if white box kill didn't hit. This is to guarantee run time
+        # limit, as if it runs as a job, running too long will create problems
+        # for job scheduling or execution.
+        # TODO detect a hanging condition. The job might run too long as RocksDB
+        # hits a hanging bug.
+        hit_timeout, retncode, stdoutdata, stderrdata = execute_cmd(
+            cmd, exit_time - time.time() + 900)
         msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format(
                check_mode, additional_opts['kill_random_test'], retncode))
+
         print(msg)
         print(stdoutdata)
+        print(stderrdata)
+
+        if hit_timeout:
+            print("Killing the run for running too long")
+            break
 
         expected = False
         if additional_opts['kill_random_test'] is None and (retncode == 0):
@@ -559,15 +644,16 @@ def whitebox_crash_main(args, unknown_args):
             print("TEST FAILED. See kill option and exit code above!!!\n")
             sys.exit(1)
 
-        stdoutdata = stdoutdata.lower()
-        errorcount = (stdoutdata.count('error') -
-                      stdoutdata.count('got errors 0 times'))
-        print("#times error occurred in output is " + str(errorcount) + "\n")
+        stderrdata = stderrdata.lower()
+        errorcount = (stderrdata.count('error') -
+                      stderrdata.count('got errors 0 times'))
+        print("#times error occurred in output is " + str(errorcount) +
+                "\n")
 
         if (errorcount > 0):
             print("TEST FAILED. Output has 'error'!!!\n")
             sys.exit(2)
-        if (stdoutdata.find('fail') >= 0):
+        if (stderrdata.find('fail') >= 0):
             print("TEST FAILED. Output has 'fail'!!!\n")
             sys.exit(2)
 
@@ -592,13 +678,16 @@ def main():
     parser.add_argument("--cf_consistency", action='store_true')
     parser.add_argument("--txn", action='store_true')
     parser.add_argument("--test_best_efforts_recovery", action='store_true')
+    parser.add_argument("--enable_ts", action='store_true')
 
     all_params = dict(list(default_params.items())
                       + list(blackbox_default_params.items())
                       + list(whitebox_default_params.items())
                       + list(simple_default_params.items())
                       + list(blackbox_simple_default_params.items())
-                      + list(whitebox_simple_default_params.items()))
+                      + list(whitebox_simple_default_params.items())
+                      + list(blob_params.items())
+                      + list(ts_params.items()))
 
     for k, v in all_params.items():
         parser.add_argument("--" + k, type=type(v() if callable(v) else v))
@@ -615,6 +704,10 @@ def main():
         blackbox_crash_main(args, unknown_args)
     if args.test_type == 'whitebox':
         whitebox_crash_main(args, unknown_args)
+    # Only delete the `expected_values_file` if test passes
+    if os.path.exists(expected_values_file):
+        os.remove(expected_values_file)
+
 
 if __name__ == '__main__':
     main()
diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc
index 794ac1530e1..7025e104008 100644
--- a/tools/db_repl_stress.cc
+++ b/tools/db_repl_stress.cc
@@ -27,13 +27,20 @@ int main() {
 // --num_inserts = the num of inserts the first thread should perform.
 // --wal_ttl = the wal ttl for the run.
 
+DEFINE_uint64(num_inserts, 1000,
+              "the num of inserts the first thread should"
+              " perform.");
+DEFINE_uint64(wal_ttl_seconds, 1000, "the wal ttl for the run(in seconds)");
+DEFINE_uint64(wal_size_limit_MB, 10,
+              "the wal size limit for the run"
+              "(in MB)");
+
 using namespace ROCKSDB_NAMESPACE;
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 using GFLAGS_NAMESPACE::SetUsageMessage;
 
 struct DataPumpThread {
-  size_t no_records;
   DB* db;  // Assumption DB is Open'ed already.
 };
 
@@ -41,8 +48,8 @@ static void DataPumpThreadBody(void* arg) {
   DataPumpThread* t = reinterpret_cast<DataPumpThread*>(arg);
   DB* db = t->db;
   Random rnd(301);
-  size_t i = 0;
-  while (i++ < t->no_records) {
+  uint64_t i = 0;
+  while (i++ < FLAGS_num_inserts) {
     if (!db->Put(WriteOptions(), Slice(rnd.RandomString(500)),
                  Slice(rnd.RandomString(500)))
              .ok()) {
@@ -52,45 +59,6 @@ static void DataPumpThreadBody(void* arg) {
   }
 }
 
-struct ReplicationThread {
-  std::atomic<bool> stop;
-  DB* db;
-  volatile size_t no_read;
-};
-
-static void ReplicationThreadBody(void* arg) {
-  ReplicationThread* t = reinterpret_cast<ReplicationThread*>(arg);
-  DB* db = t->db;
-  std::unique_ptr<TransactionLogIterator> iter;
-  SequenceNumber currentSeqNum = 1;
-  while (!t->stop.load(std::memory_order_acquire)) {
-    iter.reset();
-    Status s;
-    while (!db->GetUpdatesSince(currentSeqNum, &iter).ok()) {
-      if (t->stop.load(std::memory_order_acquire)) {
-        return;
-      }
-    }
-    fprintf(stderr, "Refreshing iterator\n");
-    for (; iter->Valid(); iter->Next(), t->no_read++, currentSeqNum++) {
-      BatchResult res = iter->GetBatch();
-      if (res.sequence != currentSeqNum) {
-        fprintf(stderr, "Missed a seq no. b/w %ld and %ld\n",
-                (long)currentSeqNum, (long)res.sequence);
-        exit(1);
-      }
-    }
-  }
-}
-
-DEFINE_uint64(num_inserts, 1000,
-              "the num of inserts the first thread should"
-              " perform.");
-DEFINE_uint64(wal_ttl_seconds, 1000, "the wal ttl for the run(in seconds)");
-DEFINE_uint64(wal_size_limit_MB, 10,
-              "the wal size limit for the run"
-              "(in MB)");
-
 int main(int argc, const char** argv) {
   SetUsageMessage(
       std::string("\nUSAGE:\n") + std::string(argv[0]) +
@@ -117,29 +85,38 @@ int main(int argc, const char** argv) {
   }
 
   DataPumpThread dataPump;
-  dataPump.no_records = FLAGS_num_inserts;
   dataPump.db = db;
   env->StartThread(DataPumpThreadBody, &dataPump);
 
-  ReplicationThread replThread;
-  replThread.db = db;
-  replThread.no_read = 0;
-  replThread.stop.store(false, std::memory_order_release);
-
-  env->StartThread(ReplicationThreadBody, &replThread);
-  while (replThread.no_read < FLAGS_num_inserts)
-    ;
-  replThread.stop.store(true, std::memory_order_release);
-  if (replThread.no_read < dataPump.no_records) {
-    // no. read should be => than inserted.
-    fprintf(stderr,
-            "No. of Record's written and read not same\nRead : %" ROCKSDB_PRIszt
-            " Written : %" ROCKSDB_PRIszt "\n",
-            replThread.no_read, dataPump.no_records);
-    exit(1);
+  std::unique_ptr<TransactionLogIterator> iter;
+  SequenceNumber currentSeqNum = 1;
+  uint64_t num_read = 0;
+  for (;;) {
+    iter.reset();
+    // Continue to probe a bit more after all received
+    size_t probes = 0;
+    while (!db->GetUpdatesSince(currentSeqNum, &iter).ok()) {
+      probes++;
+      if (probes > 100 && num_read >= FLAGS_num_inserts) {
+        if (num_read > FLAGS_num_inserts) {
+          fprintf(stderr, "Too many updates read: %ld expected: %ld\n",
+                  (long)num_read, (long)FLAGS_num_inserts);
+          exit(1);
+        }
+        fprintf(stderr, "Successful!\n");
+        return 0;
+      }
+    }
+    fprintf(stderr, "Refreshing iterator\n");
+    for (; iter->Valid(); iter->Next(), num_read++, currentSeqNum++) {
+      BatchResult res = iter->GetBatch();
+      if (res.sequence != currentSeqNum) {
+        fprintf(stderr, "Missed a seq no. b/w %ld and %ld\n",
+                (long)currentSeqNum, (long)res.sequence);
+        exit(1);
+      }
+    }
   }
-  fprintf(stderr, "Successful!\n");
-  exit(0);
 }
 
 #endif  // GFLAGS
diff --git a/tools/io_tracer_parser_test.cc b/tools/io_tracer_parser_test.cc
index 2e62a6b0aee..d91217341a1 100644
--- a/tools/io_tracer_parser_test.cc
+++ b/tools/io_tracer_parser_test.cc
@@ -9,7 +9,7 @@
 #include <cstdio>
 int main() {
   fprintf(stderr, "Please install gflags to run io_tracer_parser_test\n");
-  return 1;
+  return 0;
 }
 #else
 
@@ -18,6 +18,7 @@ int main() {
 
 #include "rocksdb/env.h"
 #include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "tools/io_tracer_parser_tool.h"
@@ -64,7 +65,7 @@ class IOTracerParserTest : public testing::Test {
     ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
                                  &trace_writer));
 
-    ASSERT_OK(db_->StartIOTrace(env_, trace_opt, std::move(trace_writer)));
+    ASSERT_OK(db_->StartIOTrace(trace_opt, std::move(trace_writer)));
 
     for (int i = 0; i < 10; i++) {
       ASSERT_OK(db_->Put(write_opt, "key_" + std::to_string(i),
diff --git a/tools/io_tracer_parser_tool.cc b/tools/io_tracer_parser_tool.cc
index fca04f164b8..01b920f3b84 100644
--- a/tools/io_tracer_parser_tool.cc
+++ b/tools/io_tracer_parser_tool.cc
@@ -14,6 +14,7 @@
 #include <sstream>
 
 #include "port/lang.h"
+#include "rocksdb/trace_reader_writer.h"
 #include "trace_replay/io_tracer.h"
 #include "util/gflags_compat.h"
 
@@ -38,33 +39,56 @@ void IOTraceRecordParser::PrintHumanReadableHeader(
 void IOTraceRecordParser::PrintHumanReadableIOTraceRecord(
     const IOTraceRecord& record) {
   std::stringstream ss;
-  ss << "Access Time : " << std::setw(17) << std::left
-     << record.access_timestamp << ", File Operation: " << std::setw(18)
+  ss << "Access Time : " << std::setw(20) << std::left
+     << record.access_timestamp << ", File Name: " << std::setw(20) << std::left
+     << record.file_name.c_str() << ", File Operation: " << std::setw(18)
      << std::left << record.file_operation.c_str()
-     << ", Latency: " << std::setw(9) << std::left << record.latency
+     << ", Latency: " << std::setw(10) << std::left << record.latency
      << ", IO Status: " << record.io_status.c_str();
 
-  switch (record.trace_type) {
-    case TraceType::kIOGeneral:
-      break;
-    case TraceType::kIOFileNameAndFileSize:
-      ss << ", File Size: " << record.file_size;
-      FALLTHROUGH_INTENDED;
-    case TraceType::kIOFileName: {
-      if (!record.file_name.empty()) {
-        ss << ", File Name: " << record.file_name.c_str();
-      }
-      break;
+  // Each bit in io_op_data stores which corresponding info from IOTraceOp will
+  // be added in the trace. Foreg, if bit at position 1 is set then
+  // IOTraceOp::kIOLen (length) will be logged in the record (Since
+  // IOTraceOp::kIOLen = 1 in the enum). So find all the set positions in
+  // io_op_data one by one and, update corresponsing info in the trace record,
+  // unset that bit to find other set bits until io_op_data = 0.
+  /* Read remaining options based on io_op_data set by file operation */
+  int64_t io_op_data = static_cast<int64_t>(record.io_op_data);
+  while (io_op_data) {
+    // Find the rightmost set bit.
+    uint32_t set_pos = static_cast<uint32_t>(log2(io_op_data & -io_op_data));
+    switch (set_pos) {
+      case IOTraceOp::kIOFileSize:
+        ss << ", File Size: " << record.file_size;
+        break;
+      case IOTraceOp::kIOLen:
+        ss << ", Length: " << record.len;
+        break;
+      case IOTraceOp::kIOOffset:
+        ss << ", Offset: " << record.offset;
+        break;
+      default:
+        assert(false);
     }
-    case TraceType::kIOLenAndOffset:
-      ss << ", Offset: " << record.offset;
-      FALLTHROUGH_INTENDED;
-    case TraceType::kIOLen:
-      ss << ", Length: " << record.len;
-      break;
-    default:
-      assert(false);
+    // unset the rightmost bit.
+    io_op_data &= (io_op_data - 1);
   }
+
+  int64_t trace_data = static_cast<int64_t>(record.trace_data);
+  while (trace_data) {
+    // Find the rightmost set bit.
+    uint32_t set_pos = static_cast<uint32_t>(log2(trace_data & -trace_data));
+    switch (set_pos) {
+      case IODebugContext::TraceData::kRequestID:
+        ss << ", Request Id: " << record.request_id;
+        break;
+      default:
+        assert(false);
+    }
+    // unset the rightmost bit.
+    trace_data &= (trace_data - 1);
+  }
+
   ss << "\n";
   fprintf(stdout, "%s", ss.str().c_str());
 }
diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc
index 80df06499f5..118f398022a 100644
--- a/tools/ldb_cmd.cc
+++ b/tools/ldb_cmd.cc
@@ -22,9 +22,7 @@
 #include "db/dbformat.h"
 #include "db/log_reader.h"
 #include "db/write_batch_internal.h"
-#include "env/composite_env_wrapper.h"
 #include "file/filename.h"
-#include "port/port_dirent.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/file_checksum.h"
 #include "rocksdb/table_properties.h"
@@ -51,6 +49,7 @@ class FileChecksumGenCrc32c;
 class FileChecksumGenCrc32cFactory;
 
 const std::string LDBCommand::ARG_ENV_URI = "env_uri";
+const std::string LDBCommand::ARG_FS_URI = "fs_uri";
 const std::string LDBCommand::ARG_DB = "db";
 const std::string LDBCommand::ARG_PATH = "path";
 const std::string LDBCommand::ARG_SECONDARY_PATH = "secondary_path";
@@ -228,6 +227,10 @@ LDBCommand* LDBCommand::SelectCommand(const ParsedParams& parsed_params) {
     return new FileChecksumDumpCommand(parsed_params.cmd_params,
                                        parsed_params.option_map,
                                        parsed_params.flags);
+  } else if (parsed_params.cmd == GetPropertyCommand::Name()) {
+    return new GetPropertyCommand(parsed_params.cmd_params,
+                                  parsed_params.option_map,
+                                  parsed_params.flags);
   } else if (parsed_params.cmd == ListColumnFamiliesCommand::Name()) {
     return new ListColumnFamiliesCommand(parsed_params.cmd_params,
                                          parsed_params.option_map,
@@ -292,9 +295,10 @@ void LDBCommand::Run() {
 
   if (!options_.env || options_.env == Env::Default()) {
     Env* env = Env::Default();
-    Status s = Env::LoadEnv(env_uri_, &env, &env_guard_);
-    if (!s.ok() && !s.IsNotFound()) {
-      fprintf(stderr, "LoadEnv: %s\n", s.ToString().c_str());
+    Status s = Env::CreateFromUri(config_options_, env_uri_, fs_uri_, &env,
+                                  &env_guard_);
+    if (!s.ok()) {
+      fprintf(stderr, "%s\n", s.ToString().c_str());
       exec_state_ = LDBCommandExecuteResult::Failed(s.ToString());
       return;
     }
@@ -349,6 +353,11 @@ LDBCommand::LDBCommand(const std::map<std::string, std::string>& options,
     env_uri_ = itr->second;
   }
 
+  itr = options.find(ARG_FS_URI);
+  if (itr != options.end()) {
+    fs_uri_ = itr->second;
+  }
+
   itr = options.find(ARG_CF_NAME);
   if (itr != options.end()) {
     column_family_name_ = itr->second;
@@ -481,6 +490,7 @@ ColumnFamilyHandle* LDBCommand::GetCfHandle() {
 std::vector<std::string> LDBCommand::BuildCmdLineOptions(
     std::vector<std::string> options) {
   std::vector<std::string> ret = {ARG_ENV_URI,
+                                  ARG_FS_URI,
                                   ARG_DB,
                                   ARG_SECONDARY_PATH,
                                   ARG_BLOOM_BITS,
@@ -550,6 +560,26 @@ bool LDBCommand::ParseStringOption(
 void LDBCommand::OverrideBaseOptions() {
   options_.create_if_missing = false;
 
+  int db_write_buffer_size;
+  if (ParseIntOption(option_map_, ARG_DB_WRITE_BUFFER_SIZE,
+                     db_write_buffer_size, exec_state_)) {
+    if (db_write_buffer_size >= 0) {
+      options_.db_write_buffer_size = db_write_buffer_size;
+    } else {
+      exec_state_ = LDBCommandExecuteResult::Failed(ARG_DB_WRITE_BUFFER_SIZE +
+                                                    " must be >= 0.");
+    }
+  }
+
+  if (options_.db_paths.size() == 0) {
+    options_.db_paths.emplace_back(db_path_,
+                                   std::numeric_limits<uint64_t>::max());
+  }
+
+  OverrideBaseCFOptions(static_cast<ColumnFamilyOptions*>(&options_));
+}
+
+void LDBCommand::OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts) {
   BlockBasedTableOptions table_options;
   bool use_table_options = false;
   int bits;
@@ -574,35 +604,35 @@ void LDBCommand::OverrideBaseOptions() {
     }
   }
 
-  options_.force_consistency_checks = force_consistency_checks_;
+  cf_opts->force_consistency_checks = force_consistency_checks_;
   if (use_table_options) {
-    options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    cf_opts->table_factory.reset(NewBlockBasedTableFactory(table_options));
   }
 
   auto itr = option_map_.find(ARG_AUTO_COMPACTION);
   if (itr != option_map_.end()) {
-    options_.disable_auto_compactions = !StringToBool(itr->second);
+    cf_opts->disable_auto_compactions = !StringToBool(itr->second);
   }
 
   itr = option_map_.find(ARG_COMPRESSION_TYPE);
   if (itr != option_map_.end()) {
     std::string comp = itr->second;
     if (comp == "no") {
-      options_.compression = kNoCompression;
+      cf_opts->compression = kNoCompression;
     } else if (comp == "snappy") {
-      options_.compression = kSnappyCompression;
+      cf_opts->compression = kSnappyCompression;
     } else if (comp == "zlib") {
-      options_.compression = kZlibCompression;
+      cf_opts->compression = kZlibCompression;
     } else if (comp == "bzip2") {
-      options_.compression = kBZip2Compression;
+      cf_opts->compression = kBZip2Compression;
     } else if (comp == "lz4") {
-      options_.compression = kLZ4Compression;
+      cf_opts->compression = kLZ4Compression;
     } else if (comp == "lz4hc") {
-      options_.compression = kLZ4HCCompression;
+      cf_opts->compression = kLZ4HCCompression;
     } else if (comp == "xpress") {
-      options_.compression = kXpressCompression;
+      cf_opts->compression = kXpressCompression;
     } else if (comp == "zstd") {
-      options_.compression = kZSTD;
+      cf_opts->compression = kZSTD;
     } else {
       // Unknown compression.
       exec_state_ =
@@ -614,29 +644,18 @@ void LDBCommand::OverrideBaseOptions() {
   if (ParseIntOption(option_map_, ARG_COMPRESSION_MAX_DICT_BYTES,
                      compression_max_dict_bytes, exec_state_)) {
     if (compression_max_dict_bytes >= 0) {
-      options_.compression_opts.max_dict_bytes = compression_max_dict_bytes;
+      cf_opts->compression_opts.max_dict_bytes = compression_max_dict_bytes;
     } else {
       exec_state_ = LDBCommandExecuteResult::Failed(
           ARG_COMPRESSION_MAX_DICT_BYTES + " must be >= 0.");
     }
   }
 
-  int db_write_buffer_size;
-  if (ParseIntOption(option_map_, ARG_DB_WRITE_BUFFER_SIZE,
-        db_write_buffer_size, exec_state_)) {
-    if (db_write_buffer_size >= 0) {
-      options_.db_write_buffer_size = db_write_buffer_size;
-    } else {
-      exec_state_ = LDBCommandExecuteResult::Failed(ARG_DB_WRITE_BUFFER_SIZE +
-                                                    " must be >= 0.");
-    }
-  }
-
   int write_buffer_size;
   if (ParseIntOption(option_map_, ARG_WRITE_BUFFER_SIZE, write_buffer_size,
         exec_state_)) {
     if (write_buffer_size > 0) {
-      options_.write_buffer_size = write_buffer_size;
+      cf_opts->write_buffer_size = write_buffer_size;
     } else {
       exec_state_ = LDBCommandExecuteResult::Failed(ARG_WRITE_BUFFER_SIZE +
                                                     " must be > 0.");
@@ -646,23 +665,18 @@ void LDBCommand::OverrideBaseOptions() {
   int file_size;
   if (ParseIntOption(option_map_, ARG_FILE_SIZE, file_size, exec_state_)) {
     if (file_size > 0) {
-      options_.target_file_size_base = file_size;
+      cf_opts->target_file_size_base = file_size;
     } else {
       exec_state_ =
           LDBCommandExecuteResult::Failed(ARG_FILE_SIZE + " must be > 0.");
     }
   }
 
-  if (options_.db_paths.size() == 0) {
-    options_.db_paths.emplace_back(db_path_,
-                                   std::numeric_limits<uint64_t>::max());
-  }
-
   int fix_prefix_len;
   if (ParseIntOption(option_map_, ARG_FIX_PREFIX_LEN, fix_prefix_len,
                      exec_state_)) {
     if (fix_prefix_len > 0) {
-      options_.prefix_extractor.reset(
+      cf_opts->prefix_extractor.reset(
           NewFixedPrefixTransform(static_cast<size_t>(fix_prefix_len)));
     } else {
       exec_state_ =
@@ -737,7 +751,7 @@ void LDBCommand::PrepareOptions() {
           "Non-existing column family " + column_family_name_);
       return;
     }
-    column_families_iter->options = options_;
+    OverrideBaseCFOptions(&column_families_iter->options);
   }
 }
 
@@ -937,8 +951,14 @@ void CompactorCommand::DoCommand() {
   CompactRangeOptions cro;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
 
-  db_->CompactRange(cro, GetCfHandle(), begin, end);
-  exec_state_ = LDBCommandExecuteResult::Succeed("");
+  Status s = db_->CompactRange(cro, GetCfHandle(), begin, end);
+  if (!s.ok()) {
+    std::stringstream oss;
+    oss << "Compaction failed: " << s.ToString();
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
+  } else {
+    exec_state_ = LDBCommandExecuteResult::Succeed("");
+  }
 
   delete begin;
   delete end;
@@ -1001,11 +1021,12 @@ void DBLoaderCommand::DoCommand() {
   // prefer ifstream getline performance vs that from std::cin istream
   std::ifstream ifs_stdin("/dev/stdin");
   std::istream* istream_p = ifs_stdin.is_open() ? &ifs_stdin : &std::cin;
-  while (getline(*istream_p, line, '\n')) {
+  Status s;
+  while (s.ok() && getline(*istream_p, line, '\n')) {
     std::string key;
     std::string value;
     if (ParseKeyValue(line, &key, &value, is_key_hex_, is_value_hex_)) {
-      db_->Put(write_options, GetCfHandle(), Slice(key), Slice(value));
+      s = db_->Put(write_options, GetCfHandle(), Slice(key), Slice(value));
     } else if (0 == line.find("Keys in range:")) {
       // ignore this line
     } else if (0 == line.find("Created bg thread 0x")) {
@@ -1018,8 +1039,19 @@ void DBLoaderCommand::DoCommand() {
   if (bad_lines > 0) {
     std::cout << "Warning: " << bad_lines << " bad lines ignored." << std::endl;
   }
-  if (compact_) {
-    db_->CompactRange(CompactRangeOptions(), GetCfHandle(), nullptr, nullptr);
+  if (!s.ok()) {
+    std::stringstream oss;
+    oss << "Load failed: " << s.ToString();
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
+  }
+  if (compact_ && s.ok()) {
+    s = db_->CompactRange(CompactRangeOptions(), GetCfHandle(), nullptr,
+                          nullptr);
+  }
+  if (!s.ok()) {
+    std::stringstream oss;
+    oss << "Compaction failed: " << s.ToString();
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
   }
 }
 
@@ -1042,7 +1074,8 @@ void DumpManifestFile(Options options, std::string file, bool verbose, bool hex,
   WriteBufferManager wb(options.db_write_buffer_size);
   ImmutableDBOptions immutable_db_options(options);
   VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc,
-                      /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr);
+                      /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                      /*db_session_id*/ "");
   Status s = versions.DumpManifest(options, file, verbose, hex, json);
   if (!s.ok()) {
     fprintf(stderr, "Error in processing file %s %s\n", file.c_str(),
@@ -1129,7 +1162,7 @@ void ManifestDumpCommand::DoCommand() {
         fname = file_path;
       }
       uint64_t file_num = 0;
-      FileType file_type = kLogFile;  // Just for initialization
+      FileType file_type = kWalFile;  // Just for initialization
       if (ParseFileName(fname, &file_num, &file_type) &&
           file_type == kDescriptorFile) {
         if (!matched_file.empty()) {
@@ -1184,7 +1217,8 @@ void GetLiveFilesChecksumInfoFromVersionSet(Options options,
   WriteBufferManager wb(options.db_write_buffer_size);
   ImmutableDBOptions immutable_db_options(options);
   VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc,
-                      /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr);
+                      /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                      /*db_session_id*/ "");
   std::vector<std::string> cf_name_list;
   s = versions.ListColumnFamilies(&cf_name_list, db_path,
                                   immutable_db_options.fs.get());
@@ -1218,7 +1252,8 @@ FileChecksumDumpCommand::FileChecksumDumpCommand(
     const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
-    : LDBCommand(options, flags, false, BuildCmdLineOptions({ARG_PATH})),
+    : LDBCommand(options, flags, false,
+                 BuildCmdLineOptions({ARG_PATH, ARG_HEX})),
       path_("") {
   std::map<std::string, std::string>::const_iterator itr =
       options.find(ARG_PATH);
@@ -1228,6 +1263,7 @@ FileChecksumDumpCommand::FileChecksumDumpCommand(
       exec_state_ = LDBCommandExecuteResult::Failed("--path: missing pathname");
     }
   }
+  is_checksum_hex_ = IsFlagPresent(flags, ARG_HEX);
 }
 
 void FileChecksumDumpCommand::DoCommand() {
@@ -1250,8 +1286,14 @@ void FileChecksumDumpCommand::DoCommand() {
         assert(i < file_numbers.size());
         assert(i < checksums.size());
         assert(i < checksum_func_names.size());
+        std::string checksum;
+        if (is_checksum_hex_) {
+          checksum = StringToHex(checksums[i]);
+        } else {
+          checksum = std::move(checksums[i]);
+        }
         fprintf(stdout, "%" PRId64 ", %s, %s\n", file_numbers[i],
-                checksum_func_names[i].c_str(), checksums[i].c_str());
+                checksum_func_names[i].c_str(), checksum.c_str());
       }
     }
     fprintf(stdout, "Print SST file checksum information finished \n");
@@ -1260,6 +1302,58 @@ void FileChecksumDumpCommand::DoCommand() {
 
 // ----------------------------------------------------------------------------
 
+void GetPropertyCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(GetPropertyCommand::Name());
+  ret.append(" <property_name>");
+  ret.append("\n");
+}
+
+GetPropertyCommand::GetPropertyCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(options, flags, true, BuildCmdLineOptions({})) {
+  if (params.size() != 1) {
+    exec_state_ =
+        LDBCommandExecuteResult::Failed("property name must be specified");
+  } else {
+    property_ = params[0];
+  }
+}
+
+void GetPropertyCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+
+  std::map<std::string, std::string> value_map;
+  std::string value;
+
+  // Rather than having different ldb command for map properties vs. string
+  // properties, we simply try Map property first. (This order only chosen
+  // because I prefer the map-style output for
+  // "rocksdb.aggregated-table-properties".)
+  if (db_->GetMapProperty(GetCfHandle(), property_, &value_map)) {
+    if (value_map.empty()) {
+      fprintf(stdout, "%s: <empty map>\n", property_.c_str());
+    } else {
+      for (auto& e : value_map) {
+        fprintf(stdout, "%s.%s: %s\n", property_.c_str(), e.first.c_str(),
+                e.second.c_str());
+      }
+    }
+  } else if (db_->GetProperty(GetCfHandle(), property_, &value)) {
+    fprintf(stdout, "%s: %s\n", property_.c_str(), value.c_str());
+  } else {
+    exec_state_ =
+        LDBCommandExecuteResult::Failed("failed to get property: " + property_);
+  }
+}
+
+// ----------------------------------------------------------------------------
+
 void ListColumnFamiliesCommand::Help(std::string& ret) {
   ret.append("  ");
   ret.append(ListColumnFamiliesCommand::Name());
@@ -1672,7 +1766,7 @@ void DBDumperCommand::DoCommand() {
     }
 
     switch (type) {
-      case kLogFile:
+      case kWalFile:
         // TODO(myabandeh): allow configuring is_write_commited
         DumpWalFile(options_, path_, /* print_header_ */ true,
                     /* print_values_ */ true, true /* is_write_commited */,
@@ -1884,14 +1978,15 @@ void ReduceDBLevelsCommand::Help(std::string& ret) {
   ret.append("\n");
 }
 
-void ReduceDBLevelsCommand::OverrideBaseOptions() {
-  LDBCommand::OverrideBaseOptions();
-  options_.num_levels = old_levels_;
-  options_.max_bytes_for_level_multiplier_additional.resize(options_.num_levels,
+void ReduceDBLevelsCommand::OverrideBaseCFOptions(
+    ColumnFamilyOptions* cf_opts) {
+  LDBCommand::OverrideBaseCFOptions(cf_opts);
+  cf_opts->num_levels = old_levels_;
+  cf_opts->max_bytes_for_level_multiplier_additional.resize(cf_opts->num_levels,
                                                             1);
   // Disable size compaction
-  options_.max_bytes_for_level_base = 1ULL << 50;
-  options_.max_bytes_for_level_multiplier = 1;
+  cf_opts->max_bytes_for_level_base = 1ULL << 50;
+  cf_opts->max_bytes_for_level_multiplier = 1;
 }
 
 Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
@@ -1904,7 +1999,8 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
   WriteController wc(opt.delayed_write_rate);
   WriteBufferManager wb(opt.db_write_buffer_size);
   VersionSet versions(db_path_, &db_options, soptions, tc.get(), &wb, &wc,
-                      /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr);
+                      /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                      /*db_session_id*/ "");
   std::vector<ColumnFamilyDescriptor> dummy;
   ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName,
                                           ColumnFamilyOptions(opt));
@@ -1961,12 +2057,16 @@ void ReduceDBLevelsCommand::DoCommand() {
   assert(db_ != nullptr);
   // Compact the whole DB to put all files to the highest level.
   fprintf(stdout, "Compacting the db...\n");
-  db_->CompactRange(CompactRangeOptions(), GetCfHandle(), nullptr, nullptr);
+  st =
+      db_->CompactRange(CompactRangeOptions(), GetCfHandle(), nullptr, nullptr);
+
   CloseDB();
 
-  EnvOptions soptions;
-  st = VersionSet::ReduceNumberOfLevels(db_path_, &options_, soptions,
-                                        new_levels_);
+  if (st.ok()) {
+    EnvOptions soptions;
+    st = VersionSet::ReduceNumberOfLevels(db_path_, &options_, soptions,
+                                          new_levels_);
+  }
   if (!st.ok()) {
     exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
     return;
@@ -2033,18 +2133,18 @@ void ChangeCompactionStyleCommand::Help(std::string& ret) {
   ret.append("\n");
 }
 
-void ChangeCompactionStyleCommand::OverrideBaseOptions() {
-  LDBCommand::OverrideBaseOptions();
-
+void ChangeCompactionStyleCommand::OverrideBaseCFOptions(
+    ColumnFamilyOptions* cf_opts) {
+  LDBCommand::OverrideBaseCFOptions(cf_opts);
   if (old_compaction_style_ == kCompactionStyleLevel &&
       new_compaction_style_ == kCompactionStyleUniversal) {
     // In order to convert from level compaction to universal compaction, we
     // need to compact all data into a single file and move it to level 0.
-    options_.disable_auto_compactions = true;
-    options_.target_file_size_base = INT_MAX;
-    options_.target_file_size_multiplier = 1;
-    options_.max_bytes_for_level_base = INT_MAX;
-    options_.max_bytes_for_level_multiplier = 1;
+    cf_opts->disable_auto_compactions = true;
+    cf_opts->target_file_size_base = INT_MAX;
+    cf_opts->target_file_size_multiplier = 1;
+    cf_opts->max_bytes_for_level_base = INT_MAX;
+    cf_opts->max_bytes_for_level_multiplier = 1;
   }
 }
 
@@ -2073,7 +2173,14 @@ void ChangeCompactionStyleCommand::DoCommand() {
   CompactRangeOptions compact_options;
   compact_options.change_level = true;
   compact_options.target_level = 0;
-  db_->CompactRange(compact_options, GetCfHandle(), nullptr, nullptr);
+  Status s =
+      db_->CompactRange(compact_options, GetCfHandle(), nullptr, nullptr);
+  if (!s.ok()) {
+    std::stringstream oss;
+    oss << "Compaction failed: " << s.ToString();
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
+    return;
+  }
 
   // verify compaction result
   files_per_level = "";
@@ -2218,19 +2325,11 @@ class InMemoryHandler : public WriteBatch::Handler {
 void DumpWalFile(Options options, std::string wal_file, bool print_header,
                  bool print_values, bool is_write_committed,
                  LDBCommandExecuteResult* exec_state) {
-  Env* env = options.env;
-  EnvOptions soptions(options);
+  const auto& fs = options.env->GetFileSystem();
+  FileOptions soptions(options);
   std::unique_ptr<SequentialFileReader> wal_file_reader;
-
-  Status status;
-  {
-    std::unique_ptr<SequentialFile> file;
-    status = env->NewSequentialFile(wal_file, &file, soptions);
-    if (status.ok()) {
-      wal_file_reader.reset(new SequentialFileReader(
-          NewLegacySequentialFileWrapper(file), wal_file));
-    }
-  }
+  Status status = SequentialFileReader::Create(fs, wal_file, soptions,
+                                               &wal_file_reader, nullptr);
   if (!status.ok()) {
     if (exec_state) {
       *exec_state = LDBCommandExecuteResult::Failed("Failed to open WAL file " +
@@ -2266,19 +2365,38 @@ void DumpWalFile(Options options, std::string wal_file, bool print_header,
       }
       std::cout << "\n";
     }
-    while (reader.ReadRecord(&record, &scratch)) {
+    while (status.ok() && reader.ReadRecord(&record, &scratch)) {
       row.str("");
       if (record.size() < WriteBatchInternal::kHeader) {
         reporter.Corruption(record.size(),
                             Status::Corruption("log record too small"));
       } else {
-        WriteBatchInternal::SetContents(&batch, record);
+        status = WriteBatchInternal::SetContents(&batch, record);
+        if (!status.ok()) {
+          std::stringstream oss;
+          oss << "Parsing write batch failed: " << status.ToString();
+          if (exec_state) {
+            *exec_state = LDBCommandExecuteResult::Failed(oss.str());
+          } else {
+            std::cerr << oss.str() << std::endl;
+          }
+          break;
+        }
         row << WriteBatchInternal::Sequence(&batch) << ",";
         row << WriteBatchInternal::Count(&batch) << ",";
         row << WriteBatchInternal::ByteSize(&batch) << ",";
         row << reader.LastRecordOffset() << ",";
         InMemoryHandler handler(row, print_values, is_write_committed);
-        batch.Iterate(&handler);
+        status = batch.Iterate(&handler);
+        if (!status.ok()) {
+          if (exec_state) {
+            std::stringstream oss;
+            oss << "Print write batch error: " << status.ToString();
+            *exec_state = LDBCommandExecuteResult::Failed(oss.str());
+          }
+          row << "error: " << status.ToString();
+          break;
+        }
         row << "\n";
       }
       std::cout << row.str();
@@ -2376,7 +2494,9 @@ void GetCommand::DoCommand() {
     fprintf(stdout, "%s\n",
               (is_value_hex_ ? StringToHex(value) : value).c_str());
   } else {
-    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+    std::stringstream oss;
+    oss << "Get failed: " << st.ToString();
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
   }
 }
 
@@ -2426,14 +2546,14 @@ void ApproxSizeCommand::DoCommand() {
   Range ranges[1];
   ranges[0] = Range(start_key_, end_key_);
   uint64_t sizes[1];
-  db_->GetApproximateSizes(GetCfHandle(), ranges, 1, sizes);
-  fprintf(stdout, "%lu\n", (unsigned long)sizes[0]);
-  /* Weird that GetApproximateSizes() returns void, although documentation
-   * says that it returns a Status object.
-  if (!st.ok()) {
-    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+  Status s = db_->GetApproximateSizes(GetCfHandle(), ranges, 1, sizes);
+  if (!s.ok()) {
+    std::stringstream oss;
+    oss << "ApproximateSize failed: " << s.ToString();
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
+  } else {
+    fprintf(stdout, "%lu\n", (unsigned long)sizes[0]);
   }
-  */
 }
 
 // ----------------------------------------------------------------------------
@@ -2479,16 +2599,28 @@ void BatchPutCommand::DoCommand() {
   }
   WriteBatch batch;
 
+  Status st;
+  std::stringstream oss;
   for (std::vector<std::pair<std::string, std::string>>::const_iterator itr =
            key_values_.begin();
        itr != key_values_.end(); ++itr) {
-    batch.Put(GetCfHandle(), itr->first, itr->second);
+    st = batch.Put(GetCfHandle(), itr->first, itr->second);
+    if (!st.ok()) {
+      oss << "Put to write batch failed: " << itr->first << "=>" << itr->second
+          << " error: " << st.ToString();
+      break;
+    }
+  }
+  if (st.ok()) {
+    st = db_->Write(WriteOptions(), &batch);
+    if (!st.ok()) {
+      oss << "Write failed: " << st.ToString();
+    }
   }
-  Status st = db_->Write(WriteOptions(), &batch);
   if (st.ok()) {
     fprintf(stdout, "OK\n");
   } else {
-    exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
   }
 }
 
@@ -2816,7 +2948,9 @@ void DBQuerierCommand::DoCommand() {
   std::string line;
   std::string key;
   std::string value;
-  while (getline(std::cin, line, '\n')) {
+  Status s;
+  std::stringstream oss;
+  while (s.ok() && getline(std::cin, line, '\n')) {
     // Parse line into std::vector<std::string>
     std::vector<std::string> tokens;
     size_t pos = 0;
@@ -2839,26 +2973,42 @@ void DBQuerierCommand::DoCommand() {
               "delete <key>\n");
     } else if (cmd == DELETE_CMD && tokens.size() == 2) {
       key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]);
-      db_->Delete(write_options, GetCfHandle(), Slice(key));
-      fprintf(stdout, "Successfully deleted %s\n", tokens[1].c_str());
+      s = db_->Delete(write_options, GetCfHandle(), Slice(key));
+      if (s.ok()) {
+        fprintf(stdout, "Successfully deleted %s\n", tokens[1].c_str());
+      } else {
+        oss << "delete " << key << " failed: " << s.ToString();
+      }
     } else if (cmd == PUT_CMD && tokens.size() == 3) {
       key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]);
       value = (is_value_hex_ ? HexToString(tokens[2]) : tokens[2]);
-      db_->Put(write_options, GetCfHandle(), Slice(key), Slice(value));
-      fprintf(stdout, "Successfully put %s %s\n",
-              tokens[1].c_str(), tokens[2].c_str());
+      s = db_->Put(write_options, GetCfHandle(), Slice(key), Slice(value));
+      if (s.ok()) {
+        fprintf(stdout, "Successfully put %s %s\n", tokens[1].c_str(),
+                tokens[2].c_str());
+      } else {
+        oss << "put " << key << "=>" << value << " failed: " << s.ToString();
+      }
     } else if (cmd == GET_CMD && tokens.size() == 2) {
       key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]);
-      if (db_->Get(read_options, GetCfHandle(), Slice(key), &value).ok()) {
+      s = db_->Get(read_options, GetCfHandle(), Slice(key), &value);
+      if (s.ok()) {
         fprintf(stdout, "%s\n", PrintKeyValue(key, value,
               is_key_hex_, is_value_hex_).c_str());
       } else {
-        fprintf(stdout, "Not found %s\n", tokens[1].c_str());
+        if (s.IsNotFound()) {
+          fprintf(stdout, "Not found %s\n", tokens[1].c_str());
+        } else {
+          oss << "get " << key << " error: " << s.ToString();
+        }
       }
     } else {
       fprintf(stdout, "Unknown command %s\n", line.c_str());
     }
   }
+  if (!s.ok()) {
+    exec_state_ = LDBCommandExecuteResult::Failed(oss.str());
+  }
 }
 
 // ----------------------------------------------------------------------------
@@ -2955,6 +3105,7 @@ void RepairCommand::DoCommand() {
 
 const std::string BackupableCommand::ARG_NUM_THREADS = "num_threads";
 const std::string BackupableCommand::ARG_BACKUP_ENV_URI = "backup_env_uri";
+const std::string BackupableCommand::ARG_BACKUP_FS_URI = "backup_fs_uri";
 const std::string BackupableCommand::ARG_BACKUP_DIR = "backup_dir";
 const std::string BackupableCommand::ARG_STDERR_LOG_LEVEL = "stderr_log_level";
 
@@ -2963,8 +3114,9 @@ BackupableCommand::BackupableCommand(
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(options, flags, false /* is_read_only */,
-                 BuildCmdLineOptions({ARG_BACKUP_ENV_URI, ARG_BACKUP_DIR,
-                                      ARG_NUM_THREADS, ARG_STDERR_LOG_LEVEL})),
+                 BuildCmdLineOptions({ARG_BACKUP_ENV_URI, ARG_BACKUP_FS_URI,
+                                      ARG_BACKUP_DIR, ARG_NUM_THREADS,
+                                      ARG_STDERR_LOG_LEVEL})),
       num_threads_(1) {
   auto itr = options.find(ARG_NUM_THREADS);
   if (itr != options.end()) {
@@ -2974,6 +3126,15 @@ BackupableCommand::BackupableCommand(
   if (itr != options.end()) {
     backup_env_uri_ = itr->second;
   }
+  itr = options.find(ARG_BACKUP_FS_URI);
+  if (itr != options.end()) {
+    backup_fs_uri_ = itr->second;
+  }
+  if (!backup_env_uri_.empty() && !backup_fs_uri_.empty()) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "you may not specity both --" + ARG_BACKUP_ENV_URI + " and --" +
+        ARG_BACKUP_FS_URI);
+  }
   itr = options.find(ARG_BACKUP_DIR);
   if (itr == options.end()) {
     exec_state_ = LDBCommandExecuteResult::Failed("--" + ARG_BACKUP_DIR +
@@ -3000,7 +3161,7 @@ BackupableCommand::BackupableCommand(
 void BackupableCommand::Help(const std::string& name, std::string& ret) {
   ret.append("  ");
   ret.append(name);
-  ret.append(" [--" + ARG_BACKUP_ENV_URI + "] ");
+  ret.append(" [--" + ARG_BACKUP_ENV_URI + " | --" + ARG_BACKUP_FS_URI + "] ");
   ret.append(" [--" + ARG_BACKUP_DIR + "] ");
   ret.append(" [--" + ARG_NUM_THREADS + "] ");
   ret.append(" [--" + ARG_STDERR_LOG_LEVEL + "=<int (InfoLogLevel)>] ");
@@ -3026,15 +3187,24 @@ void BackupCommand::DoCommand() {
     return;
   }
   fprintf(stdout, "open db OK\n");
-  Env* custom_env = nullptr;
-  Env::LoadEnv(backup_env_uri_, &custom_env, &backup_env_guard_);
+
+  Env* custom_env = backup_env_guard_.get();
+  if (custom_env == nullptr) {
+    Status s =
+        Env::CreateFromUri(config_options_, backup_env_uri_, backup_fs_uri_,
+                           &custom_env, &backup_env_guard_);
+    if (!s.ok()) {
+      exec_state_ = LDBCommandExecuteResult::Failed(s.ToString());
+      return;
+    }
+  }
   assert(custom_env != nullptr);
 
   BackupableDBOptions backup_options =
       BackupableDBOptions(backup_dir_, custom_env);
   backup_options.info_log = logger_.get();
   backup_options.max_background_operations = num_threads_;
-  status = BackupEngine::Open(custom_env, backup_options, &backup_engine);
+  status = BackupEngine::Open(options_.env, backup_options, &backup_engine);
   if (status.ok()) {
     fprintf(stdout, "open backup engine OK\n");
   } else {
@@ -3063,8 +3233,16 @@ void RestoreCommand::Help(std::string& ret) {
 }
 
 void RestoreCommand::DoCommand() {
-  Env* custom_env = nullptr;
-  Env::LoadEnv(backup_env_uri_, &custom_env, &backup_env_guard_);
+  Env* custom_env = backup_env_guard_.get();
+  if (custom_env == nullptr) {
+    Status s =
+        Env::CreateFromUri(config_options_, backup_env_uri_, backup_fs_uri_,
+                           &custom_env, &backup_env_guard_);
+    if (!s.ok()) {
+      exec_state_ = LDBCommandExecuteResult::Failed(s.ToString());
+      return;
+    }
+  }
   assert(custom_env != nullptr);
 
   std::unique_ptr<BackupEngineReadOnly> restore_engine;
@@ -3075,7 +3253,7 @@ void RestoreCommand::DoCommand() {
     opts.max_background_operations = num_threads_;
     BackupEngineReadOnly* raw_restore_engine_ptr;
     status =
-        BackupEngineReadOnly::Open(custom_env, opts, &raw_restore_engine_ptr);
+        BackupEngineReadOnly::Open(options_.env, opts, &raw_restore_engine_ptr);
     if (status.ok()) {
       restore_engine.reset(raw_restore_engine_ptr);
     }
@@ -3458,7 +3636,7 @@ void UnsafeRemoveSstFileCommand::Help(std::string& ret) {
   ret.append("  ");
   ret.append(UnsafeRemoveSstFileCommand::Name());
   ret.append(" <SST file number>");
-  ret.append("\n");
+  ret.append("  ");
   ret.append("    MUST NOT be used on a live DB.");
   ret.append("\n");
 }
@@ -3504,7 +3682,8 @@ void UnsafeRemoveSstFileCommand::DoCommand() {
       NewLRUCache(1 << 20 /* capacity */, options_.table_cache_numshardbits));
   EnvOptions sopt;
   VersionSet versions(db_path_, &immutable_db_options, sopt, tc.get(), &wb, &wc,
-                      /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr);
+                      /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                      /*db_session_id*/ "");
   Status s = versions.Recover(column_families_);
 
   ColumnFamilyData* cfd = nullptr;
diff --git a/tools/ldb_cmd_impl.h b/tools/ldb_cmd_impl.h
index 475cc44c92b..f5f7eff2e1e 100644
--- a/tools/ldb_cmd_impl.h
+++ b/tools/ldb_cmd_impl.h
@@ -186,10 +186,26 @@ class FileChecksumDumpCommand : public LDBCommand {
 
  private:
   std::string path_;
+  bool is_checksum_hex_;
 
   static const std::string ARG_PATH;
 };
 
+class GetPropertyCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "get_property"; }
+
+  GetPropertyCommand(const std::vector<std::string>& params,
+                     const std::map<std::string, std::string>& options,
+                     const std::vector<std::string>& flags);
+
+  static void Help(std::string& ret);
+  void DoCommand() override;
+
+ private:
+  std::string property_;
+};
+
 class ListColumnFamiliesCommand : public LDBCommand {
  public:
   static std::string Name() { return "list_column_families"; }
@@ -246,7 +262,7 @@ class ReduceDBLevelsCommand : public LDBCommand {
                         const std::map<std::string, std::string>& options,
                         const std::vector<std::string>& flags);
 
-  virtual void OverrideBaseOptions() override;
+  virtual void OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts) override;
 
   virtual void DoCommand() override;
 
@@ -278,7 +294,7 @@ class ChangeCompactionStyleCommand : public LDBCommand {
       const std::map<std::string, std::string>& options,
       const std::vector<std::string>& flags);
 
-  virtual void OverrideBaseOptions() override;
+  virtual void OverrideBaseCFOptions(ColumnFamilyOptions* cf_opts) override;
 
   virtual void DoCommand() override;
 
@@ -525,6 +541,7 @@ class BackupableCommand : public LDBCommand {
  protected:
   static void Help(const std::string& name, std::string& ret);
   std::string backup_env_uri_;
+  std::string backup_fs_uri_;
   std::string backup_dir_;
   int num_threads_;
   std::unique_ptr<Logger> logger_;
@@ -533,6 +550,7 @@ class BackupableCommand : public LDBCommand {
  private:
   static const std::string ARG_BACKUP_DIR;
   static const std::string ARG_BACKUP_ENV_URI;
+  static const std::string ARG_BACKUP_FS_URI;
   static const std::string ARG_NUM_THREADS;
   static const std::string ARG_STDERR_LOG_LEVEL;
 };
diff --git a/tools/ldb_cmd_test.cc b/tools/ldb_cmd_test.cc
index 504a5d72ccd..ed70ca39c9c 100644
--- a/tools/ldb_cmd_test.cc
+++ b/tools/ldb_cmd_test.cc
@@ -12,7 +12,9 @@
 #include "env/composite_env_wrapper.h"
 #include "file/filename.h"
 #include "port/stack_trace.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/file_checksum.h"
+#include "rocksdb/utilities/options_util.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
@@ -30,12 +32,8 @@ class LdbCmdTest : public testing::Test {
   LdbCmdTest() : testing::Test() {}
 
   Env* TryLoadCustomOrDefaultEnv() {
-    const char* test_env_uri = getenv("TEST_ENV_URI");
-    if (!test_env_uri) {
-      return Env::Default();
-    }
     Env* env = Env::Default();
-    Env::LoadEnv(test_env_uri, &env, &env_guard_);
+    EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env, &env_guard_));
     return env;
   }
 
@@ -205,7 +203,7 @@ class FileChecksumTestHelper {
     WriteBufferManager wb(options_.db_write_buffer_size);
     ImmutableDBOptions immutable_db_options(options_);
     VersionSet versions(dbname_, &immutable_db_options, sopt, tc.get(), &wb,
-                        &wc, nullptr, nullptr);
+                        &wc, nullptr, nullptr, "");
     std::vector<std::string> cf_name_list;
     Status s;
     s = versions.ListColumnFamilies(&cf_name_list, dbname_,
@@ -348,6 +346,85 @@ TEST_F(LdbCmdTest, DumpFileChecksumNoChecksum) {
   ASSERT_OK(fct_helper_ac.VerifyChecksumInManifest(live_files));
 }
 
+TEST_F(LdbCmdTest, BlobDBDumpFileChecksumNoChecksum) {
+  Env* base_env = TryLoadCustomOrDefaultEnv();
+  std::unique_ptr<Env> env(NewMemEnv(base_env));
+  Options opts;
+  opts.env = env.get();
+  opts.create_if_missing = true;
+  opts.enable_blob_files = true;
+
+  DB* db = nullptr;
+  std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  WriteOptions wopts;
+  FlushOptions fopts;
+  fopts.wait = true;
+  Random rnd(test::RandomSeed());
+  for (int i = 0; i < 200; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 100; i < 300; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 200; i < 400; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 300; i < 400; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+
+  char arg1[] = "./ldb";
+  std::string arg2_str = "--db=" + dbname;
+  char arg3[] = "file_checksum_dump";
+  char* argv[] = {arg1, const_cast<char*>(arg2_str.c_str()), arg3};
+
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
+
+  // Verify each sst and blob file checksum value and checksum name
+  FileChecksumTestHelper fct_helper(opts, db, dbname);
+  ASSERT_OK(fct_helper.VerifyEachFileChecksum());
+
+  // Manually trigger compaction
+  std::ostringstream oss_b_buf;
+  oss_b_buf << std::setfill('0') << std::setw(8) << std::fixed << 0;
+  std::ostringstream oss_e_buf;
+  oss_e_buf << std::setfill('0') << std::setw(8) << std::fixed << 399;
+  std::string b_buf = oss_b_buf.str();
+  std::string e_buf = oss_e_buf.str();
+  Slice begin(b_buf);
+  Slice end(e_buf);
+
+  CompactRangeOptions options;
+  ASSERT_OK(db->CompactRange(options, &begin, &end));
+  // Verify each sst file checksum after compaction
+  FileChecksumTestHelper fct_helper_ac(opts, db, dbname);
+  ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum());
+
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
+
+  delete db;
+}
+
 TEST_F(LdbCmdTest, DumpFileChecksumCRC32) {
   Env* base_env = TryLoadCustomOrDefaultEnv();
   std::unique_ptr<Env> env(NewMemEnv(base_env));
@@ -429,6 +506,85 @@ TEST_F(LdbCmdTest, DumpFileChecksumCRC32) {
   ASSERT_OK(fct_helper_ac.VerifyChecksumInManifest(live_files));
 }
 
+TEST_F(LdbCmdTest, BlobDBDumpFileChecksumCRC32) {
+  Env* base_env = TryLoadCustomOrDefaultEnv();
+  std::unique_ptr<Env> env(NewMemEnv(base_env));
+  Options opts;
+  opts.env = env.get();
+  opts.create_if_missing = true;
+  opts.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  opts.enable_blob_files = true;
+
+  DB* db = nullptr;
+  std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  WriteOptions wopts;
+  FlushOptions fopts;
+  fopts.wait = true;
+  Random rnd(test::RandomSeed());
+  for (int i = 0; i < 100; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 50; i < 150; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 100; i < 200; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+  for (int i = 150; i < 250; i++) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(8) << std::fixed << i;
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(db->Put(wopts, oss.str(), v));
+  }
+  ASSERT_OK(db->Flush(fopts));
+
+  char arg1[] = "./ldb";
+  std::string arg2_str = "--db=" + dbname;
+  char arg3[] = "file_checksum_dump";
+  char* argv[] = {arg1, const_cast<char*>(arg2_str.c_str()), arg3};
+
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
+
+  // Verify each sst and blob file checksum value and checksum name
+  FileChecksumTestHelper fct_helper(opts, db, dbname);
+  ASSERT_OK(fct_helper.VerifyEachFileChecksum());
+
+  // Manually trigger compaction
+  std::ostringstream oss_b_buf;
+  oss_b_buf << std::setfill('0') << std::setw(8) << std::fixed << 0;
+  std::ostringstream oss_e_buf;
+  oss_e_buf << std::setfill('0') << std::setw(8) << std::fixed << 249;
+  std::string b_buf = oss_b_buf.str();
+  std::string e_buf = oss_e_buf.str();
+  Slice begin(b_buf);
+  Slice end(e_buf);
+
+  CompactRangeOptions options;
+  ASSERT_OK(db->CompactRange(options, &begin, &end));
+  // Verify each sst file checksum after compaction
+  FileChecksumTestHelper fct_helper_ac(opts, db, dbname);
+  ASSERT_OK(fct_helper_ac.VerifyEachFileChecksum());
+
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(3, argv, opts, LDBOptions(), nullptr));
+  delete db;
+}
+
 TEST_F(LdbCmdTest, OptionParsing) {
   // test parsing flags
   Options opts;
@@ -672,6 +828,52 @@ TEST_F(LdbCmdTest, TestBadDbPath) {
   ASSERT_EQ(1,
             LDBCommandRunner::RunCommand(4, argv, opts, LDBOptions(), nullptr));
 }
+
+TEST_F(LdbCmdTest, LoadCFOptionsAndOverride) {
+  // Env* base_env = TryLoadCustomOrDefaultEnv();
+  // std::unique_ptr<Env> env(NewMemEnv(base_env));
+  std::unique_ptr<Env> env(new EnvWrapper(Env::Default()));
+  Options opts;
+  opts.env = env.get();
+  opts.create_if_missing = true;
+
+  DB* db = nullptr;
+  std::string dbname = test::PerThreadDBPath(env.get(), "ldb_cmd_test");
+  DestroyDB(dbname, opts);
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  ColumnFamilyHandle* cf_handle;
+  ColumnFamilyOptions cf_opts;
+  cf_opts.num_levels = 20;
+  ASSERT_OK(db->CreateColumnFamily(cf_opts, "cf1", &cf_handle));
+
+  delete cf_handle;
+  delete db;
+
+  char arg1[] = "./ldb";
+  char arg2[1024];
+  snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+  char arg3[] = "put";
+  char arg4[] = "key1";
+  char arg5[] = "value1";
+  char arg6[] = "--try_load_options";
+  char arg7[] = "--column_family=cf1";
+  char arg8[] = "--write_buffer_size=268435456";
+  char* argv[] = {arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8};
+
+  ASSERT_EQ(0,
+            LDBCommandRunner::RunCommand(8, argv, opts, LDBOptions(), nullptr));
+
+  ConfigOptions config_opts;
+  Options options;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  config_opts.env = env.get();
+  ASSERT_OK(LoadLatestOptions(config_opts, dbname, &options, &column_families));
+  ASSERT_EQ(column_families.size(), 2);
+  ASSERT_EQ(options.num_levels, opts.num_levels);
+  ASSERT_EQ(column_families[1].options.num_levels, cf_opts.num_levels);
+  ASSERT_EQ(column_families[1].options.write_buffer_size, 268435456);
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
diff --git a/tools/ldb_test.py b/tools/ldb_test.py
index 46edfa4e572..f329699a2eb 100644
--- a/tools/ldb_test.py
+++ b/tools/ldb_test.py
@@ -473,6 +473,52 @@ def testManifestDump(self):
                              expected_pattern, unexpected=False,
                              isPattern=True)
 
+        # Check if null characters doesn't infer with output format.
+        self.assertRunOK("put a1 b1", "OK")
+        self.assertRunOK("put a2 b2", "OK")
+        self.assertRunOK("put --hex 0x12000DA0 0x80C0000B", "OK")
+        self.assertRunOK("put --hex 0x7200004f 0x80000004", "OK")
+        self.assertRunOK("put --hex 0xa000000a 0xf000000f", "OK")
+        self.assertRunOK("put a3 b3", "OK")
+        self.assertRunOK("put a4 b4", "OK")
+
+        # Verifies that all "levels" are printed out.
+        # There should be 66 mentions of levels.
+        expected_verbose_output = re.compile("matched")
+        # Test manifest_dump verbose and verify that key 0x7200004f
+        # is present. Note that we are forced to use grep here because
+        # an output with a non-terminating null character in it isn't piped
+        # correctly through the Python subprocess object.
+        # Also note that 0x72=r and 0x4f=O, hence the regex \'r.{2}O\'
+        # (we cannot use null character in the subprocess input either,
+        # so we have to use '.{2}')
+        cmd_verbose = "manifest_dump --verbose --db=%s | grep -aq $'\'r.{2}O\'' && echo 'matched' || echo 'not matched'" %dbPath
+
+        self.assertRunOKFull(cmd_verbose , expected_verbose_output,
+                             unexpected=False, isPattern=True)
+
+
+    def testGetProperty(self):
+        print("Running testGetProperty...")
+        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+        self.assertRunOK("put 1 1 --create_if_missing", "OK")
+        self.assertRunOK("put 2 2", "OK")
+        # A "string" property
+        cmd = "--db=%s get_property rocksdb.estimate-num-keys"
+        self.assertRunOKFull(cmd % dbPath,
+                             "rocksdb.estimate-num-keys: 2")
+        # A "map" property
+        # FIXME: why doesn't this pick up two entries?
+        cmd = "--db=%s get_property rocksdb.aggregated-table-properties"
+        part = "rocksdb.aggregated-table-properties.num_entries: "
+        expected_pattern = re.compile(part)
+        self.assertRunOKFull(cmd % dbPath,
+                             expected_pattern, unexpected=False,
+                             isPattern=True)
+        # An invalid property
+        cmd = "--db=%s get_property rocksdb.this-property-does-not-exist"
+        self.assertRunFAILFull(cmd % dbPath)
+
     def testSSTDump(self):
         print("Running testSSTDump...")
 
diff --git a/tools/ldb_tool.cc b/tools/ldb_tool.cc
index 696737bdcb0..f8f7e718198 100644
--- a/tools/ldb_tool.cc
+++ b/tools/ldb_tool.cc
@@ -22,7 +22,8 @@ void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options,
              "=<full_path_to_db_directory> when necessary\n");
   ret.append("\n");
   ret.append("commands can optionally specify --" + LDBCommand::ARG_ENV_URI +
-             "=<uri_of_environment> if necessary\n\n");
+             "=<uri_of_environment> or --" + LDBCommand::ARG_FS_URI +
+             "=<uri_of_filesystem> if necessary\n\n");
   ret.append(
       "The following optional parameters control if keys/values are "
       "input/output as hex or as plain strings:\n");
@@ -87,6 +88,7 @@ void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options,
   DBLoaderCommand::Help(ret);
   ManifestDumpCommand::Help(ret);
   FileChecksumDumpCommand::Help(ret);
+  GetPropertyCommand::Help(ret);
   ListColumnFamiliesCommand::Help(ret);
   CreateColumnFamilyCommand::Help(ret);
   DropColumnFamilyCommand::Help(ret);
diff --git a/tools/reduce_levels_test.cc b/tools/reduce_levels_test.cc
index e301016ab22..3b2fdc5050b 100644
--- a/tools/reduce_levels_test.cc
+++ b/tools/reduce_levels_test.cc
@@ -107,7 +107,7 @@ bool ReduceLevelTest::ReduceLevels(int target_level) {
 TEST_F(ReduceLevelTest, Last_Level) {
   ASSERT_OK(OpenDB(true, 4));
   ASSERT_OK(Put("aaaa", "11111"));
-  Flush();
+  ASSERT_OK(Flush());
   MoveL0FileToLevel(3);
   ASSERT_EQ(FilesOnLevel(3), 1);
   CloseDB();
@@ -126,7 +126,7 @@ TEST_F(ReduceLevelTest, Last_Level) {
 TEST_F(ReduceLevelTest, Top_Level) {
   ASSERT_OK(OpenDB(true, 5));
   ASSERT_OK(Put("aaaa", "11111"));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_EQ(FilesOnLevel(0), 1);
   CloseDB();
 
diff --git a/tools/regression_test.sh b/tools/regression_test.sh
index 79963738f67..7bedf240f5f 100755
--- a/tools/regression_test.sh
+++ b/tools/regression_test.sh
@@ -142,6 +142,7 @@ function main {
       run_db_bench "deleterandom" $((NUM_KEYS / 10 / $NUM_THREADS))
       run_db_bench "seekrandom"
       run_db_bench "seekrandomwhilewriting"
+      run_db_bench "multireadrandom" 
   fi
 
   cleanup_test_directory $TEST_ROOT_DIR
@@ -157,7 +158,7 @@ function init_arguments {
 
   current_time=$(date +"%F-%H:%M:%S")
   RESULT_PATH=${RESULT_PATH:-"$1/results/$current_time"}
-  COMMIT_ID=`git log | head -n1 | cut -c 8-`
+  COMMIT_ID=`hg id -i`
   SUMMARY_FILE="$RESULT_PATH/SUMMARY.csv"
 
   DB_PATH=${3:-"$1/db"}
@@ -192,6 +193,8 @@ function init_arguments {
   DELETE_TEST_PATH=${DELETE_TEST_PATH:-0}
   SEEK_NEXTS=${SEEK_NEXTS:-10}
   SEED=${SEED:-$( date +%s )}
+  MULTIREAD_BATCH_SIZE=${MULTIREAD_BATCH_SIZE:-128}
+  MULTIREAD_STRIDE=${MULTIREAD_STRIDE:-12}
 }
 
 # $1 --- benchmark name
@@ -240,7 +243,10 @@ function run_db_bench {
       --max_background_compactions=$MAX_BACKGROUND_COMPACTIONS \
       --num_high_pri_threads=$NUM_HIGH_PRI_THREADS \
       --num_low_pri_threads=$NUM_LOW_PRI_THREADS \
-      --seed=$SEED) 2>&1"
+      --seed=$SEED \
+      --multiread_batched=true \
+      --batch_size=$MULTIREAD_BATCH_SIZE \
+      --multiread_stride=$MULTIREAD_STRIDE) 2>&1"
   ps_cmd="ps aux"
   if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
     echo "Running benchmark remotely on $REMOTE_USER_AT_HOST"
@@ -288,14 +294,14 @@ function build_checkpoint {
             db_index=$(basename $dir)
             echo "Building checkpoints: $ORIGIN_PATH/$db_index -> $DB_PATH/$db_index ..."
             $cmd_prefix $DB_BENCH_DIR/ldb checkpoint --checkpoint_dir=$DB_PATH/$db_index \
-                        --db=$ORIGIN_PATH/$db_index 2>&1
+                        --db=$ORIGIN_PATH/$db_index --try_load_options 2>&1
         done
     else
         # checkpoint cannot build in directory already exists
         $cmd_prefix rm -rf $DB_PATH
         echo "Building checkpoint: $ORIGIN_PATH -> $DB_PATH ..."
         $cmd_prefix $DB_BENCH_DIR/ldb checkpoint --checkpoint_dir=$DB_PATH \
-                    --db=$ORIGIN_PATH 2>&1
+                    --db=$ORIGIN_PATH --try_load_options 2>&1
     fi
 }
 
@@ -361,23 +367,13 @@ function exit_on_error {
   fi
 }
 
-function checkout_rocksdb {
-  echo "Checking out commit $1 ..."
-
-  git fetch --all
-  exit_on_error $?
-
-  git checkout $1
-  exit_on_error $?
-}
-
 function build_db_bench_and_ldb {
   echo "Building db_bench & ldb ..."
 
   make clean
   exit_on_error $?
 
-  DEBUG_LEVEL=0 PORTABLE=1 make db_bench ldb -j32
+  DEBUG_LEVEL=0 make db_bench ldb -j32
   exit_on_error $?
 }
 
diff --git a/tools/restore_db.sh b/tools/restore_db.sh
new file mode 100755
index 00000000000..ed89794b2dd
--- /dev/null
+++ b/tools/restore_db.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#
+#
+
+if [ "$#" -lt 2 ]; then
+  echo "usage: ${BASH_SOURCE[0]} <Backup Dir> <DB Path>"
+  exit 1
+fi
+
+backup_dir="$1"
+db_dir="$2"
+
+echo "== Restoring latest from $backup_dir to $db_dir"
+./ldb restore --db="$db_dir" --backup_dir="$backup_dir"
diff --git a/tools/simulated_hybrid_file_system.cc b/tools/simulated_hybrid_file_system.cc
new file mode 100644
index 00000000000..59b4654be5e
--- /dev/null
+++ b/tools/simulated_hybrid_file_system.cc
@@ -0,0 +1,152 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "tools/simulated_hybrid_file_system.h"
+
+#include <algorithm>
+#include <sstream>
+#include <string>
+
+#include "rocksdb/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const int kLatencyAddedPerRequestUs = 15000;
+const int64_t kRequestPerSec = 100;
+const int64_t kDummyBytesPerRequest = 1024 * 1024;
+
+// The metadata file format: each line is a full filename of a file which is
+// warm
+SimulatedHybridFileSystem::SimulatedHybridFileSystem(
+    const std::shared_ptr<FileSystem>& base,
+    const std::string& metadata_file_name)
+    : FileSystemWrapper(base),
+      // Limit to 100 requests per second.
+      rate_limiter_(NewGenericRateLimiter(
+          kDummyBytesPerRequest * kRequestPerSec /* rate_bytes_per_sec */,
+          1000 /* refill_period_us */)),
+      metadata_file_name_(metadata_file_name),
+      name_("SimulatedHybridFileSystem: " + std::string(target()->Name())) {
+  IOStatus s = base->FileExists(metadata_file_name, IOOptions(), nullptr);
+  if (s.IsNotFound()) {
+    return;
+  }
+  std::string metadata;
+  s = ReadFileToString(base.get(), metadata_file_name, &metadata);
+  if (!s.ok()) {
+    fprintf(stderr, "Error reading from file %s: %s",
+            metadata_file_name.c_str(), s.ToString().c_str());
+    // Exit rather than assert as this file system is built to run with
+    // benchmarks, which usually run on release mode.
+    std::exit(1);
+  }
+  std::istringstream input;
+  input.str(metadata);
+  std::string line;
+  while (std::getline(input, line)) {
+    fprintf(stderr, "Warm file %s\n", line.c_str());
+    warm_file_set_.insert(line);
+  }
+}
+
+// Need to write out the metadata file to file. See comment of
+// SimulatedHybridFileSystem::SimulatedHybridFileSystem() for format of the
+// file.
+SimulatedHybridFileSystem::~SimulatedHybridFileSystem() {
+  std::string metadata;
+  for (const auto& f : warm_file_set_) {
+    metadata += f;
+    metadata += "\n";
+  }
+  IOStatus s = WriteStringToFile(target(), metadata, metadata_file_name_, true);
+  if (!s.ok()) {
+    fprintf(stderr, "Error writing to file %s: %s", metadata_file_name_.c_str(),
+            s.ToString().c_str());
+  }
+}
+
+IOStatus SimulatedHybridFileSystem::NewRandomAccessFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSRandomAccessFile>* result, IODebugContext* dbg) {
+  Temperature temperature = Temperature::kUnknown;
+  {
+    const std::lock_guard<std::mutex> lock(mutex_);
+    if (warm_file_set_.find(fname) != warm_file_set_.end()) {
+      temperature = Temperature::kWarm;
+    }
+  }
+  IOStatus s = target()->NewRandomAccessFile(fname, file_opts, result, dbg);
+  result->reset(
+      new SimulatedHybridRaf(result->release(), rate_limiter_, temperature));
+  return s;
+}
+
+IOStatus SimulatedHybridFileSystem::NewWritableFile(
+    const std::string& fname, const FileOptions& file_opts,
+    std::unique_ptr<FSWritableFile>* result, IODebugContext* dbg) {
+  if (file_opts.temperature == Temperature::kWarm) {
+    const std::lock_guard<std::mutex> lock(mutex_);
+    fprintf(stderr, "warm file %s\n", fname.c_str());
+    warm_file_set_.insert(fname);
+  }
+  return target()->NewWritableFile(fname, file_opts, result, dbg);
+}
+
+IOStatus SimulatedHybridFileSystem::DeleteFile(const std::string& fname,
+                                               const IOOptions& options,
+                                               IODebugContext* dbg) {
+  {
+    const std::lock_guard<std::mutex> lock(mutex_);
+    warm_file_set_.erase(fname);
+  }
+  return target()->DeleteFile(fname, options, dbg);
+}
+
+IOStatus SimulatedHybridRaf::Read(uint64_t offset, size_t n,
+                                  const IOOptions& options, Slice* result,
+                                  char* scratch, IODebugContext* dbg) const {
+  if (temperature_ == Temperature::kWarm) {
+    Env::Default()->SleepForMicroseconds(kLatencyAddedPerRequestUs);
+    RequestRateLimit(1);
+  }
+  return target()->Read(offset, n, options, result, scratch, dbg);
+}
+
+IOStatus SimulatedHybridRaf::MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                                       const IOOptions& options,
+                                       IODebugContext* dbg) {
+  if (temperature_ == Temperature::kWarm) {
+    RequestRateLimit(static_cast<int64_t>(num_reqs));
+    Env::Default()->SleepForMicroseconds(kLatencyAddedPerRequestUs *
+                                         static_cast<int>(num_reqs));
+  }
+  return target()->MultiRead(reqs, num_reqs, options, dbg);
+}
+
+IOStatus SimulatedHybridRaf::Prefetch(uint64_t offset, size_t n,
+                                      const IOOptions& options,
+                                      IODebugContext* dbg) {
+  if (temperature_ == Temperature::kWarm) {
+    RequestRateLimit(1);
+    Env::Default()->SleepForMicroseconds(kLatencyAddedPerRequestUs);
+  }
+  return target()->Prefetch(offset, n, options, dbg);
+}
+
+void SimulatedHybridRaf::RequestRateLimit(int64_t num_requests) const {
+  int64_t left = num_requests * kDummyBytesPerRequest;
+  const int64_t kMaxToRequest = kDummyBytesPerRequest / 100;
+  while (left > 0) {
+    int64_t to_request = std::min(kMaxToRequest, left);
+    rate_limiter_->Request(to_request, Env::IOPriority::IO_LOW, nullptr);
+    left -= to_request;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/tools/simulated_hybrid_file_system.h b/tools/simulated_hybrid_file_system.h
new file mode 100644
index 00000000000..a3d16c4e71c
--- /dev/null
+++ b/tools/simulated_hybrid_file_system.h
@@ -0,0 +1,91 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <utility>
+
+#include "rocksdb/file_system.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A FileSystem simulates hybrid file system by ingesting latency and limit
+// IOPs.
+// This class is only used for development purpose and should not be used
+// in production.
+// Right now we ingest 15ms latency and allow 100 requests per second when
+// the file is for warm temperature.
+// When the object is destroyed, the list of warm files are written to a
+// file, which can be used to reopen a FileSystem and still recover the
+// list. This is to allow the information to preserve between db_bench
+// runs.
+class SimulatedHybridFileSystem : public FileSystemWrapper {
+ public:
+  // metadata_file_name stores metadata of the files, so that it can be
+  // loaded after process restarts. If the file doesn't exist, create
+  // one. The file is written when the class is destroyed.
+  explicit SimulatedHybridFileSystem(const std::shared_ptr<FileSystem>& base,
+                                     const std::string& metadata_file_name);
+
+  ~SimulatedHybridFileSystem() override;
+
+ public:
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& file_opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override;
+  IOStatus NewWritableFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override;
+  IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override;
+
+  const char* Name() const override { return name_.c_str(); }
+
+ private:
+  // Limit 100 requests per second. Rate limiter is designed to byte but
+  // we use it as fixed bytes is one request.
+  std::shared_ptr<RateLimiter> rate_limiter_;
+  std::mutex mutex_;
+  std::unordered_set<std::string> warm_file_set_;
+  std::string metadata_file_name_;
+  std::string name_;
+};
+
+// Simulated random access file that can control IOPs and latency to simulate
+// specific storage media
+class SimulatedHybridRaf : public FSRandomAccessFileWrapper {
+ public:
+  SimulatedHybridRaf(FSRandomAccessFile* t,
+                     std::shared_ptr<RateLimiter> rate_limiter,
+                     Temperature temperature)
+      : FSRandomAccessFileWrapper(t),
+        rate_limiter_(rate_limiter),
+        temperature_(temperature) {}
+
+  ~SimulatedHybridRaf() override {}
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
+
+  IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                     const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+                    IODebugContext* dbg) override;
+
+ private:
+  std::shared_ptr<RateLimiter> rate_limiter_;
+  Temperature temperature_;
+
+  void RequestRateLimit(int64_t num_requests) const;
+};
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/tools/sst_dump_test.cc b/tools/sst_dump_test.cc
index 3b963ad36b0..3cbfcd3cc3d 100644
--- a/tools/sst_dump_test.cc
+++ b/tools/sst_dump_test.cc
@@ -10,11 +10,12 @@
 #ifndef ROCKSDB_LITE
 
 #include <stdint.h>
-#include "rocksdb/sst_dump_tool.h"
 
 #include "file/random_access_file_reader.h"
 #include "port/stack_trace.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/filter_policy.h"
+#include "rocksdb/sst_dump_tool.h"
 #include "table/block_based/block_based_table_factory.h"
 #include "table/table_builder.h"
 #include "test_util/testharness.h"
@@ -56,10 +57,7 @@ class SSTDumpToolTest : public testing::Test {
 
  public:
   SSTDumpToolTest() : env_(Env::Default()) {
-    const char* test_env_uri = getenv("TEST_ENV_URI");
-    if (test_env_uri) {
-      Env::LoadEnv(test_env_uri, &env_, &env_guard_);
-    }
+    EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env_, &env_guard_));
     test_dir_ = test::PerThreadDBPath(env_, "sst_dump_test_db");
     Status s = env_->CreateDirIfMissing(test_dir_);
     EXPECT_OK(s);
@@ -94,30 +92,26 @@ class SSTDumpToolTest : public testing::Test {
 
   void createSST(const Options& opts, const std::string& file_name) {
     Env* test_env = opts.env;
-    EnvOptions env_options(opts);
+    FileOptions file_options(opts);
     ReadOptions read_options;
-    const ImmutableCFOptions imoptions(opts);
+    const ImmutableOptions imoptions(opts);
     const MutableCFOptions moptions(opts);
     ROCKSDB_NAMESPACE::InternalKeyComparator ikc(opts.comparator);
     std::unique_ptr<TableBuilder> tb;
 
-    std::unique_ptr<WritableFile> file;
-    ASSERT_OK(test_env->NewWritableFile(file_name, &file, env_options));
+    IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+    std::unique_ptr<WritableFileWriter> file_writer;
+    ASSERT_OK(WritableFileWriter::Create(test_env->GetFileSystem(), file_name,
+                                         file_options, &file_writer, nullptr));
 
-    std::vector<std::unique_ptr<IntTblPropCollectorFactory> >
-        int_tbl_prop_collector_factories;
-    std::unique_ptr<WritableFileWriter> file_writer(
-        new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(file)),
-                               file_name, EnvOptions()));
     std::string column_family_name;
     int unknown_level = -1;
     tb.reset(opts.table_factory->NewTableBuilder(
         TableBuilderOptions(
             imoptions, moptions, ikc, &int_tbl_prop_collector_factories,
-            CompressionType::kNoCompression, 0 /* sample_for_compression */,
-            CompressionOptions(), false /* skip_filters */, column_family_name,
-            unknown_level),
-        TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+            CompressionType::kNoCompression, CompressionOptions(),
+            TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+            column_family_name, unknown_level),
         file_writer.get()));
 
     // Populate slightly more than 1K keys
@@ -126,7 +120,7 @@ class SSTDumpToolTest : public testing::Test {
       tb->Add(MakeKey(i), MakeValue(i));
     }
     ASSERT_OK(tb->Finish());
-    file_writer->Close();
+    ASSERT_OK(file_writer->Close());
   }
 
  protected:
@@ -393,6 +387,8 @@ TEST_F(SSTDumpToolTest, RawOutput) {
 
   ASSERT_EQ(kNumKey, key_count);
 
+  raw_file.close();
+
   cleanup(opts, file_path);
   for (int i = 0; i < 3; i++) {
     delete[] usage[i];
diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc
index a703b23c32f..195747e0aea 100644
--- a/tools/sst_dump_tool.cc
+++ b/tools/sst_dump_tool.cc
@@ -37,7 +37,10 @@ void print_help(bool to_stderr) {
       Path to SST file or directory containing SST files
 
     --env_uri=<uri of underlying Env>
-      URI of underlying Env
+      URI of underlying Env, mutually exclusive with fs_uri
+
+    --fs_uri=<uri of underlying FileSystem>
+      URI of underlying FileSystem, mutually exclusive with env_uri
 
     --command=check|scan|raw|verify|identify
         check: Iterate over entries in files but don't print anything except if an error is encountered (default command)
@@ -103,6 +106,9 @@ void print_help(bool to_stderr) {
 
     --compression_zstd_max_train_bytes=<uint32_t>
       Maximum size of training data passed to zstd's dictionary trainer
+
+    --compression_max_dict_buffer_bytes=<int64_t>
+      Limit on buffer size from which we collect samples for dictionary generation.
 )");
 }
 
@@ -127,7 +133,7 @@ bool ParseIntArg(const char* arg, const std::string arg_name,
 }  // namespace
 
 int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
-  const char* env_uri = nullptr;
+  std::string env_uri, fs_uri;
   const char* dir_or_file = nullptr;
   uint64_t read_num = std::numeric_limits<uint64_t>::max();
   std::string command;
@@ -166,12 +172,16 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
       ROCKSDB_NAMESPACE::CompressionOptions().max_dict_bytes;
   uint32_t compression_zstd_max_train_bytes =
       ROCKSDB_NAMESPACE::CompressionOptions().zstd_max_train_bytes;
+  uint64_t compression_max_dict_buffer_bytes =
+      ROCKSDB_NAMESPACE::CompressionOptions().max_dict_buffer_bytes;
 
   int64_t tmp_val;
 
   for (int i = 1; i < argc; i++) {
     if (strncmp(argv[i], "--env_uri=", 10) == 0) {
       env_uri = argv[i] + 10;
+    } else if (strncmp(argv[i], "--fs_uri=", 9) == 0) {
+      fs_uri = argv[i] + 9;
     } else if (strncmp(argv[i], "--file=", 7) == 0) {
       dir_or_file = argv[i] + 7;
     } else if (strcmp(argv[i], "--output_hex") == 0) {
@@ -238,12 +248,13 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
       Slice sl_key = ROCKSDB_NAMESPACE::Slice(in_key);
       ParsedInternalKey ikey;
       int retc = 0;
-      if (ParseInternalKey(sl_key, &ikey) != Status::OK()) {
-        std::cerr << "Internal Key [" << sl_key.ToString(true /* in hex*/)
-                  << "] parse error!\n";
+      Status pik_status =
+          ParseInternalKey(sl_key, &ikey, true /* log_err_key */);
+      if (!pik_status.ok()) {
+        std::cerr << pik_status.getState() << "\n";
         retc = -1;
       }
-      fprintf(stdout, "key=%s\n", ikey.DebugString(true).c_str());
+      fprintf(stdout, "key=%s\n", ikey.DebugString(true, true).c_str());
       return retc;
     } else if (ParseIntArg(argv[i], "--compression_level_from=",
                            "compression_level_from must be numeric",
@@ -275,12 +286,22 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
         return 1;
       }
       compression_zstd_max_train_bytes = static_cast<uint32_t>(tmp_val);
+    } else if (ParseIntArg(argv[i], "--compression_max_dict_buffer_bytes=",
+                           "compression_max_dict_buffer_bytes must be numeric",
+                           &tmp_val)) {
+      if (tmp_val < 0) {
+        fprintf(stderr,
+                "compression_max_dict_buffer_bytes must be positive: '%s'\n",
+                argv[i]);
+        print_help(/*to_stderr*/ true);
+        return 1;
+      }
+      compression_max_dict_buffer_bytes = static_cast<uint64_t>(tmp_val);
     } else if (strcmp(argv[i], "--help") == 0) {
       print_help(/*to_stderr*/ false);
       return 0;
     } else if (strcmp(argv[i], "--version") == 0) {
-      printf("sst_dump from RocksDB %d.%d.%d\n", ROCKSDB_MAJOR, ROCKSDB_MINOR,
-             ROCKSDB_PATCH);
+      printf("%s\n", GetRocksBuildInfoAsString("sst_dump").c_str());
       return 0;
     } else {
       fprintf(stderr, "Unrecognized argument '%s'\n\n", argv[i]);
@@ -323,18 +344,19 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
   std::shared_ptr<ROCKSDB_NAMESPACE::Env> env_guard;
 
   // If caller of SSTDumpTool::Run(...) does not specify a different env other
-  // than Env::Default(), then try to load custom env based on dir_or_file.
+  // than Env::Default(), then try to load custom env based on env_uri/fs_uri.
   // Otherwise, the caller is responsible for creating custom env.
-  if (!options.env || options.env == ROCKSDB_NAMESPACE::Env::Default()) {
-    Env* env = Env::Default();
-    Status s = Env::LoadEnv(env_uri ? env_uri : "", &env, &env_guard);
-    if (!s.ok() && !s.IsNotFound()) {
-      fprintf(stderr, "LoadEnv: %s\n", s.ToString().c_str());
+  {
+    ConfigOptions config_options;
+    config_options.env = options.env;
+    Status s = Env::CreateFromUri(config_options, env_uri, fs_uri, &options.env,
+                                  &env_guard);
+    if (!s.ok()) {
+      fprintf(stderr, "CreateEnvFromUri: %s\n", s.ToString().c_str());
       exit(1);
+    } else {
+      fprintf(stdout, "options.env is %p\n", options.env);
     }
-    options.env = env;
-  } else {
-    fprintf(stdout, "options.env is %p\n", options.env);
   }
 
   std::vector<std::string> filenames;
@@ -403,7 +425,7 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) {
           set_block_size ? block_size : 16384,
           compression_types.empty() ? kCompressions : compression_types,
           compress_level_from, compress_level_to, compression_max_dict_bytes,
-          compression_zstd_max_train_bytes);
+          compression_zstd_max_train_bytes, compression_max_dict_buffer_bytes);
       if (!st.ok()) {
         fprintf(stderr, "Failed to recompress: %s\n", st.ToString().c_str());
         exit(1);
diff --git a/tools/trace_analyzer_test.cc b/tools/trace_analyzer_test.cc
index b089e3d337e..3c89a988366 100644
--- a/tools/trace_analyzer_test.cc
+++ b/tools/trace_analyzer_test.cc
@@ -12,7 +12,7 @@
 #include <cstdio>
 int main() {
   fprintf(stderr, "Please install gflags to run trace_analyzer test\n");
-  return 1;
+  return 0;
 }
 #else
 
@@ -23,7 +23,7 @@ int main() {
 #include <thread>
 
 #include "db/db_test_util.h"
-#include "file/read_write_util.h"
+#include "file/line_file_reader.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/status.h"
@@ -57,7 +57,11 @@ class TraceAnalyzerTest : public testing::Test {
     Options options;
     options.create_if_missing = true;
     options.merge_operator = MergeOperators::CreatePutOperator();
+    Slice upper_bound("a");
+    Slice lower_bound("abce");
     ReadOptions ro;
+    ro.iterate_upper_bound = &upper_bound;
+    ro.iterate_lower_bound = &lower_bound;
     WriteOptions wo;
     TraceOptions trace_opt;
     DB* db_ = nullptr;
@@ -77,15 +81,35 @@ class TraceAnalyzerTest : public testing::Test {
     ASSERT_OK(batch.SingleDelete("d"));
     ASSERT_OK(batch.DeleteRange("e", "f"));
     ASSERT_OK(db_->Write(wo, &batch));
-
+    std::vector<Slice> keys;
+    keys.push_back("a");
+    keys.push_back("b");
+    keys.push_back("df");
+    keys.push_back("gege");
+    keys.push_back("hjhjhj");
+    std::vector<std::string> values;
+    std::vector<Status> ss = db_->MultiGet(ro, keys, &values);
+    ASSERT_GE(ss.size(), 0);
+    ASSERT_OK(ss[0]);
+    ASSERT_NOK(ss[2]);
+    std::vector<ColumnFamilyHandle*> cfs(2, db_->DefaultColumnFamily());
+    std::vector<PinnableSlice> values2(keys.size());
+    db_->MultiGet(ro, 2, cfs.data(), keys.data(), values2.data(), ss.data(),
+                  false);
+    ASSERT_OK(ss[0]);
+    db_->MultiGet(ro, db_->DefaultColumnFamily(), 2, keys.data() + 3,
+                  values2.data(), ss.data(), false);
     ASSERT_OK(db_->Get(ro, "a", &value));
+
     single_iter = db_->NewIterator(ro);
     single_iter->Seek("a");
+    ASSERT_OK(single_iter->status());
     single_iter->SeekForPrev("b");
+    ASSERT_OK(single_iter->status());
     delete single_iter;
     std::this_thread::sleep_for (std::chrono::seconds(1));
 
-    db_->Get(ro, "g", &value);
+    db_->Get(ro, "g", &value).PermitUncheckedError();
 
     ASSERT_OK(db_->EndTrace());
 
@@ -120,27 +144,24 @@ class TraceAnalyzerTest : public testing::Test {
 
   void CheckFileContent(const std::vector<std::string>& cnt,
                         std::string file_path, bool full_content) {
-    ASSERT_OK(env_->FileExists(file_path));
-    std::unique_ptr<SequentialFile> f_ptr;
-    ASSERT_OK(env_->NewSequentialFile(file_path, &f_ptr, env_options_));
+    const auto& fs = env_->GetFileSystem();
+    FileOptions fopts(env_options_);
+
+    ASSERT_OK(fs->FileExists(file_path, fopts.io_options, nullptr));
+    std::unique_ptr<FSSequentialFile> file;
+    ASSERT_OK(fs->NewSequentialFile(file_path, fopts, &file, nullptr));
+
+    LineFileReader lf_reader(std::move(file), file_path,
+                             4096 /* filereadahead_size */);
 
-    std::string get_line;
-    std::istringstream iss;
-    bool has_data = true;
     std::vector<std::string> result;
-    uint32_t count;
-    Status s;
-    std::unique_ptr<FSSequentialFile> file =
-        NewLegacySequentialFileWrapper(f_ptr);
-    SequentialFileReader sf_reader(std::move(file), file_path,
-                                   4096 /* filereadahead_size */);
-
-    for (count = 0; ReadOneLine(&iss, &sf_reader, &get_line, &has_data, &s);
-         ++count) {
-      ASSERT_OK(s);
-      result.push_back(get_line);
+    std::string line;
+    while (lf_reader.ReadLine(&line)) {
+      result.push_back(line);
     }
 
+    ASSERT_OK(lf_reader.GetStatus());
+
     ASSERT_EQ(cnt.size(), result.size());
     for (int i = 0; i < static_cast<int>(result.size()); i++) {
       if (full_content) {
diff --git a/tools/trace_analyzer_tool.cc b/tools/trace_analyzer_tool.cc
index 2951af13f4b..c80abdc0216 100644
--- a/tools/trace_analyzer_tool.cc
+++ b/tools/trace_analyzer_tool.cc
@@ -9,7 +9,6 @@
 #ifdef GFLAGS
 #ifdef NUMA
 #include <numa.h>
-#include <numaif.h>
 #endif
 #ifndef OS_WIN
 #include <unistd.h>
@@ -27,7 +26,7 @@
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
 #include "env/composite_env_wrapper.h"
-#include "file/read_write_util.h"
+#include "file/line_file_reader.h"
 #include "file/writable_file_writer.h"
 #include "options/cf_options.h"
 #include "rocksdb/db.h"
@@ -50,8 +49,6 @@
 #include "util/string_util.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
-using GFLAGS_NAMESPACE::RegisterFlagValidator;
-using GFLAGS_NAMESPACE::SetUsageMessage;
 
 DEFINE_string(trace_path, "", "The trace file path.");
 DEFINE_string(output_dir, "", "The directory to store the output files.");
@@ -286,6 +283,8 @@ TraceAnalyzer::TraceAnalyzer(std::string& trace_path, std::string& output_path,
   end_time_ = 0;
   time_series_start_ = 0;
   cur_time_sec_ = 0;
+  // Set the default trace file version as version 0.2
+  trace_file_version_ = 2;
   if (FLAGS_sample_ratio > 1.0 || FLAGS_sample_ratio <= 0) {
     sample_max_ = 1;
   } else {
@@ -392,10 +391,15 @@ Status TraceAnalyzer::PrepareProcessing() {
 
 Status TraceAnalyzer::ReadTraceHeader(Trace* header) {
   assert(header != nullptr);
-  Status s = ReadTraceRecord(header);
+  std::string encoded_trace;
+  // Read the trace head
+  Status s = trace_reader_->Read(&encoded_trace);
   if (!s.ok()) {
     return s;
   }
+
+  s = TracerHelper::DecodeTrace(encoded_trace, header);
+
   if (header->type != kTraceBegin) {
     return Status::Corruption("Corrupted trace file. Incorrect header.");
   }
@@ -425,13 +429,7 @@ Status TraceAnalyzer::ReadTraceRecord(Trace* trace) {
   if (!s.ok()) {
     return s;
   }
-
-  Slice enc_slice = Slice(encoded_trace);
-  GetFixed64(&enc_slice, &trace->ts);
-  trace->type = static_cast<TraceType>(enc_slice[0]);
-  enc_slice.remove_prefix(kTraceTypeSize + kTracePayloadLengthSize);
-  trace->payload = enc_slice.ToString();
-  return s;
+  return TracerHelper::DecodeTrace(encoded_trace, trace);
 }
 
 // process the trace itself and redirect the trace content
@@ -445,6 +443,11 @@ Status TraceAnalyzer::StartProcessing() {
     fprintf(stderr, "Cannot read the header\n");
     return s;
   }
+  s = TracerHelper::ParseTraceHeader(header, &trace_file_version_,
+                                     &db_version_);
+  if (!s.ok()) {
+    return s;
+  }
   trace_create_time_ = header.ts;
   if (FLAGS_output_time_series) {
     time_series_start_ = header.ts;
@@ -463,14 +466,22 @@ Status TraceAnalyzer::StartProcessing() {
     if (trace.type == kTraceWrite) {
       total_writes_++;
       c_time_ = trace.ts;
-      WriteBatch batch(trace.payload);
-
+      Slice batch_data;
+      if (trace_file_version_ < 2) {
+        Slice tmp_data(trace.payload);
+        batch_data = tmp_data;
+      } else {
+        WritePayload w_payload;
+        TracerHelper::DecodeWritePayload(&trace, &w_payload);
+        batch_data = w_payload.write_batch_data;
+      }
       // Note that, if the write happens in a transaction,
       // 'Write' will be called twice, one for Prepare, one for
       // Commit. Thus, in the trace, for the same WriteBatch, there
       // will be two reords if it is in a transaction. Here, we only
       // process the reord that is committed. If write is non-transaction,
       // HasBeginPrepare()==false, so we process it normally.
+      WriteBatch batch(batch_data.ToString());
       if (batch.HasBeginPrepare() && !batch.HasCommit()) {
         continue;
       }
@@ -481,26 +492,42 @@ Status TraceAnalyzer::StartProcessing() {
         return s;
       }
     } else if (trace.type == kTraceGet) {
-      uint32_t cf_id = 0;
-      Slice key;
-      DecodeCFAndKeyFromString(trace.payload, &cf_id, &key);
+      GetPayload get_payload;
+      get_payload.get_key = 0;
+      if (trace_file_version_ < 2) {
+        DecodeCFAndKeyFromString(trace.payload, &get_payload.cf_id,
+                                 &get_payload.get_key);
+      } else {
+        TracerHelper::DecodeGetPayload(&trace, &get_payload);
+      }
       total_gets_++;
 
-      s = HandleGet(cf_id, key.ToString(), trace.ts, 1);
+      s = HandleGet(get_payload.cf_id, get_payload.get_key.ToString(), trace.ts,
+                    1);
       if (!s.ok()) {
         fprintf(stderr, "Cannot process the get in the trace\n");
         return s;
       }
     } else if (trace.type == kTraceIteratorSeek ||
                trace.type == kTraceIteratorSeekForPrev) {
-      uint32_t cf_id = 0;
-      Slice key;
-      DecodeCFAndKeyFromString(trace.payload, &cf_id, &key);
-      s = HandleIter(cf_id, key.ToString(), trace.ts, trace.type);
+      IterPayload iter_payload;
+      iter_payload.cf_id = 0;
+      if (trace_file_version_ < 2) {
+        DecodeCFAndKeyFromString(trace.payload, &iter_payload.cf_id,
+                                 &iter_payload.iter_key);
+      } else {
+        TracerHelper::DecodeIterPayload(&trace, &iter_payload);
+      }
+      s = HandleIter(iter_payload.cf_id, iter_payload.iter_key.ToString(),
+                     trace.ts, trace.type);
       if (!s.ok()) {
         fprintf(stderr, "Cannot process the iterator in the trace\n");
         return s;
       }
+    } else if (trace.type == kTraceMultiGet) {
+      MultiGetPayload multiget_payload;
+      assert(trace_file_version_ >= 2);
+      TracerHelper::DecodeMultiGetPayload(&trace, &multiget_payload);
     } else if (trace.type == kTraceEnd) {
       break;
     }
@@ -1048,32 +1075,23 @@ Status TraceAnalyzer::ReProcessing() {
           FLAGS_key_space_dir + "/" + std::to_string(cf_id) + ".txt";
       std::string input_key, get_key;
       std::vector<std::string> prefix(kTaTypeNum);
-      std::istringstream iss;
-      bool has_data = true;
-      std::unique_ptr<SequentialFile> wkey_input_f;
+      std::unique_ptr<FSSequentialFile> file;
 
-      s = env_->NewSequentialFile(whole_key_path, &wkey_input_f, env_options_);
+      s = env_->GetFileSystem()->NewSequentialFile(
+          whole_key_path, FileOptions(env_options_), &file, nullptr);
       if (!s.ok()) {
         fprintf(stderr, "Cannot open the whole key space file of CF: %u\n",
                 cf_id);
-        wkey_input_f.reset();
+        file.reset();
       }
 
-      if (wkey_input_f) {
-        std::unique_ptr<FSSequentialFile> file;
-        file = NewLegacySequentialFileWrapper(wkey_input_f);
+      if (file) {
         size_t kTraceFileReadaheadSize = 2 * 1024 * 1024;
-        SequentialFileReader sf_reader(
+        LineFileReader lf_reader(
             std::move(file), whole_key_path,
             kTraceFileReadaheadSize /* filereadahead_size */);
-        for (cfs_[cf_id].w_count = 0;
-             ReadOneLine(&iss, &sf_reader, &get_key, &has_data, &s);
+        for (cfs_[cf_id].w_count = 0; lf_reader.ReadLine(&get_key);
              ++cfs_[cf_id].w_count) {
-          if (!s.ok()) {
-            fprintf(stderr, "Read whole key space file failed\n");
-            return s;
-          }
-
           input_key = ROCKSDB_NAMESPACE::LDBCommand::HexToString(get_key);
           for (int type = 0; type < kTaTypeNum; type++) {
             if (!ta_[type].enabled) {
@@ -1130,6 +1148,11 @@ Status TraceAnalyzer::ReProcessing() {
             }
           }
         }
+        s = lf_reader.GetStatus();
+        if (!s.ok()) {
+          fprintf(stderr, "Read whole key space file failed\n");
+          return s;
+        }
       }
     }
 
diff --git a/tools/trace_analyzer_tool.h b/tools/trace_analyzer_tool.h
index 2ca877df450..865296d7965 100644
--- a/tools/trace_analyzer_tool.h
+++ b/tools/trace_analyzer_tool.h
@@ -249,6 +249,9 @@ class TraceAnalyzer {
   Status MakeStatisticKeyStatsOrPrefix(TraceStats& stats);
   Status MakeStatisticCorrelation(TraceStats& stats, StatsUnit& unit);
   Status MakeStatisticQPS();
+  // Set the default trace file version as version 0.2
+  int trace_file_version_;
+  int db_version_;
 };
 
 // write bach handler to be used for WriteBache iterator
@@ -281,6 +284,37 @@ class TraceWriteHandler : public WriteBatch::Handler {
     return ta_ptr->HandleMerge(column_family_id, key, value);
   }
 
+  // The following hanlders are not implemented, return Status::OK() to avoid
+  // the running time assertion and other irrelevant falures.
+  virtual Status PutBlobIndexCF(uint32_t /*column_family_id*/,
+                                const Slice& /*key*/,
+                                const Slice& /*value*/) override {
+    return Status::OK();
+  }
+
+  // The default implementation of LogData does nothing.
+  virtual void LogData(const Slice& /*blob*/) override {}
+
+  virtual Status MarkBeginPrepare(bool = false) override {
+    return Status::OK();
+  }
+
+  virtual Status MarkEndPrepare(const Slice& /*xid*/) override {
+    return Status::OK();
+  }
+
+  virtual Status MarkNoop(bool /*empty_batch*/) override {
+    return Status::OK();
+  }
+
+  virtual Status MarkRollback(const Slice& /*xid*/) override {
+    return Status::OK();
+  }
+
+  virtual Status MarkCommit(const Slice& /*xid*/) override {
+    return Status::OK();
+  }
+
  private:
   TraceAnalyzer* ta_ptr;
 };
diff --git a/tools/write_external_sst.sh b/tools/write_external_sst.sh
index 3b02d082f0e..be01ae0229e 100755
--- a/tools/write_external_sst.sh
+++ b/tools/write_external_sst.sh
@@ -12,6 +12,7 @@ input_data_dir=$1
 db_dir=$2
 extern_sst_dir=$3
 rm -rf $db_dir
+mkdir -p $extern_sst_dir
 
 set -e
 
diff --git a/tools/write_stress.cc b/tools/write_stress.cc
index 15b1da881cb..31161ce1c23 100644
--- a/tools/write_stress.cc
+++ b/tools/write_stress.cc
@@ -69,6 +69,7 @@ int main() {
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/system_clock.h"
 #include "util/gflags_compat.h"
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
@@ -187,8 +188,8 @@ class WriteStress {
   void IteratorHoldThread() {
     while (!stop_.load(std::memory_order_relaxed)) {
       std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
-      Env::Default()->SleepForMicroseconds(FLAGS_iterator_hold_sec * 1000 *
-                                           1000LL);
+      SystemClock::Default()->SleepForMicroseconds(FLAGS_iterator_hold_sec *
+                                                   1000 * 1000LL);
       for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
       }
       if (!iterator->status().ok()) {
@@ -204,9 +205,8 @@ class WriteStress {
     std::uniform_real_distribution<double> dist(0, 1);
     std::uniform_int_distribution<int> char_dist('a', 'z');
     while (!stop_.load(std::memory_order_relaxed)) {
-      Env::Default()->SleepForMicroseconds(static_cast<int>(
-                                           FLAGS_prefix_mutate_period_sec *
-                                           1000 * 1000LL));
+      SystemClock::Default()->SleepForMicroseconds(
+          static_cast<int>(FLAGS_prefix_mutate_period_sec * 1000 * 1000LL));
       if (dist(rng) < FLAGS_first_char_mutate_probability) {
         key_prefix_[0].store(static_cast<char>(char_dist(rng)), std::memory_order_relaxed);
       }
@@ -227,11 +227,12 @@ class WriteStress {
     if (FLAGS_runtime_sec == -1) {
       // infinite runtime, until we get killed
       while (true) {
-        Env::Default()->SleepForMicroseconds(1000 * 1000);
+        SystemClock::Default()->SleepForMicroseconds(1000 * 1000);
       }
     }
 
-    Env::Default()->SleepForMicroseconds(FLAGS_runtime_sec * 1000 * 1000);
+    SystemClock::Default()->SleepForMicroseconds(FLAGS_runtime_sec * 1000 *
+                                                 1000);
 
     stop_.store(true, std::memory_order_relaxed);
     for (auto& t : threads_) {
diff --git a/trace_replay/block_cache_tracer.cc b/trace_replay/block_cache_tracer.cc
index 02cf86f01a7..efb8f70edf8 100644
--- a/trace_replay/block_cache_tracer.cc
+++ b/trace_replay/block_cache_tracer.cc
@@ -99,9 +99,9 @@ uint64_t BlockCacheTraceHelper::GetBlockOffsetInFile(
 }
 
 BlockCacheTraceWriter::BlockCacheTraceWriter(
-    Env* env, const TraceOptions& trace_options,
+    SystemClock* clock, const TraceOptions& trace_options,
     std::unique_ptr<TraceWriter>&& trace_writer)
-    : env_(env),
+    : clock_(clock),
       trace_options_(trace_options),
       trace_writer_(std::move(trace_writer)) {}
 
@@ -131,7 +131,7 @@ Status BlockCacheTraceWriter::WriteBlockAccess(
     std::string rkStorage;
     if ((trace_options_.filter & kTraceFilterReferencedKey) != 0) {
       ParsedInternalKey pk;
-      Status st = ParseInternalKey(rk, &pk);
+      Status st = ParseInternalKey(rk, &pk, false);
       if (!st.ok()) {
         return st;
       }
@@ -157,7 +157,7 @@ Status BlockCacheTraceWriter::WriteBlockAccess(
 
 Status BlockCacheTraceWriter::WriteHeader() {
   Trace trace;
-  trace.ts = env_->NowMicros();
+  trace.ts = clock_->NowMicros();
   trace.type = TraceType::kTraceBegin;
   PutLengthPrefixedSlice(&trace.payload, kTraceMagic);
   PutFixed32(&trace.payload, kMajorVersion);
@@ -459,7 +459,7 @@ BlockCacheTracer::BlockCacheTracer() { writer_.store(nullptr); }
 BlockCacheTracer::~BlockCacheTracer() { EndTrace(); }
 
 Status BlockCacheTracer::StartTrace(
-    Env* env, const TraceOptions& trace_options,
+    SystemClock* clock, const TraceOptions& trace_options,
     std::unique_ptr<TraceWriter>&& trace_writer) {
   InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
   if (writer_.load()) {
@@ -468,7 +468,7 @@ Status BlockCacheTracer::StartTrace(
   get_id_counter_.store(1);
   trace_options_ = trace_options;
   writer_.store(
-      new BlockCacheTraceWriter(env, trace_options, std::move(trace_writer)));
+      new BlockCacheTraceWriter(clock, trace_options, std::move(trace_writer)));
   return writer_.load()->WriteHeader();
 }
 
diff --git a/trace_replay/block_cache_tracer.h b/trace_replay/block_cache_tracer.h
index 9dc62b7fba8..23672e1df71 100644
--- a/trace_replay/block_cache_tracer.h
+++ b/trace_replay/block_cache_tracer.h
@@ -9,13 +9,14 @@
 #include <fstream>
 
 #include "monitoring/instrumented_mutex.h"
-#include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "rocksdb/trace_reader_writer.h"
 #include "table/table_reader_caller.h"
 #include "trace_replay/trace_replay.h"
 
 namespace ROCKSDB_NAMESPACE {
+class Env;
+class SystemClock;
 
 extern const uint64_t kMicrosInSecond;
 extern const uint64_t kSecondInMinute;
@@ -172,7 +173,7 @@ struct BlockCacheTraceHeader {
 // payload.
 class BlockCacheTraceWriter {
  public:
-  BlockCacheTraceWriter(Env* env, const TraceOptions& trace_options,
+  BlockCacheTraceWriter(SystemClock* clock, const TraceOptions& trace_options,
                         std::unique_ptr<TraceWriter>&& trace_writer);
   ~BlockCacheTraceWriter() = default;
   // No copy and move.
@@ -191,7 +192,7 @@ class BlockCacheTraceWriter {
   Status WriteHeader();
 
  private:
-  Env* env_;
+  SystemClock* clock_;
   TraceOptions trace_options_;
   std::unique_ptr<TraceWriter> trace_writer_;
 };
@@ -266,7 +267,7 @@ class BlockCacheTracer {
   BlockCacheTracer& operator=(BlockCacheTracer&&) = delete;
 
   // Start writing block cache accesses to the trace_writer.
-  Status StartTrace(Env* env, const TraceOptions& trace_options,
+  Status StartTrace(SystemClock* clock, const TraceOptions& trace_options,
                     std::unique_ptr<TraceWriter>&& trace_writer);
 
   // Stop writing block cache accesses to the trace_writer.
diff --git a/trace_replay/block_cache_tracer_test.cc b/trace_replay/block_cache_tracer_test.cc
index b29600890d6..01b834ed023 100644
--- a/trace_replay/block_cache_tracer_test.cc
+++ b/trace_replay/block_cache_tracer_test.cc
@@ -28,6 +28,7 @@ class BlockCacheTracerTest : public testing::Test {
   BlockCacheTracerTest() {
     test_path_ = test::PerThreadDBPath("block_cache_tracer_test");
     env_ = ROCKSDB_NAMESPACE::Env::Default();
+    clock_ = env_->GetSystemClock().get();
     EXPECT_OK(env_->CreateDir(test_path_));
     trace_file_path_ = test_path_ + "/block_cache_trace";
   }
@@ -52,6 +53,7 @@ class BlockCacheTracerTest : public testing::Test {
         return TableReaderCaller::kUserIterator;
     }
     assert(false);
+    return TableReaderCaller::kMaxBlockCacheLookupCaller;
   }
 
   void WriteBlockAccess(BlockCacheTraceWriter* writer, uint32_t from_key_id,
@@ -63,7 +65,7 @@ class BlockCacheTracerTest : public testing::Test {
       record.block_type = block_type;
       record.block_size = kBlockSize + key_id;
       record.block_key = (kBlockKeyPrefix + std::to_string(key_id));
-      record.access_timestamp = env_->NowMicros();
+      record.access_timestamp = clock_->NowMicros();
       record.cf_id = kCFId;
       record.cf_name = kDefaultColumnFamilyName;
       record.caller = GetCaller(key_id);
@@ -93,7 +95,7 @@ class BlockCacheTracerTest : public testing::Test {
     record.block_type = TraceType::kBlockTraceDataBlock;
     record.block_size = kBlockSize;
     record.block_key = kBlockKeyPrefix + std::to_string(key_id);
-    record.access_timestamp = env_->NowMicros();
+    record.access_timestamp = clock_->NowMicros();
     record.cf_id = kCFId;
     record.cf_name = kDefaultColumnFamilyName;
     record.caller = GetCaller(key_id);
@@ -150,6 +152,7 @@ class BlockCacheTracerTest : public testing::Test {
   }
 
   Env* env_;
+  SystemClock* clock_;
   EnvOptions env_options_;
   std::string trace_file_path_;
   std::string test_path_;
@@ -187,7 +190,7 @@ TEST_F(BlockCacheTracerTest, AtomicWrite) {
     ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
                                  &trace_writer));
     BlockCacheTracer writer;
-    ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
+    ASSERT_OK(writer.StartTrace(clock_, trace_opt, std::move(trace_writer)));
     ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name,
                                       record.referenced_key));
     ASSERT_OK(env_->FileExists(trace_file_path_));
@@ -200,8 +203,8 @@ TEST_F(BlockCacheTracerTest, AtomicWrite) {
     BlockCacheTraceReader reader(std::move(trace_reader));
     BlockCacheTraceHeader header;
     ASSERT_OK(reader.ReadHeader(&header));
-    ASSERT_EQ(kMajorVersion, header.rocksdb_major_version);
-    ASSERT_EQ(kMinorVersion, header.rocksdb_minor_version);
+    ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+    ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
     VerifyAccess(&reader, 0, TraceType::kBlockTraceDataBlock, 1);
     ASSERT_NOK(reader.ReadAccess(&record));
   }
@@ -213,8 +216,8 @@ TEST_F(BlockCacheTracerTest, ConsecutiveStartTrace) {
   ASSERT_OK(
       NewFileTraceWriter(env_, env_options_, trace_file_path_, &trace_writer));
   BlockCacheTracer writer;
-  ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
-  ASSERT_NOK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
+  ASSERT_OK(writer.StartTrace(clock_, trace_opt, std::move(trace_writer)));
+  ASSERT_NOK(writer.StartTrace(clock_, trace_opt, std::move(trace_writer)));
   ASSERT_OK(env_->FileExists(trace_file_path_));
 }
 
@@ -226,7 +229,7 @@ TEST_F(BlockCacheTracerTest, AtomicNoWriteAfterEndTrace) {
     ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
                                  &trace_writer));
     BlockCacheTracer writer;
-    ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
+    ASSERT_OK(writer.StartTrace(clock_, trace_opt, std::move(trace_writer)));
     ASSERT_OK(writer.WriteBlockAccess(record, record.block_key, record.cf_name,
                                       record.referenced_key));
     writer.EndTrace();
@@ -244,8 +247,8 @@ TEST_F(BlockCacheTracerTest, AtomicNoWriteAfterEndTrace) {
     BlockCacheTraceReader reader(std::move(trace_reader));
     BlockCacheTraceHeader header;
     ASSERT_OK(reader.ReadHeader(&header));
-    ASSERT_EQ(kMajorVersion, header.rocksdb_major_version);
-    ASSERT_EQ(kMinorVersion, header.rocksdb_minor_version);
+    ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+    ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
     VerifyAccess(&reader, 0, TraceType::kBlockTraceDataBlock, 1);
     ASSERT_NOK(reader.ReadAccess(&record));
   }
@@ -261,7 +264,7 @@ TEST_F(BlockCacheTracerTest, NextGetId) {
     // next get id should always return 0 before we call StartTrace.
     ASSERT_EQ(0, writer.NextGetId());
     ASSERT_EQ(0, writer.NextGetId());
-    ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
+    ASSERT_OK(writer.StartTrace(clock_, trace_opt, std::move(trace_writer)));
     ASSERT_EQ(1, writer.NextGetId());
     ASSERT_EQ(2, writer.NextGetId());
     writer.EndTrace();
@@ -275,7 +278,7 @@ TEST_F(BlockCacheTracerTest, NextGetId) {
     std::unique_ptr<TraceWriter> trace_writer;
     ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
                                  &trace_writer));
-    ASSERT_OK(writer.StartTrace(env_, trace_opt, std::move(trace_writer)));
+    ASSERT_OK(writer.StartTrace(clock_, trace_opt, std::move(trace_writer)));
     ASSERT_EQ(1, writer.NextGetId());
   }
 }
@@ -287,7 +290,7 @@ TEST_F(BlockCacheTracerTest, MixedBlocks) {
     std::unique_ptr<TraceWriter> trace_writer;
     ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
                                  &trace_writer));
-    BlockCacheTraceWriter writer(env_, trace_opt, std::move(trace_writer));
+    BlockCacheTraceWriter writer(clock_, trace_opt, std::move(trace_writer));
     ASSERT_OK(writer.WriteHeader());
     // Write blocks of different types.
     WriteBlockAccess(&writer, 0, TraceType::kBlockTraceUncompressionDictBlock,
@@ -307,8 +310,8 @@ TEST_F(BlockCacheTracerTest, MixedBlocks) {
     BlockCacheTraceReader reader(std::move(trace_reader));
     BlockCacheTraceHeader header;
     ASSERT_OK(reader.ReadHeader(&header));
-    ASSERT_EQ(kMajorVersion, header.rocksdb_major_version);
-    ASSERT_EQ(kMinorVersion, header.rocksdb_minor_version);
+    ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+    ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
     // Read blocks.
     VerifyAccess(&reader, 0, TraceType::kBlockTraceUncompressionDictBlock, 10);
     VerifyAccess(&reader, 10, TraceType::kBlockTraceDataBlock, 10);
diff --git a/trace_replay/io_tracer.cc b/trace_replay/io_tracer.cc
index b1ae8f222bc..a860130f856 100644
--- a/trace_replay/io_tracer.cc
+++ b/trace_replay/io_tracer.cc
@@ -12,18 +12,22 @@
 #include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/trace_reader_writer.h"
 #include "util/coding.h"
 #include "util/hash.h"
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
-IOTraceWriter::IOTraceWriter(Env* env, const TraceOptions& trace_options,
+IOTraceWriter::IOTraceWriter(SystemClock* clock,
+                             const TraceOptions& trace_options,
                              std::unique_ptr<TraceWriter>&& trace_writer)
-    : env_(env),
+    : clock_(clock),
       trace_options_(trace_options),
       trace_writer_(std::move(trace_writer)) {}
 
-Status IOTraceWriter::WriteIOOp(const IOTraceRecord& record) {
+Status IOTraceWriter::WriteIOOp(const IOTraceRecord& record,
+                                IODebugContext* dbg) {
   uint64_t trace_file_size = trace_writer_->GetFileSize();
   if (trace_file_size > trace_options_.max_trace_file_size) {
     return Status::OK();
@@ -31,32 +35,63 @@ Status IOTraceWriter::WriteIOOp(const IOTraceRecord& record) {
   Trace trace;
   trace.ts = record.access_timestamp;
   trace.type = record.trace_type;
+  PutFixed64(&trace.payload, record.io_op_data);
   Slice file_operation(record.file_operation);
   PutLengthPrefixedSlice(&trace.payload, file_operation);
   PutFixed64(&trace.payload, record.latency);
   Slice io_status(record.io_status);
   PutLengthPrefixedSlice(&trace.payload, io_status);
-  /* Write remaining options based on trace_type set by file operation */
-  switch (record.trace_type) {
-    case TraceType::kIOGeneral:
-      break;
-    case TraceType::kIOFileNameAndFileSize:
-      PutFixed64(&trace.payload, record.file_size);
-      FALLTHROUGH_INTENDED;
-    case TraceType::kIOFileName: {
-      Slice file_name(record.file_name);
-      PutLengthPrefixedSlice(&trace.payload, file_name);
-      break;
+  Slice file_name(record.file_name);
+  PutLengthPrefixedSlice(&trace.payload, file_name);
+
+  // Each bit in io_op_data stores which corresponding info from IOTraceOp will
+  // be added in the trace. Foreg, if bit at position 1 is set then
+  // IOTraceOp::kIOLen (length) will be logged in the record (Since
+  // IOTraceOp::kIOLen = 1 in the enum). So find all the set positions in
+  // io_op_data one by one and, update corresponsing info in the trace record,
+  // unset that bit to find other set bits until io_op_data = 0.
+  /* Write remaining options based on io_op_data set by file operation */
+  int64_t io_op_data = static_cast<int64_t>(record.io_op_data);
+  while (io_op_data) {
+    // Find the rightmost set bit.
+    uint32_t set_pos = static_cast<uint32_t>(log2(io_op_data & -io_op_data));
+    switch (set_pos) {
+      case IOTraceOp::kIOFileSize:
+        PutFixed64(&trace.payload, record.file_size);
+        break;
+      case IOTraceOp::kIOLen:
+        PutFixed64(&trace.payload, record.len);
+        break;
+      case IOTraceOp::kIOOffset:
+        PutFixed64(&trace.payload, record.offset);
+        break;
+      default:
+        assert(false);
+    }
+    // unset the rightmost bit.
+    io_op_data &= (io_op_data - 1);
+  }
+
+  int64_t trace_data = 0;
+  if (dbg) {
+    trace_data = static_cast<int64_t>(dbg->trace_data);
+  }
+  PutFixed64(&trace.payload, trace_data);
+  while (trace_data) {
+    // Find the rightmost set bit.
+    uint32_t set_pos = static_cast<uint32_t>(log2(trace_data & -trace_data));
+    switch (set_pos) {
+      case IODebugContext::TraceData::kRequestID: {
+        Slice request_id(dbg->request_id);
+        PutLengthPrefixedSlice(&trace.payload, request_id);
+      } break;
+      default:
+        assert(false);
     }
-    case TraceType::kIOLenAndOffset:
-      PutFixed64(&trace.payload, record.offset);
-      FALLTHROUGH_INTENDED;
-    case TraceType::kIOLen:
-      PutFixed64(&trace.payload, record.len);
-      break;
-    default:
-      assert(false);
+    // unset the rightmost bit.
+    trace_data &= (trace_data - 1);
   }
+
   std::string encoded_trace;
   TracerHelper::EncodeTrace(trace, &encoded_trace);
   return trace_writer_->Write(encoded_trace);
@@ -64,7 +99,7 @@ Status IOTraceWriter::WriteIOOp(const IOTraceRecord& record) {
 
 Status IOTraceWriter::WriteHeader() {
   Trace trace;
-  trace.ts = env_->NowMicros();
+  trace.ts = clock_->NowMicros();
   trace.type = TraceType::kTraceBegin;
   PutLengthPrefixedSlice(&trace.payload, kTraceMagic);
   PutFixed32(&trace.payload, kMajorVersion);
@@ -135,6 +170,10 @@ Status IOTraceReader::ReadIOOp(IOTraceRecord* record) {
   record->trace_type = trace.type;
   Slice enc_slice = Slice(trace.payload);
 
+  if (!GetFixed64(&enc_slice, &record->io_op_data)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read trace data.");
+  }
   Slice file_operation;
   if (!GetLengthPrefixedSlice(&enc_slice, &file_operation)) {
     return Status::Incomplete(
@@ -151,41 +190,75 @@ Status IOTraceReader::ReadIOOp(IOTraceRecord* record) {
         "Incomplete access record: Failed to read IO status.");
   }
   record->io_status = io_status.ToString();
-  /* Read remaining options based on trace_type set by file operation */
-  switch (record->trace_type) {
-    case TraceType::kIOGeneral:
-      break;
-    case TraceType::kIOFileNameAndFileSize:
-      if (!GetFixed64(&enc_slice, &record->file_size)) {
-        return Status::Incomplete(
-            "Incomplete access record: Failed to read file size.");
-      }
-      FALLTHROUGH_INTENDED;
-    case TraceType::kIOFileName: {
-      Slice file_name;
-      if (!GetLengthPrefixedSlice(&enc_slice, &file_name)) {
-        return Status::Incomplete(
-            "Incomplete access record: Failed to read file name.");
-      }
-      record->file_name = file_name.ToString();
-      break;
+  Slice file_name;
+  if (!GetLengthPrefixedSlice(&enc_slice, &file_name)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read file name.");
+  }
+  record->file_name = file_name.ToString();
+
+  // Each bit in io_op_data stores which corresponding info from IOTraceOp will
+  // be added in the trace. Foreg, if bit at position 1 is set then
+  // IOTraceOp::kIOLen (length) will be logged in the record (Since
+  // IOTraceOp::kIOLen = 1 in the enum). So find all the set positions in
+  // io_op_data one by one and, update corresponsing info in the trace record,
+  // unset that bit to find other set bits until io_op_data = 0.
+  /* Read remaining options based on io_op_data set by file operation */
+  // Assuming 63 bits will be used at max.
+  int64_t io_op_data = static_cast<int64_t>(record->io_op_data);
+  while (io_op_data) {
+    // Find the rightmost set bit.
+    uint32_t set_pos = static_cast<uint32_t>(log2(io_op_data & -io_op_data));
+    switch (set_pos) {
+      case IOTraceOp::kIOFileSize:
+        if (!GetFixed64(&enc_slice, &record->file_size)) {
+          return Status::Incomplete(
+              "Incomplete access record: Failed to read file size.");
+        }
+        break;
+      case IOTraceOp::kIOLen:
+        if (!GetFixed64(&enc_slice, &record->len)) {
+          return Status::Incomplete(
+              "Incomplete access record: Failed to read length.");
+        }
+        break;
+      case IOTraceOp::kIOOffset:
+        if (!GetFixed64(&enc_slice, &record->offset)) {
+          return Status::Incomplete(
+              "Incomplete access record: Failed to read offset.");
+        }
+        break;
+      default:
+        assert(false);
     }
-    case TraceType::kIOLenAndOffset:
-      if (!GetFixed64(&enc_slice, &record->offset)) {
-        return Status::Incomplete(
-            "Incomplete access record: Failed to read offset.");
-      }
-      FALLTHROUGH_INTENDED;
-    case TraceType::kIOLen: {
-      if (!GetFixed64(&enc_slice, &record->len)) {
-        return Status::Incomplete(
-            "Incomplete access record: Failed to read length.");
-      }
-      break;
+    // unset the rightmost bit.
+    io_op_data &= (io_op_data - 1);
+  }
+
+  if (!GetFixed64(&enc_slice, &record->trace_data)) {
+    return Status::Incomplete(
+        "Incomplete access record: Failed to read trace op.");
+  }
+  int64_t trace_data = static_cast<int64_t>(record->trace_data);
+  while (trace_data) {
+    // Find the rightmost set bit.
+    uint32_t set_pos = static_cast<uint32_t>(log2(trace_data & -trace_data));
+    switch (set_pos) {
+      case IODebugContext::TraceData::kRequestID: {
+        Slice request_id;
+        if (!GetLengthPrefixedSlice(&enc_slice, &request_id)) {
+          return Status::Incomplete(
+              "Incomplete access record: Failed to request id.");
+        }
+        record->request_id = request_id.ToString();
+      } break;
+      default:
+        assert(false);
     }
-    default:
-      assert(false);
+    // unset the rightmost bit.
+    trace_data &= (trace_data - 1);
   }
+
   return Status::OK();
 }
 
@@ -193,14 +266,16 @@ IOTracer::IOTracer() : tracing_enabled(false) { writer_.store(nullptr); }
 
 IOTracer::~IOTracer() { EndIOTrace(); }
 
-Status IOTracer::StartIOTrace(Env* env, const TraceOptions& trace_options,
+Status IOTracer::StartIOTrace(SystemClock* clock,
+                              const TraceOptions& trace_options,
                               std::unique_ptr<TraceWriter>&& trace_writer) {
   InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
   if (writer_.load()) {
     return Status::Busy();
   }
   trace_options_ = trace_options;
-  writer_.store(new IOTraceWriter(env, trace_options, std::move(trace_writer)));
+  writer_.store(
+      new IOTraceWriter(clock, trace_options, std::move(trace_writer)));
   tracing_enabled = true;
   return writer_.load()->WriteHeader();
 }
@@ -215,14 +290,14 @@ void IOTracer::EndIOTrace() {
   tracing_enabled = false;
 }
 
-Status IOTracer::WriteIOOp(const IOTraceRecord& record) {
+void IOTracer::WriteIOOp(const IOTraceRecord& record, IODebugContext* dbg) {
   if (!writer_.load()) {
-    return Status::OK();
+    return;
   }
   InstrumentedMutexLock lock_guard(&trace_writer_mutex_);
   if (!writer_.load()) {
-    return Status::OK();
+    return;
   }
-  return writer_.load()->WriteIOOp(record);
+  writer_.load()->WriteIOOp(record, dbg).PermitUncheckedError();
 }
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/trace_replay/io_tracer.h b/trace_replay/io_tracer.h
index 36be4c60238..83bcaee9d54 100644
--- a/trace_replay/io_tracer.h
+++ b/trace_replay/io_tracer.h
@@ -9,44 +9,64 @@
 #include <fstream>
 
 #include "monitoring/instrumented_mutex.h"
-#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
 #include "rocksdb/options.h"
-#include "rocksdb/trace_reader_writer.h"
 #include "trace_replay/trace_replay.h"
 
 namespace ROCKSDB_NAMESPACE {
+class SystemClock;
+class TraceReader;
+class TraceWriter;
+
+/* In order to log new data in trace record for specified operations, do
+   following:
+   1. Add new data in IOTraceOP (say kIONewData= 3)
+   2. Log it in IOTraceWriter::WriteIOOp, and read that in
+   IOTraceReader::ReadIOOp and
+   IOTraceRecordParser::PrintHumanReadableIOTraceRecord in the switch case.
+   3. In the FileSystemTracer APIs where this data will be logged with, update
+   io_op_data |= (1 << IOTraceOp::kIONewData).
+*/
+enum IOTraceOp : char {
+  // The value of each enum represents the bitwise position for
+  // IOTraceRecord.io_op_data.
+  kIOFileSize = 0,
+  kIOLen = 1,
+  kIOOffset = 2,
+};
 
 struct IOTraceRecord {
   // Required fields for all accesses.
   uint64_t access_timestamp = 0;
   TraceType trace_type = TraceType::kTraceMax;
+  // Each bit in io_op_data stores which corresponding info from IOTraceOp will
+  // be added in the trace. Foreg, if bit at position 1 is set then
+  // IOTraceOp::kIOLen (length) will be logged in the record.
+  uint64_t io_op_data = 0;
   std::string file_operation;
   uint64_t latency = 0;
   std::string io_status;
-  // Required fields for read.
+  // Stores file name instead of full path.
   std::string file_name;
+
+  // Fields added to record based on IO operation.
   uint64_t len = 0;
   uint64_t offset = 0;
   uint64_t file_size = 0;
 
-  IOTraceRecord() {}
+  // Additional information passed in IODebugContext.
+  uint64_t trace_data = 0;
+  std::string request_id;
 
-  IOTraceRecord(const uint64_t& _access_timestamp, const TraceType& _trace_type,
-                const std::string& _file_operation, const uint64_t& _latency,
-                const std::string& _io_status, const std::string& _file_name)
-      : access_timestamp(_access_timestamp),
-        trace_type(_trace_type),
-        file_operation(_file_operation),
-        latency(_latency),
-        io_status(_io_status),
-        file_name(_file_name) {}
+  IOTraceRecord() {}
 
   IOTraceRecord(const uint64_t& _access_timestamp, const TraceType& _trace_type,
-                const std::string& _file_operation, const uint64_t& _latency,
-                const std::string& _io_status, const std::string& _file_name,
-                const uint64_t& _file_size)
+                const uint64_t& _io_op_data, const std::string& _file_operation,
+                const uint64_t& _latency, const std::string& _io_status,
+                const std::string& _file_name, const uint64_t& _file_size = 0)
       : access_timestamp(_access_timestamp),
         trace_type(_trace_type),
+        io_op_data(_io_op_data),
         file_operation(_file_operation),
         latency(_latency),
         io_status(_io_status),
@@ -54,14 +74,17 @@ struct IOTraceRecord {
         file_size(_file_size) {}
 
   IOTraceRecord(const uint64_t& _access_timestamp, const TraceType& _trace_type,
-                const std::string& _file_operation, const uint64_t& _latency,
-                const std::string& _io_status, const uint64_t& _len = 0,
-                const uint64_t& _offset = 0)
+                const uint64_t& _io_op_data, const std::string& _file_operation,
+                const uint64_t& _latency, const std::string& _io_status,
+                const std::string& _file_name, const uint64_t& _len,
+                const uint64_t& _offset)
       : access_timestamp(_access_timestamp),
         trace_type(_trace_type),
+        io_op_data(_io_op_data),
         file_operation(_file_operation),
         latency(_latency),
         io_status(_io_status),
+        file_name(_file_name),
         len(_len),
         offset(_offset) {}
 };
@@ -76,7 +99,7 @@ struct IOTraceHeader {
 // timestamp and type, followed by the trace payload.
 class IOTraceWriter {
  public:
-  IOTraceWriter(Env* env, const TraceOptions& trace_options,
+  IOTraceWriter(SystemClock* clock, const TraceOptions& trace_options,
                 std::unique_ptr<TraceWriter>&& trace_writer);
   ~IOTraceWriter() = default;
   // No copy and move.
@@ -85,14 +108,14 @@ class IOTraceWriter {
   IOTraceWriter(IOTraceWriter&&) = delete;
   IOTraceWriter& operator=(IOTraceWriter&&) = delete;
 
-  Status WriteIOOp(const IOTraceRecord& record);
+  Status WriteIOOp(const IOTraceRecord& record, IODebugContext* dbg);
 
   // Write a trace header at the beginning, typically on initiating a trace,
   // with some metadata like a magic number and RocksDB version.
   Status WriteHeader();
 
  private:
-  Env* env_;
+  SystemClock* clock_;
   TraceOptions trace_options_;
   std::unique_ptr<TraceWriter> trace_writer_;
 };
@@ -150,7 +173,7 @@ class IOTracer {
 
   // Start writing IO operations to the trace_writer.
   TSAN_SUPPRESSION Status
-  StartIOTrace(Env* env, const TraceOptions& trace_options,
+  StartIOTrace(SystemClock* clock, const TraceOptions& trace_options,
                std::unique_ptr<TraceWriter>&& trace_writer);
 
   // Stop writing IO operations to the trace_writer.
@@ -158,7 +181,7 @@ class IOTracer {
 
   TSAN_SUPPRESSION bool is_tracing_enabled() const { return tracing_enabled; }
 
-  Status WriteIOOp(const IOTraceRecord& record);
+  void WriteIOOp(const IOTraceRecord& record, IODebugContext* dbg);
 
  private:
   TraceOptions trace_options_;
diff --git a/trace_replay/io_tracer_test.cc b/trace_replay/io_tracer_test.cc
index 3e1602ea167..cc27bb97038 100644
--- a/trace_replay/io_tracer_test.cc
+++ b/trace_replay/io_tracer_test.cc
@@ -7,6 +7,7 @@
 
 #include "rocksdb/env.h"
 #include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 
@@ -22,6 +23,7 @@ class IOTracerTest : public testing::Test {
   IOTracerTest() {
     test_path_ = test::PerThreadDBPath("io_tracer_test");
     env_ = ROCKSDB_NAMESPACE::Env::Default();
+    clock_ = env_->GetSystemClock().get();
     EXPECT_OK(env_->CreateDir(test_path_));
     trace_file_path_ = test_path_ + "/io_trace";
   }
@@ -52,13 +54,16 @@ class IOTracerTest : public testing::Test {
     assert(writer);
     for (uint64_t i = 0; i < nrecords; i++) {
       IOTraceRecord record;
-      record.trace_type = TraceType::kIOLenAndOffset;
+      record.io_op_data = 0;
+      record.trace_type = TraceType::kIOTracer;
+      record.io_op_data |= (1 << IOTraceOp::kIOLen);
+      record.io_op_data |= (1 << IOTraceOp::kIOOffset);
       record.file_operation = GetFileOperation(i);
       record.io_status = IOStatus::OK().ToString();
       record.file_name = kDummyFile + std::to_string(i);
       record.len = i;
       record.offset = i + 20;
-      ASSERT_OK(writer->WriteIOOp(record));
+      EXPECT_OK(writer->WriteIOOp(record, nullptr));
     }
   }
 
@@ -75,23 +80,148 @@ class IOTracerTest : public testing::Test {
   }
 
   Env* env_;
+  SystemClock* clock_;
   EnvOptions env_options_;
   std::string trace_file_path_;
   std::string test_path_;
 };
 
+TEST_F(IOTracerTest, MultipleRecordsWithDifferentIOOpOptions) {
+  std::string file_name = kDummyFile + std::to_string(5);
+  {
+    TraceOptions trace_opt;
+    std::unique_ptr<TraceWriter> trace_writer;
+
+    ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
+                                 &trace_writer));
+    IOTracer writer;
+    ASSERT_OK(writer.StartIOTrace(clock_, trace_opt, std::move(trace_writer)));
+
+    // Write general record.
+    IOTraceRecord record0(0, TraceType::kIOTracer, 0 /*io_op_data*/,
+                          GetFileOperation(0), 155 /*latency*/,
+                          IOStatus::OK().ToString(), file_name);
+    writer.WriteIOOp(record0, nullptr);
+
+    // Write record with FileSize.
+    uint64_t io_op_data = 0;
+    io_op_data |= (1 << IOTraceOp::kIOFileSize);
+    IOTraceRecord record1(0, TraceType::kIOTracer, io_op_data,
+                          GetFileOperation(1), 10 /*latency*/,
+                          IOStatus::OK().ToString(), file_name,
+                          256 /*file_size*/);
+    writer.WriteIOOp(record1, nullptr);
+
+    // Write record with Length.
+    io_op_data = 0;
+    io_op_data |= (1 << IOTraceOp::kIOLen);
+    IOTraceRecord record2(0, TraceType::kIOTracer, io_op_data,
+                          GetFileOperation(2), 10 /*latency*/,
+                          IOStatus::OK().ToString(), file_name, 100 /*length*/,
+                          200 /*offset*/);
+    writer.WriteIOOp(record2, nullptr);
+
+    // Write record with Length and offset.
+    io_op_data = 0;
+    io_op_data |= (1 << IOTraceOp::kIOLen);
+    io_op_data |= (1 << IOTraceOp::kIOOffset);
+    IOTraceRecord record3(0, TraceType::kIOTracer, io_op_data,
+                          GetFileOperation(3), 10 /*latency*/,
+                          IOStatus::OK().ToString(), file_name, 120 /*length*/,
+                          17 /*offset*/);
+    writer.WriteIOOp(record3, nullptr);
+
+    // Write record with offset.
+    io_op_data = 0;
+    io_op_data |= (1 << IOTraceOp::kIOOffset);
+    IOTraceRecord record4(0, TraceType::kIOTracer, io_op_data,
+                          GetFileOperation(4), 10 /*latency*/,
+                          IOStatus::OK().ToString(), file_name, 13 /*length*/,
+                          50 /*offset*/);
+    writer.WriteIOOp(record4, nullptr);
+
+    // Write record with IODebugContext.
+    io_op_data = 0;
+    IODebugContext dbg;
+    dbg.SetRequestId("request_id_1");
+    IOTraceRecord record5(0, TraceType::kIOTracer, io_op_data,
+                          GetFileOperation(5), 10 /*latency*/,
+                          IOStatus::OK().ToString(), file_name);
+    writer.WriteIOOp(record5, &dbg);
+
+    ASSERT_OK(env_->FileExists(trace_file_path_));
+  }
+  {
+    // Verify trace file is generated correctly.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(NewFileTraceReader(env_, env_options_, trace_file_path_,
+                                 &trace_reader));
+    IOTraceReader reader(std::move(trace_reader));
+    IOTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+    ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
+
+    // Read general record.
+    IOTraceRecord record0;
+    ASSERT_OK(reader.ReadIOOp(&record0));
+    ASSERT_EQ(record0.file_operation, GetFileOperation(0));
+    ASSERT_EQ(record0.latency, 155);
+    ASSERT_EQ(record0.file_name, file_name);
+
+    // Read record with FileSize.
+    IOTraceRecord record1;
+    ASSERT_OK(reader.ReadIOOp(&record1));
+    ASSERT_EQ(record1.file_size, 256);
+    ASSERT_EQ(record1.len, 0);
+    ASSERT_EQ(record1.offset, 0);
+
+    // Read record with Length.
+    IOTraceRecord record2;
+    ASSERT_OK(reader.ReadIOOp(&record2));
+    ASSERT_EQ(record2.len, 100);
+    ASSERT_EQ(record2.file_size, 0);
+    ASSERT_EQ(record2.offset, 0);
+
+    // Read record with Length and offset.
+    IOTraceRecord record3;
+    ASSERT_OK(reader.ReadIOOp(&record3));
+    ASSERT_EQ(record3.len, 120);
+    ASSERT_EQ(record3.file_size, 0);
+    ASSERT_EQ(record3.offset, 17);
+
+    // Read record with offset.
+    IOTraceRecord record4;
+    ASSERT_OK(reader.ReadIOOp(&record4));
+    ASSERT_EQ(record4.len, 0);
+    ASSERT_EQ(record4.file_size, 0);
+    ASSERT_EQ(record4.offset, 50);
+
+    IOTraceRecord record5;
+    ASSERT_OK(reader.ReadIOOp(&record5));
+    ASSERT_EQ(record5.len, 0);
+    ASSERT_EQ(record5.file_size, 0);
+    ASSERT_EQ(record5.offset, 0);
+    ASSERT_EQ(record5.request_id, "request_id_1");
+    // Read one more record and it should report error.
+    IOTraceRecord record6;
+    ASSERT_NOK(reader.ReadIOOp(&record6));
+  }
+}
+
 TEST_F(IOTracerTest, AtomicWrite) {
   std::string file_name = kDummyFile + std::to_string(0);
   {
-    IOTraceRecord record(0, TraceType::kIOFileName, GetFileOperation(0), 0,
+    IOTraceRecord record(0, TraceType::kIOTracer, 0 /*io_op_data*/,
+                         GetFileOperation(0), 10 /*latency*/,
                          IOStatus::OK().ToString(), file_name);
     TraceOptions trace_opt;
     std::unique_ptr<TraceWriter> trace_writer;
     ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
                                  &trace_writer));
     IOTracer writer;
-    ASSERT_OK(writer.StartIOTrace(env_, trace_opt, std::move(trace_writer)));
-    ASSERT_OK(writer.WriteIOOp(record));
+    ASSERT_OK(writer.StartIOTrace(clock_, trace_opt, std::move(trace_writer)));
+    writer.WriteIOOp(record, nullptr);
     ASSERT_OK(env_->FileExists(trace_file_path_));
   }
   {
@@ -115,16 +245,18 @@ TEST_F(IOTracerTest, AtomicWrite) {
 }
 
 TEST_F(IOTracerTest, AtomicWriteBeforeStartTrace) {
+  std::string file_name = kDummyFile + std::to_string(0);
   {
-    IOTraceRecord record(0, TraceType::kIOGeneral, GetFileOperation(0), 0,
-                         IOStatus::OK().ToString());
+    IOTraceRecord record(0, TraceType::kIOTracer, 0 /*io_op_data*/,
+                         GetFileOperation(0), 0, IOStatus::OK().ToString(),
+                         file_name);
     std::unique_ptr<TraceWriter> trace_writer;
     ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
                                  &trace_writer));
     IOTracer writer;
     // The record should not be written to the trace_file since StartIOTrace is
     // not called.
-    ASSERT_OK(writer.WriteIOOp(record));
+    writer.WriteIOOp(record, nullptr);
     ASSERT_OK(env_->FileExists(trace_file_path_));
   }
   {
@@ -139,21 +271,24 @@ TEST_F(IOTracerTest, AtomicWriteBeforeStartTrace) {
 }
 
 TEST_F(IOTracerTest, AtomicNoWriteAfterEndTrace) {
+  std::string file_name = kDummyFile + std::to_string(0);
   {
-    IOTraceRecord record(0, TraceType::kIOFileNameAndFileSize,
-                         GetFileOperation(2), 0 /*latency*/,
-                         IOStatus::OK().ToString(), "", 10 /*file_size*/);
+    uint64_t io_op_data = 0;
+    io_op_data |= (1 << IOTraceOp::kIOFileSize);
+    IOTraceRecord record(
+        0, TraceType::kIOTracer, io_op_data, GetFileOperation(2), 0 /*latency*/,
+        IOStatus::OK().ToString(), file_name, 10 /*file_size*/);
     TraceOptions trace_opt;
     std::unique_ptr<TraceWriter> trace_writer;
     ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
                                  &trace_writer));
     IOTracer writer;
-    ASSERT_OK(writer.StartIOTrace(env_, trace_opt, std::move(trace_writer)));
-    ASSERT_OK(writer.WriteIOOp(record));
+    ASSERT_OK(writer.StartIOTrace(clock_, trace_opt, std::move(trace_writer)));
+    writer.WriteIOOp(record, nullptr);
     writer.EndIOTrace();
     // Write the record again. This time the record should not be written since
     // EndIOTrace is called.
-    ASSERT_OK(writer.WriteIOOp(record));
+    writer.WriteIOOp(record, nullptr);
     ASSERT_OK(env_->FileExists(trace_file_path_));
   }
   {
@@ -183,7 +318,7 @@ TEST_F(IOTracerTest, AtomicMultipleWrites) {
     std::unique_ptr<TraceWriter> trace_writer;
     ASSERT_OK(NewFileTraceWriter(env_, env_options_, trace_file_path_,
                                  &trace_writer));
-    IOTraceWriter writer(env_, trace_opt, std::move(trace_writer));
+    IOTraceWriter writer(clock_, trace_opt, std::move(trace_writer));
     ASSERT_OK(writer.WriteHeader());
     // Write 10 records
     WriteIOOp(&writer, 10);
diff --git a/trace_replay/trace_replay.cc b/trace_replay/trace_replay.cc
index 949cd450d01..5fd529568d0 100644
--- a/trace_replay/trace_replay.cc
+++ b/trace_replay/trace_replay.cc
@@ -8,8 +8,13 @@
 #include <chrono>
 #include <sstream>
 #include <thread>
+
 #include "db/db_impl/db_impl.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/trace_reader_writer.h"
 #include "rocksdb/write_batch.h"
 #include "util/coding.h"
 #include "util/string_util.h"
@@ -20,11 +25,6 @@ namespace ROCKSDB_NAMESPACE {
 const std::string kTraceMagic = "feedcafedeadbeef";
 
 namespace {
-void EncodeCFAndKey(std::string* dst, uint32_t cf_id, const Slice& key) {
-  PutFixed32(dst, cf_id);
-  PutLengthPrefixedSlice(dst, key);
-}
-
 void DecodeCFAndKey(std::string& buffer, uint32_t* cf_id, Slice* key) {
   Slice buf(buffer);
   GetFixed32(&buf, cf_id);
@@ -32,6 +32,54 @@ void DecodeCFAndKey(std::string& buffer, uint32_t* cf_id, Slice* key) {
 }
 }  // namespace
 
+Status TracerHelper::ParseVersionStr(std::string& v_string, int* v_num) {
+  if (v_string.find_first_of('.') == std::string::npos ||
+      v_string.find_first_of('.') != v_string.find_last_of('.')) {
+    return Status::Corruption(
+        "Corrupted trace file. Incorrect version format.");
+  }
+  int tmp_num = 0;
+  for (int i = 0; i < static_cast<int>(v_string.size()); i++) {
+    if (v_string[i] == '.') {
+      continue;
+    } else if (isdigit(v_string[i])) {
+      tmp_num = tmp_num * 10 + (v_string[i] - '0');
+    } else {
+      return Status::Corruption(
+          "Corrupted trace file. Incorrect version format");
+    }
+  }
+  *v_num = tmp_num;
+  return Status::OK();
+}
+
+Status TracerHelper::ParseTraceHeader(const Trace& header, int* trace_version,
+                                      int* db_version) {
+  std::vector<std::string> s_vec;
+  int begin = 0, end;
+  for (int i = 0; i < 3; i++) {
+    assert(header.payload.find("\t", begin) != std::string::npos);
+    end = static_cast<int>(header.payload.find("\t", begin));
+    s_vec.push_back(header.payload.substr(begin, end - begin));
+    begin = end + 1;
+  }
+
+  std::string t_v_str, db_v_str;
+  assert(s_vec.size() == 3);
+  assert(s_vec[1].find("Trace Version: ") != std::string::npos);
+  t_v_str = s_vec[1].substr(15);
+  assert(s_vec[2].find("RocksDB Version: ") != std::string::npos);
+  db_v_str = s_vec[2].substr(17);
+
+  Status s;
+  s = ParseVersionStr(t_v_str, trace_version);
+  if (s != Status::OK()) {
+    return s;
+  }
+  s = ParseVersionStr(db_v_str, db_version);
+  return s;
+}
+
 void TracerHelper::EncodeTrace(const Trace& trace, std::string* encoded_trace) {
   assert(encoded_trace);
   PutFixed64(encoded_trace, trace.ts);
@@ -56,12 +104,134 @@ Status TracerHelper::DecodeTrace(const std::string& encoded_trace,
   return Status::OK();
 }
 
-Tracer::Tracer(Env* env, const TraceOptions& trace_options,
+bool TracerHelper::SetPayloadMap(uint64_t& payload_map,
+                                 const TracePayloadType payload_type) {
+  uint64_t old_state = payload_map;
+  uint64_t tmp = 1;
+  payload_map |= (tmp << payload_type);
+  return old_state != payload_map;
+}
+
+void TracerHelper::DecodeWritePayload(Trace* trace,
+                                      WritePayload* write_payload) {
+  assert(write_payload != nullptr);
+  Slice buf(trace->payload);
+  GetFixed64(&buf, &trace->payload_map);
+  int64_t payload_map = static_cast<int64_t>(trace->payload_map);
+  while (payload_map) {
+    // Find the rightmost set bit.
+    uint32_t set_pos = static_cast<uint32_t>(log2(payload_map & -payload_map));
+    switch (set_pos) {
+      case TracePayloadType::kWriteBatchData:
+        GetLengthPrefixedSlice(&buf, &(write_payload->write_batch_data));
+        break;
+      default:
+        assert(false);
+    }
+    // unset the rightmost bit.
+    payload_map &= (payload_map - 1);
+  }
+}
+
+void TracerHelper::DecodeGetPayload(Trace* trace, GetPayload* get_payload) {
+  assert(get_payload != nullptr);
+  Slice buf(trace->payload);
+  GetFixed64(&buf, &trace->payload_map);
+  int64_t payload_map = static_cast<int64_t>(trace->payload_map);
+  while (payload_map) {
+    // Find the rightmost set bit.
+    uint32_t set_pos = static_cast<uint32_t>(log2(payload_map & -payload_map));
+    switch (set_pos) {
+      case TracePayloadType::kGetCFID:
+        GetFixed32(&buf, &(get_payload->cf_id));
+        break;
+      case TracePayloadType::kGetKey:
+        GetLengthPrefixedSlice(&buf, &(get_payload->get_key));
+        break;
+      default:
+        assert(false);
+    }
+    // unset the rightmost bit.
+    payload_map &= (payload_map - 1);
+  }
+}
+
+void TracerHelper::DecodeIterPayload(Trace* trace, IterPayload* iter_payload) {
+  assert(iter_payload != nullptr);
+  Slice buf(trace->payload);
+  GetFixed64(&buf, &trace->payload_map);
+  int64_t payload_map = static_cast<int64_t>(trace->payload_map);
+  while (payload_map) {
+    // Find the rightmost set bit.
+    uint32_t set_pos = static_cast<uint32_t>(log2(payload_map & -payload_map));
+    switch (set_pos) {
+      case TracePayloadType::kIterCFID:
+        GetFixed32(&buf, &(iter_payload->cf_id));
+        break;
+      case TracePayloadType::kIterKey:
+        GetLengthPrefixedSlice(&buf, &(iter_payload->iter_key));
+        break;
+      case TracePayloadType::kIterLowerBound:
+        GetLengthPrefixedSlice(&buf, &(iter_payload->lower_bound));
+        break;
+      case TracePayloadType::kIterUpperBound:
+        GetLengthPrefixedSlice(&buf, &(iter_payload->upper_bound));
+        break;
+      default:
+        assert(false);
+    }
+    // unset the rightmost bit.
+    payload_map &= (payload_map - 1);
+  }
+}
+
+void TracerHelper::DecodeMultiGetPayload(Trace* trace,
+                                         MultiGetPayload* multiget_payload) {
+  assert(multiget_payload != nullptr);
+  Slice cfids_payload;
+  Slice keys_payload;
+  Slice buf(trace->payload);
+  GetFixed64(&buf, &trace->payload_map);
+  int64_t payload_map = static_cast<int64_t>(trace->payload_map);
+  while (payload_map) {
+    // Find the rightmost set bit.
+    uint32_t set_pos = static_cast<uint32_t>(log2(payload_map & -payload_map));
+    switch (set_pos) {
+      case TracePayloadType::kMultiGetSize:
+        GetFixed32(&buf, &(multiget_payload->multiget_size));
+        break;
+      case TracePayloadType::kMultiGetCFIDs:
+        GetLengthPrefixedSlice(&buf, &cfids_payload);
+        break;
+      case TracePayloadType::kMultiGetKeys:
+        GetLengthPrefixedSlice(&buf, &keys_payload);
+        break;
+      default:
+        assert(false);
+    }
+    // unset the rightmost bit.
+    payload_map &= (payload_map - 1);
+  }
+
+  // Decode the cfids_payload and keys_payload
+  multiget_payload->cf_ids.reserve(multiget_payload->multiget_size);
+  multiget_payload->multiget_keys.reserve(multiget_payload->multiget_size);
+  for (uint32_t i = 0; i < multiget_payload->multiget_size; i++) {
+    uint32_t tmp_cfid;
+    Slice tmp_key;
+    GetFixed32(&cfids_payload, &tmp_cfid);
+    GetLengthPrefixedSlice(&keys_payload, &tmp_key);
+    multiget_payload->cf_ids.push_back(tmp_cfid);
+    multiget_payload->multiget_keys.push_back(tmp_key.ToString());
+  }
+}
+
+Tracer::Tracer(SystemClock* clock, const TraceOptions& trace_options,
                std::unique_ptr<TraceWriter>&& trace_writer)
-    : env_(env),
+    : clock_(clock),
       trace_options_(trace_options),
       trace_writer_(std::move(trace_writer)),
-      trace_request_count_ (0) {
+      trace_request_count_(0) {
   // TODO: What if this fails?
   WriteHeader().PermitUncheckedError();
 }
@@ -74,9 +244,12 @@ Status Tracer::Write(WriteBatch* write_batch) {
     return Status::OK();
   }
   Trace trace;
-  trace.ts = env_->NowMicros();
+  trace.ts = clock_->NowMicros();
   trace.type = trace_type;
-  trace.payload = write_batch->Data();
+  TracerHelper::SetPayloadMap(trace.payload_map,
+                              TracePayloadType::kWriteBatchData);
+  PutFixed64(&trace.payload, trace.payload_map);
+  PutLengthPrefixedSlice(&trace.payload, Slice(write_batch->Data()));
   return WriteTrace(trace);
 }
 
@@ -86,33 +259,158 @@ Status Tracer::Get(ColumnFamilyHandle* column_family, const Slice& key) {
     return Status::OK();
   }
   Trace trace;
-  trace.ts = env_->NowMicros();
+  trace.ts = clock_->NowMicros();
   trace.type = trace_type;
-  EncodeCFAndKey(&trace.payload, column_family->GetID(), key);
+  // Set the payloadmap of the struct member that will be encoded in the
+  // payload.
+  TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kGetCFID);
+  TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kGetKey);
+  // Encode the Get struct members into payload. Make sure add them in order.
+  PutFixed64(&trace.payload, trace.payload_map);
+  PutFixed32(&trace.payload, column_family->GetID());
+  PutLengthPrefixedSlice(&trace.payload, key);
   return WriteTrace(trace);
 }
 
-Status Tracer::IteratorSeek(const uint32_t& cf_id, const Slice& key) {
+Status Tracer::IteratorSeek(const uint32_t& cf_id, const Slice& key,
+                            const Slice& lower_bound, const Slice upper_bound) {
   TraceType trace_type = kTraceIteratorSeek;
   if (ShouldSkipTrace(trace_type)) {
     return Status::OK();
   }
   Trace trace;
-  trace.ts = env_->NowMicros();
+  trace.ts = clock_->NowMicros();
   trace.type = trace_type;
-  EncodeCFAndKey(&trace.payload, cf_id, key);
+  // Set the payloadmap of the struct member that will be encoded in the
+  // payload.
+  TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kIterCFID);
+  TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kIterKey);
+  if (lower_bound.size() > 0) {
+    TracerHelper::SetPayloadMap(trace.payload_map,
+                                TracePayloadType::kIterLowerBound);
+  }
+  if (upper_bound.size() > 0) {
+    TracerHelper::SetPayloadMap(trace.payload_map,
+                                TracePayloadType::kIterUpperBound);
+  }
+  // Encode the Iterator struct members into payload. Make sure add them in
+  // order.
+  PutFixed64(&trace.payload, trace.payload_map);
+  PutFixed32(&trace.payload, cf_id);
+  PutLengthPrefixedSlice(&trace.payload, key);
+  if (lower_bound.size() > 0) {
+    PutLengthPrefixedSlice(&trace.payload, lower_bound);
+  }
+  if (upper_bound.size() > 0) {
+    PutLengthPrefixedSlice(&trace.payload, upper_bound);
+  }
   return WriteTrace(trace);
 }
 
-Status Tracer::IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key) {
+Status Tracer::IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key,
+                                   const Slice& lower_bound,
+                                   const Slice upper_bound) {
   TraceType trace_type = kTraceIteratorSeekForPrev;
   if (ShouldSkipTrace(trace_type)) {
     return Status::OK();
   }
   Trace trace;
-  trace.ts = env_->NowMicros();
+  trace.ts = clock_->NowMicros();
   trace.type = trace_type;
-  EncodeCFAndKey(&trace.payload, cf_id, key);
+  // Set the payloadmap of the struct member that will be encoded in the
+  // payload.
+  TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kIterCFID);
+  TracerHelper::SetPayloadMap(trace.payload_map, TracePayloadType::kIterKey);
+  if (lower_bound.size() > 0) {
+    TracerHelper::SetPayloadMap(trace.payload_map,
+                                TracePayloadType::kIterLowerBound);
+  }
+  if (upper_bound.size() > 0) {
+    TracerHelper::SetPayloadMap(trace.payload_map,
+                                TracePayloadType::kIterUpperBound);
+  }
+  // Encode the Iterator struct members into payload. Make sure add them in
+  // order.
+  PutFixed64(&trace.payload, trace.payload_map);
+  PutFixed32(&trace.payload, cf_id);
+  PutLengthPrefixedSlice(&trace.payload, key);
+  if (lower_bound.size() > 0) {
+    PutLengthPrefixedSlice(&trace.payload, lower_bound);
+  }
+  if (upper_bound.size() > 0) {
+    PutLengthPrefixedSlice(&trace.payload, upper_bound);
+  }
+  return WriteTrace(trace);
+}
+
+Status Tracer::MultiGet(const size_t num_keys,
+                        ColumnFamilyHandle** column_families,
+                        const Slice* keys) {
+  if (num_keys == 0) {
+    return Status::OK();
+  }
+  std::vector<ColumnFamilyHandle*> v_column_families;
+  std::vector<Slice> v_keys;
+  v_column_families.resize(num_keys);
+  v_keys.resize(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    v_column_families[i] = column_families[i];
+    v_keys[i] = keys[i];
+  }
+  return MultiGet(v_column_families, v_keys);
+}
+
+Status Tracer::MultiGet(const size_t num_keys,
+                        ColumnFamilyHandle* column_family, const Slice* keys) {
+  if (num_keys == 0) {
+    return Status::OK();
+  }
+  std::vector<ColumnFamilyHandle*> column_families;
+  std::vector<Slice> v_keys;
+  column_families.resize(num_keys);
+  v_keys.resize(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    column_families[i] = column_family;
+    v_keys[i] = keys[i];
+  }
+  return MultiGet(column_families, v_keys);
+}
+
+Status Tracer::MultiGet(const std::vector<ColumnFamilyHandle*>& column_families,
+                        const std::vector<Slice>& keys) {
+  if (column_families.size() != keys.size()) {
+    return Status::Corruption("the CFs size and keys size does not match!");
+  }
+  TraceType trace_type = kTraceMultiGet;
+  if (ShouldSkipTrace(trace_type)) {
+    return Status::OK();
+  }
+  uint32_t multiget_size = static_cast<uint32_t>(keys.size());
+  Trace trace;
+  trace.ts = clock_->NowMicros();
+  trace.type = trace_type;
+  // Set the payloadmap of the struct member that will be encoded in the
+  // payload.
+  TracerHelper::SetPayloadMap(trace.payload_map,
+                              TracePayloadType::kMultiGetSize);
+  TracerHelper::SetPayloadMap(trace.payload_map,
+                              TracePayloadType::kMultiGetCFIDs);
+  TracerHelper::SetPayloadMap(trace.payload_map,
+                              TracePayloadType::kMultiGetKeys);
+  // Encode the CFIDs inorder
+  std::string cfids_payload;
+  std::string keys_payload;
+  for (uint32_t i = 0; i < multiget_size; i++) {
+    assert(i < column_families.size());
+    assert(i < keys.size());
+    PutFixed32(&cfids_payload, column_families[i]->GetID());
+    PutLengthPrefixedSlice(&keys_payload, keys[i]);
+  }
+  // Encode the Get struct members into payload. Make sure add them in order.
+  PutFixed64(&trace.payload, trace.payload_map);
+  PutFixed32(&trace.payload, multiget_size);
+  PutLengthPrefixedSlice(&trace.payload, cfids_payload);
+  PutLengthPrefixedSlice(&trace.payload, keys_payload);
   return WriteTrace(trace);
 }
 
@@ -142,13 +440,14 @@ bool Tracer::IsTraceFileOverMax() {
 Status Tracer::WriteHeader() {
   std::ostringstream s;
   s << kTraceMagic << "\t"
-    << "Trace Version: 0.1\t"
+    << "Trace Version: " << kTraceFileMajorVersion << "."
+    << kTraceFileMinorVersion << "\t"
     << "RocksDB Version: " << kMajorVersion << "." << kMinorVersion << "\t"
     << "Format: Timestamp OpType Payload\n";
   std::string header(s.str());
 
   Trace trace;
-  trace.ts = env_->NowMicros();
+  trace.ts = clock_->NowMicros();
   trace.type = kTraceBegin;
   trace.payload = header;
   return WriteTrace(trace);
@@ -156,8 +455,10 @@ Status Tracer::WriteHeader() {
 
 Status Tracer::WriteFooter() {
   Trace trace;
-  trace.ts = env_->NowMicros();
+  trace.ts = clock_->NowMicros();
   trace.type = kTraceEnd;
+  TracerHelper::SetPayloadMap(trace.payload_map,
+                              TracePayloadType::kEmptyPayload);
   trace.payload = "";
   return WriteTrace(trace);
 }
@@ -198,10 +499,15 @@ Status Replayer::SetFastForward(uint32_t fast_forward) {
 Status Replayer::Replay() {
   Status s;
   Trace header;
+  int db_version;
   s = ReadHeader(&header);
   if (!s.ok()) {
     return s;
   }
+  s = TracerHelper::ParseTraceHeader(header, &trace_file_version_, &db_version);
+  if (!s.ok()) {
+    return s;
+  }
 
   std::chrono::system_clock::time_point replay_epoch =
       std::chrono::system_clock::now();
@@ -221,55 +527,84 @@ Status Replayer::Replay() {
         replay_epoch +
         std::chrono::microseconds((trace.ts - header.ts) / fast_forward_));
     if (trace.type == kTraceWrite) {
-      WriteBatch batch(trace.payload);
-      db_->Write(woptions, &batch);
+      if (trace_file_version_ < 2) {
+        WriteBatch batch(trace.payload);
+        db_->Write(woptions, &batch);
+      } else {
+        WritePayload w_payload;
+        TracerHelper::DecodeWritePayload(&trace, &w_payload);
+        WriteBatch batch(w_payload.write_batch_data.ToString());
+        db_->Write(woptions, &batch);
+      }
       ops++;
     } else if (trace.type == kTraceGet) {
-      uint32_t cf_id = 0;
-      Slice key;
-      DecodeCFAndKey(trace.payload, &cf_id, &key);
-      if (cf_id > 0 && cf_map_.find(cf_id) == cf_map_.end()) {
+      GetPayload get_payload;
+      get_payload.cf_id = 0;
+      get_payload.get_key = 0;
+      if (trace_file_version_ < 2) {
+        DecodeCFAndKey(trace.payload, &get_payload.cf_id, &get_payload.get_key);
+      } else {
+        TracerHelper::DecodeGetPayload(&trace, &get_payload);
+      }
+      if (get_payload.cf_id > 0 &&
+          cf_map_.find(get_payload.cf_id) == cf_map_.end()) {
         return Status::Corruption("Invalid Column Family ID.");
       }
 
       std::string value;
-      if (cf_id == 0) {
-        db_->Get(roptions, key, &value);
+      if (get_payload.cf_id == 0) {
+        db_->Get(roptions, get_payload.get_key, &value);
       } else {
-        db_->Get(roptions, cf_map_[cf_id], key, &value);
+        db_->Get(roptions, cf_map_[get_payload.cf_id], get_payload.get_key,
+                 &value);
       }
       ops++;
     } else if (trace.type == kTraceIteratorSeek) {
-      uint32_t cf_id = 0;
-      Slice key;
-      DecodeCFAndKey(trace.payload, &cf_id, &key);
-      if (cf_id > 0 && cf_map_.find(cf_id) == cf_map_.end()) {
+      // Currently, we only support to call Seek. The Next() and Prev() is not
+      // supported.
+      IterPayload iter_payload;
+      iter_payload.cf_id = 0;
+      if (trace_file_version_ < 2) {
+        DecodeCFAndKey(trace.payload, &iter_payload.cf_id,
+                       &iter_payload.iter_key);
+      } else {
+        TracerHelper::DecodeIterPayload(&trace, &iter_payload);
+      }
+      if (iter_payload.cf_id > 0 &&
+          cf_map_.find(iter_payload.cf_id) == cf_map_.end()) {
         return Status::Corruption("Invalid Column Family ID.");
       }
 
-      if (cf_id == 0) {
+      if (iter_payload.cf_id == 0) {
         single_iter = db_->NewIterator(roptions);
       } else {
-        single_iter = db_->NewIterator(roptions, cf_map_[cf_id]);
+        single_iter = db_->NewIterator(roptions, cf_map_[iter_payload.cf_id]);
       }
-      single_iter->Seek(key);
+      single_iter->Seek(iter_payload.iter_key);
       ops++;
       delete single_iter;
     } else if (trace.type == kTraceIteratorSeekForPrev) {
-      // Currently, only support to call the Seek()
-      uint32_t cf_id = 0;
-      Slice key;
-      DecodeCFAndKey(trace.payload, &cf_id, &key);
-      if (cf_id > 0 && cf_map_.find(cf_id) == cf_map_.end()) {
+      // Currently, we only support to call SeekForPrev. The Next() and Prev()
+      // is not supported.
+      IterPayload iter_payload;
+      iter_payload.cf_id = 0;
+      if (trace_file_version_ < 2) {
+        DecodeCFAndKey(trace.payload, &iter_payload.cf_id,
+                       &iter_payload.iter_key);
+      } else {
+        TracerHelper::DecodeIterPayload(&trace, &iter_payload);
+      }
+      if (iter_payload.cf_id > 0 &&
+          cf_map_.find(iter_payload.cf_id) == cf_map_.end()) {
         return Status::Corruption("Invalid Column Family ID.");
       }
 
-      if (cf_id == 0) {
+      if (iter_payload.cf_id == 0) {
         single_iter = db_->NewIterator(roptions);
       } else {
-        single_iter = db_->NewIterator(roptions, cf_map_[cf_id]);
+        single_iter = db_->NewIterator(roptions, cf_map_[iter_payload.cf_id]);
       }
-      single_iter->SeekForPrev(key);
+      single_iter->SeekForPrev(iter_payload.iter_key);
       ops++;
       delete single_iter;
     } else if (trace.type == kTraceEnd) {
@@ -296,11 +631,15 @@ Status Replayer::Replay() {
 Status Replayer::MultiThreadReplay(uint32_t threads_num) {
   Status s;
   Trace header;
+  int db_version;
   s = ReadHeader(&header);
   if (!s.ok()) {
     return s;
   }
-
+  s = TracerHelper::ParseTraceHeader(header, &trace_file_version_, &db_version);
+  if (!s.ok()) {
+    return s;
+  }
   ThreadPoolImpl thread_pool;
   thread_pool.SetHostEnv(env_);
 
@@ -325,6 +664,7 @@ Status Replayer::MultiThreadReplay(uint32_t threads_num) {
     ra->cf_map = &cf_map_;
     ra->woptions = woptions;
     ra->roptions = roptions;
+    ra->trace_file_version = trace_file_version_;
 
     std::this_thread::sleep_until(
         replay_epoch + std::chrono::microseconds(
@@ -368,10 +708,15 @@ Status Replayer::MultiThreadReplay(uint32_t threads_num) {
 
 Status Replayer::ReadHeader(Trace* header) {
   assert(header != nullptr);
-  Status s = ReadTrace(header);
+  std::string encoded_trace;
+  // Read the trace head
+  Status s = trace_reader_->Read(&encoded_trace);
   if (!s.ok()) {
     return s;
   }
+
+  s = TracerHelper::DecodeTrace(encoded_trace, header);
+
   if (header->type != kTraceBegin) {
     return Status::Corruption("Corrupted trace file. Incorrect header.");
   }
@@ -412,20 +757,26 @@ void Replayer::BGWorkGet(void* arg) {
   assert(ra != nullptr);
   auto cf_map = static_cast<std::unordered_map<uint32_t, ColumnFamilyHandle*>*>(
       ra->cf_map);
-  uint32_t cf_id = 0;
-  Slice key;
-  DecodeCFAndKey(ra->trace_entry.payload, &cf_id, &key);
-  if (cf_id > 0 && cf_map->find(cf_id) == cf_map->end()) {
+  GetPayload get_payload;
+  get_payload.cf_id = 0;
+  if (ra->trace_file_version < 2) {
+    DecodeCFAndKey(ra->trace_entry.payload, &get_payload.cf_id,
+                   &get_payload.get_key);
+  } else {
+    TracerHelper::DecodeGetPayload(&(ra->trace_entry), &get_payload);
+  }
+  if (get_payload.cf_id > 0 &&
+      cf_map->find(get_payload.cf_id) == cf_map->end()) {
     return;
   }
 
   std::string value;
-  if (cf_id == 0) {
-    ra->db->Get(ra->roptions, key, &value);
+  if (get_payload.cf_id == 0) {
+    ra->db->Get(ra->roptions, get_payload.get_key, &value);
   } else {
-    ra->db->Get(ra->roptions, (*cf_map)[cf_id], key, &value);
+    ra->db->Get(ra->roptions, (*cf_map)[get_payload.cf_id], get_payload.get_key,
+                &value);
   }
-
   return;
 }
 
@@ -433,8 +784,16 @@ void Replayer::BGWorkWriteBatch(void* arg) {
   std::unique_ptr<ReplayerWorkerArg> ra(
       reinterpret_cast<ReplayerWorkerArg*>(arg));
   assert(ra != nullptr);
-  WriteBatch batch(ra->trace_entry.payload);
-  ra->db->Write(ra->woptions, &batch);
+
+  if (ra->trace_file_version < 2) {
+    WriteBatch batch(ra->trace_entry.payload);
+    ra->db->Write(ra->woptions, &batch);
+  } else {
+    WritePayload w_payload;
+    TracerHelper::DecodeWritePayload(&(ra->trace_entry), &w_payload);
+    WriteBatch batch(w_payload.write_batch_data.ToString());
+    ra->db->Write(ra->woptions, &batch);
+  }
   return;
 }
 
@@ -444,21 +803,28 @@ void Replayer::BGWorkIterSeek(void* arg) {
   assert(ra != nullptr);
   auto cf_map = static_cast<std::unordered_map<uint32_t, ColumnFamilyHandle*>*>(
       ra->cf_map);
-  uint32_t cf_id = 0;
-  Slice key;
-  DecodeCFAndKey(ra->trace_entry.payload, &cf_id, &key);
-  if (cf_id > 0 && cf_map->find(cf_id) == cf_map->end()) {
+  IterPayload iter_payload;
+  iter_payload.cf_id = 0;
+
+  if (ra->trace_file_version < 2) {
+    DecodeCFAndKey(ra->trace_entry.payload, &iter_payload.cf_id,
+                   &iter_payload.iter_key);
+  } else {
+    TracerHelper::DecodeIterPayload(&(ra->trace_entry), &iter_payload);
+  }
+  if (iter_payload.cf_id > 0 &&
+      cf_map->find(iter_payload.cf_id) == cf_map->end()) {
     return;
   }
 
-  std::string value;
   Iterator* single_iter = nullptr;
-  if (cf_id == 0) {
+  if (iter_payload.cf_id == 0) {
     single_iter = ra->db->NewIterator(ra->roptions);
   } else {
-    single_iter = ra->db->NewIterator(ra->roptions, (*cf_map)[cf_id]);
+    single_iter =
+        ra->db->NewIterator(ra->roptions, (*cf_map)[iter_payload.cf_id]);
   }
-  single_iter->Seek(key);
+  single_iter->Seek(iter_payload.iter_key);
   delete single_iter;
   return;
 }
@@ -469,21 +835,28 @@ void Replayer::BGWorkIterSeekForPrev(void* arg) {
   assert(ra != nullptr);
   auto cf_map = static_cast<std::unordered_map<uint32_t, ColumnFamilyHandle*>*>(
       ra->cf_map);
-  uint32_t cf_id = 0;
-  Slice key;
-  DecodeCFAndKey(ra->trace_entry.payload, &cf_id, &key);
-  if (cf_id > 0 && cf_map->find(cf_id) == cf_map->end()) {
+  IterPayload iter_payload;
+  iter_payload.cf_id = 0;
+
+  if (ra->trace_file_version < 2) {
+    DecodeCFAndKey(ra->trace_entry.payload, &iter_payload.cf_id,
+                   &iter_payload.iter_key);
+  } else {
+    TracerHelper::DecodeIterPayload(&(ra->trace_entry), &iter_payload);
+  }
+  if (iter_payload.cf_id > 0 &&
+      cf_map->find(iter_payload.cf_id) == cf_map->end()) {
     return;
   }
 
-  std::string value;
   Iterator* single_iter = nullptr;
-  if (cf_id == 0) {
+  if (iter_payload.cf_id == 0) {
     single_iter = ra->db->NewIterator(ra->roptions);
   } else {
-    single_iter = ra->db->NewIterator(ra->roptions, (*cf_map)[cf_id]);
+    single_iter =
+        ra->db->NewIterator(ra->roptions, (*cf_map)[iter_payload.cf_id]);
   }
-  single_iter->SeekForPrev(key);
+  single_iter->SeekForPrev(iter_payload.iter_key);
   delete single_iter;
   return;
 }
diff --git a/trace_replay/trace_replay.h b/trace_replay/trace_replay.h
index 3c3be1cd151..d3ad2d79907 100644
--- a/trace_replay/trace_replay.h
+++ b/trace_replay/trace_replay.h
@@ -9,9 +9,9 @@
 #include <unordered_map>
 #include <utility>
 
-#include "rocksdb/env.h"
 #include "rocksdb/options.h"
-#include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -22,9 +22,17 @@ class ColumnFamilyHandle;
 class ColumnFamilyData;
 class DB;
 class DBImpl;
+class Env;
 class Slice;
+class SystemClock;
+class TraceReader;
+class TraceWriter;
 class WriteBatch;
 
+struct ReadOptions;
+struct TraceOptions;
+struct WriteOptions;
+
 extern const std::string kTraceMagic;
 const unsigned int kTraceTimestampSize = 8;
 const unsigned int kTraceTypeSize = 1;
@@ -32,6 +40,9 @@ const unsigned int kTracePayloadLengthSize = 4;
 const unsigned int kTraceMetadataSize =
     kTraceTimestampSize + kTraceTypeSize + kTracePayloadLengthSize;
 
+static const int kTraceFileMajorVersion = 0;
+static const int kTraceFileMinorVersion = 2;
+
 // Supported Trace types.
 enum TraceType : char {
   kTraceBegin = 1,
@@ -46,12 +57,10 @@ enum TraceType : char {
   kBlockTraceDataBlock = 9,
   kBlockTraceUncompressionDictBlock = 10,
   kBlockTraceRangeDeletionBlock = 11,
-  // IO Trace related types based on options that will be added in trace file.
-  kIOGeneral = 12,
-  kIOFileName = 13,
-  kIOFileNameAndFileSize = 14,
-  kIOLen = 15,
-  kIOLenAndOffset = 16,
+  // For IOTracing.
+  kIOTracer = 12,
+  // For query tracing
+  kTraceMultiGet = 13,
   // All trace types should be added before kTraceMax
   kTraceMax,
 };
@@ -63,22 +72,92 @@ enum TraceType : char {
 struct Trace {
   uint64_t ts;  // timestamp
   TraceType type;
+  // Each bit in payload_map stores which corresponding struct member added in
+  // the payload. Each TraceType has its corresponding payload struct. For
+  // example, if bit at position 0 is set in write payload, then the write batch
+  // will be addedd.
+  uint64_t payload_map = 0;
+  // Each trace type has its own payload_struct, which will be serilized in the
+  // payload.
   std::string payload;
 
   void reset() {
     ts = 0;
     type = kTraceMax;
+    payload_map = 0;
     payload.clear();
   }
 };
 
+enum TracePayloadType : char {
+  // Each member of all query payload structs should have a corresponding flag
+  // here. Make sure to add them sequentially in the order of it is added.
+  kEmptyPayload = 0,
+  kWriteBatchData = 1,
+  kGetCFID = 2,
+  kGetKey = 3,
+  kIterCFID = 4,
+  kIterKey = 5,
+  kIterLowerBound = 6,
+  kIterUpperBound = 7,
+  kMultiGetSize = 8,
+  kMultiGetCFIDs = 9,
+  kMultiGetKeys = 10,
+};
+
+struct WritePayload {
+  Slice write_batch_data;
+};
+
+struct GetPayload {
+  uint32_t cf_id = 0;
+  Slice get_key;
+};
+
+struct IterPayload {
+  uint32_t cf_id = 0;
+  Slice iter_key;
+  Slice lower_bound;
+  Slice upper_bound;
+};
+
+struct MultiGetPayload {
+  uint32_t multiget_size;
+  std::vector<uint32_t> cf_ids;
+  std::vector<std::string> multiget_keys;
+};
+
 class TracerHelper {
  public:
-  // Encode a trace object into the given string.
+  // Parse the string with major and minor version only
+  static Status ParseVersionStr(std::string& v_string, int* v_num);
+
+  // Parse the trace file version and db version in trace header
+  static Status ParseTraceHeader(const Trace& header, int* trace_version,
+                                 int* db_version);
+
+  // Encode a version 0.1 trace object into the given string.
   static void EncodeTrace(const Trace& trace, std::string* encoded_trace);
 
   // Decode a string into the given trace object.
   static Status DecodeTrace(const std::string& encoded_trace, Trace* trace);
+
+  // Set the payload map based on the payload type
+  static bool SetPayloadMap(uint64_t& payload_map,
+                            const TracePayloadType payload_type);
+
+  // Decode the write payload and store in WrteiPayload
+  static void DecodeWritePayload(Trace* trace, WritePayload* write_payload);
+
+  // Decode the get payload and store in WrteiPayload
+  static void DecodeGetPayload(Trace* trace, GetPayload* get_payload);
+
+  // Decode the iter payload and store in WrteiPayload
+  static void DecodeIterPayload(Trace* trace, IterPayload* iter_payload);
+
+  // Decode the multiget payload and store in MultiGetPayload
+  static void DecodeMultiGetPayload(Trace* trace,
+                                    MultiGetPayload* multiget_payload);
 };
 
 // Tracer captures all RocksDB operations using a user-provided TraceWriter.
@@ -86,7 +165,7 @@ class TracerHelper {
 // timestamp and type, followed by the trace payload.
 class Tracer {
  public:
-  Tracer(Env* env, const TraceOptions& trace_options,
+  Tracer(SystemClock* clock, const TraceOptions& trace_options,
          std::unique_ptr<TraceWriter>&& trace_writer);
   ~Tracer();
 
@@ -97,8 +176,21 @@ class Tracer {
   Status Get(ColumnFamilyHandle* cfname, const Slice& key);
 
   // Trace Iterators.
-  Status IteratorSeek(const uint32_t& cf_id, const Slice& key);
-  Status IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key);
+  Status IteratorSeek(const uint32_t& cf_id, const Slice& key,
+                      const Slice& lower_bound, const Slice upper_bound);
+  Status IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key,
+                             const Slice& lower_bound, const Slice upper_bound);
+
+  // Trace MultiGet
+
+  Status MultiGet(const size_t num_keys, ColumnFamilyHandle** column_families,
+                  const Slice* keys);
+
+  Status MultiGet(const size_t num_keys, ColumnFamilyHandle* column_family,
+                  const Slice* keys);
+
+  Status MultiGet(const std::vector<ColumnFamilyHandle*>& column_family,
+                  const std::vector<Slice>& keys);
 
   // Returns true if the trace is over the configured max trace file limit.
   // False otherwise.
@@ -124,7 +216,7 @@ class Tracer {
   // Returns true if a trace should be skipped, false otherwise.
   bool ShouldSkipTrace(const TraceType& type);
 
-  Env* env_;
+  SystemClock* clock_;
   TraceOptions trace_options_;
   std::unique_ptr<TraceWriter> trace_writer_;
   uint64_t trace_request_count_;
@@ -181,6 +273,10 @@ class Replayer {
   std::unique_ptr<TraceReader> trace_reader_;
   std::unordered_map<uint32_t, ColumnFamilyHandle*> cf_map_;
   uint32_t fast_forward_;
+  // When reading the trace header, the trace file version can be parsed.
+  // Replayer will use different decode method to get the trace content based
+  // on different trace file version.
+  int trace_file_version_;
 };
 
 // The passin arg of MultiThreadRepkay for each trace record.
@@ -190,6 +286,7 @@ struct ReplayerWorkerArg {
   std::unordered_map<uint32_t, ColumnFamilyHandle*>* cf_map;
   WriteOptions woptions;
   ReadOptions roptions;
+  int trace_file_version;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/bloom_impl.h b/util/bloom_impl.h
index 4e83f6bb187..5f2a69e07ca 100644
--- a/util/bloom_impl.h
+++ b/util/bloom_impl.h
@@ -10,8 +10,10 @@
 #pragma once
 #include <stddef.h>
 #include <stdint.h>
+
 #include <cmath>
 
+#include "port/port.h"  // for PREFETCH
 #include "rocksdb/slice.h"
 #include "util/hash.h"
 
diff --git a/util/bloom_test.cc b/util/bloom_test.cc
index 0fea9c6625d..660e56611ca 100644
--- a/util/bloom_test.cc
+++ b/util/bloom_test.cc
@@ -19,7 +19,6 @@ int main() {
 #include <cmath>
 #include <vector>
 
-#include "logging/logging.h"
 #include "memory/arena.h"
 #include "port/jemalloc_helper.h"
 #include "rocksdb/filter_policy.h"
@@ -31,6 +30,9 @@ int main() {
 
 using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 
+// The test is not fully designed for bits_per_key other than 10, but with
+// this parameter you can easily explore the behavior of other bits_per_key.
+// See also filter_bench.
 DEFINE_int32(bits_per_key, 10, "");
 
 namespace ROCKSDB_NAMESPACE {
@@ -159,7 +161,8 @@ TEST_F(BlockBasedBloomTest, VaryingLengths) {
     }
     Build();
 
-    ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 40)) << length;
+    ASSERT_LE(FilterSize(), (size_t)((length * FLAGS_bits_per_key / 8) + 40))
+        << length;
 
     // All added keys must match
     for (int i = 0; i < length; i++) {
@@ -173,11 +176,16 @@ TEST_F(BlockBasedBloomTest, VaryingLengths) {
       fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
               rate*100.0, length, static_cast<int>(FilterSize()));
     }
-    ASSERT_LE(rate, 0.02);   // Must not be over 2%
-    if (rate > 0.0125) mediocre_filters++;  // Allowed, but not too often
-    else good_filters++;
+    if (FLAGS_bits_per_key == 10) {
+      ASSERT_LE(rate, 0.02);  // Must not be over 2%
+      if (rate > 0.0125) {
+        mediocre_filters++;  // Allowed, but not too often
+      } else {
+        good_filters++;
+      }
+    }
   }
-  if (kVerbose >= 1) {
+  if (FLAGS_bits_per_key == 10 && kVerbose >= 1) {
     fprintf(stderr, "Filters: %d good, %d mediocre\n",
             good_filters, mediocre_filters);
   }
@@ -323,6 +331,13 @@ class FullBloomTest : public testing::TestWithParam<BloomFilterPolicy::Mode> {
     }
   }
 
+  int GetRibbonSeedFromFilterData() {
+    assert(filter_size_ >= 5);
+    // Check for ribbon marker
+    assert(-2 == static_cast<int8_t>(buf_.get()[filter_size_ - 5]));
+    return static_cast<uint8_t>(buf_.get()[filter_size_ - 4]);
+  }
+
   bool Matches(const Slice& s) {
     if (bits_reader_ == nullptr) {
       Build();
@@ -373,22 +388,6 @@ class FullBloomTest : public testing::TestWithParam<BloomFilterPolicy::Mode> {
     }
     return result / 10000.0;
   }
-
-  uint32_t SelectByImpl(uint32_t for_legacy_bloom,
-                        uint32_t for_fast_local_bloom) {
-    switch (GetParam()) {
-      case BloomFilterPolicy::kLegacyBloom:
-        return for_legacy_bloom;
-      case BloomFilterPolicy::kFastLocalBloom:
-        return for_fast_local_bloom;
-      case BloomFilterPolicy::kDeprecatedBlock:
-      case BloomFilterPolicy::kAuto:
-          /* N/A */;
-    }
-    // otherwise
-    assert(false);
-    return 0;
-  }
 };
 
 TEST_P(FullBloomTest, FilterSize) {
@@ -432,12 +431,29 @@ TEST_P(FullBloomTest, FilterSize) {
     EXPECT_EQ((bpk.second + 500) / 1000, bfp->GetWholeBitsPerKey());
 
     auto bits_builder = GetBuiltinFilterBitsBuilder();
-    for (int n = 1; n < 100; n++) {
-      auto space = bits_builder->CalculateSpace(n);
-      auto n2 = bits_builder->CalculateNumEntry(space);
+
+    size_t n = 1;
+    size_t space = 0;
+    for (; n < 1000000; n += 1 + n / 1000) {
+      // Ensure consistency between CalculateSpace and ApproximateNumEntries
+      space = bits_builder->CalculateSpace(n);
+      size_t n2 = bits_builder->ApproximateNumEntries(space);
       EXPECT_GE(n2, n);
-      auto space2 = bits_builder->CalculateSpace(n2);
-      EXPECT_EQ(space, space2);
+      size_t space2 = bits_builder->CalculateSpace(n2);
+      if (n > 12000 && GetParam() == BloomFilterPolicy::kStandard128Ribbon) {
+        // TODO(peterd): better approximation?
+        EXPECT_GE(space2, space);
+        EXPECT_LE(space2 * 0.998, space * 1.0);
+      } else {
+        EXPECT_EQ(space2, space);
+      }
+    }
+    // Until size_t overflow
+    for (; n < (n + n / 3); n += n / 3) {
+      // Ensure space computation is not overflowing; capped is OK
+      size_t space2 = bits_builder->CalculateSpace(n);
+      EXPECT_GE(space2, space);
+      space = space2;
     }
   }
   // Check that the compiler hasn't optimized our computation into nothing
@@ -474,8 +490,8 @@ TEST_P(FullBloomTest, FullVaryingLengths) {
     }
     Build();
 
-    ASSERT_LE(FilterSize(),
-              (size_t)((length * 10 / 8) + CACHE_LINE_SIZE * 2 + 5));
+    EXPECT_LE(FilterSize(), (size_t)((length * FLAGS_bits_per_key / 8) +
+                                     CACHE_LINE_SIZE * 2 + 5));
 
     // All added keys must match
     for (int i = 0; i < length; i++) {
@@ -489,17 +505,20 @@ TEST_P(FullBloomTest, FullVaryingLengths) {
       fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
               rate*100.0, length, static_cast<int>(FilterSize()));
     }
-    ASSERT_LE(rate, 0.02);   // Must not be over 2%
-    if (rate > 0.0125)
-      mediocre_filters++;  // Allowed, but not too often
-    else
-      good_filters++;
+    if (FLAGS_bits_per_key == 10) {
+      EXPECT_LE(rate, 0.02);  // Must not be over 2%
+      if (rate > 0.0125) {
+        mediocre_filters++;  // Allowed, but not too often
+      } else {
+        good_filters++;
+      }
+    }
   }
   if (kVerbose >= 1) {
     fprintf(stderr, "Filters: %d good, %d mediocre\n",
             good_filters, mediocre_filters);
   }
-  ASSERT_LE(mediocre_filters, good_filters/5);
+  EXPECT_LE(mediocre_filters, good_filters / 5);
 }
 
 TEST_P(FullBloomTest, OptimizeForMemory) {
@@ -531,10 +550,16 @@ TEST_P(FullBloomTest, OptimizeForMemory) {
       total_keys += nkeys;
       total_fp_rate += FalsePositiveRate();
     }
-    EXPECT_LE(total_fp_rate / double{nfilters}, 0.011);
-    EXPECT_GE(total_fp_rate / double{nfilters}, 0.008);
+    if (FLAGS_bits_per_key == 10) {
+      EXPECT_LE(total_fp_rate / double{nfilters}, 0.011);
+      EXPECT_GE(total_fp_rate / double{nfilters}, 0.008);
+    }
 
     int64_t ex_min_total_size = int64_t{FLAGS_bits_per_key} * total_keys / 8;
+    if (GetParam() == BloomFilterPolicy::kStandard128Ribbon) {
+      // ~ 30% savings vs. Bloom filter
+      ex_min_total_size = 7 * ex_min_total_size / 10;
+    }
     EXPECT_GE(static_cast<int64_t>(total_size), ex_min_total_size);
 
     int64_t blocked_bloom_overhead = nfilters * (CACHE_LINE_SIZE + 5);
@@ -562,8 +587,10 @@ TEST_P(FullBloomTest, OptimizeForMemory) {
 #ifdef ROCKSDB_JEMALLOC
       fprintf(stderr, "Jemalloc detected? %d\n", HasJemalloc());
       if (HasJemalloc()) {
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
         // More than 5% internal fragmentation
         EXPECT_GE(total_mem, total_size * 105 / 100);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
       }
 #endif  // ROCKSDB_JEMALLOC
       // No storage penalty, just usual overhead
@@ -597,94 +624,160 @@ inline uint32_t SelectByCacheLineSize(uint32_t for64, uint32_t for128,
 // ability to read filters generated using other cache line sizes.
 // See RawSchema.
 TEST_P(FullBloomTest, Schema) {
+#define EXPECT_EQ_Bloom(a, b)                                  \
+  {                                                            \
+    if (GetParam() != BloomFilterPolicy::kStandard128Ribbon) { \
+      EXPECT_EQ(a, b);                                         \
+    }                                                          \
+  }
+#define EXPECT_EQ_Ribbon(a, b)                                 \
+  {                                                            \
+    if (GetParam() == BloomFilterPolicy::kStandard128Ribbon) { \
+      EXPECT_EQ(a, b);                                         \
+    }                                                          \
+  }
+#define EXPECT_EQ_FastBloom(a, b)                           \
+  {                                                         \
+    if (GetParam() == BloomFilterPolicy::kFastLocalBloom) { \
+      EXPECT_EQ(a, b);                                      \
+    }                                                       \
+  }
+#define EXPECT_EQ_LegacyBloom(a, b)                      \
+  {                                                      \
+    if (GetParam() == BloomFilterPolicy::kLegacyBloom) { \
+      EXPECT_EQ(a, b);                                   \
+    }                                                    \
+  }
+#define EXPECT_EQ_NotLegacy(a, b)                        \
+  {                                                      \
+    if (GetParam() != BloomFilterPolicy::kLegacyBloom) { \
+      EXPECT_EQ(a, b);                                   \
+    }                                                    \
+  }
+
   char buffer[sizeof(int)];
 
-  // Use enough keys so that changing bits / key by 1 is guaranteed to
+  // First do a small number of keys, where Ribbon config will fall back on
+  // fast Bloom filter and generate the same data
+  ResetPolicy(5);  // num_probes = 3
+  for (int key = 0; key < 87; key++) {
+    Add(Key(key, buffer));
+  }
+  Build();
+  EXPECT_EQ(GetNumProbesFromFilterData(), 3);
+
+  EXPECT_EQ_NotLegacy(BloomHash(FilterData()), 4130687756U);
+
+  EXPECT_EQ_NotLegacy("31,38,40,43,61,83,86,112,125,131", FirstFPs(10));
+
+  // Now use enough keys so that changing bits / key by 1 is guaranteed to
   // change number of allocated cache lines. So keys > max cache line bits.
 
+  // Note that the first attempted Ribbon seed is determined by the hash
+  // of the first key added (for pseudorandomness in practice, determinism in
+  // testing)
+
   ResetPolicy(2);  // num_probes = 1
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), 1);
-  EXPECT_EQ(
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 1);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
       BloomHash(FilterData()),
-      SelectByImpl(SelectByCacheLineSize(1567096579, 1964771444, 2659542661U),
-                   3817481309U));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("11,13,17,25,29,30,35,37,45,53", FirstFPs(10));
-  }
+      SelectByCacheLineSize(1567096579, 1964771444, 2659542661U));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 3817481309U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1705851228);
+
+  EXPECT_EQ_FastBloom("11,13,17,25,29,30,35,37,45,53", FirstFPs(10));
+  EXPECT_EQ_Ribbon("3,8,10,17,19,20,23,28,31,32", FirstFPs(10));
 
   ResetPolicy(3);  // num_probes = 2
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), 2);
-  EXPECT_EQ(
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 2);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
       BloomHash(FilterData()),
-      SelectByImpl(SelectByCacheLineSize(2707206547U, 2571983456U, 218344685),
-                   2807269961U));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("4,15,17,24,27,28,29,53,63,70", FirstFPs(10));
-  }
+      SelectByCacheLineSize(2707206547U, 2571983456U, 218344685));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 2807269961U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1095342358);
+
+  EXPECT_EQ_FastBloom("4,15,17,24,27,28,29,53,63,70", FirstFPs(10));
+  EXPECT_EQ_Ribbon("3,17,20,28,32,33,36,43,49,54", FirstFPs(10));
 
   ResetPolicy(5);  // num_probes = 3
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), 3);
-  EXPECT_EQ(
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 3);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
       BloomHash(FilterData()),
-      SelectByImpl(SelectByCacheLineSize(515748486, 94611728, 2436112214U),
-                   204628445));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("15,24,29,39,53,87,89,100,103,104", FirstFPs(10));
-  }
+      SelectByCacheLineSize(515748486, 94611728, 2436112214U));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 204628445);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 3971337699U);
+
+  EXPECT_EQ_FastBloom("15,24,29,39,53,87,89,100,103,104", FirstFPs(10));
+  EXPECT_EQ_Ribbon("3,33,36,43,67,70,76,78,84,102", FirstFPs(10));
 
   ResetPolicy(8);  // num_probes = 5
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), 5);
-  EXPECT_EQ(
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 5);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
       BloomHash(FilterData()),
-      SelectByImpl(SelectByCacheLineSize(1302145999, 2811644657U, 756553699),
-                   355564975));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("16,60,66,126,220,238,244,256,265,287", FirstFPs(10));
-  }
+      SelectByCacheLineSize(1302145999, 2811644657U, 756553699));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 355564975);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 3651449053U);
+
+  EXPECT_EQ_FastBloom("16,60,66,126,220,238,244,256,265,287", FirstFPs(10));
+  EXPECT_EQ_Ribbon("33,187,203,296,300,322,411,419,547,582", FirstFPs(10));
 
   ResetPolicy(9);  // num_probes = 6
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), 6);
-  EXPECT_EQ(
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
       BloomHash(FilterData()),
-      SelectByImpl(SelectByCacheLineSize(2092755149, 661139132, 1182970461),
-                   2137566013U));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("156,367,791,872,945,1015,1139,1159,1265,1435", FirstFPs(10));
-  }
+      SelectByCacheLineSize(2092755149, 661139132, 1182970461));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 2137566013U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1005676675);
+
+  EXPECT_EQ_FastBloom("156,367,791,872,945,1015,1139,1159,1265", FirstFPs(9));
+  EXPECT_EQ_Ribbon("33,187,203,296,411,419,604,612,615,619", FirstFPs(10));
 
   ResetPolicy(11);  // num_probes = 7
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), 7);
-  EXPECT_EQ(
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 7);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
       BloomHash(FilterData()),
-      SelectByImpl(SelectByCacheLineSize(3755609649U, 1812694762, 1449142939),
-                   2561502687U));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("34,74,130,236,643,882,962,1015,1035,1110", FirstFPs(10));
-  }
+      SelectByCacheLineSize(3755609649U, 1812694762, 1449142939));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 2561502687U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 3129900846U);
+
+  EXPECT_EQ_FastBloom("34,74,130,236,643,882,962,1015,1035,1110", FirstFPs(10));
+  EXPECT_EQ_Ribbon("411,419,623,665,727,794,955,1052,1323,1330", FirstFPs(10));
 
   // This used to be 9 probes, but 8 is a better choice for speed,
   // especially with SIMD groups of 8 probes, with essentially no
@@ -697,15 +790,18 @@ TEST_P(FullBloomTest, Schema) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(static_cast<uint32_t>(GetNumProbesFromFilterData()),
-            SelectByImpl(9, 8));
-  EXPECT_EQ(
+  EXPECT_EQ_LegacyBloom(GetNumProbesFromFilterData(), 9);
+  EXPECT_EQ_FastBloom(GetNumProbesFromFilterData(), 8);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
       BloomHash(FilterData()),
-      SelectByImpl(SelectByCacheLineSize(178861123, 379087593, 2574136516U),
-                   3709876890U));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("130,240,522,565,989,2002,2526,3147,3543", FirstFPs(9));
-  }
+      SelectByCacheLineSize(178861123, 379087593, 2574136516U));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 3709876890U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1855638875);
+
+  EXPECT_EQ_FastBloom("130,240,522,565,989,2002,2526,3147,3543", FirstFPs(9));
+  EXPECT_EQ_Ribbon("665,727,1323,1755,3866,4232,4442,4492,4736", FirstFPs(9));
 
   // This used to be 11 probes, but 9 is a better choice for speed
   // AND accuracy.
@@ -717,57 +813,69 @@ TEST_P(FullBloomTest, Schema) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(static_cast<uint32_t>(GetNumProbesFromFilterData()),
-            SelectByImpl(11, 9));
-  EXPECT_EQ(
+  EXPECT_EQ_LegacyBloom(GetNumProbesFromFilterData(), 11);
+  EXPECT_EQ_FastBloom(GetNumProbesFromFilterData(), 9);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
       BloomHash(FilterData()),
-      SelectByImpl(SelectByCacheLineSize(1129406313, 3049154394U, 1727750964),
-                   1087138490));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("3299,3611,3916,6620,7822,8079,8482,8942,10167", FirstFPs(9));
-  }
+      SelectByCacheLineSize(1129406313, 3049154394U, 1727750964));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 1087138490);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 459379967);
+
+  EXPECT_EQ_FastBloom("3299,3611,3916,6620,7822,8079,8482,8942", FirstFPs(8));
+  EXPECT_EQ_Ribbon("727,1323,1755,4442,4736,5386,6974,7154,8222", FirstFPs(9));
 
   ResetPolicy(10);  // num_probes = 6, but different memory ratio vs. 9
   for (int key = 0; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), 6);
-  EXPECT_EQ(
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61);
+
+  EXPECT_EQ_LegacyBloom(
       BloomHash(FilterData()),
-      SelectByImpl(SelectByCacheLineSize(1478976371, 2910591341U, 1182970461),
-                   2498541272U));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("16,126,133,422,466,472,813,1002,1035,1159", FirstFPs(10));
-  }
+      SelectByCacheLineSize(1478976371, 2910591341U, 1182970461));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 2498541272U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1273231667);
+
+  EXPECT_EQ_FastBloom("16,126,133,422,466,472,813,1002,1035", FirstFPs(9));
+  EXPECT_EQ_Ribbon("296,411,419,612,619,623,630,665,686,727", FirstFPs(10));
 
   ResetPolicy(10);
   for (int key = /*CHANGED*/ 1; key < 2087; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), 6);
-  EXPECT_EQ(
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), /*CHANGED*/ 184);
+
+  EXPECT_EQ_LegacyBloom(
       BloomHash(FilterData()),
-      SelectByImpl(SelectByCacheLineSize(4205696321U, 1132081253U, 2385981855U),
-                   2058382345U));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("16,126,133,422,466,472,813,1002,1035,1159", FirstFPs(10));
-  }
+      SelectByCacheLineSize(4205696321U, 1132081253U, 2385981855U));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 2058382345U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 3007790572U);
+
+  EXPECT_EQ_FastBloom("16,126,133,422,466,472,813,1002,1035", FirstFPs(9));
+  EXPECT_EQ_Ribbon("33,152,383,497,589,633,737,781,911,990", FirstFPs(10));
 
   ResetPolicy(10);
   for (int key = 1; key < /*CHANGED*/ 2088; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), 6);
-  EXPECT_EQ(
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 184);
+
+  EXPECT_EQ_LegacyBloom(
       BloomHash(FilterData()),
-      SelectByImpl(SelectByCacheLineSize(2885052954U, 769447944, 4175124908U),
-                   23699164));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ("16,126,133,422,466,472,813,1002,1035,1159", FirstFPs(10));
-  }
+      SelectByCacheLineSize(2885052954U, 769447944, 4175124908U));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 23699164);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1942323379);
+
+  EXPECT_EQ_FastBloom("16,126,133,422,466,472,813,1002,1035", FirstFPs(9));
+  EXPECT_EQ_Ribbon("33,95,360,589,737,911,990,1048,1081,1414", FirstFPs(10));
 
   // With new fractional bits_per_key, check that we are rounding to
   // whole bits per key for old Bloom filters but fractional for
@@ -777,31 +885,35 @@ TEST_P(FullBloomTest, Schema) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(GetNumProbesFromFilterData(), 6);
-  EXPECT_EQ(BloomHash(FilterData()),
-            SelectByImpl(/*SAME*/ SelectByCacheLineSize(2885052954U, 769447944,
-                                                        4175124908U),
-                         /*CHANGED*/ 3166884174U));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ(/*CHANGED*/ "126,156,367,444,458,791,813,976,1015,1035",
-              FirstFPs(10));
-  }
+  EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 184);
+
+  EXPECT_EQ_LegacyBloom(
+      BloomHash(FilterData()),
+      /*SAME*/ SelectByCacheLineSize(2885052954U, 769447944, 4175124908U));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 3166884174U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1148258663);
+
+  EXPECT_EQ_FastBloom("126,156,367,444,458,791,813,976,1015", FirstFPs(9));
+  EXPECT_EQ_Ribbon("33,54,95,360,589,693,737,911,990,1048", FirstFPs(10));
 
   ResetPolicy(10.499);
   for (int key = 1; key < 2088; key++) {
     Add(Key(key, buffer));
   }
   Build();
-  EXPECT_EQ(static_cast<uint32_t>(GetNumProbesFromFilterData()),
-            SelectByImpl(6, 7));
-  EXPECT_EQ(BloomHash(FilterData()),
-            SelectByImpl(/*SAME*/ SelectByCacheLineSize(2885052954U, 769447944,
-                                                        4175124908U),
-                         /*CHANGED*/ 4098502778U));
-  if (GetParam() == BloomFilterPolicy::kFastLocalBloom) {
-    EXPECT_EQ(/*CHANGED*/ "16,236,240,472,1015,1045,1111,1409,1465,1612",
-              FirstFPs(10));
-  }
+  EXPECT_EQ_LegacyBloom(GetNumProbesFromFilterData(), 6);
+  EXPECT_EQ_FastBloom(GetNumProbesFromFilterData(), 7);
+  EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 184);
+
+  EXPECT_EQ_LegacyBloom(
+      BloomHash(FilterData()),
+      /*SAME*/ SelectByCacheLineSize(2885052954U, 769447944, 4175124908U));
+  EXPECT_EQ_FastBloom(BloomHash(FilterData()), 4098502778U);
+  EXPECT_EQ_Ribbon(BloomHash(FilterData()), 792138188);
+
+  EXPECT_EQ_FastBloom("16,236,240,472,1015,1045,1111,1409,1465", FirstFPs(9));
+  EXPECT_EQ_Ribbon("33,95,360,589,737,990,1048,1081,1414,1643", FirstFPs(10));
 
   ResetPolicy();
 }
@@ -843,6 +955,7 @@ struct RawFilterTester {
 
 TEST_P(FullBloomTest, RawSchema) {
   RawFilterTester cft;
+  // Legacy Bloom configurations
   // Two probes, about 3/4 bits set: ~50% "FP" rate
   // One 256-byte cache line.
   OpenRaw(cft.ResetWeirdFill(256, 1, 2));
@@ -855,12 +968,37 @@ TEST_P(FullBloomTest, RawSchema) {
   // Four 64-byte cache lines.
   OpenRaw(cft.ResetWeirdFill(256, 4, 2));
   EXPECT_EQ(uint64_t{7123594913907464682U}, PackedMatches());
+
+  // Fast local Bloom configurations (marker 255 -> -1)
+  // Two probes, about 3/4 bits set: ~50% "FP" rate
+  // Four 64-byte cache lines.
+  OpenRaw(cft.ResetWeirdFill(256, 2U << 8, 255));
+  EXPECT_EQ(uint64_t{9957045189927952471U}, PackedMatches());
+
+  // Ribbon configurations (marker 254 -> -2)
+
+  // Even though the builder never builds configurations this
+  // small (preferring Bloom), we can test that the configuration
+  // can be read, for possible future-proofing.
+
+  // 256 slots, one result column = 32 bytes (2 blocks, seed 0)
+  // ~50% FP rate:
+  // 0b0101010111110101010000110000011011011111100100001110010011101010
+  OpenRaw(cft.ResetWeirdFill(32, 2U << 8, 254));
+  EXPECT_EQ(uint64_t{6193930559317665002U}, PackedMatches());
+
+  // 256 slots, three-to-four result columns = 112 bytes
+  // ~ 1 in 10 FP rate:
+  // 0b0000000000100000000000000000000001000001000000010000101000000000
+  OpenRaw(cft.ResetWeirdFill(112, 2U << 8, 254));
+  EXPECT_EQ(uint64_t{9007200345328128U}, PackedMatches());
 }
 
 TEST_P(FullBloomTest, CorruptFilters) {
   RawFilterTester cft;
 
   for (bool fill : {false, true}) {
+    // Legacy Bloom configurations
     // Good filter bits - returns same as fill
     OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 6, fill));
     ASSERT_EQ(fill, Matches("hello"));
@@ -966,16 +1104,96 @@ TEST_P(FullBloomTest, CorruptFilters) {
     ASSERT_TRUE(Matches("world"));
 
     // Dubious filter bits - returns true (for now)
-    // Similar, with 255 / -1
-    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 255, fill));
+    // Similar, with 253 / -3
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 253, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    // #########################################################
+    // Fast local Bloom configurations (marker 255 -> -1)
+    // Good config with six probes
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 6U << 8, 255, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Becomes bad/reserved config (always true) if any other byte set
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, (6U << 8) | 1U, 255, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, (6U << 8) | (1U << 16), 255, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, (6U << 8) | (1U << 24), 255, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    // Good config, max 30 probes
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 30U << 8, 255, fill));
+    ASSERT_EQ(fill, Matches("hello"));
+    ASSERT_EQ(fill, Matches("world"));
+
+    // Bad/reserved config (always true) if more than 30
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 31U << 8, 255, fill));
     ASSERT_TRUE(Matches("hello"));
     ASSERT_TRUE(Matches("world"));
+
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 33U << 8, 255, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 66U << 8, 255, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+
+    OpenRaw(cft.Reset(CACHE_LINE_SIZE, 130U << 8, 255, fill));
+    ASSERT_TRUE(Matches("hello"));
+    ASSERT_TRUE(Matches("world"));
+  }
+
+  // #########################################################
+  // Ribbon configurations (marker 254 -> -2)
+  // ("fill" doesn't work to detect good configurations, we just
+  // have to rely on TN probability)
+
+  // Good: 2 blocks * 16 bytes / segment * 4 columns = 128 bytes
+  // seed = 123
+  OpenRaw(cft.Reset(128, (2U << 8) + 123U, 254, false));
+  ASSERT_FALSE(Matches("hello"));
+  ASSERT_FALSE(Matches("world"));
+
+  // Good: 2 blocks * 16 bytes / segment * 8 columns = 256 bytes
+  OpenRaw(cft.Reset(256, (2U << 8) + 123U, 254, false));
+  ASSERT_FALSE(Matches("hello"));
+  ASSERT_FALSE(Matches("world"));
+
+  // Surprisingly OK: 5000 blocks (640,000 slots) in only 1024 bits
+  // -> average close to 0 columns
+  OpenRaw(cft.Reset(128, (5000U << 8) + 123U, 254, false));
+  // *Almost* all FPs
+  ASSERT_TRUE(Matches("hello"));
+  ASSERT_TRUE(Matches("world"));
+  // Need many queries to find a "true negative"
+  for (int i = 0; Matches(ToString(i)); ++i) {
+    ASSERT_LT(i, 1000);
   }
+
+  // Bad: 1 block not allowed (for implementation detail reasons)
+  OpenRaw(cft.Reset(128, (1U << 8) + 123U, 254, false));
+  ASSERT_TRUE(Matches("hello"));
+  ASSERT_TRUE(Matches("world"));
+
+  // Bad: 0 blocks not allowed
+  OpenRaw(cft.Reset(128, (0U << 8) + 123U, 254, false));
+  ASSERT_TRUE(Matches("hello"));
+  ASSERT_TRUE(Matches("world"));
 }
 
 INSTANTIATE_TEST_CASE_P(Full, FullBloomTest,
                         testing::Values(BloomFilterPolicy::kLegacyBloom,
-                                        BloomFilterPolicy::kFastLocalBloom));
+                                        BloomFilterPolicy::kFastLocalBloom,
+                                        BloomFilterPolicy::kStandard128Ribbon));
 
 }  // namespace ROCKSDB_NAMESPACE
 
diff --git a/util/build_version.cc.in b/util/build_version.cc.in
index 8569580188e..9ef424669b4 100644
--- a/util/build_version.cc.in
+++ b/util/build_version.cc.in
@@ -1,5 +1,71 @@
 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-#include "build_version.h"
-const char* rocksdb_build_git_sha = "rocksdb_build_git_sha:@@GIT_SHA@@";
-const char* rocksdb_build_git_date = "rocksdb_build_git_date:@@GIT_DATE_TIME@@";
-const char* rocksdb_build_compile_date = __DATE__;
+
+#include <memory>
+
+#include "rocksdb/version.h"
+#include "util/string_util.h"
+
+// The build script may replace these values with real values based
+// on whether or not GIT is available and the platform settings
+static const std::string rocksdb_build_git_sha  = "rocksdb_build_git_sha:@GIT_SHA@";
+static const std::string rocksdb_build_git_tag = "rocksdb_build_git_tag:@GIT_TAG@";
+#define HAS_GIT_CHANGES @GIT_MOD@
+#if HAS_GIT_CHANGES == 0
+// If HAS_GIT_CHANGES is 0, the GIT date is used.
+// Use the time the branch/tag was last modified
+static const std::string rocksdb_build_date = "rocksdb_build_date:@GIT_DATE@";
+#else
+// If HAS_GIT_CHANGES is > 0, the branch/tag has modifications.
+// Use the time the build was created.
+static const std::string rocksdb_build_date = "rocksdb_build_date:@BUILD_DATE@";
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+static void AddProperty(std::unordered_map<std::string, std::string> *props, const std::string& name) {
+  size_t colon = name.find(":");
+  if (colon != std::string::npos && colon > 0 && colon < name.length() - 1) {
+    // If we found a "@:", then this property was a build-time substitution that failed.  Skip it
+    size_t at = name.find("@", colon);
+    if (at != colon + 1) {
+      // Everything before the colon is the name, after is the value
+      (*props)[name.substr(0, colon)] = name.substr(colon + 1);
+    }
+  }
+}
+  
+static std::unordered_map<std::string, std::string>* LoadPropertiesSet() {
+  auto * properties = new std::unordered_map<std::string, std::string>();
+  AddProperty(properties, rocksdb_build_git_sha);
+  AddProperty(properties, rocksdb_build_git_tag);
+  AddProperty(properties, rocksdb_build_date);
+  return properties;
+}
+
+const std::unordered_map<std::string, std::string>& GetRocksBuildProperties() {
+  static std::unique_ptr<std::unordered_map<std::string, std::string>> props(LoadPropertiesSet());
+  return *props;
+}
+
+std::string GetRocksVersionAsString(bool with_patch) {
+  std::string version = ToString(ROCKSDB_MAJOR) + "." + ToString(ROCKSDB_MINOR);
+  if (with_patch) {
+    return version + "." + ToString(ROCKSDB_PATCH);
+  } else {
+    return version;
+  }
+}
+  
+std::string GetRocksBuildInfoAsString(const std::string& program, bool verbose) {
+  std::string info = program + " (RocksDB) " + GetRocksVersionAsString(true);
+  if (verbose) {
+    for (const auto& it : GetRocksBuildProperties()) {
+      info.append("\n    ");
+      info.append(it.first);
+      info.append(": ");
+      info.append(it.second);
+    }
+  }
+  return info;
+}
+} // namespace ROCKSDB_NAMESPACE
+
diff --git a/util/build_version.h b/util/build_version.h
deleted file mode 100644
index 36ff92c078d..00000000000
--- a/util/build_version.h
+++ /dev/null
@@ -1,15 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-#pragma once
-#if !defined(IOS_CROSS_COMPILE)
-// if we compile with Xcode, we don't run build_detect_version, so we don't
-// generate these variables
-// this variable tells us about the git revision
-extern const char* rocksdb_build_git_sha;
-
-// Date on which the code was compiled:
-extern const char* rocksdb_build_compile_date;
-#endif
diff --git a/util/coding.h b/util/coding.h
index 0dd10ee5f65..876f181f1a0 100644
--- a/util/coding.h
+++ b/util/coding.h
@@ -12,15 +12,16 @@
 //   (little endian, native order on Intel and others)
 // * In addition we support variable length "varint" encoding
 // * Strings are encoded prefixed by their length in varint format
+//
+// Some related functions are provided in coding_lean.h
 
 #pragma once
 #include <algorithm>
-#include <stdint.h>
-#include <string.h>
 #include <string>
 
-#include "rocksdb/write_batch.h"
 #include "port/port.h"
+#include "rocksdb/slice.h"
+#include "util/coding_lean.h"
 
 // Some processors does not allow unaligned access to memory
 #if defined(__sparc)
@@ -94,60 +95,12 @@ inline const char* GetVarsignedint64Ptr(const char* p, const char* limit,
 // Returns the length of the varint32 or varint64 encoding of "v"
 extern int VarintLength(uint64_t v);
 
-// Lower-level versions of Put... that write directly into a character buffer
-// REQUIRES: dst has enough space for the value being written
-extern void EncodeFixed16(char* dst, uint16_t value);
-extern void EncodeFixed32(char* dst, uint32_t value);
-extern void EncodeFixed64(char* dst, uint64_t value);
-
 // Lower-level versions of Put... that write directly into a character buffer
 // and return a pointer just past the last byte written.
 // REQUIRES: dst has enough space for the value being written
 extern char* EncodeVarint32(char* dst, uint32_t value);
 extern char* EncodeVarint64(char* dst, uint64_t value);
 
-// Lower-level versions of Get... that read directly from a character buffer
-// without any bounds checking.
-
-inline uint16_t DecodeFixed16(const char* ptr) {
-  if (port::kLittleEndian) {
-    // Load the raw bytes
-    uint16_t result;
-    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
-    return result;
-  } else {
-    return ((static_cast<uint16_t>(static_cast<unsigned char>(ptr[0]))) |
-            (static_cast<uint16_t>(static_cast<unsigned char>(ptr[1])) << 8));
-  }
-}
-
-inline uint32_t DecodeFixed32(const char* ptr) {
-  if (port::kLittleEndian) {
-    // Load the raw bytes
-    uint32_t result;
-    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
-    return result;
-  } else {
-    return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0])))
-        | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8)
-        | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16)
-        | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
-  }
-}
-
-inline uint64_t DecodeFixed64(const char* ptr) {
-  if (port::kLittleEndian) {
-    // Load the raw bytes
-    uint64_t result;
-    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
-    return result;
-  } else {
-    uint64_t lo = DecodeFixed32(ptr);
-    uint64_t hi = DecodeFixed32(ptr + 4);
-    return (hi << 32) | lo;
-  }
-}
-
 // Internal routine for use by fallback path of GetVarint32Ptr
 extern const char* GetVarint32PtrFallback(const char* p,
                                           const char* limit,
@@ -165,42 +118,6 @@ inline const char* GetVarint32Ptr(const char* p,
   return GetVarint32PtrFallback(p, limit, value);
 }
 
-// -- Implementation of the functions declared above
-inline void EncodeFixed16(char* buf, uint16_t value) {
-  if (port::kLittleEndian) {
-    memcpy(buf, &value, sizeof(value));
-  } else {
-    buf[0] = value & 0xff;
-    buf[1] = (value >> 8) & 0xff;
-  }
-}
-
-inline void EncodeFixed32(char* buf, uint32_t value) {
-  if (port::kLittleEndian) {
-    memcpy(buf, &value, sizeof(value));
-  } else {
-    buf[0] = value & 0xff;
-    buf[1] = (value >> 8) & 0xff;
-    buf[2] = (value >> 16) & 0xff;
-    buf[3] = (value >> 24) & 0xff;
-  }
-}
-
-inline void EncodeFixed64(char* buf, uint64_t value) {
-  if (port::kLittleEndian) {
-    memcpy(buf, &value, sizeof(value));
-  } else {
-    buf[0] = value & 0xff;
-    buf[1] = (value >> 8) & 0xff;
-    buf[2] = (value >> 16) & 0xff;
-    buf[3] = (value >> 24) & 0xff;
-    buf[4] = (value >> 32) & 0xff;
-    buf[5] = (value >> 40) & 0xff;
-    buf[6] = (value >> 48) & 0xff;
-    buf[7] = (value >> 56) & 0xff;
-  }
-}
-
 // Pull the last 8 bits and cast it to a character
 inline void PutFixed16(std::string* dst, uint16_t value) {
   if (port::kLittleEndian) {
@@ -403,38 +320,6 @@ inline bool GetVarsignedint64(Slice* input, int64_t* value) {
   }
 }
 
-// Swaps between big and little endian. Can be used to in combination
-// with the little-endian encoding/decoding functions to encode/decode
-// big endian.
-template <typename T>
-inline T EndianSwapValue(T v) {
-  static_assert(std::is_integral<T>::value, "non-integral type");
-
-#ifdef _MSC_VER
-  if (sizeof(T) == 2) {
-    return static_cast<T>(_byteswap_ushort(static_cast<uint16_t>(v)));
-  } else if (sizeof(T) == 4) {
-    return static_cast<T>(_byteswap_ulong(static_cast<uint32_t>(v)));
-  } else if (sizeof(T) == 8) {
-    return static_cast<T>(_byteswap_uint64(static_cast<uint64_t>(v)));
-  }
-#else
-  if (sizeof(T) == 2) {
-    return static_cast<T>(__builtin_bswap16(static_cast<uint16_t>(v)));
-  } else if (sizeof(T) == 4) {
-    return static_cast<T>(__builtin_bswap32(static_cast<uint32_t>(v)));
-  } else if (sizeof(T) == 8) {
-    return static_cast<T>(__builtin_bswap64(static_cast<uint64_t>(v)));
-  }
-#endif
-  // Recognized by clang as bswap, but not by gcc :(
-  T ret_val = 0;
-  for (size_t i = 0; i < sizeof(T); ++i) {
-    ret_val |= ((v >> (8 * i)) & 0xff) << (8 * (sizeof(T) - 1 - i));
-  }
-  return ret_val;
-}
-
 inline bool GetLengthPrefixedSlice(Slice* input, Slice* result) {
   uint32_t len = 0;
   if (GetVarint32(input, &len) && input->size() >= len) {
diff --git a/util/coding_lean.h b/util/coding_lean.h
new file mode 100644
index 00000000000..0b205c269cf
--- /dev/null
+++ b/util/coding_lean.h
@@ -0,0 +1,133 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// Encoding independent of machine byte order:
+// * Fixed-length numbers are encoded with least-significant byte first
+//   (little endian, native order on Intel and others)
+//
+// More functions in coding.h
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+
+#include "port/port.h"  // for port::kLittleEndian
+
+namespace ROCKSDB_NAMESPACE {
+
+// Lower-level versions of Put... that write directly into a character buffer
+// REQUIRES: dst has enough space for the value being written
+// -- Implementation of the functions declared above
+inline void EncodeFixed16(char* buf, uint16_t value) {
+  if (port::kLittleEndian) {
+    memcpy(buf, &value, sizeof(value));
+  } else {
+    buf[0] = value & 0xff;
+    buf[1] = (value >> 8) & 0xff;
+  }
+}
+
+inline void EncodeFixed32(char* buf, uint32_t value) {
+  if (port::kLittleEndian) {
+    memcpy(buf, &value, sizeof(value));
+  } else {
+    buf[0] = value & 0xff;
+    buf[1] = (value >> 8) & 0xff;
+    buf[2] = (value >> 16) & 0xff;
+    buf[3] = (value >> 24) & 0xff;
+  }
+}
+
+inline void EncodeFixed64(char* buf, uint64_t value) {
+  if (port::kLittleEndian) {
+    memcpy(buf, &value, sizeof(value));
+  } else {
+    buf[0] = value & 0xff;
+    buf[1] = (value >> 8) & 0xff;
+    buf[2] = (value >> 16) & 0xff;
+    buf[3] = (value >> 24) & 0xff;
+    buf[4] = (value >> 32) & 0xff;
+    buf[5] = (value >> 40) & 0xff;
+    buf[6] = (value >> 48) & 0xff;
+    buf[7] = (value >> 56) & 0xff;
+  }
+}
+
+// Lower-level versions of Get... that read directly from a character buffer
+// without any bounds checking.
+
+inline uint16_t DecodeFixed16(const char* ptr) {
+  if (port::kLittleEndian) {
+    // Load the raw bytes
+    uint16_t result;
+    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    return result;
+  } else {
+    return ((static_cast<uint16_t>(static_cast<unsigned char>(ptr[0]))) |
+            (static_cast<uint16_t>(static_cast<unsigned char>(ptr[1])) << 8));
+  }
+}
+
+inline uint32_t DecodeFixed32(const char* ptr) {
+  if (port::kLittleEndian) {
+    // Load the raw bytes
+    uint32_t result;
+    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    return result;
+  } else {
+    return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0]))) |
+            (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8) |
+            (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16) |
+            (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
+  }
+}
+
+inline uint64_t DecodeFixed64(const char* ptr) {
+  if (port::kLittleEndian) {
+    // Load the raw bytes
+    uint64_t result;
+    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    return result;
+  } else {
+    uint64_t lo = DecodeFixed32(ptr);
+    uint64_t hi = DecodeFixed32(ptr + 4);
+    return (hi << 32) | lo;
+  }
+}
+
+// Swaps between big and little endian. Can be used to in combination
+// with the little-endian encoding/decoding functions to encode/decode
+// big endian.
+template <typename T>
+inline T EndianSwapValue(T v) {
+  static_assert(std::is_integral<T>::value, "non-integral type");
+
+#ifdef _MSC_VER
+  if (sizeof(T) == 2) {
+    return static_cast<T>(_byteswap_ushort(static_cast<uint16_t>(v)));
+  } else if (sizeof(T) == 4) {
+    return static_cast<T>(_byteswap_ulong(static_cast<uint32_t>(v)));
+  } else if (sizeof(T) == 8) {
+    return static_cast<T>(_byteswap_uint64(static_cast<uint64_t>(v)));
+  }
+#else
+  if (sizeof(T) == 2) {
+    return static_cast<T>(__builtin_bswap16(static_cast<uint16_t>(v)));
+  } else if (sizeof(T) == 4) {
+    return static_cast<T>(__builtin_bswap32(static_cast<uint32_t>(v)));
+  } else if (sizeof(T) == 8) {
+    return static_cast<T>(__builtin_bswap64(static_cast<uint64_t>(v)));
+  }
+#endif
+  // Recognized by clang as bswap, but not by gcc :(
+  T ret_val = 0;
+  for (size_t i = 0; i < sizeof(T); ++i) {
+    ret_val |= ((v >> (8 * i)) & 0xff) << (8 * (sizeof(T) - 1 - i));
+  }
+  return ret_val;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/compaction_job_stats_impl.cc b/util/compaction_job_stats_impl.cc
index e6380bb0481..cfab2a4fefe 100644
--- a/util/compaction_job_stats_impl.cc
+++ b/util/compaction_job_stats_impl.cc
@@ -14,17 +14,21 @@ void CompactionJobStats::Reset() {
   cpu_micros = 0;
 
   num_input_records = 0;
+  num_blobs_read = 0;
   num_input_files = 0;
   num_input_files_at_output_level = 0;
 
   num_output_records = 0;
   num_output_files = 0;
+  num_output_files_blob = 0;
 
   is_full_compaction = false;
   is_manual_compaction = false;
 
   total_input_bytes = 0;
+  total_blob_bytes_read = 0;
   total_output_bytes = 0;
+  total_output_bytes_blob = 0;
 
   num_records_replaced = 0;
 
@@ -53,14 +57,18 @@ void CompactionJobStats::Add(const CompactionJobStats& stats) {
   cpu_micros += stats.cpu_micros;
 
   num_input_records += stats.num_input_records;
+  num_blobs_read += stats.num_blobs_read;
   num_input_files += stats.num_input_files;
   num_input_files_at_output_level += stats.num_input_files_at_output_level;
 
   num_output_records += stats.num_output_records;
   num_output_files += stats.num_output_files;
+  num_output_files_blob += stats.num_output_files_blob;
 
   total_input_bytes += stats.total_input_bytes;
+  total_blob_bytes_read += stats.total_blob_bytes_read;
   total_output_bytes += stats.total_output_bytes;
+  total_output_bytes_blob += stats.total_output_bytes_blob;
 
   num_records_replaced += stats.num_records_replaced;
 
diff --git a/util/comparator.cc b/util/comparator.cc
index 44d45732ad4..f115a73e953 100644
--- a/util/comparator.cc
+++ b/util/comparator.cc
@@ -11,7 +11,6 @@
 #include <stdint.h>
 #include <algorithm>
 #include <memory>
-#include "logging/logging.h"
 #include "port/port.h"
 #include "rocksdb/slice.h"
 
@@ -130,6 +129,10 @@ class BytewiseComparatorImpl : public Comparator {
                               bool /*b_has_ts*/) const override {
     return a.compare(b);
   }
+
+  bool EqualWithoutTimestamp(const Slice& a, const Slice& b) const override {
+    return a == b;
+  }
 };
 
 class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl {
diff --git a/util/compression.h b/util/compression.h
index 4f5254499c9..5ce419c9c2d 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -540,6 +540,43 @@ inline bool CompressionTypeSupported(CompressionType compression_type) {
   }
 }
 
+inline bool DictCompressionTypeSupported(CompressionType compression_type) {
+  switch (compression_type) {
+    case kNoCompression:
+      return false;
+    case kSnappyCompression:
+      return false;
+    case kZlibCompression:
+      return Zlib_Supported();
+    case kBZip2Compression:
+      return false;
+    case kLZ4Compression:
+    case kLZ4HCCompression:
+#if LZ4_VERSION_NUMBER >= 10400  // r124+
+      return LZ4_Supported();
+#else
+      return false;
+#endif
+    case kXpressCompression:
+      return false;
+    case kZSTDNotFinalCompression:
+#if ZSTD_VERSION_NUMBER >= 500  // v0.5.0+
+      return ZSTDNotFinal_Supported();
+#else
+      return false;
+#endif
+    case kZSTD:
+#if ZSTD_VERSION_NUMBER >= 500  // v0.5.0+
+      return ZSTD_Supported();
+#else
+      return false;
+#endif
+    default:
+      assert(false);
+      return false;
+  }
+}
+
 inline std::string CompressionTypeToString(CompressionType compression_type) {
   switch (compression_type) {
     case kNoCompression:
@@ -590,6 +627,9 @@ inline std::string CompressionOptionsToString(
   result.append("enabled=")
       .append(ToString(compression_options.enabled))
       .append("; ");
+  result.append("max_dict_buffer_bytes=")
+      .append(ToString(compression_options.max_dict_buffer_bytes))
+      .append("; ");
   return result;
 }
 
diff --git a/util/concurrent_task_limiter_impl.cc b/util/concurrent_task_limiter_impl.cc
index 2342677d895..efa01a17f93 100644
--- a/util/concurrent_task_limiter_impl.cc
+++ b/util/concurrent_task_limiter_impl.cc
@@ -59,9 +59,14 @@ ConcurrentTaskLimiter* NewConcurrentTaskLimiter(
   return new ConcurrentTaskLimiterImpl(name, limit);
 }
 
-TaskLimiterToken::~TaskLimiterToken() {
-  --limiter_->outstanding_tasks_;
+void TaskLimiterToken::ReleaseOnce() {
+  if (!released_) {
+    --limiter_->outstanding_tasks_;
+    released_ = true;
+  }
   assert(limiter_->outstanding_tasks_ >= 0);
 }
 
+TaskLimiterToken::~TaskLimiterToken() { ReleaseOnce(); }
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/concurrent_task_limiter_impl.h b/util/concurrent_task_limiter_impl.h
index d8c1e03cb07..0e6cf1466d6 100644
--- a/util/concurrent_task_limiter_impl.h
+++ b/util/concurrent_task_limiter_impl.h
@@ -53,11 +53,16 @@ class ConcurrentTaskLimiterImpl : public ConcurrentTaskLimiter {
 class TaskLimiterToken {
  public:
   explicit TaskLimiterToken(ConcurrentTaskLimiterImpl* limiter)
-      : limiter_(limiter) {}
+      : limiter_(limiter), released_(false) {}
   ~TaskLimiterToken();
+  // Releases the token from the `ConcurrentTaskLimiterImpl` if not already
+  // released.
+  // Not thread-safe.
+  void ReleaseOnce();
 
  private:
   ConcurrentTaskLimiterImpl* limiter_;
+  bool released_;
 
   // no copying allowed
   TaskLimiterToken(const TaskLimiterToken&) = delete;
diff --git a/util/crc32c.cc b/util/crc32c.cc
index 95965809958..d2cbb6d90bd 100644
--- a/util/crc32c.cc
+++ b/util/crc32c.cc
@@ -10,15 +10,20 @@
 // A portable implementation of crc32c, optimized to handle
 // four bytes at a time.
 #include "util/crc32c.h"
+
 #include <stdint.h>
+
+#include <array>
+#include <utility>
 #ifdef HAVE_SSE42
 #include <nmmintrin.h>
 #include <wmmintrin.h>
 #endif
+
 #include "port/lang.h"
 #include "util/coding.h"
-
 #include "util/crc32c_arm64.h"
+#include "util/math.h"
 
 #ifdef __powerpc64__
 #include "util/crc32c_ppc.h"
@@ -37,11 +42,15 @@
 #define AT_HWCAP2 26
 #endif
 
+#elif __FreeBSD__
+#include <machine/cpu.h>
+#include <sys/auxv.h>
+#include <sys/elf_common.h>
 #endif /* __linux__ */
 
 #endif
 
-#if defined(__linux__) && defined(HAVE_ARM64_CRC)
+#if defined(HAVE_ARM64_CRC)
 bool pmull_runtime_flag = false;
 #endif
 
@@ -346,6 +355,9 @@ static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) {
   table0_[c >> 24];
 }
 
+#if (!(defined(HAVE_POWER8) && defined(HAS_ALTIVEC))) && \
+        (!defined(HAVE_ARM64_CRC)) ||                    \
+    defined(NO_THREEWAY_CRC32C)
 static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) {
 #ifndef HAVE_SSE42
   Slow_CRC32(l, p);
@@ -359,6 +371,7 @@ static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) {
   *p += 4;
 #endif
 }
+#endif
 
 template<void (*CRC32)(uint64_t*, uint8_t const**)>
 uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
@@ -463,6 +476,18 @@ static int arch_ppc_probe(void) {
 
   return arch_ppc_crc32;
 }
+#elif __FreeBSD__
+static int arch_ppc_probe(void) {
+  unsigned long cpufeatures;
+  arch_ppc_crc32 = 0;
+
+#if defined(__powerpc64__)
+  elf_aux_info(AT_HWCAP2, &cpufeatures, sizeof(cpufeatures));
+  if (cpufeatures & PPC_FEATURE2_HAS_VEC_CRYPTO) arch_ppc_crc32 = 1;
+#endif  /* __powerpc64__ */
+
+  return arch_ppc_crc32;
+}
 #endif  // __linux__
 
 static bool isAltiVec() {
@@ -474,7 +499,7 @@ static bool isAltiVec() {
 }
 #endif
 
-#if defined(__linux__) && defined(HAVE_ARM64_CRC)
+#if defined(HAVE_ARM64_CRC)
 uint32_t ExtendARMImpl(uint32_t crc, const char *buf, size_t size) {
   return crc32c_arm64(crc, (const unsigned char *)buf, size);
 }
@@ -494,7 +519,7 @@ std::string IsFastCrc32Supported() {
   has_fast_crc = false;
   arch = "PPC";
 #endif
-#elif defined(__linux__) && defined(HAVE_ARM64_CRC)
+#elif defined(HAVE_ARM64_CRC)
   if (crc32c_runtime_check()) {
     has_fast_crc = true;
     arch = "Arm64";
@@ -1227,7 +1252,7 @@ uint32_t crc32c_3way(uint32_t crc, const char* buf, size_t len) {
 static inline Function Choose_Extend() {
 #ifdef HAVE_POWER8
   return isAltiVec() ? ExtendPPCImpl : ExtendImpl<Slow_CRC32>;
-#elif defined(__linux__) && defined(HAVE_ARM64_CRC)
+#elif defined(HAVE_ARM64_CRC)
   if(crc32c_runtime_check()) {
     pmull_runtime_flag = crc32c_pmull_runtime_check();
     return ExtendARMImpl;
@@ -1259,5 +1284,164 @@ uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
 }
 
 
+// The code for crc32c combine, copied with permission from folly
+
+// Standard galois-field multiply.  The only modification is that a,
+// b, m, and p are all bit-reflected.
+//
+// https://en.wikipedia.org/wiki/Finite_field_arithmetic
+static constexpr uint32_t gf_multiply_sw_1(
+    size_t i, uint32_t p, uint32_t a, uint32_t b, uint32_t m) {
+  // clang-format off
+  return i == 32 ? p : gf_multiply_sw_1(
+      /* i = */ i + 1,
+      /* p = */ p ^ ((0u-((b >> 31) & 1)) & a),
+      /* a = */ (a >> 1) ^ ((0u-(a & 1)) & m),
+      /* b = */ b << 1,
+      /* m = */ m);
+  // clang-format on
+}
+static constexpr uint32_t gf_multiply_sw(uint32_t a, uint32_t b, uint32_t m) {
+  return gf_multiply_sw_1(/* i = */ 0, /* p = */ 0, a, b, m);
+}
+
+static constexpr uint32_t gf_square_sw(uint32_t a, uint32_t m) {
+  return gf_multiply_sw(a, a, m);
+}
+
+template <size_t i, uint32_t m>
+struct gf_powers_memo {
+  static constexpr uint32_t value =
+      gf_square_sw(gf_powers_memo<i - 1, m>::value, m);
+};
+template <uint32_t m>
+struct gf_powers_memo<0, m> {
+  static constexpr uint32_t value = m;
+};
+
+template <typename T, T... Ints>
+struct integer_sequence {
+  typedef T value_type;
+  static constexpr size_t size() { return sizeof...(Ints); }
+};
+
+template <typename T, std::size_t N, T... Is>
+struct make_integer_sequence : make_integer_sequence<T, N - 1, N - 1, Is...> {};
+
+template <typename T, T... Is>
+struct make_integer_sequence<T, 0, Is...> : integer_sequence<T, Is...> {};
+
+template <std::size_t N>
+using make_index_sequence = make_integer_sequence<std::size_t, N>;
+
+template <uint32_t m>
+struct gf_powers_make {
+  template <size_t... i>
+  using index_sequence = integer_sequence<size_t, i...>;
+  template <size_t... i>
+  constexpr std::array<uint32_t, sizeof...(i)> operator()(
+      index_sequence<i...>) const {
+    return std::array<uint32_t, sizeof...(i)>{{gf_powers_memo<i, m>::value...}};
+  }
+};
+
+static constexpr uint32_t crc32c_m = 0x82f63b78;
+
+static constexpr std::array<uint32_t, 62> const crc32c_powers =
+    gf_powers_make<crc32c_m>{}(make_index_sequence<62>{});
+
+// Expects a "pure" crc (see Crc32cCombine)
+static uint32_t Crc32AppendZeroes(
+    uint32_t crc, size_t len_over_4, uint32_t polynomial,
+    std::array<uint32_t, 62> const& powers_array) {
+  auto powers = powers_array.data();
+  // Append by multiplying by consecutive powers of two of the zeroes
+  // array
+  size_t len_bits = len_over_4;
+
+  while (len_bits) {
+    // Advance directly to next bit set.
+    auto r = CountTrailingZeroBits(len_bits);
+    len_bits >>= r;
+    powers += r;
+
+    crc = gf_multiply_sw(crc, *powers, polynomial);
+
+    len_bits >>= 1;
+    powers++;
+  }
+
+  return crc;
+}
+
+static inline uint32_t InvertedToPure(uint32_t crc) { return ~crc; }
+
+static inline uint32_t PureToInverted(uint32_t crc) { return ~crc; }
+
+static inline uint32_t PureExtend(uint32_t crc, const char* buf, size_t size) {
+  return InvertedToPure(Extend(PureToInverted(crc), buf, size));
+}
+
+// Background:
+// RocksDB uses two kinds of crc32c values: masked and unmasked. Neither is
+// a "pure" CRC because a pure CRC satisfies (^ for xor)
+//  crc(a ^ b) = crc(a) ^ crc(b)
+// The unmasked is closest, and this function takes unmasked crc32c values.
+// The unmasked values are impure in two ways:
+// * The initial setting at the start of CRC computation is all 1 bits
+// (like -1) instead of zero.
+// * The result has all bits invered.
+// Note that together, these result in the empty string having a crc32c of
+// zero. See
+// https://en.wikipedia.org/wiki/Computation_of_cyclic_redundancy_checks#CRC_variants
+//
+// Simplified version of strategy, using xor through pure CRCs (+ for concat):
+//
+// pure_crc(str1 + str2) = pure_crc(str1 + zeros(len(str2))) ^
+//                         pure_crc(zeros(len(str1)) + str2)
+//
+// because the xor of these two zero-padded strings is str1 + str2. For pure
+// CRC, leading zeros don't affect the result, so we only need
+//
+// pure_crc(str1 + str2) = pure_crc(str1 + zeros(len(str2))) ^
+//                         pure_crc(str2)
+//
+// Considering we aren't working with pure CRCs, what is actually in the input?
+//
+// crc1 = PureToInverted(PureExtendCrc32c(-1, zeros, crc1len) ^
+//                       PureCrc32c(str1, crc1len))
+// crc2 = PureToInverted(PureExtendCrc32c(-1, zeros, crc2len) ^
+//                       PureCrc32c(str2, crc2len))
+//
+// The result we want to compute is
+// combined = PureToInverted(PureExtendCrc32c(PureExtendCrc32c(-1, zeros,
+//                                                             crc1len) ^
+//                                            PureCrc32c(str1, crc1len),
+//                                            zeros, crc2len) ^
+//                           PureCrc32c(str2, crc2len))
+//
+// Thus, in addition to extending crc1 over the length of str2 in (virtual)
+// zeros, we need to cancel out the -1 initializer that was used in computing
+// crc2. To cancel it out, we also need to extend it over crc2len in zeros.
+// To simplify, since the end of str1 and that -1 initializer for crc2 are at
+// the same logical position, we can combine them before we extend over the
+// zeros.
+uint32_t Crc32cCombine(uint32_t crc1, uint32_t crc2, size_t crc2len) {
+  uint32_t pure_crc1_with_init = InvertedToPure(crc1);
+  uint32_t pure_crc2_with_init = InvertedToPure(crc2);
+  uint32_t pure_crc2_init = static_cast<uint32_t>(-1);
+
+  // Append up to 32 bits of zeroes in the normal way
+  char zeros[4] = {0, 0, 0, 0};
+  auto len = crc2len & 3;
+  uint32_t tmp = pure_crc1_with_init ^ pure_crc2_init;
+  if (len) {
+    tmp = PureExtend(tmp, zeros, len);
+  }
+  return PureToInverted(
+      Crc32AppendZeroes(tmp, crc2len / 4, crc32c_m, crc32c_powers) ^
+      pure_crc2_with_init);
+}
+
 }  // namespace crc32c
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/crc32c.h b/util/crc32c.h
index a3c7cd1d7ca..d4f397c06b7 100644
--- a/util/crc32c.h
+++ b/util/crc32c.h
@@ -24,6 +24,12 @@ extern std::string IsFastCrc32Supported();
 // crc32c of a stream of data.
 extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n);
 
+// Takes two unmasked crc32c values, and the length of the string from
+// which `crc2` was computed, and computes a crc32c value for the
+// concatenation of the original two input strings. Running time is
+// ~ log(crc2len).
+extern uint32_t Crc32cCombine(uint32_t crc1, uint32_t crc2, size_t crc2len);
+
 // Return the crc32c of data[0,n-1]
 inline uint32_t Value(const char* data, size_t n) {
   return Extend(0, data, n);
diff --git a/util/crc32c_arm64.cc b/util/crc32c_arm64.cc
index 566810f4b3d..a799fb6058c 100644
--- a/util/crc32c_arm64.cc
+++ b/util/crc32c_arm64.cc
@@ -5,9 +5,11 @@
 
 #include "util/crc32c_arm64.h"
 
-#if defined(__linux__) && defined(HAVE_ARM64_CRC)
+#if defined(HAVE_ARM64_CRC)
 
+#if defined(__linux__)
 #include <asm/hwcap.h>
+#endif
 #ifdef ROCKSDB_AUXV_GETAUXVAL_PRESENT
 #include <sys/auxv.h>
 #endif
@@ -17,6 +19,9 @@
 #ifndef HWCAP_PMULL
 #define HWCAP_PMULL (1 << 4)
 #endif
+#if defined(__APPLE__)
+#include <sys/sysctl.h>
+#endif
 
 #ifdef HAVE_ARM64_CRYPTO
 /* unfolding to compute 8 * 3 = 24 bytes parallelly */
@@ -41,20 +46,33 @@
 extern bool pmull_runtime_flag;
 
 uint32_t crc32c_runtime_check(void) {
-#ifdef ROCKSDB_AUXV_GETAUXVAL_PRESENT
-  uint64_t auxv = getauxval(AT_HWCAP);
+#if !defined(__APPLE__)
+  uint64_t auxv = 0;
+#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT)
+  auxv = getauxval(AT_HWCAP);
+#elif defined(__FreeBSD__)
+  elf_aux_info(AT_HWCAP, &auxv, sizeof(auxv));
+#endif
   return (auxv & HWCAP_CRC32) != 0;
 #else
-  return 0;
+  int r;
+  size_t l = sizeof(r);
+  if (sysctlbyname("hw.optional.armv8_crc32", &r, &l, NULL, 0) == -1) return 0;
+  return r == 1;
 #endif
 }
 
 bool crc32c_pmull_runtime_check(void) {
-#ifdef ROCKSDB_AUXV_GETAUXVAL_PRESENT
-  uint64_t auxv = getauxval(AT_HWCAP);
+#if !defined(__APPLE__)
+  uint64_t auxv = 0;
+#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT)
+  auxv = getauxval(AT_HWCAP);
+#elif defined(__FreeBSD__)
+  elf_aux_info(AT_HWCAP, &auxv, sizeof(auxv));
+#endif
   return (auxv & HWCAP_PMULL) != 0;
 #else
-  return false;
+  return true;
 #endif
 }
 
@@ -65,8 +83,8 @@ __attribute__((__no_sanitize__("alignment")))
 __attribute__((__no_sanitize_undefined__))
 #endif
 #endif
-uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
-                             unsigned len) {
+uint32_t
+crc32c_arm64(uint32_t crc, unsigned char const *data, size_t len) {
   const uint8_t *buf8;
   const uint64_t *buf64 = (uint64_t *)data;
   int length = (int)len;
diff --git a/util/crc32c_arm64.h b/util/crc32c_arm64.h
index a12354683d8..b16b1f3e0be 100644
--- a/util/crc32c_arm64.h
+++ b/util/crc32c_arm64.h
@@ -7,6 +7,7 @@
 #define UTIL_CRC32C_ARM64_H
 
 #include <cinttypes>
+#include <cstddef>
 
 #if defined(__aarch64__) || defined(__AARCH64__)
 
@@ -33,7 +34,8 @@
   PREF4X64L1(buffer, (PREF_OFFSET), 8) \
   PREF4X64L1(buffer, (PREF_OFFSET), 12)
 
-extern uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, unsigned len);
+extern uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data,
+                             size_t len);
 extern uint32_t crc32c_runtime_check(void);
 extern bool crc32c_pmull_runtime_check(void);
 
diff --git a/util/crc32c_ppc.c b/util/crc32c_ppc.c
index 888a4943eaa..b37dfb15809 100644
--- a/util/crc32c_ppc.c
+++ b/util/crc32c_ppc.c
@@ -27,7 +27,7 @@ unsigned int __crc32_vpmsum(unsigned int crc, unsigned char const *p,
                             unsigned long len);
 
 static uint32_t crc32_vpmsum(uint32_t crc, unsigned char const *data,
-                             unsigned len) {
+                             size_t len) {
   unsigned int prealign;
   unsigned int tail;
 
@@ -67,7 +67,7 @@ static uint32_t crc32_vpmsum(uint32_t crc, unsigned char const *data,
  * does not gracefully handle the case where the data pointer is NULL.  There
  * may be room for performance improvement here.
  */
-uint32_t crc32c_ppc(uint32_t crc, unsigned char const *data, unsigned len) {
+uint32_t crc32c_ppc(uint32_t crc, unsigned char const *data, size_t len) {
   unsigned char *buf2;
 
   if (!data) {
@@ -87,7 +87,7 @@ uint32_t crc32c_ppc(uint32_t crc, unsigned char const *data, unsigned len) {
  * ppc systems using power7 or below) in order to compile properly
  * there, even though it won't be called.
  */
-uint32_t crc32c_ppc(uint32_t crc, unsigned char const *data, unsigned len) {
+uint32_t crc32c_ppc(uint32_t crc, unsigned char const *data, size_t len) {
   return 0;
 }
 
diff --git a/util/crc32c_ppc.h b/util/crc32c_ppc.h
index c359061c610..15ed6fae5e0 100644
--- a/util/crc32c_ppc.h
+++ b/util/crc32c_ppc.h
@@ -12,7 +12,7 @@ extern "C" {
 #endif
 
 extern uint32_t crc32c_ppc(uint32_t crc, unsigned char const *buffer,
-                           unsigned len);
+                           size_t len);
 
 #ifdef __cplusplus
 }
diff --git a/util/crc32c_ppc_asm.S b/util/crc32c_ppc_asm.S
index a317bf96b87..6959ba83984 100644
--- a/util/crc32c_ppc_asm.S
+++ b/util/crc32c_ppc_asm.S
@@ -6,7 +6,11 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#if defined (__clang__)
+#include "third-party/gcc/ppc-asm.h"
+#else
 #include <ppc-asm.h>
+#endif
 #include "ppc-opcode.h"
 
 #undef toc
diff --git a/util/crc32c_test.cc b/util/crc32c_test.cc
index 3e4f7396e76..e02a95ebe21 100644
--- a/util/crc32c_test.cc
+++ b/util/crc32c_test.cc
@@ -7,8 +7,10 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "util/crc32c.h"
+
 #include "test_util/testharness.h"
 #include "util/coding.h"
+#include "util/random.h"
 
 namespace ROCKSDB_NAMESPACE {
 namespace crc32c {
@@ -137,6 +139,51 @@ TEST(CRC, Mask) {
   ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc)))));
 }
 
+TEST(CRC, Crc32cCombineBasicTest) {
+  uint32_t crc1 = Value("hello ", 6);
+  uint32_t crc2 = Value("world", 5);
+  uint32_t crc3 = Value("hello world", 11);
+  uint32_t crc1_2_combine = Crc32cCombine(crc1, crc2, 5);
+  ASSERT_EQ(crc3, crc1_2_combine);
+}
+
+TEST(CRC, Crc32cCombineOrderMattersTest) {
+  uint32_t crc1 = Value("hello ", 6);
+  uint32_t crc2 = Value("world", 5);
+  uint32_t crc3 = Value("hello world", 11);
+  uint32_t crc2_1_combine = Crc32cCombine(crc2, crc1, 6);
+  ASSERT_NE(crc3, crc2_1_combine);
+}
+
+TEST(CRC, Crc32cCombineFullCoverTest) {
+  int scale = 4 * 1024;
+  Random rnd(test::RandomSeed());
+  int size_1 = 1024 * 1024;
+  std::string s1 = rnd.RandomBinaryString(size_1);
+  uint32_t crc1 = Value(s1.data(), size_1);
+  for (int i = 0; i < scale; i++) {
+    int size_2 = i;
+    std::string s2 = rnd.RandomBinaryString(size_2);
+    uint32_t crc2 = Value(s2.data(), s2.size());
+    uint32_t crc1_2 = Extend(crc1, s2.data(), s2.size());
+    uint32_t crc1_2_combine = Crc32cCombine(crc1, crc2, size_2);
+    ASSERT_EQ(crc1_2, crc1_2_combine);
+  }
+}
+
+TEST(CRC, Crc32cCombineBigSizeTest) {
+  Random rnd(test::RandomSeed());
+  int size_1 = 1024 * 1024;
+  std::string s1 = rnd.RandomBinaryString(size_1);
+  uint32_t crc1 = Value(s1.data(), size_1);
+  int size_2 = 16 * 1024 * 1024 - 1;
+  std::string s2 = rnd.RandomBinaryString(size_2);
+  uint32_t crc2 = Value(s2.data(), s2.size());
+  uint32_t crc1_2 = Extend(crc1, s2.data(), s2.size());
+  uint32_t crc1_2_combine = Crc32cCombine(crc1, crc2, size_2);
+  ASSERT_EQ(crc1_2, crc1_2_combine);
+}
+
 }  // namespace crc32c
 }  // namespace ROCKSDB_NAMESPACE
 
diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc
index 47d04a36c6e..9f032befd1c 100644
--- a/util/dynamic_bloom_test.cc
+++ b/util/dynamic_bloom_test.cc
@@ -20,9 +20,9 @@ int main() {
 #include <vector>
 
 #include "dynamic_bloom.h"
-#include "logging/logging.h"
 #include "memory/arena.h"
 #include "port/port.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/gflags_compat.h"
@@ -178,7 +178,7 @@ TEST_F(DynamicBloomTest, VaryingLengths) {
 
 TEST_F(DynamicBloomTest, perf) {
   KeyMaker km;
-  StopWatchNano timer(Env::Default());
+  StopWatchNano timer(SystemClock::Default().get());
   uint32_t num_probes = static_cast<uint32_t>(FLAGS_num_probes);
 
   if (!FLAGS_enable_perf) {
@@ -238,7 +238,7 @@ TEST_F(DynamicBloomTest, concurrent_with_perf) {
 
     std::function<void(size_t)> adder([&](size_t t) {
       KeyMaker km;
-      StopWatchNano timer(Env::Default());
+      StopWatchNano timer(SystemClock::Default().get());
       timer.Start();
       for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
         std_bloom.AddConcurrently(km.Seq(i));
@@ -261,7 +261,7 @@ TEST_F(DynamicBloomTest, concurrent_with_perf) {
     elapsed = 0;
     std::function<void(size_t)> hitter([&](size_t t) {
       KeyMaker km;
-      StopWatchNano timer(Env::Default());
+      StopWatchNano timer(SystemClock::Default().get());
       timer.Start();
       for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
         bool f = std_bloom.MayContain(km.Seq(i));
@@ -286,7 +286,7 @@ TEST_F(DynamicBloomTest, concurrent_with_perf) {
     std::atomic<uint32_t> false_positives(0);
     std::function<void(size_t)> misser([&](size_t t) {
       KeyMaker km;
-      StopWatchNano timer(Env::Default());
+      StopWatchNano timer(SystemClock::Default().get());
       timer.Start();
       for (uint64_t i = num_keys + 1 + t; i <= 2 * num_keys; i += num_threads) {
         bool f = std_bloom.MayContain(km.Seq(i));
diff --git a/util/file_checksum_helper.cc b/util/file_checksum_helper.cc
index 761307b3eca..78ed524cb1c 100644
--- a/util/file_checksum_helper.cc
+++ b/util/file_checksum_helper.cc
@@ -13,6 +13,7 @@
 
 #include "db/log_reader.h"
 #include "db/version_edit.h"
+#include "db/version_edit_handler.h"
 #include "file/sequence_file_reader.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -95,7 +96,7 @@ Status GetFileChecksumsFromManifest(Env* src_env, const std::string& abs_path,
   if (checksum_list == nullptr) {
     return Status::InvalidArgument("checksum_list is nullptr");
   }
-
+  assert(checksum_list);
   checksum_list->reset();
   Status s;
 
@@ -123,32 +124,13 @@ Status GetFileChecksumsFromManifest(Env* src_env, const std::string& abs_path,
   reporter.status_ptr = &s;
   log::Reader reader(nullptr, std::move(file_reader), &reporter,
                      true /* checksum */, 0 /* log_number */);
-  Slice record;
-  std::string scratch;
-  while (reader.LastRecordEnd() < manifest_file_size &&
-         reader.ReadRecord(&record, &scratch) && s.ok()) {
-    VersionEdit edit;
-    s = edit.DecodeFrom(record);
-    if (!s.ok()) {
-      break;
-    }
-
-    // Remove the deleted files from the checksum_list
-    for (const auto& deleted_file : edit.GetDeletedFiles()) {
-      checksum_list->RemoveOneFileChecksum(deleted_file.second);
-    }
-
-    // Add the new files to the checksum_list
-    for (const auto& new_file : edit.GetNewFiles()) {
-      checksum_list->InsertOneFileChecksum(
-          new_file.second.fd.GetNumber(), new_file.second.file_checksum,
-          new_file.second.file_checksum_func_name);
-    }
-  }
-  assert(!s.ok() ||
+  FileChecksumRetriever retriever(manifest_file_size, *checksum_list);
+  retriever.Iterate(reader, &s);
+  assert(!retriever.status().ok() ||
          manifest_file_size == std::numeric_limits<uint64_t>::max() ||
          reader.LastRecordEnd() == manifest_file_size);
-  return s;
+
+  return retriever.status();
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/file_reader_writer_test.cc b/util/file_reader_writer_test.cc
index 01faf632e4b..449747024eb 100644
--- a/util/file_reader_writer_test.cc
+++ b/util/file_reader_writer_test.cc
@@ -5,11 +5,14 @@
 //
 #include <algorithm>
 #include <vector>
-#include "env/composite_env_wrapper.h"
+
+#include "env/mock_env.h"
+#include "file/line_file_reader.h"
 #include "file/random_access_file_reader.h"
 #include "file/readahead_raf.h"
 #include "file/sequence_file_reader.h"
 #include "file/writable_file_writer.h"
+#include "rocksdb/file_system.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/random.h"
@@ -21,42 +24,64 @@ class WritableFileWriterTest : public testing::Test {};
 const uint32_t kMb = 1 << 20;
 
 TEST_F(WritableFileWriterTest, RangeSync) {
-  class FakeWF : public WritableFile {
+  class FakeWF : public FSWritableFile {
    public:
     explicit FakeWF() : size_(0), last_synced_(0) {}
     ~FakeWF() override {}
 
-    Status Append(const Slice& data) override {
+    using FSWritableFile::Append;
+    IOStatus Append(const Slice& data, const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
       size_ += data.size();
-      return Status::OK();
+      return IOStatus::OK();
+    }
+    IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
     }
-    Status Truncate(uint64_t /*size*/) override { return Status::OK(); }
-    Status Close() override {
+    IOStatus Close(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
       EXPECT_GE(size_, last_synced_ + kMb);
       EXPECT_LT(size_, last_synced_ + 2 * kMb);
       // Make sure random writes generated enough writes.
       EXPECT_GT(size_, 10 * kMb);
-      return Status::OK();
+      return IOStatus::OK();
+    }
+    IOStatus Flush(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Sync(const IOOptions& /*options*/,
+                  IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Fsync(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
     }
-    Status Flush() override { return Status::OK(); }
-    Status Sync() override { return Status::OK(); }
-    Status Fsync() override { return Status::OK(); }
     void SetIOPriority(Env::IOPriority /*pri*/) override {}
-    uint64_t GetFileSize() override { return size_; }
+    uint64_t GetFileSize(const IOOptions& /*options*/,
+                         IODebugContext* /*dbg*/) override {
+      return size_;
+    }
     void GetPreallocationStatus(size_t* /*block_size*/,
                                 size_t* /*last_allocated_block*/) override {}
     size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override {
       return 0;
     }
-    Status InvalidateCache(size_t /*offset*/, size_t /*length*/) override {
-      return Status::OK();
+    IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) override {
+      return IOStatus::OK();
     }
 
    protected:
-    Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) override {
-      return Status::OK();
+    IOStatus Allocate(uint64_t /*offset*/, uint64_t /*len*/,
+                      const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
     }
-    Status RangeSync(uint64_t offset, uint64_t nbytes) override {
+    IOStatus RangeSync(uint64_t offset, uint64_t nbytes,
+                       const IOOptions& /*options*/,
+                       IODebugContext* /*dbg*/) override {
       EXPECT_EQ(offset % 4096, 0u);
       EXPECT_EQ(nbytes % 4096, 0u);
 
@@ -66,7 +91,7 @@ TEST_F(WritableFileWriterTest, RangeSync) {
       if (size_ > 2 * kMb) {
         EXPECT_LT(size_, last_synced_ + 2 * kMb);
       }
-      return Status::OK();
+      return IOStatus::OK();
     }
 
     uint64_t size_;
@@ -77,8 +102,7 @@ TEST_F(WritableFileWriterTest, RangeSync) {
   env_options.bytes_per_sync = kMb;
   std::unique_ptr<FakeWF> wf(new FakeWF);
   std::unique_ptr<WritableFileWriter> writer(
-      new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(wf)),
-                             "" /* don't care */, env_options));
+      new WritableFileWriter(std::move(wf), "" /* don't care */, env_options));
   Random r(301);
   Status s;
   std::unique_ptr<char[]> large_buf(new char[10 * kMb]);
@@ -99,7 +123,7 @@ TEST_F(WritableFileWriterTest, RangeSync) {
 }
 
 TEST_F(WritableFileWriterTest, IncrementalBuffer) {
-  class FakeWF : public WritableFile {
+  class FakeWF : public FSWritableFile {
    public:
     explicit FakeWF(std::string* _file_data, bool _use_direct_io,
                     bool _no_flush)
@@ -108,37 +132,58 @@ TEST_F(WritableFileWriterTest, IncrementalBuffer) {
           no_flush_(_no_flush) {}
     ~FakeWF() override {}
 
-    Status Append(const Slice& data) override {
+    using FSWritableFile::Append;
+    IOStatus Append(const Slice& data, const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
       file_data_->append(data.data(), data.size());
       size_ += data.size();
-      return Status::OK();
+      return IOStatus::OK();
     }
-    Status PositionedAppend(const Slice& data, uint64_t pos) override {
+    using FSWritableFile::PositionedAppend;
+    IOStatus PositionedAppend(const Slice& data, uint64_t pos,
+                              const IOOptions& /*options*/,
+                              IODebugContext* /*dbg*/) override {
       EXPECT_TRUE(pos % 512 == 0);
       EXPECT_TRUE(data.size() % 512 == 0);
       file_data_->resize(pos);
       file_data_->append(data.data(), data.size());
       size_ += data.size();
-      return Status::OK();
+      return IOStatus::OK();
     }
 
-    Status Truncate(uint64_t size) override {
+    IOStatus Truncate(uint64_t size, const IOOptions& /*options*/,
+                      IODebugContext* /*dbg*/) override {
       file_data_->resize(size);
-      return Status::OK();
+      return IOStatus::OK();
+    }
+    IOStatus Close(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Flush(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Sync(const IOOptions& /*options*/,
+                  IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Fsync(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
     }
-    Status Close() override { return Status::OK(); }
-    Status Flush() override { return Status::OK(); }
-    Status Sync() override { return Status::OK(); }
-    Status Fsync() override { return Status::OK(); }
     void SetIOPriority(Env::IOPriority /*pri*/) override {}
-    uint64_t GetFileSize() override { return size_; }
+    uint64_t GetFileSize(const IOOptions& /*options*/,
+                         IODebugContext* /*dbg*/) override {
+      return size_;
+    }
     void GetPreallocationStatus(size_t* /*block_size*/,
                                 size_t* /*last_allocated_block*/) override {}
     size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override {
       return 0;
     }
-    Status InvalidateCache(size_t /*offset*/, size_t /*length*/) override {
-      return Status::OK();
+    IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) override {
+      return IOStatus::OK();
     }
     bool use_direct_io() const override { return use_direct_io_; }
 
@@ -163,24 +208,23 @@ TEST_F(WritableFileWriterTest, IncrementalBuffer) {
                                           false,
 #endif
                                           no_flush));
-    std::unique_ptr<WritableFileWriter> writer(
-        new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(wf)),
-                               "" /* don't care */, env_options));
+    std::unique_ptr<WritableFileWriter> writer(new WritableFileWriter(
+        std::move(wf), "" /* don't care */, env_options));
 
     std::string target;
     for (int i = 0; i < 20; i++) {
       uint32_t num = r.Skewed(16) * 100 + r.Uniform(100);
       std::string random_string = r.RandomString(num);
-      writer->Append(Slice(random_string.c_str(), num));
+      ASSERT_OK(writer->Append(Slice(random_string.c_str(), num)));
       target.append(random_string.c_str(), num);
 
       // In some attempts, flush in a chance of 1/10.
       if (!no_flush && r.Uniform(10) == 0) {
-        writer->Flush();
+        ASSERT_OK(writer->Flush());
       }
     }
-    writer->Flush();
-    writer->Close();
+    ASSERT_OK(writer->Flush());
+    ASSERT_OK(writer->Close());
     ASSERT_EQ(target.size(), actual.size());
     ASSERT_EQ(target, actual);
   }
@@ -188,26 +232,41 @@ TEST_F(WritableFileWriterTest, IncrementalBuffer) {
 
 #ifndef ROCKSDB_LITE
 TEST_F(WritableFileWriterTest, AppendStatusReturn) {
-  class FakeWF : public WritableFile {
+  class FakeWF : public FSWritableFile {
    public:
     explicit FakeWF() : use_direct_io_(false), io_error_(false) {}
 
     bool use_direct_io() const override { return use_direct_io_; }
-    Status Append(const Slice& /*data*/) override {
+
+    using FSWritableFile::Append;
+    IOStatus Append(const Slice& /*data*/, const IOOptions& /*options*/,
+                    IODebugContext* /*dbg*/) override {
       if (io_error_) {
-        return Status::IOError("Fake IO error");
+        return IOStatus::IOError("Fake IO error");
       }
-      return Status::OK();
+      return IOStatus::OK();
     }
-    Status PositionedAppend(const Slice& /*data*/, uint64_t) override {
+    using FSWritableFile::PositionedAppend;
+    IOStatus PositionedAppend(const Slice& /*data*/, uint64_t,
+                              const IOOptions& /*options*/,
+                              IODebugContext* /*dbg*/) override {
       if (io_error_) {
-        return Status::IOError("Fake IO error");
+        return IOStatus::IOError("Fake IO error");
       }
-      return Status::OK();
+      return IOStatus::OK();
+    }
+    IOStatus Close(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Flush(const IOOptions& /*options*/,
+                   IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
+    }
+    IOStatus Sync(const IOOptions& /*options*/,
+                  IODebugContext* /*dbg*/) override {
+      return IOStatus::OK();
     }
-    Status Close() override { return Status::OK(); }
-    Status Flush() override { return Status::OK(); }
-    Status Sync() override { return Status::OK(); }
     void Setuse_direct_io(bool val) { use_direct_io_ = val; }
     void SetIOError(bool val) { io_error_ = val; }
 
@@ -218,15 +277,13 @@ TEST_F(WritableFileWriterTest, AppendStatusReturn) {
   std::unique_ptr<FakeWF> wf(new FakeWF());
   wf->Setuse_direct_io(true);
   std::unique_ptr<WritableFileWriter> writer(
-      new WritableFileWriter(NewLegacyWritableFileWrapper(std::move(wf)),
-                             "" /* don't care */, EnvOptions()));
+      new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions()));
 
   ASSERT_OK(writer->Append(std::string(2 * kMb, 'a')));
 
   // Next call to WritableFile::Append() should fail
-  LegacyWritableFileWrapper* file =
-      static_cast<LegacyWritableFileWrapper*>(writer->writable_file());
-  static_cast<FakeWF*>(file->target())->SetIOError(true);
+  FakeWF* fwf = static_cast<FakeWF*>(writer->writable_file());
+  fwf->SetIOError(true);
   ASSERT_NOK(writer->Append(std::string(2 * kMb, 'b')));
 }
 #endif
@@ -246,19 +303,21 @@ class ReadaheadRandomAccessFileTest
   ReadaheadRandomAccessFileTest() : control_contents_() {}
   std::string Read(uint64_t offset, size_t n) {
     Slice result;
-    Status s = test_read_holder_->Read(offset, n, &result, scratch_.get());
+    Status s = test_read_holder_->Read(offset, n, IOOptions(), &result,
+                                       scratch_.get(), nullptr);
     EXPECT_TRUE(s.ok() || s.IsInvalidArgument());
     return std::string(result.data(), result.size());
   }
   void ResetSourceStr(const std::string& str = "") {
-    auto write_holder =
-        std::unique_ptr<WritableFileWriter>(test::GetWritableFileWriter(
-            new test::StringSink(&control_contents_), "" /* don't care */));
+    std::unique_ptr<FSWritableFile> sink(
+        new test::StringSink(&control_contents_));
+    std::unique_ptr<WritableFileWriter> write_holder(new WritableFileWriter(
+        std::move(sink), "" /* don't care */, FileOptions()));
     Status s = write_holder->Append(Slice(str));
     EXPECT_OK(s);
     s = write_holder->Flush();
     EXPECT_OK(s);
-    auto read_holder = std::unique_ptr<RandomAccessFile>(
+    std::unique_ptr<FSRandomAccessFile> read_holder(
         new test::StringSource(control_contents_));
     test_read_holder_ =
         NewReadaheadRandomAccessFile(std::move(read_holder), readahead_size_);
@@ -268,7 +327,7 @@ class ReadaheadRandomAccessFileTest
  private:
   size_t readahead_size_;
   Slice control_contents_;
-  std::unique_ptr<RandomAccessFile> test_read_holder_;
+  std::unique_ptr<FSRandomAccessFile> test_read_holder_;
   std::unique_ptr<char[]> scratch_;
 };
 
@@ -353,10 +412,10 @@ class ReadaheadSequentialFileTest : public testing::Test,
   }
   void Skip(size_t n) { test_read_holder_->Skip(n); }
   void ResetSourceStr(const std::string& str = "") {
-    auto read_holder = std::unique_ptr<SequentialFile>(
+    auto read_holder = std::unique_ptr<FSSequentialFile>(
         new test::SeqStringSource(str, &seq_read_count_));
-    test_read_holder_.reset(new SequentialFileReader(
-        NewLegacySequentialFileWrapper(read_holder), "test", readahead_size_));
+    test_read_holder_.reset(new SequentialFileReader(std::move(read_holder),
+                                                     "test", readahead_size_));
   }
   size_t GetReadaheadSize() const { return readahead_size_; }
 
@@ -439,6 +498,103 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     ReadExceedsReadaheadSize, ReadaheadSequentialFileTest,
     ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList()));
+
+namespace {
+std::string GenerateLine(int n) {
+  std::string rv;
+  // Multiples of 17 characters per line, for likely bad buffer alignment
+  for (int i = 0; i < n; ++i) {
+    rv.push_back(static_cast<char>('0' + (i % 10)));
+    rv.append("xxxxxxxxxxxxxxxx");
+  }
+  return rv;
+}
+}  // namespace
+
+TEST(LineFileReaderTest, LineFileReaderTest) {
+  const int nlines = 1000;
+
+  std::unique_ptr<MockEnv> mem_env(new MockEnv(Env::Default()));
+  std::shared_ptr<FileSystem> fs = mem_env->GetFileSystem();
+  // Create an input file
+  {
+    std::unique_ptr<FSWritableFile> file;
+    ASSERT_OK(
+        fs->NewWritableFile("testfile", FileOptions(), &file, /*dbg*/ nullptr));
+
+    for (int i = 0; i < nlines; ++i) {
+      std::string line = GenerateLine(i);
+      line.push_back('\n');
+      ASSERT_OK(file->Append(line, IOOptions(), /*dbg*/ nullptr));
+    }
+  }
+
+  // Verify with no I/O errors
+  {
+    std::unique_ptr<LineFileReader> reader;
+    ASSERT_OK(LineFileReader::Create(fs, "testfile", FileOptions(), &reader,
+                                     nullptr));
+    std::string line;
+    int count = 0;
+    while (reader->ReadLine(&line)) {
+      ASSERT_EQ(line, GenerateLine(count));
+      ++count;
+      ASSERT_EQ(static_cast<int>(reader->GetLineNumber()), count);
+    }
+    ASSERT_OK(reader->GetStatus());
+    ASSERT_EQ(count, nlines);
+    ASSERT_EQ(static_cast<int>(reader->GetLineNumber()), count);
+    // And still
+    ASSERT_FALSE(reader->ReadLine(&line));
+    ASSERT_OK(reader->GetStatus());
+    ASSERT_EQ(static_cast<int>(reader->GetLineNumber()), count);
+  }
+
+  // Verify with injected I/O error
+  {
+    std::unique_ptr<LineFileReader> reader;
+    ASSERT_OK(LineFileReader::Create(fs, "testfile", FileOptions(), &reader,
+                                     nullptr));
+    std::string line;
+    int count = 0;
+    // Read part way through the file
+    while (count < nlines / 4) {
+      ASSERT_TRUE(reader->ReadLine(&line));
+      ASSERT_EQ(line, GenerateLine(count));
+      ++count;
+      ASSERT_EQ(static_cast<int>(reader->GetLineNumber()), count);
+    }
+    ASSERT_OK(reader->GetStatus());
+
+    // Inject error
+    int callback_count = 0;
+    SyncPoint::GetInstance()->SetCallBack(
+        "MemFile::Read:IOStatus", [&](void* arg) {
+          IOStatus* status = static_cast<IOStatus*>(arg);
+          *status = IOStatus::Corruption("test");
+          ++callback_count;
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    while (reader->ReadLine(&line)) {
+      ASSERT_EQ(line, GenerateLine(count));
+      ++count;
+      ASSERT_EQ(static_cast<int>(reader->GetLineNumber()), count);
+    }
+    ASSERT_TRUE(reader->GetStatus().IsCorruption());
+    ASSERT_LT(count, nlines / 2);
+    ASSERT_EQ(callback_count, 1);
+
+    // Still get error & no retry
+    ASSERT_FALSE(reader->ReadLine(&line));
+    ASSERT_TRUE(reader->GetStatus().IsCorruption());
+    ASSERT_EQ(callback_count, 1);
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/util/filter_bench.cc b/util/filter_bench.cc
index 7aaf30a73de..1bc41caf80d 100644
--- a/util/filter_bench.cc
+++ b/util/filter_bench.cc
@@ -19,6 +19,7 @@ int main() {
 #include "memory/arena.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
+#include "rocksdb/system_clock.h"
 #include "table/block_based/filter_policy_internal.h"
 #include "table/block_based/full_filter_block.h"
 #include "table/block_based/mock_block_based_table.h"
@@ -80,7 +81,8 @@ DEFINE_bool(new_builder, false,
 
 DEFINE_uint32(impl, 0,
               "Select filter implementation. Without -use_plain_table_bloom:"
-              "0 = full filter, 1 = block-based filter. With "
+              "0 = legacy full Bloom filter, 1 = block-based Bloom filter, "
+              "2 = format_version 5 Bloom filter, 3 = Ribbon128 filter. With "
               "-use_plain_table_bloom: 0 = no locality, 1 = locality.");
 
 DEFINE_bool(net_includes_hashing, false,
@@ -268,8 +270,8 @@ struct FilterBench : public MockBlockBasedTableTester {
   Random32 random_;
   std::ostringstream fp_rate_report_;
   Arena arena_;
-  StderrLogger stderr_logger_;
   double m_queries_;
+  StderrLogger stderr_logger_;
 
   FilterBench()
       : MockBlockBasedTableTester(new BloomFilterPolicy(
@@ -280,7 +282,7 @@ struct FilterBench : public MockBlockBasedTableTester {
     for (uint32_t i = 0; i < FLAGS_batch_size; ++i) {
       kms_.emplace_back(FLAGS_key_size < 8 ? 8 : FLAGS_key_size);
     }
-    ioptions_.info_log = &stderr_logger_;
+    ioptions_.logger = &stderr_logger_;
     table_options_.optimize_filters_for_memory =
         FLAGS_optimize_filters_for_memory;
   }
@@ -306,9 +308,9 @@ void FilterBench::Go() {
       throw std::runtime_error(
           "Block-based filter not currently supported by filter_bench");
     }
-    if (FLAGS_impl > 2) {
+    if (FLAGS_impl > 3) {
       throw std::runtime_error(
-          "-impl must currently be 0 or 2 for Block-based table");
+          "-impl must currently be 0, 2, or 3 for Block-based table");
     }
   }
 
@@ -357,8 +359,8 @@ void FilterBench::Go() {
     max_mem = static_cast<size_t>(1024 * 1024 * working_mem_size_mb);
   }
 
-  ROCKSDB_NAMESPACE::StopWatchNano timer(ROCKSDB_NAMESPACE::Env::Default(),
-                                         true);
+  ROCKSDB_NAMESPACE::StopWatchNano timer(
+      ROCKSDB_NAMESPACE::SystemClock::Default().get(), true);
 
   infos_.clear();
   while ((working_mem_size_mb == 0 || total_size < max_mem) &&
@@ -561,15 +563,25 @@ double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
     // 100% of queries to 1 filter
     num_primary_filters = 1;
   } else if (mode == kFiftyOneFilter) {
+    if (num_infos < 50) {
+      return 0.0;  // skip
+    }
     // 50% of queries
     primary_filter_threshold /= 2;
     // to 1% of filters
     num_primary_filters = (num_primary_filters + 99) / 100;
   } else if (mode == kEightyTwentyFilter) {
+    if (num_infos < 5) {
+      return 0.0;  // skip
+    }
     // 80% of queries
     primary_filter_threshold = primary_filter_threshold / 5 * 4;
     // to 20% of filters
     num_primary_filters = (num_primary_filters + 4) / 5;
+  } else if (mode == kRandomFilter) {
+    if (num_infos == 1) {
+      return 0.0;  // skip
+    }
   }
   uint32_t batch_size = 1;
   std::unique_ptr<Slice[]> batch_slices;
@@ -587,8 +599,8 @@ double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run,
     batch_slice_ptrs[i] = &batch_slices[i];
   }
 
-  ROCKSDB_NAMESPACE::StopWatchNano timer(ROCKSDB_NAMESPACE::Env::Default(),
-                                         true);
+  ROCKSDB_NAMESPACE::StopWatchNano timer(
+      ROCKSDB_NAMESPACE::SystemClock::Default().get(), true);
 
   for (uint64_t q = 0; q < max_queries; q += batch_size) {
     bool inside_this_time = random_.Next() <= inside_threshold;
diff --git a/util/gflags_compat.h b/util/gflags_compat.h
index d5a30ce7e5d..ddd3747fa03 100644
--- a/util/gflags_compat.h
+++ b/util/gflags_compat.h
@@ -3,6 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#pragma once
 #include <gflags/gflags.h>
 
 #ifndef GFLAGS_NAMESPACE
diff --git a/util/hash.cc b/util/hash.cc
index 452216fa3a8..b4ebc0e40e9 100644
--- a/util/hash.cc
+++ b/util/hash.cc
@@ -15,6 +15,8 @@
 
 namespace ROCKSDB_NAMESPACE {
 
+uint64_t (*kGetSliceNPHash64UnseededFnPtr)(const Slice&) = &GetSliceHash64;
+
 uint32_t Hash(const char* data, size_t n, uint32_t seed) {
   // MurmurHash1 - fast but mediocre quality
   // https://github.com/aappleby/smhasher/wiki/MurmurHash1
@@ -80,4 +82,19 @@ uint64_t Hash64(const char* data, size_t n) {
   return XXH3p_64bits(data, n);
 }
 
+uint64_t GetSlicePartsNPHash64(const SliceParts& data, uint64_t seed) {
+  // TODO(ajkr): use XXH3 streaming APIs to avoid the copy/allocation.
+  size_t concat_len = 0;
+  for (int i = 0; i < data.num_parts; ++i) {
+    concat_len += data.parts[i].size();
+  }
+  std::string concat_data;
+  concat_data.reserve(concat_len);
+  for (int i = 0; i < data.num_parts; ++i) {
+    concat_data.append(data.parts[i].data(), data.parts[i].size());
+  }
+  assert(concat_data.size() == concat_len);
+  return NPHash64(concat_data.data(), concat_len, seed);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/hash.h b/util/hash.h
index a07fc463175..e91316af172 100644
--- a/util/hash.h
+++ b/util/hash.h
@@ -29,28 +29,45 @@ namespace ROCKSDB_NAMESPACE {
 
 // Stable/persistent 64-bit hash. Higher quality and generally faster than
 // Hash(), especially for inputs > 24 bytes.
+// KNOWN FLAW: incrementing seed by 1 might not give sufficiently independent
+// results from previous seed. Recommend incrementing by a large odd number.
 extern uint64_t Hash64(const char* data, size_t n, uint64_t seed);
 
 // Specific optimization without seed (same as seed = 0)
 extern uint64_t Hash64(const char* data, size_t n);
 
-// Non-persistent hash. Must only used for in-memory data structure.
-// The hash results are thus applicable to change. (Thus, it rarely makes
-// sense to specify a seed for this function.)
-inline uint64_t NPHash64(const char* data, size_t n, uint32_t seed) {
+// Non-persistent hash. Must only used for in-memory data structures.
+// The hash results are thus subject to change between releases,
+// architectures, build configuration, etc. (Thus, it rarely makes sense
+// to specify a seed for this function, except for a "rolling" hash.)
+// KNOWN FLAW: incrementing seed by 1 might not give sufficiently independent
+// results from previous seed. Recommend incrementing by a large odd number.
+inline uint64_t NPHash64(const char* data, size_t n, uint64_t seed) {
+#ifdef ROCKSDB_MODIFY_NPHASH
+  // For testing "subject to change"
+  return Hash64(data, n, seed + 123456789);
+#else
   // Currently same as Hash64
   return Hash64(data, n, seed);
+#endif
 }
 
 // Specific optimization without seed (same as seed = 0)
 inline uint64_t NPHash64(const char* data, size_t n) {
+#ifdef ROCKSDB_MODIFY_NPHASH
+  // For testing "subject to change"
+  return Hash64(data, n, 123456789);
+#else
   // Currently same as Hash64
   return Hash64(data, n);
+#endif
 }
 
 // Stable/persistent 32-bit hash. Moderate quality and high speed on
 // small inputs.
 // TODO: consider rename to Hash32
+// KNOWN FLAW: incrementing seed by 1 might not give sufficiently independent
+// results from previous seed. Recommend pseudorandom or hashed seeds.
 extern uint32_t Hash(const char* data, size_t n, uint32_t seed);
 
 // TODO: consider rename to LegacyBloomHash32
@@ -61,11 +78,22 @@ inline uint32_t BloomHash(const Slice& key) {
 inline uint64_t GetSliceHash64(const Slice& key) {
   return Hash64(key.data(), key.size());
 }
+// Provided for convenience for use with template argument deduction, where a
+// specific overload needs to be used.
+extern uint64_t (*kGetSliceNPHash64UnseededFnPtr)(const Slice&);
 
 inline uint64_t GetSliceNPHash64(const Slice& s) {
   return NPHash64(s.data(), s.size());
 }
 
+inline uint64_t GetSliceNPHash64(const Slice& s, uint64_t seed) {
+  return NPHash64(s.data(), s.size(), seed);
+}
+
+// Similar to `GetSliceNPHash64()` with `seed`, but input comes from
+// concatenation of `Slice`s in `data`.
+extern uint64_t GetSlicePartsNPHash64(const SliceParts& data, uint64_t seed);
+
 inline size_t GetSliceRangedNPHash(const Slice& s, size_t range) {
   return FastRange64(NPHash64(s.data(), s.size()), range);
 }
diff --git a/util/hash_test.cc b/util/hash_test.cc
index da253bdd458..08fcaf574a4 100644
--- a/util/hash_test.cc
+++ b/util/hash_test.cc
@@ -396,7 +396,9 @@ using ROCKSDB_NAMESPACE::BitParity;
 using ROCKSDB_NAMESPACE::BitsSetToOne;
 using ROCKSDB_NAMESPACE::CountTrailingZeroBits;
 using ROCKSDB_NAMESPACE::DecodeFixed128;
+using ROCKSDB_NAMESPACE::DecodeFixedGeneric;
 using ROCKSDB_NAMESPACE::EncodeFixed128;
+using ROCKSDB_NAMESPACE::EncodeFixedGeneric;
 using ROCKSDB_NAMESPACE::FloorLog2;
 using ROCKSDB_NAMESPACE::Lower64of128;
 using ROCKSDB_NAMESPACE::Multiply64to128;
@@ -534,9 +536,10 @@ TEST(MathTest, Math128) {
 
 TEST(MathTest, Coding128) {
   const char *in = "_1234567890123456";
+  // Note: in + 1 is likely unaligned
   Unsigned128 decoded = DecodeFixed128(in + 1);
-  EXPECT_EQ(Lower64of128(decoded), 4050765991979987505U);
-  EXPECT_EQ(Upper64of128(decoded), 3906085646303834169U);
+  EXPECT_EQ(Lower64of128(decoded), 0x3837363534333231U);
+  EXPECT_EQ(Upper64of128(decoded), 0x3635343332313039U);
   char out[18];
   out[0] = '_';
   EncodeFixed128(out + 1, decoded);
@@ -544,7 +547,46 @@ TEST(MathTest, Coding128) {
   EXPECT_EQ(std::string(in), std::string(out));
 }
 
+TEST(MathTest, CodingGeneric) {
+  const char *in = "_1234567890123456";
+  // Decode
+  // Note: in + 1 is likely unaligned
+  Unsigned128 decoded128 = DecodeFixedGeneric<Unsigned128>(in + 1);
+  EXPECT_EQ(Lower64of128(decoded128), 0x3837363534333231U);
+  EXPECT_EQ(Upper64of128(decoded128), 0x3635343332313039U);
+
+  uint64_t decoded64 = DecodeFixedGeneric<uint64_t>(in + 1);
+  EXPECT_EQ(decoded64, 0x3837363534333231U);
+
+  uint32_t decoded32 = DecodeFixedGeneric<uint32_t>(in + 1);
+  EXPECT_EQ(decoded32, 0x34333231U);
+
+  uint16_t decoded16 = DecodeFixedGeneric<uint16_t>(in + 1);
+  EXPECT_EQ(decoded16, 0x3231U);
+
+  // Encode
+  char out[18];
+  out[0] = '_';
+  memset(out + 1, '\0', 17);
+  EncodeFixedGeneric(out + 1, decoded128);
+  EXPECT_EQ(std::string(in), std::string(out));
+
+  memset(out + 1, '\0', 9);
+  EncodeFixedGeneric(out + 1, decoded64);
+  EXPECT_EQ(std::string("_12345678"), std::string(out));
+
+  memset(out + 1, '\0', 5);
+  EncodeFixedGeneric(out + 1, decoded32);
+  EXPECT_EQ(std::string("_1234"), std::string(out));
+
+  memset(out + 1, '\0', 3);
+  EncodeFixedGeneric(out + 1, decoded16);
+  EXPECT_EQ(std::string("_12"), std::string(out));
+}
+
 int main(int argc, char** argv) {
+  fprintf(stderr, "NPHash64 id: %x\n",
+          static_cast<int>(ROCKSDB_NAMESPACE::GetSliceNPHash64("RocksDB")));
   ::testing::InitGoogleTest(&argc, argv);
 
   return RUN_ALL_TESTS();
diff --git a/util/heap.h b/util/heap.h
index c72d944d32b..e0737581e53 100644
--- a/util/heap.h
+++ b/util/heap.h
@@ -72,7 +72,12 @@ class BinaryHeap {
 
   void pop() {
     assert(!empty());
-    data_.front() = std::move(data_.back());
+    if (data_.size() > 1) {
+      // Avoid self-move-assign, because it could cause problems with
+      // classes which are not prepared for this and it trips up the
+      // STL debugger when activated.
+      data_.front() = std::move(data_.back());
+    }
     data_.pop_back();
     if (!empty()) {
       downheap(get_root());
diff --git a/util/log_write_bench.cc b/util/log_write_bench.cc
index 60798babfbe..c1637db15d5 100644
--- a/util/log_write_bench.cc
+++ b/util/log_write_bench.cc
@@ -14,6 +14,7 @@ int main() {
 #include "file/writable_file_writer.h"
 #include "monitoring/histogram.h"
 #include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/gflags_compat.h"
@@ -34,13 +35,14 @@ void RunBenchmark() {
   std::string file_name = test::PerThreadDBPath("log_write_benchmark.log");
   DBOptions options;
   Env* env = Env::Default();
+  const auto& clock = env->GetSystemClock();
   EnvOptions env_options = env->OptimizeForLogWrite(EnvOptions(), options);
   env_options.bytes_per_sync = FLAGS_bytes_per_sync;
   std::unique_ptr<WritableFile> file;
   env->NewWritableFile(file_name, &file, env_options);
   std::unique_ptr<WritableFileWriter> writer;
   writer.reset(new WritableFileWriter(std::move(file), file_name, env_options,
-                                      env, nullptr /* stats */,
+                                      clock, nullptr /* stats */,
                                       options.listeners));
 
   std::string record;
@@ -48,24 +50,24 @@ void RunBenchmark() {
 
   HistogramImpl hist;
 
-  uint64_t start_time = env->NowMicros();
+  uint64_t start_time = clock->NowMicros();
   for (int i = 0; i < FLAGS_num_records; i++) {
-    uint64_t start_nanos = env->NowNanos();
+    uint64_t start_nanos = clock->NowNanos();
     writer->Append(record);
     writer->Flush();
     if (FLAGS_enable_sync) {
       writer->Sync(false);
     }
-    hist.Add(env->NowNanos() - start_nanos);
+    hist.Add(clock->NowNanos() - start_nanos);
 
     if (i % 1000 == 1) {
       fprintf(stderr, "Wrote %d records...\n", i);
     }
 
     int time_to_sleep =
-        (i + 1) * FLAGS_record_interval - (env->NowMicros() - start_time);
+        (i + 1) * FLAGS_record_interval - (clock->NowMicros() - start_time);
     if (time_to_sleep > 0) {
-      env->SleepForMicroseconds(time_to_sleep);
+      clock->SleepForMicroseconds(time_to_sleep);
     }
   }
 
diff --git a/util/math128.h b/util/math128.h
index 10e99714e83..a531ea7f0de 100644
--- a/util/math128.h
+++ b/util/math128.h
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include "util/coding.h"
+#include "util/coding_lean.h"
 #include "util/math.h"
 
 #ifdef TEST_UINT128_COMPAT
@@ -40,6 +40,14 @@ struct Unsigned128 {
     lo = lower;
     hi = upper;
   }
+
+  explicit operator uint64_t() { return lo; }
+
+  explicit operator uint32_t() { return static_cast<uint32_t>(lo); }
+
+  explicit operator uint16_t() { return static_cast<uint16_t>(lo); }
+
+  explicit operator uint8_t() { return static_cast<uint8_t>(lo); }
 };
 
 inline Unsigned128 operator<<(const Unsigned128& lhs, unsigned shift) {
@@ -207,9 +215,14 @@ inline int BitsSetToOne(Unsigned128 v) {
 
 template <>
 inline int BitParity(Unsigned128 v) {
-  return BitParity(Lower64of128(v)) ^ BitParity(Upper64of128(v));
+  return BitParity(Lower64of128(v) ^ Upper64of128(v));
 }
 
+template <typename T>
+struct IsUnsignedUpTo128
+    : std::integral_constant<bool, std::is_unsigned<T>::value ||
+                                       std::is_same<T, Unsigned128>::value> {};
+
 inline void EncodeFixed128(char* dst, Unsigned128 value) {
   EncodeFixed64(dst, Lower64of128(value));
   EncodeFixed64(dst + 8, Upper64of128(value));
@@ -220,4 +233,66 @@ inline Unsigned128 DecodeFixed128(const char* ptr) {
   return (rv << 64) | DecodeFixed64(ptr);
 }
 
+// A version of EncodeFixed* for generic algorithms. Likely to be used
+// with Unsigned128, so lives here for now.
+template <typename T>
+inline void EncodeFixedGeneric(char* /*dst*/, T /*value*/) {
+  // Unfortunately, GCC does not appear to optimize this simple code down
+  // to a trivial load on Intel:
+  //
+  // T ret_val = 0;
+  // for (size_t i = 0; i < sizeof(T); ++i) {
+  //   ret_val |= (static_cast<T>(static_cast<unsigned char>(ptr[i])) << (8 *
+  //   i));
+  // }
+  // return ret_val;
+  //
+  // But does unroll the loop, and does optimize manually unrolled version
+  // for specific sizes down to a trivial load. I have no idea why it doesn't
+  // do both on this code.
+
+  // So instead, we rely on specializations
+  static_assert(sizeof(T) == 0, "No specialization provided for this type");
+}
+
+template <>
+inline void EncodeFixedGeneric(char* dst, uint16_t value) {
+  return EncodeFixed16(dst, value);
+}
+template <>
+inline void EncodeFixedGeneric(char* dst, uint32_t value) {
+  return EncodeFixed32(dst, value);
+}
+template <>
+inline void EncodeFixedGeneric(char* dst, uint64_t value) {
+  return EncodeFixed64(dst, value);
+}
+template <>
+inline void EncodeFixedGeneric(char* dst, Unsigned128 value) {
+  return EncodeFixed128(dst, value);
+}
+
+// A version of EncodeFixed* for generic algorithms.
+template <typename T>
+inline T DecodeFixedGeneric(const char* /*dst*/) {
+  static_assert(sizeof(T) == 0, "No specialization provided for this type");
+}
+
+template <>
+inline uint16_t DecodeFixedGeneric(const char* dst) {
+  return DecodeFixed16(dst);
+}
+template <>
+inline uint32_t DecodeFixedGeneric(const char* dst) {
+  return DecodeFixed32(dst);
+}
+template <>
+inline uint64_t DecodeFixedGeneric(const char* dst) {
+  return DecodeFixed64(dst);
+}
+template <>
+inline Unsigned128 DecodeFixedGeneric(const char* dst) {
+  return DecodeFixed128(dst);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/random.cc b/util/random.cc
index 68624ad4380..f1ab0293917 100644
--- a/util/random.cc
+++ b/util/random.cc
@@ -53,4 +53,13 @@ std::string Random::RandomString(int len) {
   return ret;
 }
 
+std::string Random::RandomBinaryString(int len) {
+  std::string ret;
+  ret.resize(len);
+  for (int i = 0; i < len; i++) {
+    ret[i] = static_cast<char>(Uniform(CHAR_MAX));
+  }
+  return ret;
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/random.h b/util/random.h
index 5f6eaf51ef3..7e1350f06ad 100644
--- a/util/random.h
+++ b/util/random.h
@@ -92,6 +92,9 @@ class Random {
   // Generates a random string of len bytes using human-readable characters
   std::string HumanReadableString(int len);
 
+  // Generates a random binary data
+  std::string RandomBinaryString(int len);
+
   // Returns a Random instance for use by the current thread without
   // additional locking
   static Random* GetTLSInstance();
diff --git a/util/rate_limiter.cc b/util/rate_limiter.cc
index b1eefe62012..cefd9e29914 100644
--- a/util/rate_limiter.cc
+++ b/util/rate_limiter.cc
@@ -8,9 +8,10 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "util/rate_limiter.h"
+
 #include "monitoring/statistics.h"
 #include "port/port.h"
-#include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/sync_point.h"
 #include "util/aligned_buffer.h"
 
@@ -43,22 +44,22 @@ struct GenericRateLimiter::Req {
   bool granted;
 };
 
-GenericRateLimiter::GenericRateLimiter(int64_t rate_bytes_per_sec,
-                                       int64_t refill_period_us,
-                                       int32_t fairness, RateLimiter::Mode mode,
-                                       Env* env, bool auto_tuned)
+GenericRateLimiter::GenericRateLimiter(
+    int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness,
+    RateLimiter::Mode mode, const std::shared_ptr<SystemClock>& clock,
+    bool auto_tuned)
     : RateLimiter(mode),
       refill_period_us_(refill_period_us),
       rate_bytes_per_sec_(auto_tuned ? rate_bytes_per_sec / 2
                                      : rate_bytes_per_sec),
       refill_bytes_per_period_(
           CalculateRefillBytesPerPeriod(rate_bytes_per_sec_)),
-      env_(env),
+      clock_(clock),
       stop_(false),
       exit_cv_(&request_mutex_),
       requests_to_wait_(0),
       available_bytes_(0),
-      next_refill_us_(NowMicrosMonotonic(env_)),
+      next_refill_us_(NowMicrosMonotonic()),
       fairness_(fairness > 100 ? 100 : fairness),
       rnd_((uint32_t)time(nullptr)),
       leader_(nullptr),
@@ -66,7 +67,7 @@ GenericRateLimiter::GenericRateLimiter(int64_t rate_bytes_per_sec,
       num_drains_(0),
       prev_num_drains_(0),
       max_bytes_per_sec_(rate_bytes_per_sec),
-      tuned_time_(NowMicrosMonotonic(env_)) {
+      tuned_time_(NowMicrosMonotonic()) {
   total_requests_[0] = 0;
   total_requests_[1] = 0;
   total_bytes_through_[0] = 0;
@@ -108,10 +109,11 @@ void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri,
 
   if (auto_tuned_) {
     static const int kRefillsPerTune = 100;
-    std::chrono::microseconds now(NowMicrosMonotonic(env_));
+    std::chrono::microseconds now(NowMicrosMonotonic());
     if (now - tuned_time_ >=
         kRefillsPerTune * std::chrono::microseconds(refill_period_us_)) {
-      Tune();
+      Status s = Tune();
+      s.PermitUncheckedError();  //**TODO: What to do on error?
     }
   }
 
@@ -148,12 +150,12 @@ void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri,
          (!queue_[Env::IO_LOW].empty() &&
             &r == queue_[Env::IO_LOW].front()))) {
       leader_ = &r;
-      int64_t delta = next_refill_us_ - NowMicrosMonotonic(env_);
+      int64_t delta = next_refill_us_ - NowMicrosMonotonic();
       delta = delta > 0 ? delta : 0;
       if (delta == 0) {
         timedout = true;
       } else {
-        int64_t wait_until = env_->NowMicros() + delta;
+        int64_t wait_until = clock_->NowMicros() + delta;
         RecordTick(stats, NUMBER_RATE_LIMITER_DRAINS);
         ++num_drains_;
         timedout = r.cv.TimedWait(wait_until);
@@ -228,7 +230,7 @@ void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri,
 
 void GenericRateLimiter::Refill() {
   TEST_SYNC_POINT("GenericRateLimiter::Refill");
-  next_refill_us_ = NowMicrosMonotonic(env_) + refill_period_us_;
+  next_refill_us_ = NowMicrosMonotonic() + refill_period_us_;
   // Carry over the left over quota from the last period
   auto refill_bytes_per_period =
       refill_bytes_per_period_.load(std::memory_order_relaxed);
@@ -283,7 +285,7 @@ Status GenericRateLimiter::Tune() {
   const int kAllowedRangeFactor = 20;
 
   std::chrono::microseconds prev_tuned_time = tuned_time_;
-  tuned_time_ = std::chrono::microseconds(NowMicrosMonotonic(env_));
+  tuned_time_ = std::chrono::microseconds(NowMicrosMonotonic());
 
   int64_t elapsed_intervals = (tuned_time_ - prev_tuned_time +
                                std::chrono::microseconds(refill_period_us_) -
@@ -333,7 +335,7 @@ RateLimiter* NewGenericRateLimiter(
   assert(refill_period_us > 0);
   assert(fairness > 0);
   return new GenericRateLimiter(rate_bytes_per_sec, refill_period_us, fairness,
-                                mode, Env::Default(), auto_tuned);
+                                mode, SystemClock::Default(), auto_tuned);
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/rate_limiter.h b/util/rate_limiter.h
index 5f047a56771..ec391162b34 100644
--- a/util/rate_limiter.h
+++ b/util/rate_limiter.h
@@ -13,9 +13,11 @@
 #include <atomic>
 #include <chrono>
 #include <deque>
+
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "rocksdb/rate_limiter.h"
+#include "rocksdb/system_clock.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 
@@ -24,7 +26,8 @@ namespace ROCKSDB_NAMESPACE {
 class GenericRateLimiter : public RateLimiter {
  public:
   GenericRateLimiter(int64_t refill_bytes, int64_t refill_period_us,
-                     int32_t fairness, RateLimiter::Mode mode, Env* env,
+                     int32_t fairness, RateLimiter::Mode mode,
+                     const std::shared_ptr<SystemClock>& clock,
                      bool auto_tuned);
 
   virtual ~GenericRateLimiter();
@@ -71,9 +74,7 @@ class GenericRateLimiter : public RateLimiter {
   int64_t CalculateRefillBytesPerPeriod(int64_t rate_bytes_per_sec);
   Status Tune();
 
-  uint64_t NowMicrosMonotonic(Env* env) {
-    return env->NowNanos() / std::milli::den;
-  }
+  uint64_t NowMicrosMonotonic() { return clock_->NowNanos() / std::milli::den; }
 
   // This mutex guard all internal states
   mutable port::Mutex request_mutex_;
@@ -85,7 +86,7 @@ class GenericRateLimiter : public RateLimiter {
   int64_t rate_bytes_per_sec_;
   // This variable can be changed dynamically.
   std::atomic<int64_t> refill_bytes_per_period_;
-  Env* const env_;
+  std::shared_ptr<SystemClock> clock_;
 
   bool stop_;
   port::CondVar exit_cv_;
diff --git a/util/rate_limiter_test.cc b/util/rate_limiter_test.cc
index 87591923fb1..a979dfd5cb0 100644
--- a/util/rate_limiter_test.cc
+++ b/util/rate_limiter_test.cc
@@ -14,7 +14,7 @@
 #include <limits>
 
 #include "db/db_test_util.h"
-#include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
 #include "util/random.h"
@@ -26,8 +26,8 @@ class RateLimiterTest : public testing::Test {};
 
 TEST_F(RateLimiterTest, OverflowRate) {
   GenericRateLimiter limiter(port::kMaxInt64, 1000, 10,
-                             RateLimiter::Mode::kWritesOnly, Env::Default(),
-                             false /* auto_tuned */);
+                             RateLimiter::Mode::kWritesOnly,
+                             SystemClock::Default(), false /* auto_tuned */);
   ASSERT_GT(limiter.GetSingleBurstBytes(), 1000000000ll);
 }
 
@@ -38,9 +38,10 @@ TEST_F(RateLimiterTest, StartStop) {
 TEST_F(RateLimiterTest, Modes) {
   for (auto mode : {RateLimiter::Mode::kWritesOnly,
                     RateLimiter::Mode::kReadsOnly, RateLimiter::Mode::kAllIo}) {
-    GenericRateLimiter limiter(
-        2000 /* rate_bytes_per_sec */, 1000 * 1000 /* refill_period_us */,
-        10 /* fairness */, mode, Env::Default(), false /* auto_tuned */);
+    GenericRateLimiter limiter(2000 /* rate_bytes_per_sec */,
+                               1000 * 1000 /* refill_period_us */,
+                               10 /* fairness */, mode, SystemClock::Default(),
+                               false /* auto_tuned */);
     limiter.Request(1000 /* bytes */, Env::IO_HIGH, nullptr /* stats */,
                     RateLimiter::OpType::kRead);
     if (mode == RateLimiter::Mode::kWritesOnly) {
@@ -59,7 +60,6 @@ TEST_F(RateLimiterTest, Modes) {
   }
 }
 
-#if !(defined(TRAVIS) && defined(OS_MACOSX))
 TEST_F(RateLimiterTest, Rate) {
   auto* env = Env::Default();
   struct Arg {
@@ -73,13 +73,13 @@ TEST_F(RateLimiterTest, Rate) {
   };
 
   auto writer = [](void* p) {
-    auto* thread_env = Env::Default();
+    const auto& thread_clock = SystemClock::Default();
     auto* arg = static_cast<Arg*>(p);
     // Test for 2 seconds
-    auto until = thread_env->NowMicros() + 2 * 1000000;
-    Random r((uint32_t)(thread_env->NowNanos() %
+    auto until = thread_clock->NowMicros() + 2 * 1000000;
+    Random r((uint32_t)(thread_clock->NowNanos() %
                         std::numeric_limits<uint32_t>::max()));
-    while (thread_env->NowMicros() < until) {
+    while (thread_clock->NowMicros() < until) {
       for (int i = 0; i < static_cast<int>(r.Skewed(arg->burst) + 1); ++i) {
         arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1,
                               Env::IO_HIGH, nullptr /* stats */,
@@ -90,6 +90,9 @@ TEST_F(RateLimiterTest, Rate) {
     }
   };
 
+  int samples = 0;
+  int samples_at_minimum = 0;
+
   for (int i = 1; i <= 16; i *= 2) {
     int32_t target = i * 1024 * 10;
     Arg arg(target, i / 4 + 1);
@@ -117,12 +120,28 @@ TEST_F(RateLimiterTest, Rate) {
               arg.request_size - 1, target / 1024, rate / 1024,
               elapsed / 1000000.0);
 
-      ASSERT_GE(rate / target, 0.80);
+      ++samples;
+      if (rate / target >= 0.80) {
+        ++samples_at_minimum;
+      }
       ASSERT_LE(rate / target, 1.25);
     }
   }
-}
+
+  // This can fail in heavily loaded CI environments
+  bool skip_minimum_rate_check =
+#if (defined(TRAVIS) || defined(CIRCLECI)) && defined(OS_MACOSX)
+      true;
+#else
+      getenv("SANDCASTLE");
 #endif
+  if (skip_minimum_rate_check) {
+    fprintf(stderr, "Skipped minimum rate check (%d / %d passed)\n",
+            samples_at_minimum, samples);
+  } else {
+    ASSERT_EQ(samples_at_minimum, samples);
+  }
+}
 
 TEST_F(RateLimiterTest, LimitChangeTest) {
   // starvation test when limit changes to a smaller value
@@ -151,7 +170,7 @@ TEST_F(RateLimiterTest, LimitChangeTest) {
       std::shared_ptr<RateLimiter> limiter =
           std::make_shared<GenericRateLimiter>(
               target, refill_period, 10, RateLimiter::Mode::kWritesOnly,
-              Env::Default(), false /* auto_tuned */);
+              SystemClock::Default(), false /* auto_tuned */);
       ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
           {{"GenericRateLimiter::Request",
             "RateLimiterTest::LimitChangeTest:changeLimitStart"},
@@ -186,7 +205,8 @@ TEST_F(RateLimiterTest, AutoTuneIncreaseWhenFull) {
   std::unique_ptr<RateLimiter> rate_limiter(new GenericRateLimiter(
       1000 /* rate_bytes_per_sec */,
       std::chrono::microseconds(kTimePerRefill).count(), 10 /* fairness */,
-      RateLimiter::Mode::kWritesOnly, &special_env, true /* auto_tuned */));
+      RateLimiter::Mode::kWritesOnly, special_env.GetSystemClock(),
+      true /* auto_tuned */));
 
   // Use callback to advance time because we need to advance (1) after Request()
   // has determined the bytes are not available; and (2) before Refill()
diff --git a/util/repeatable_thread.h b/util/repeatable_thread.h
index f78b1a8c2d9..c75ad7c49f3 100644
--- a/util/repeatable_thread.h
+++ b/util/repeatable_thread.h
@@ -8,11 +8,9 @@
 #include <functional>
 #include <string>
 
+#include "monitoring/instrumented_mutex.h"
 #include "port/port.h"
-#include "rocksdb/env.h"
-#ifndef NDEBUG
-#include "test_util/mock_time_env.h"
-#endif  // !NDEBUG
+#include "rocksdb/system_clock.h"
 #include "util/mutexlock.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -23,14 +21,14 @@ namespace ROCKSDB_NAMESPACE {
 class RepeatableThread {
  public:
   RepeatableThread(std::function<void()> function,
-                   const std::string& thread_name, Env* env, uint64_t delay_us,
-                   uint64_t initial_delay_us = 0)
+                   const std::string& thread_name, SystemClock* clock,
+                   uint64_t delay_us, uint64_t initial_delay_us = 0)
       : function_(function),
         thread_name_("rocksdb:" + thread_name),
-        env_(env),
+        clock_(clock),
         delay_us_(delay_us),
         initial_delay_us_(initial_delay_us),
-        mutex_(env),
+        mutex_(clock),
         cond_var_(&mutex_),
         running_(true),
 #ifndef NDEBUG
@@ -59,7 +57,7 @@ class RepeatableThread {
 #ifndef NDEBUG
   // Wait until RepeatableThread starting waiting, call the optional callback,
   // then wait for one run of RepeatableThread. Tests can use provide a
-  // custom env object to mock time, and use the callback here to bump current
+  // custom clock object to mock time, and use the callback here to bump current
   // time and trigger RepeatableThread. See repeatable_thread_test for example.
   //
   // Note: only support one caller of this method.
@@ -83,14 +81,14 @@ class RepeatableThread {
   bool wait(uint64_t delay) {
     InstrumentedMutexLock l(&mutex_);
     if (running_ && delay > 0) {
-      uint64_t wait_until = env_->NowMicros() + delay;
+      uint64_t wait_until = clock_->NowMicros() + delay;
 #ifndef NDEBUG
       waiting_ = true;
       cond_var_.SignalAll();
 #endif
       while (running_) {
         cond_var_.TimedWait(wait_until);
-        if (env_->NowMicros() >= wait_until) {
+        if (clock_->NowMicros() >= wait_until) {
           break;
         }
       }
@@ -130,7 +128,7 @@ class RepeatableThread {
 
   const std::function<void()> function_;
   const std::string thread_name_;
-  Env* const env_;
+  SystemClock* clock_;
   const uint64_t delay_us_;
   const uint64_t initial_delay_us_;
 
diff --git a/util/repeatable_thread_test.cc b/util/repeatable_thread_test.cc
index a18aa6cd11e..19f22d726ac 100644
--- a/util/repeatable_thread_test.cc
+++ b/util/repeatable_thread_test.cc
@@ -3,44 +3,46 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include "util/repeatable_thread.h"
+
 #include <atomic>
 #include <memory>
 
 #include "db/db_test_util.h"
+#include "test_util/mock_time_env.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
-#include "util/repeatable_thread.h"
 
 class RepeatableThreadTest : public testing::Test {
  public:
   RepeatableThreadTest()
-      : mock_env_(new ROCKSDB_NAMESPACE::MockTimeEnv(
-            ROCKSDB_NAMESPACE::Env::Default())) {}
+      : mock_clock_(std::make_shared<ROCKSDB_NAMESPACE::MockSystemClock>(
+            ROCKSDB_NAMESPACE::SystemClock::Default())) {}
 
  protected:
-  std::unique_ptr<ROCKSDB_NAMESPACE::MockTimeEnv> mock_env_;
+  std::shared_ptr<ROCKSDB_NAMESPACE::MockSystemClock> mock_clock_;
 };
 
 TEST_F(RepeatableThreadTest, TimedTest) {
   constexpr uint64_t kSecond = 1000000;  // 1s = 1000000us
   constexpr int kIteration = 3;
-  ROCKSDB_NAMESPACE::Env* env = ROCKSDB_NAMESPACE::Env::Default();
+  const auto& clock = ROCKSDB_NAMESPACE::SystemClock::Default();
   ROCKSDB_NAMESPACE::port::Mutex mutex;
   ROCKSDB_NAMESPACE::port::CondVar test_cv(&mutex);
   int count = 0;
-  uint64_t prev_time = env->NowMicros();
+  uint64_t prev_time = clock->NowMicros();
   ROCKSDB_NAMESPACE::RepeatableThread thread(
       [&] {
         ROCKSDB_NAMESPACE::MutexLock l(&mutex);
         count++;
-        uint64_t now = env->NowMicros();
+        uint64_t now = clock->NowMicros();
         assert(count == 1 || prev_time + 1 * kSecond <= now);
         prev_time = now;
         if (count >= kIteration) {
           test_cv.SignalAll();
         }
       },
-      "rt_test", env, 1 * kSecond);
+      "rt_test", clock.get(), 1 * kSecond);
   // Wait for execution finish.
   {
     ROCKSDB_NAMESPACE::MutexLock l(&mutex);
@@ -56,7 +58,7 @@ TEST_F(RepeatableThreadTest, TimedTest) {
 TEST_F(RepeatableThreadTest, MockEnvTest) {
   constexpr uint64_t kSecond = 1000000;  // 1s = 1000000us
   constexpr int kIteration = 3;
-  mock_env_->set_current_time(0);  // in seconds
+  mock_clock_->SetCurrentTime(0);  // in seconds
   std::atomic<int> count{0};
 
 #if defined(OS_MACOSX) && !defined(NDEBUG)
@@ -72,26 +74,27 @@ TEST_F(RepeatableThreadTest, MockEnvTest) {
         // immediately, the mutex will not be released, and
         // RepeatableThread::TEST_WaitForRun never has a chance to execute the
         // callback which, in this case, updates the result returned by
-        // mock_env->NowMicros. Consequently, RepeatableThread::wait cannot
+        // mock_clock->NowMicros. Consequently, RepeatableThread::wait cannot
         // break out of the loop, causing test to hang. The extra 1000 seconds
         // is a best-effort approach because there seems no reliable and
         // deterministic way to provide the aforementioned guarantee. By the
         // time RepeatableThread::wait is called, it is no guarantee that the
-        // delay + mock_env->NowMicros will be greater than the current real
+        // delay + mock_clock->NowMicros will be greater than the current real
         // time. However, 1000 seconds should be sufficient in most cases.
         uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
-        if (time_us < mock_env_->RealNowMicros()) {
-          *reinterpret_cast<uint64_t*>(arg) = mock_env_->RealNowMicros() + 1000;
+        if (time_us < mock_clock_->RealNowMicros()) {
+          *reinterpret_cast<uint64_t*>(arg) =
+              mock_clock_->RealNowMicros() + 1000;
         }
       });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 #endif  // OS_MACOSX && !NDEBUG
 
   ROCKSDB_NAMESPACE::RepeatableThread thread(
-      [&] { count++; }, "rt_test", mock_env_.get(), 1 * kSecond, 1 * kSecond);
+      [&] { count++; }, "rt_test", mock_clock_.get(), 1 * kSecond, 1 * kSecond);
   for (int i = 1; i <= kIteration; i++) {
     // Bump current time
-    thread.TEST_WaitForRun([&] { mock_env_->set_current_time(i); });
+    thread.TEST_WaitForRun([&] { mock_clock_->SetCurrentTime(i); });
   }
   // Test function should be exectued exactly kIteraion times.
   ASSERT_EQ(kIteration, count.load());
diff --git a/util/ribbon_alg.h b/util/ribbon_alg.h
new file mode 100644
index 00000000000..da017f6399c
--- /dev/null
+++ b/util/ribbon_alg.h
@@ -0,0 +1,1225 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <array>
+#include <memory>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "util/math128.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace ribbon {
+
+// RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly)
+//
+// ribbon_alg.h: generic versions of core algorithms.
+//
+// Ribbon is a Perfect Hash Static Function construction useful as a compact
+// static Bloom filter alternative. It combines (a) a boolean (GF(2)) linear
+// system construction that approximates a Band Matrix with hashing,
+// (b) an incremental, on-the-fly Gaussian Elimination algorithm that is
+// remarkably efficient and adaptable at constructing an upper-triangular
+// band matrix from a set of band-approximating inputs from (a), and
+// (c) a storage layout that is fast and adaptable as a filter.
+//
+// Footnotes: (a) "Efficient Gauss Elimination for Near-Quadratic Matrices
+// with One Short Random Block per Row, with Applications" by Stefan
+// Walzer and Martin Dietzfelbinger ("DW paper")
+// (b) developed by Peter C. Dillinger, though not the first on-the-fly
+// GE algorithm. See "On the fly Gaussian Elimination for LT codes" by
+// Bioglio, Grangetto, Gaeta, and Sereno.
+// (c) see "interleaved" solution storage below.
+//
+// See ribbon_impl.h for high-level behavioral summary. This file focuses
+// on the core design details.
+//
+// ######################################################################
+// ################# PHSF -> static filter reduction ####################
+//
+// A Perfect Hash Static Function is a data structure representing a
+// map from anything hashable (a "key") to values of some fixed size.
+// Crucially, it is allowed to return garbage values for anything not in
+// the original set of map keys, and it is a "static" structure: entries
+// cannot be added or deleted after construction. PHSFs representing n
+// mappings to b-bit values (assume uniformly distributed) require at least
+// n * b bits to represent, or at least b bits per entry. We typically
+// describe the compactness of a PHSF by typical bits per entry as some
+// function of b. For example, the MWHC construction (k=3 "peeling")
+// requires about 1.0222*b and a variant called Xor+ requires about
+// 1.08*b + 0.5 bits per entry.
+//
+// With more hashing, a PHSF can over-approximate a set as a Bloom filter
+// does, with no FN queries and predictable false positive (FP) query
+// rate. Instead of the user providing a value to map each input key to,
+// a hash function provides the value. Keys in the original set will
+// return a positive membership query because the underlying PHSF returns
+// the same value as hashing the key. When a key is not in the original set,
+// the PHSF returns a "garbage" value, which is only equal to the key's
+// hash with (false positive) probability 1 in 2^b.
+//
+// For a matching false positive rate, standard Bloom filters require
+// 1.44*b bits per entry. Cache-local Bloom filters (like bloom_impl.h)
+// require a bit more, around 1.5*b bits per entry. Thus, a Bloom
+// alternative could save up to or nearly 1/3rd of memory and storage
+// that RocksDB uses for SST (static) Bloom filters. (Memtable Bloom filter
+// is dynamic.)
+//
+// Recommended reading:
+// "Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters"
+// by Graf and Lemire
+// First three sections of "Fast Scalable Construction of (Minimal
+// Perfect Hash) Functions" by Genuzio, Ottaviano, and Vigna
+//
+// ######################################################################
+// ################## PHSF vs. hash table vs. Bloom #####################
+//
+// You can think of traditional hash tables and related filter variants
+// such as Cuckoo filters as utilizing an "OR" construction: a hash
+// function associates a key with some slots and the data is returned if
+// the data is found in any one of those slots. The collision resolution
+// is visible in the final data structure and requires extra information.
+// For example, Cuckoo filter uses roughly 1.05b + 2 bits per entry, and
+// Golomb-Rice code (aka "GCS") as little as b + 1.5. When the data
+// structure associates each input key with data in one slot, the
+// structure implicitly constructs a (near-)minimal (near-)perfect hash
+// (MPH) of the keys, which requires at least 1.44 bits per key to
+// represent. This is why approaches with visible collision resolution
+// have a fixed + 1.5 or more in storage overhead per entry, often in
+// addition to an overhead multiplier on b.
+//
+// By contrast Bloom filters utilize an "AND" construction: a query only
+// returns true if all bit positions associated with a key are set to 1.
+// There is no collision resolution, so Bloom filters do not suffer a
+// fixed bits per entry overhead like the above structures.
+//
+// PHSFs typically use a bitwise XOR construction: the data you want is
+// not in a single slot, but in a linear combination of several slots.
+// For static data, this gives the best of "AND" and "OR" constructions:
+// avoids the +1.44 or more fixed overhead by not approximating a MPH and
+// can do much better than Bloom's 1.44 factor on b with collision
+// resolution, which here is done ahead of time and invisible at query
+// time.
+//
+// ######################################################################
+// ######################## PHSF construction ###########################
+//
+// For a typical PHSF, construction is solving a linear system of
+// equations, typically in GF(2), which is to say that values are boolean
+// and XOR serves both as addition and subtraction. We can use matrices to
+// represent the problem:
+//
+//    C    *    S    =    R
+// (n x m)   (m x b)   (n x b)
+// where C = coefficients, S = solution, R = results
+// and solving for S given C and R.
+//
+// Note that C and R each have n rows, one for each input entry for the
+// PHSF. A row in C is given by a hash function on the PHSF input key,
+// and the corresponding row in R is the b-bit value to associate with
+// that input key. (In a filter, rows of R are given by another hash
+// function on the input key.)
+//
+// On solving, the matrix S (solution) is the final PHSF data, as it
+// maps any row from the original C to its corresponding desired result
+// in R. We just have to hash our query inputs and compute a linear
+// combination of rows in S.
+//
+// In theory, we could chose m = n and let a hash function associate
+// each input key with random rows in C. A solution exists with high
+// probability, and uses essentially minimum space, b bits per entry
+// (because we set m = n) but this has terrible scaling, something
+// like O(n^2) space and O(n^3) time during construction (Gaussian
+// elimination) and O(n) query time. But computational efficiency is
+// key, and the core of this is avoiding scanning all of S to answer
+// each query.
+//
+// The traditional approach (MWHC, aka Xor filter) starts with setting
+// only some small fixed number of columns (typically k=3) to 1 for each
+// row of C, with remaining entries implicitly 0. This is implemented as
+// three hash functions over [0,m), and S can be implemented as a vector
+// vector of b-bit values. Now, a query only involves looking up k rows
+// (values) in S and computing their bitwise XOR. Additionally, this
+// construction can use a linear time algorithm called "peeling" for
+// finding a solution in many cases of one existing, but peeling
+// generally requires a larger space overhead factor in the solution
+// (m/n) than is required with Gaussian elimination.
+//
+// Recommended reading:
+// "Peeling Close to the Orientability Threshold – Spatial Coupling in
+// Hashing-Based Data Structures" by Stefan Walzer
+//
+// ######################################################################
+// ##################### Ribbon PHSF construction #######################
+//
+// Ribbon constructs coefficient rows essentially the same as in the
+// Walzer/Dietzfelbinger paper cited above: for some chosen fixed width
+// r (kCoeffBits in code), each key is hashed to a starting column in
+// [0, m - r] (GetStart() in code) and an r-bit sequence of boolean
+// coefficients (GetCoeffRow() in code). If you sort the rows by start,
+// the C matrix would look something like this:
+//
+// [####00000000000000000000]
+// [####00000000000000000000]
+// [000####00000000000000000]
+// [0000####0000000000000000]
+// [0000000####0000000000000]
+// [000000000####00000000000]
+// [000000000####00000000000]
+// [0000000000000####0000000]
+// [0000000000000000####0000]
+// [00000000000000000####000]
+// [00000000000000000000####]
+//
+// where each # could be a 0 or 1, chosen uniformly by a hash function.
+// (Except we typically set the start column value to 1.) This scheme
+// uses hashing to approximate a band matrix, and it has a solution iff
+// it reduces to an upper-triangular boolean r-band matrix, like this:
+//
+// [1###00000000000000000000]
+// [01##00000000000000000000]
+// [000000000000000000000000]
+// [0001###00000000000000000]
+// [000000000000000000000000]
+// [000001##0000000000000000]
+// [000000000000000000000000]
+// [00000001###0000000000000]
+// [000000001###000000000000]
+// [0000000001##000000000000]
+// ...
+// [00000000000000000000001#]
+// [000000000000000000000001]
+//
+// where we have expanded to an m x m matrix by filling with rows of
+// all zeros as needed. As in Gaussian elimination, this form is ready for
+// generating a solution through back-substitution.
+//
+// The awesome thing about the Ribbon construction (from the DW paper) is
+// how row reductions keep each row representable as a start column and
+// r coefficients, because row reductions are only needed when two rows
+// have the same number of leading zero columns. Thus, the combination
+// of those rows, the bitwise XOR of the r-bit coefficient rows, cancels
+// out the leading 1s, so starts (at least) one column later and only
+// needs (at most) r - 1 coefficients.
+//
+// ######################################################################
+// ###################### Ribbon PHSF scalability #######################
+//
+// Although more practical detail is in ribbon_impl.h, it's worth
+// understanding some of the overall benefits and limitations of the
+// Ribbon PHSFs.
+//
+// High-end scalability is a primary issue for Ribbon PHSFs, because in
+// a single Ribbon linear system with fixed r and fixed m/n ratio, the
+// solution probability approaches zero as n approaches infinity.
+// For a given n, solution probability improves with larger r and larger
+// m/n.
+//
+// By contrast, peeling-based PHSFs have somewhat worse storage ratio
+// or solution probability for small n (less than ~1000). This is
+// especially true with spatial-coupling, where benefits are only
+// notable for n on the order of 100k or 1m or more.
+//
+// To make best use of current hardware, r=128 seems to be closest to
+// a "generally good" choice for Ribbon, at least in RocksDB where SST
+// Bloom filters typically hold around 10-100k keys, and almost always
+// less than 10m keys. r=128 ribbon has a high chance of encoding success
+// (with first hash seed) when storage overhead is around 5% (m/n ~ 1.05)
+// for roughly 10k - 10m keys in a single linear system. r=64 only scales
+// up to about 10k keys with the same storage overhead. Construction and
+// access times for r=128 are similar to r=64. r=128 tracks nearly
+// twice as much data during construction, but in most cases we expect
+// the scalability benefits of r=128 vs. r=64 to make it preferred.
+//
+// A natural approach to scaling Ribbon beyond ~10m keys is splitting
+// (or "sharding") the inputs into multiple linear systems with their
+// own hash seeds. This can also help to control peak memory consumption.
+// TODO: much more to come
+//
+// ######################################################################
+// #################### Ribbon on-the-fly banding #######################
+//
+// "Banding" is what we call the process of reducing the inputs to an
+// upper-triangular r-band matrix ready for finishing a solution with
+// back-substitution. Although the DW paper presents an algorithm for
+// this ("SGauss"), the awesome properties of their construction enable
+// an even simpler, faster, and more backtrackable algorithm. In simplest
+// terms, the SGauss algorithm requires sorting the inputs by start
+// columns, but it's possible to make Gaussian elimination resemble hash
+// table insertion!
+//
+// The enhanced algorithm is based on these observations:
+// - When processing a coefficient row with first 1 in column j,
+//   - If it's the first at column j to be processed, it can be part of
+//     the banding at row j. (And that decision never overwritten, with
+//     no loss of generality!)
+//   - Else, it can be combined with existing row j and re-processed,
+//     which will look for a later "empty" row or reach "no solution".
+//
+// We call our banding algorithm "incremental" and "on-the-fly" because
+// (like hash table insertion) we are "finished" after each input
+// processed, with respect to all inputs processed so far. Although the
+// band matrix is an intermediate step to the solution structure, we have
+// eliminated intermediate steps and unnecessary data tracking for
+// banding.
+//
+// Building on "incremental" and "on-the-fly", the banding algorithm is
+// easily backtrackable because no (non-empty) rows are overwritten in
+// the banding. Thus, if we want to "try" adding an additional set of
+// inputs to the banding, we only have to record which rows were written
+// in order to efficiently backtrack to our state before considering
+// the additional set. (TODO: how this can mitigate scalability and
+// reach sub-1% overheads)
+//
+// Like in a linear-probed hash table, as the occupancy approaches and
+// surpasses 90-95%, collision resolution dominates the construction
+// time. (Ribbon doesn't usually pay at query time; see solution
+// storage below.) This means that we can speed up construction time
+// by using a higher m/n ratio, up to negative returns around 1.2.
+// At m/n ~= 1.2, which still saves memory substantially vs. Bloom
+// filter's 1.5, construction speed (including back-substitution) is not
+// far from sorting speed, but still a few times slower than cache-local
+// Bloom construction speed.
+//
+// Back-substitution from an upper-triangular boolean band matrix is
+// especially fast and easy. All the memory accesses are sequential or at
+// least local, no random. If the number of result bits (b) is a
+// compile-time constant, the back-substitution state can even be tracked
+// in CPU registers. Regardless of the solution representation, we prefer
+// column-major representation for tracking back-substitution state, as
+// r (the band width) will typically be much larger than b (result bits
+// or columns), so better to handle r-bit values b times (per solution
+// row) than b-bit values r times.
+//
+// ######################################################################
+// ##################### Ribbon solution storage ########################
+//
+// Row-major layout is typical for boolean (bit) matrices, including for
+// MWHC (Xor) filters where a query combines k b-bit values, and k is
+// typically smaller than b. Even for k=4 and b=2, at least k=4 random
+// look-ups are required regardless of layout.
+//
+// Ribbon PHSFs are quite different, however, because
+// (a) all of the solution rows relevant to a query are within a single
+// range of r rows, and
+// (b) the number of solution rows involved (r/2 on average, or r if
+// avoiding conditional accesses) is typically much greater than
+// b, the number of solution columns.
+//
+// Row-major for Ribbon PHSFs therefore tends to incur undue CPU overhead
+// by processing (up to) r entries of b bits each, where b is typically
+// less than 10 for filter applications.
+//
+// Column-major layout has poor locality because of accessing up to b
+// memory locations in different pages (and obviously cache lines). Note
+// that negative filter queries do not typically need to access all
+// solution columns, as they can return when a mismatch is found in any
+// result/solution column. This optimization doesn't always pay off on
+// recent hardware, where the penalty for unpredictable conditional
+// branching can exceed the penalty for unnecessary work, but the
+// optimization is essentially unavailable with row-major layout.
+//
+// The best compromise seems to be interleaving column-major on the small
+// scale with row-major on the large scale. For example, let a solution
+// "block" be r rows column-major encoded as b r-bit values in sequence.
+// Each query accesses (up to) 2 adjacent blocks, which will typically
+// span 1-3 cache lines in adjacent memory. We get very close to the same
+// locality as row-major, but with much faster reconstruction of each
+// result column, at least for filter applications where b is relatively
+// small and negative queries can return early.
+//
+// ######################################################################
+// ###################### Fractional result bits ########################
+//
+// Bloom filters have great flexibility that alternatives mostly do not
+// have. One of those flexibilities is in utilizing any ratio of data
+// structure bits per key. With a typical memory allocator like jemalloc,
+// this flexibility can save roughly 10% of the filters' footprint in
+// DRAM by rounding up and down filter sizes to minimize memory internal
+// fragmentation (see optimize_filters_for_memory RocksDB option).
+//
+// At first glance, PHSFs only offer a whole number of bits per "slot"
+// (m rather than number of keys n), but coefficient locality in the
+// Ribbon construction makes fractional bits/key quite possible and
+// attractive for filter applications. This works by a prefix of the
+// structure using b-1 solution columns and the rest using b solution
+// columns. See InterleavedSolutionStorage below for more detail.
+//
+// Because false positive rates are non-linear in bits/key, this approach
+// is not quite optimal in terms of information theory. In common cases,
+// we see additional space overhead up to about 1.5% vs. theoretical
+// optimal to achieve the same FP rate. We consider this a quite acceptable
+// overhead for very efficiently utilizing space that might otherwise be
+// wasted.
+//
+// This property of Ribbon even makes it "elastic." A Ribbon filter and
+// its small metadata for answering queries can be adapted into another
+// Ribbon filter filling any smaller multiple of r bits (plus small
+// metadata), with a correspondingly higher FP rate. None of the data
+// thrown away during construction needs to be recalled for this reduction.
+// Similarly a single Ribbon construction can be separated (by solution
+// column) into two or more structures (or "layers" or "levels") with
+// independent filtering ability (no FP correlation, just as solution or
+// result columns in a single structure) despite being constructed as part
+// of a single linear system. (TODO: implement)
+// See also "ElasticBF: Fine-grained and Elastic Bloom Filter Towards
+// Efficient Read for LSM-tree-based KV Stores."
+//
+
+// ######################################################################
+// ################### CODE: Ribbon core algorithms #####################
+// ######################################################################
+//
+// These algorithms are templatized for genericity but near-maximum
+// performance in a given application. The template parameters
+// adhere to informal class/struct type concepts outlined below. (This
+// code is written for C++11 so does not use formal C++ concepts.)
+
+// Rough architecture for these algorithms:
+//
+// +-----------+     +---+     +-----------------+
+// | AddInputs | --> | H | --> | BandingStorage  |
+// +-----------+     | a |     +-----------------+
+//                   | s |             |
+//                   | h |      Back substitution
+//                   | e |             V
+// +-----------+     | r |     +-----------------+
+// | Query Key | --> |   | >+< | SolutionStorage |
+// +-----------+     +---+  |  +-----------------+
+//                          V
+//                     Query result
+
+// Common to other concepts
+// concept RibbonTypes {
+//   // An unsigned integer type for an r-bit subsequence of coefficients.
+//   // r (or kCoeffBits) is taken to be sizeof(CoeffRow) * 8, as it would
+//   // generally only hurt scalability to leave bits of CoeffRow unused.
+//   typename CoeffRow;
+//   // An unsigned integer type big enough to hold a result row (b bits,
+//   // or number of solution/result columns).
+//   // In many applications, especially filters, the number of result
+//   // columns is decided at run time, so ResultRow simply needs to be
+//   // big enough for the largest number of columns allowed.
+//   typename ResultRow;
+//   // An unsigned integer type sufficient for representing the number of
+//   // rows in the solution structure, and at least the arithmetic
+//   // promotion size (usually 32 bits). uint32_t recommended because a
+//   // single Ribbon construction doesn't really scale to billions of
+//   // entries.
+//   typename Index;
+// };
+
+// ######################################################################
+// ######################## Hashers and Banding #########################
+
+// Hasher concepts abstract out hashing details.
+
+// concept PhsfQueryHasher extends RibbonTypes {
+//   // Type for a lookup key, which is hashable.
+//   typename Key;
+//
+//   // Type for hashed summary of a Key. uint64_t is recommended.
+//   typename Hash;
+//
+//   // Compute a hash value summarizing a Key
+//   Hash GetHash(const Key &) const;
+//
+//   // Given a hash value and a number of columns that can start an
+//   // r-sequence of coefficients (== m - r + 1), return the start
+//   // column to associate with that hash value. (Starts can be chosen
+//   // uniformly or "smash" extra entries into the beginning and end for
+//   // better utilization at those extremes of the structure. Details in
+//   // ribbon.impl.h)
+//   Index GetStart(Hash, Index num_starts) const;
+//
+//   // Given a hash value, return the r-bit sequence of coefficients to
+//   // associate with it. It's generally OK if
+//   //   sizeof(CoeffRow) > sizeof(Hash)
+//   // as long as the hash itself is not too prone to collisions for the
+//   // applications and the CoeffRow is generated uniformly from
+//   // available hash data, but relatively independent of the start.
+//   //
+//   // Must be non-zero, because that's required for a solution to exist
+//   // when mapping to non-zero result row. (Note: BandingAdd could be
+//   // modified to allow 0 coeff row if that only occurs with 0 result
+//   // row, which really only makes sense for filter implementation,
+//   // where both values are hash-derived. Or BandingAdd could reject 0
+//   // coeff row, forcing next seed, but that has potential problems with
+//   // generality/scalability.)
+//   CoeffRow GetCoeffRow(Hash) const;
+// };
+
+// concept FilterQueryHasher extends PhsfQueryHasher {
+//   // For building or querying a filter, this returns the expected
+//   // result row associated with a hashed input. For general PHSF,
+//   // this must return 0.
+//   //
+//   // Although not strictly required, there's a slightly better chance of
+//   // solver success if result row is masked down here to only the bits
+//   // actually needed.
+//   ResultRow GetResultRowFromHash(Hash) const;
+// }
+
+// concept BandingHasher extends FilterQueryHasher {
+//   // For a filter, this will generally be the same as Key.
+//   // For a general PHSF, it must either
+//   // (a) include a key and a result it maps to (e.g. in a std::pair), or
+//   // (b) GetResultRowFromInput looks up the result somewhere rather than
+//   // extracting it.
+//   typename AddInput;
+//
+//   // Instead of requiring a way to extract a Key from an
+//   // AddInput, we require getting the hash of the Key part
+//   // of an AddInput, which is trivial if AddInput == Key.
+//   Hash GetHash(const AddInput &) const;
+//
+//   // For building a non-filter PHSF, this extracts or looks up the result
+//   // row to associate with an input. For filter PHSF, this must return 0.
+//   ResultRow GetResultRowFromInput(const AddInput &) const;
+//
+//   // Whether the solver can assume the lowest bit of GetCoeffRow is
+//   // always 1. When true, it should improve solver efficiency slightly.
+//   static bool kFirstCoeffAlwaysOne;
+// }
+
+// Abstract storage for the the result of "banding" the inputs (Gaussian
+// elimination to an upper-triangular boolean band matrix). Because the
+// banding is an incremental / on-the-fly algorithm, this also represents
+// all the intermediate state between input entries.
+//
+// concept BandingStorage extends RibbonTypes {
+//   // Tells the banding algorithm to prefetch memory associated with
+//   // the next input before processing the current input. Generally
+//   // recommended iff the BandingStorage doesn't easily fit in CPU
+//   // cache.
+//   bool UsePrefetch() const;
+//
+//   // Prefetches (e.g. __builtin_prefetch) memory associated with a
+//   // slot index i.
+//   void Prefetch(Index i) const;
+//
+//   // Load or store CoeffRow and ResultRow for slot index i.
+//   // (Gaussian row operations involve both sides of the equation.)
+//   // Bool `for_back_subst` indicates that customizing values for
+//   // unconstrained solution rows (cr == 0) is allowed.
+//   void LoadRow(Index i, CoeffRow *cr, ResultRow *rr, bool for_back_subst)
+//        const;
+//   void StoreRow(Index i, CoeffRow cr, ResultRow rr);
+//
+//   // Returns the number of columns that can start an r-sequence of
+//   // coefficients, which is the number of slots minus r (kCoeffBits)
+//   // plus one. (m - r + 1)
+//   Index GetNumStarts() const;
+// };
+
+// Optional storage for backtracking data in banding a set of input
+// entries. It exposes an array structure which will generally be
+// used as a stack. It must be able to accommodate as many entries
+// as are passed in as inputs to `BandingAddRange`.
+//
+// concept BacktrackStorage extends RibbonTypes {
+//   // If false, backtracking support will be disabled in the algorithm.
+//   // This should preferably be an inline compile-time constant function.
+//   bool UseBacktrack() const;
+//
+//   // Records `to_save` as the `i`th backtrack entry
+//   void BacktrackPut(Index i, Index to_save);
+//
+//   // Recalls the `i`th backtrack entry
+//   Index BacktrackGet(Index i) const;
+// }
+
+// Adds a single entry to BandingStorage (and optionally, BacktrackStorage),
+// returning true if successful or false if solution is impossible with
+// current hasher (and presumably its seed) and number of "slots" (solution
+// or banding rows). (A solution is impossible when there is a linear
+// dependence among the inputs that doesn't "cancel out".)
+//
+// Pre- and post-condition: the BandingStorage represents a band matrix
+// ready for back substitution (row echelon form except for zero rows),
+// augmented with result values such that back substitution would give a
+// solution satisfying all the cr@start -> rr entries added.
+template <bool kFirstCoeffAlwaysOne, typename BandingStorage,
+          typename BacktrackStorage>
+bool BandingAdd(BandingStorage *bs, typename BandingStorage::Index start,
+                typename BandingStorage::ResultRow rr,
+                typename BandingStorage::CoeffRow cr, BacktrackStorage *bts,
+                typename BandingStorage::Index *backtrack_pos) {
+  using CoeffRow = typename BandingStorage::CoeffRow;
+  using ResultRow = typename BandingStorage::ResultRow;
+  using Index = typename BandingStorage::Index;
+
+  Index i = start;
+
+  if (!kFirstCoeffAlwaysOne) {
+    // Requires/asserts that cr != 0
+    int tz = CountTrailingZeroBits(cr);
+    i += static_cast<Index>(tz);
+    cr >>= tz;
+  }
+
+  for (;;) {
+    assert((cr & 1) == 1);
+    CoeffRow cr_at_i;
+    ResultRow rr_at_i;
+    bs->LoadRow(i, &cr_at_i, &rr_at_i, /* for_back_subst */ false);
+    if (cr_at_i == 0) {
+      bs->StoreRow(i, cr, rr);
+      bts->BacktrackPut(*backtrack_pos, i);
+      ++*backtrack_pos;
+      return true;
+    }
+    assert((cr_at_i & 1) == 1);
+    // Gaussian row reduction
+    cr ^= cr_at_i;
+    rr ^= rr_at_i;
+    if (cr == 0) {
+      // Inconsistency or (less likely) redundancy
+      break;
+    }
+    // Find relative offset of next non-zero coefficient.
+    int tz = CountTrailingZeroBits(cr);
+    i += static_cast<Index>(tz);
+    cr >>= tz;
+  }
+
+  // Failed, unless result row == 0 because e.g. a duplicate input or a
+  // stock hash collision, with same result row. (For filter, stock hash
+  // collision implies same result row.) Or we could have a full equation
+  // equal to sum of other equations, which is very possible with
+  // small range of values for result row.
+  return rr == 0;
+}
+
+// Adds a range of entries to BandingStorage returning true if successful
+// or false if solution is impossible with current hasher (and presumably
+// its seed) and number of "slots" (solution or banding rows). (A solution
+// is impossible when there is a linear dependence among the inputs that
+// doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs.
+//
+// If UseBacktrack in the BacktrackStorage, this function call rolls back
+// to prior state on failure. If !UseBacktrack, some subset of the entries
+// will have been added to the BandingStorage, so best considered to be in
+// an indeterminate state.
+//
+template <typename BandingStorage, typename BacktrackStorage,
+          typename BandingHasher, typename InputIterator>
+bool BandingAddRange(BandingStorage *bs, BacktrackStorage *bts,
+                     const BandingHasher &bh, InputIterator begin,
+                     InputIterator end) {
+  using CoeffRow = typename BandingStorage::CoeffRow;
+  using Index = typename BandingStorage::Index;
+  using ResultRow = typename BandingStorage::ResultRow;
+  using Hash = typename BandingHasher::Hash;
+
+  static_assert(IsUnsignedUpTo128<CoeffRow>::value, "must be unsigned");
+  static_assert(IsUnsignedUpTo128<Index>::value, "must be unsigned");
+  static_assert(IsUnsignedUpTo128<ResultRow>::value, "must be unsigned");
+
+  constexpr bool kFCA1 = BandingHasher::kFirstCoeffAlwaysOne;
+
+  if (begin == end) {
+    // trivial
+    return true;
+  }
+
+  const Index num_starts = bs->GetNumStarts();
+
+  InputIterator cur = begin;
+  Index backtrack_pos = 0;
+  if (!bs->UsePrefetch()) {
+    // Simple version, no prefetch
+    for (;;) {
+      Hash h = bh.GetHash(*cur);
+      Index start = bh.GetStart(h, num_starts);
+      ResultRow rr =
+          bh.GetResultRowFromInput(*cur) | bh.GetResultRowFromHash(h);
+      CoeffRow cr = bh.GetCoeffRow(h);
+
+      if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
+        break;
+      }
+      if ((++cur) == end) {
+        return true;
+      }
+    }
+  } else {
+    // Pipelined w/prefetch
+    // Prime the pipeline
+    Hash h = bh.GetHash(*cur);
+    Index start = bh.GetStart(h, num_starts);
+    ResultRow rr = bh.GetResultRowFromInput(*cur);
+    bs->Prefetch(start);
+
+    // Pipeline
+    for (;;) {
+      rr |= bh.GetResultRowFromHash(h);
+      CoeffRow cr = bh.GetCoeffRow(h);
+      if ((++cur) == end) {
+        if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
+          break;
+        }
+        return true;
+      }
+      Hash next_h = bh.GetHash(*cur);
+      Index next_start = bh.GetStart(next_h, num_starts);
+      ResultRow next_rr = bh.GetResultRowFromInput(*cur);
+      bs->Prefetch(next_start);
+      if (!BandingAdd<kFCA1>(bs, start, rr, cr, bts, &backtrack_pos)) {
+        break;
+      }
+      h = next_h;
+      start = next_start;
+      rr = next_rr;
+    }
+  }
+  // failed; backtrack (if implemented)
+  if (bts->UseBacktrack()) {
+    while (backtrack_pos > 0) {
+      --backtrack_pos;
+      Index i = bts->BacktrackGet(backtrack_pos);
+      // Clearing the ResultRow is not strictly required, but is required
+      // for good FP rate on inputs that might have been backtracked out.
+      // (We don't want anything we've backtracked on to leak into final
+      // result, as that might not be "harmless".)
+      bs->StoreRow(i, 0, 0);
+    }
+  }
+  return false;
+}
+
+// Adds a range of entries to BandingStorage returning true if successful
+// or false if solution is impossible with current hasher (and presumably
+// its seed) and number of "slots" (solution or banding rows). (A solution
+// is impossible when there is a linear dependence among the inputs that
+// doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs.
+//
+// On failure, some subset of the entries will have been added to the
+// BandingStorage, so best considered to be in an indeterminate state.
+//
+template <typename BandingStorage, typename BandingHasher,
+          typename InputIterator>
+bool BandingAddRange(BandingStorage *bs, const BandingHasher &bh,
+                     InputIterator begin, InputIterator end) {
+  using Index = typename BandingStorage::Index;
+  struct NoopBacktrackStorage {
+    bool UseBacktrack() { return false; }
+    void BacktrackPut(Index, Index) {}
+    Index BacktrackGet(Index) {
+      assert(false);
+      return 0;
+    }
+  } nbts;
+  return BandingAddRange(bs, &nbts, bh, begin, end);
+}
+
+// ######################################################################
+// ######################### Solution Storage ###########################
+
+// Back-substitution and query algorithms unfortunately depend on some
+// details of data layout in the final data structure ("solution"). Thus,
+// there is no common SolutionStorage covering all the reasonable
+// possibilities.
+
+// ###################### SimpleSolutionStorage #########################
+
+// SimpleSolutionStorage is for a row-major storage, typically with no
+// unused bits in each ResultRow. This is mostly for demonstration
+// purposes as the simplest solution storage scheme. It is relatively slow
+// for filter queries.
+
+// concept SimpleSolutionStorage extends RibbonTypes {
+//   // This is called at the beginning of back-substitution for the
+//   // solution storage to do any remaining configuration before data
+//   // is stored to it. If configuration is previously finalized, this
+//   // could be a simple assertion or even no-op. Ribbon algorithms
+//   // only call this from back-substitution, and only once per call,
+//   // before other functions here.
+//   void PrepareForNumStarts(Index num_starts) const;
+//   // Must return num_starts passed to PrepareForNumStarts, or the most
+//   // recent call to PrepareForNumStarts if this storage object can be
+//   // reused. Note that num_starts == num_slots - kCoeffBits + 1 because
+//   // there must be a run of kCoeffBits slots starting from each start.
+//   Index GetNumStarts() const;
+//   // Load the solution row (type ResultRow) for a slot
+//   ResultRow Load(Index slot_num) const;
+//   // Store the solution row (type ResultRow) for a slot
+//   void Store(Index slot_num, ResultRow data);
+// };
+
+// Back-substitution for generating a solution from BandingStorage to
+// SimpleSolutionStorage.
+template <typename SimpleSolutionStorage, typename BandingStorage>
+void SimpleBackSubst(SimpleSolutionStorage *sss, const BandingStorage &bs) {
+  using CoeffRow = typename BandingStorage::CoeffRow;
+  using Index = typename BandingStorage::Index;
+  using ResultRow = typename BandingStorage::ResultRow;
+
+  static_assert(sizeof(Index) == sizeof(typename SimpleSolutionStorage::Index),
+                "must be same");
+  static_assert(
+      sizeof(CoeffRow) == sizeof(typename SimpleSolutionStorage::CoeffRow),
+      "must be same");
+  static_assert(
+      sizeof(ResultRow) == sizeof(typename SimpleSolutionStorage::ResultRow),
+      "must be same");
+
+  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
+  constexpr auto kResultBits = static_cast<Index>(sizeof(ResultRow) * 8U);
+
+  // A column-major buffer of the solution matrix, containing enough
+  // recently-computed solution data to compute the next solution row
+  // (based also on banding data).
+  std::array<CoeffRow, kResultBits> state;
+  state.fill(0);
+
+  const Index num_starts = bs.GetNumStarts();
+  sss->PrepareForNumStarts(num_starts);
+  const Index num_slots = num_starts + kCoeffBits - 1;
+
+  for (Index i = num_slots; i > 0;) {
+    --i;
+    CoeffRow cr;
+    ResultRow rr;
+    bs.LoadRow(i, &cr, &rr, /* for_back_subst */ true);
+    // solution row
+    ResultRow sr = 0;
+    for (Index j = 0; j < kResultBits; ++j) {
+      // Compute next solution bit at row i, column j (see derivation below)
+      CoeffRow tmp = state[j] << 1;
+      bool bit = (BitParity(tmp & cr) ^ ((rr >> j) & 1)) != 0;
+      tmp |= bit ? CoeffRow{1} : CoeffRow{0};
+
+      // Now tmp is solution at column j from row i for next kCoeffBits
+      // more rows. Thus, for valid solution, the dot product of the
+      // solution column with the coefficient row has to equal the result
+      // at that column,
+      //   BitParity(tmp & cr) == ((rr >> j) & 1)
+
+      // Update state.
+      state[j] = tmp;
+      // add to solution row
+      sr |= (bit ? ResultRow{1} : ResultRow{0}) << j;
+    }
+    sss->Store(i, sr);
+  }
+}
+
+// Common functionality for querying a key (already hashed) in
+// SimpleSolutionStorage.
+template <typename SimpleSolutionStorage>
+typename SimpleSolutionStorage::ResultRow SimpleQueryHelper(
+    typename SimpleSolutionStorage::Index start_slot,
+    typename SimpleSolutionStorage::CoeffRow cr,
+    const SimpleSolutionStorage &sss) {
+  using CoeffRow = typename SimpleSolutionStorage::CoeffRow;
+  using ResultRow = typename SimpleSolutionStorage::ResultRow;
+
+  constexpr unsigned kCoeffBits = static_cast<unsigned>(sizeof(CoeffRow) * 8U);
+
+  ResultRow result = 0;
+  for (unsigned i = 0; i < kCoeffBits; ++i) {
+    // Bit masking whole value is generally faster here than 'if'
+    result ^= sss.Load(start_slot + i) &
+              (ResultRow{0} - (static_cast<ResultRow>(cr >> i) & ResultRow{1}));
+  }
+  return result;
+}
+
+// General PHSF query a key from SimpleSolutionStorage.
+template <typename SimpleSolutionStorage, typename PhsfQueryHasher>
+typename SimpleSolutionStorage::ResultRow SimplePhsfQuery(
+    const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher,
+    const SimpleSolutionStorage &sss) {
+  const typename PhsfQueryHasher::Hash hash = hasher.GetHash(key);
+
+  static_assert(sizeof(typename SimpleSolutionStorage::Index) ==
+                    sizeof(typename PhsfQueryHasher::Index),
+                "must be same");
+  static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) ==
+                    sizeof(typename PhsfQueryHasher::CoeffRow),
+                "must be same");
+
+  return SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()),
+                           hasher.GetCoeffRow(hash), sss);
+}
+
+// Filter query a key from SimpleSolutionStorage.
+template <typename SimpleSolutionStorage, typename FilterQueryHasher>
+bool SimpleFilterQuery(const typename FilterQueryHasher::Key &key,
+                       const FilterQueryHasher &hasher,
+                       const SimpleSolutionStorage &sss) {
+  const typename FilterQueryHasher::Hash hash = hasher.GetHash(key);
+  const typename SimpleSolutionStorage::ResultRow expected =
+      hasher.GetResultRowFromHash(hash);
+
+  static_assert(sizeof(typename SimpleSolutionStorage::Index) ==
+                    sizeof(typename FilterQueryHasher::Index),
+                "must be same");
+  static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) ==
+                    sizeof(typename FilterQueryHasher::CoeffRow),
+                "must be same");
+  static_assert(sizeof(typename SimpleSolutionStorage::ResultRow) ==
+                    sizeof(typename FilterQueryHasher::ResultRow),
+                "must be same");
+
+  return expected ==
+         SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()),
+                           hasher.GetCoeffRow(hash), sss);
+}
+
+// #################### InterleavedSolutionStorage ######################
+
+// InterleavedSolutionStorage is row-major at a high level, for good
+// locality, and column-major at a low level, for CPU efficiency
+// especially in filter queries or relatively small number of result bits
+// (== solution columns). The storage is a sequence of "blocks" where a
+// block has one CoeffRow-sized segment for each solution column. Each
+// query spans at most two blocks; the starting solution row is typically
+// in the row-logical middle of a block and spans to the middle of the
+// next block. (See diagram below.)
+//
+// InterleavedSolutionStorage supports choosing b (number of result or
+// solution columns) at run time, and even supports mixing b and b-1 solution
+// columns in a single linear system solution, for filters that can
+// effectively utilize any size space (multiple of CoeffRow) for minimizing
+// FP rate for any number of added keys. To simplify query implementation
+// (with lower-index columns first), the b-bit portion comes after the b-1
+// portion of the structure.
+//
+// Diagram (=== marks logical block boundary; b=4; ### is data used by a
+// query crossing the b-1 to b boundary, each Segment has type CoeffRow):
+//  ...
+// +======================+
+// | S e g m e n t  col=0 |
+// +----------------------+
+// | S e g m e n t  col=1 |
+// +----------------------+
+// | S e g m e n t  col=2 |
+// +======================+
+// | S e g m e n #########|
+// +----------------------+
+// | S e g m e n #########|
+// +----------------------+
+// | S e g m e n #########|
+// +======================+ Result/solution columns: above = 3, below = 4
+// |#############t  col=0 |
+// +----------------------+
+// |#############t  col=1 |
+// +----------------------+
+// |#############t  col=2 |
+// +----------------------+
+// | S e g m e n t  col=3 |
+// +======================+
+// | S e g m e n t  col=0 |
+// +----------------------+
+// | S e g m e n t  col=1 |
+// +----------------------+
+// | S e g m e n t  col=2 |
+// +----------------------+
+// | S e g m e n t  col=3 |
+// +======================+
+//  ...
+//
+// InterleavedSolutionStorage will be adapted by the algorithms from
+// simple array-like segment storage. That array-like storage is templatized
+// in part so that an implementation may choose to handle byte ordering
+// at access time.
+//
+// concept InterleavedSolutionStorage extends RibbonTypes {
+//   // This is called at the beginning of back-substitution for the
+//   // solution storage to do any remaining configuration before data
+//   // is stored to it. If configuration is previously finalized, this
+//   // could be a simple assertion or even no-op. Ribbon algorithms
+//   // only call this from back-substitution, and only once per call,
+//   // before other functions here.
+//   void PrepareForNumStarts(Index num_starts) const;
+//   // Must return num_starts passed to PrepareForNumStarts, or the most
+//   // recent call to PrepareForNumStarts if this storage object can be
+//   // reused. Note that num_starts == num_slots - kCoeffBits + 1 because
+//   // there must be a run of kCoeffBits slots starting from each start.
+//   Index GetNumStarts() const;
+//   // The larger number of solution columns used (called "b" above).
+//   Index GetUpperNumColumns() const;
+//   // If returns > 0, then block numbers below that use
+//   // GetUpperNumColumns() - 1 columns per solution row, and the rest
+//   // use GetUpperNumColumns(). A block represents kCoeffBits "slots",
+//   // where all but the last kCoeffBits - 1 slots are also starts. And
+//   // a block contains a segment for each solution column.
+//   // An implementation may only support uniform columns per solution
+//   // row and return constant 0 here.
+//   Index GetUpperStartBlock() const;
+//
+//   // ### "Array of segments" portion of API ###
+//   // The number of values of type CoeffRow used in this solution
+//   // representation. (This value can be inferred from the previous
+//   // three functions, but is expected at least for sanity / assertion
+//   // checking.)
+//   Index GetNumSegments() const;
+//   // Load an entry from the logical array of segments
+//   CoeffRow LoadSegment(Index segment_num) const;
+//   // Store an entry to the logical array of segments
+//   void StoreSegment(Index segment_num, CoeffRow data);
+// };
+
+// A helper for InterleavedBackSubst.
+template <typename BandingStorage>
+inline void BackSubstBlock(typename BandingStorage::CoeffRow *state,
+                           typename BandingStorage::Index num_columns,
+                           const BandingStorage &bs,
+                           typename BandingStorage::Index start_slot) {
+  using CoeffRow = typename BandingStorage::CoeffRow;
+  using Index = typename BandingStorage::Index;
+  using ResultRow = typename BandingStorage::ResultRow;
+
+  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
+
+  for (Index i = start_slot + kCoeffBits; i > start_slot;) {
+    --i;
+    CoeffRow cr;
+    ResultRow rr;
+    bs.LoadRow(i, &cr, &rr, /* for_back_subst */ true);
+    for (Index j = 0; j < num_columns; ++j) {
+      // Compute next solution bit at row i, column j (see derivation below)
+      CoeffRow tmp = state[j] << 1;
+      int bit = BitParity(tmp & cr) ^ ((rr >> j) & 1);
+      tmp |= static_cast<CoeffRow>(bit);
+
+      // Now tmp is solution at column j from row i for next kCoeffBits
+      // more rows. Thus, for valid solution, the dot product of the
+      // solution column with the coefficient row has to equal the result
+      // at that column,
+      //   BitParity(tmp & cr) == ((rr >> j) & 1)
+
+      // Update state.
+      state[j] = tmp;
+    }
+  }
+}
+
+// Back-substitution for generating a solution from BandingStorage to
+// InterleavedSolutionStorage.
+template <typename InterleavedSolutionStorage, typename BandingStorage>
+void InterleavedBackSubst(InterleavedSolutionStorage *iss,
+                          const BandingStorage &bs) {
+  using CoeffRow = typename BandingStorage::CoeffRow;
+  using Index = typename BandingStorage::Index;
+
+  static_assert(
+      sizeof(Index) == sizeof(typename InterleavedSolutionStorage::Index),
+      "must be same");
+  static_assert(
+      sizeof(CoeffRow) == sizeof(typename InterleavedSolutionStorage::CoeffRow),
+      "must be same");
+
+  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
+
+  const Index num_starts = bs.GetNumStarts();
+  // Although it might be nice to have a filter that returns "always false"
+  // when no key is added, we aren't specifically supporting that here
+  // because it would require another condition branch in the query.
+  assert(num_starts > 0);
+  iss->PrepareForNumStarts(num_starts);
+
+  const Index num_slots = num_starts + kCoeffBits - 1;
+  assert(num_slots % kCoeffBits == 0);
+  const Index num_blocks = num_slots / kCoeffBits;
+  const Index num_segments = iss->GetNumSegments();
+
+  // For now upper, then lower
+  Index num_columns = iss->GetUpperNumColumns();
+  const Index upper_start_block = iss->GetUpperStartBlock();
+
+  if (num_columns == 0) {
+    // Nothing to do, presumably because there's not enough space for even
+    // a single segment.
+    assert(num_segments == 0);
+    // When num_columns == 0, a Ribbon filter query will always return true,
+    // or a PHSF query always 0.
+    return;
+  }
+
+  // We should be utilizing all available segments
+  assert(num_segments == (upper_start_block * (num_columns - 1)) +
+                             ((num_blocks - upper_start_block) * num_columns));
+
+  // TODO: consider fixed-column specializations with stack-allocated state
+
+  // A column-major buffer of the solution matrix, containing enough
+  // recently-computed solution data to compute the next solution row
+  // (based also on banding data).
+  std::unique_ptr<CoeffRow[]> state{new CoeffRow[num_columns]()};
+
+  Index block = num_blocks;
+  Index segment_num = num_segments;
+  while (block > upper_start_block) {
+    --block;
+    BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits);
+    segment_num -= num_columns;
+    for (Index i = 0; i < num_columns; ++i) {
+      iss->StoreSegment(segment_num + i, state[i]);
+    }
+  }
+  // Now (if applicable), region using lower number of columns
+  // (This should be optimized away if GetUpperStartBlock() returns
+  // constant 0.)
+  --num_columns;
+  while (block > 0) {
+    --block;
+    BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits);
+    segment_num -= num_columns;
+    for (Index i = 0; i < num_columns; ++i) {
+      iss->StoreSegment(segment_num + i, state[i]);
+    }
+  }
+  // Verify everything processed
+  assert(block == 0);
+  assert(segment_num == 0);
+}
+
+// Prefetch memory for a key in InterleavedSolutionStorage.
+template <typename InterleavedSolutionStorage, typename PhsfQueryHasher>
+inline void InterleavedPrepareQuery(
+    const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher,
+    const InterleavedSolutionStorage &iss,
+    typename PhsfQueryHasher::Hash *saved_hash,
+    typename InterleavedSolutionStorage::Index *saved_segment_num,
+    typename InterleavedSolutionStorage::Index *saved_num_columns,
+    typename InterleavedSolutionStorage::Index *saved_start_bit) {
+  using Hash = typename PhsfQueryHasher::Hash;
+  using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
+  using Index = typename InterleavedSolutionStorage::Index;
+
+  static_assert(sizeof(Index) == sizeof(typename PhsfQueryHasher::Index),
+                "must be same");
+
+  const Hash hash = hasher.GetHash(key);
+  const Index start_slot = hasher.GetStart(hash, iss.GetNumStarts());
+
+  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
+
+  const Index upper_start_block = iss.GetUpperStartBlock();
+  Index num_columns = iss.GetUpperNumColumns();
+  Index start_block_num = start_slot / kCoeffBits;
+  Index segment_num = start_block_num * num_columns -
+                      std::min(start_block_num, upper_start_block);
+  // Change to lower num columns if applicable.
+  // (This should not compile to a conditional branch.)
+  num_columns -= (start_block_num < upper_start_block) ? 1 : 0;
+
+  Index start_bit = start_slot % kCoeffBits;
+
+  Index segment_count = num_columns + (start_bit == 0 ? 0 : num_columns);
+
+  iss.PrefetchSegmentRange(segment_num, segment_num + segment_count);
+
+  *saved_hash = hash;
+  *saved_segment_num = segment_num;
+  *saved_num_columns = num_columns;
+  *saved_start_bit = start_bit;
+}
+
+// General PHSF query from InterleavedSolutionStorage, using data for
+// the query key from InterleavedPrepareQuery
+template <typename InterleavedSolutionStorage, typename PhsfQueryHasher>
+inline typename InterleavedSolutionStorage::ResultRow InterleavedPhsfQuery(
+    typename PhsfQueryHasher::Hash hash,
+    typename InterleavedSolutionStorage::Index segment_num,
+    typename InterleavedSolutionStorage::Index num_columns,
+    typename InterleavedSolutionStorage::Index start_bit,
+    const PhsfQueryHasher &hasher, const InterleavedSolutionStorage &iss) {
+  using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
+  using Index = typename InterleavedSolutionStorage::Index;
+  using ResultRow = typename InterleavedSolutionStorage::ResultRow;
+
+  static_assert(sizeof(Index) == sizeof(typename PhsfQueryHasher::Index),
+                "must be same");
+  static_assert(sizeof(CoeffRow) == sizeof(typename PhsfQueryHasher::CoeffRow),
+                "must be same");
+
+  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
+
+  const CoeffRow cr = hasher.GetCoeffRow(hash);
+
+  ResultRow sr = 0;
+  const CoeffRow cr_left = cr << static_cast<unsigned>(start_bit);
+  for (Index i = 0; i < num_columns; ++i) {
+    sr ^= BitParity(iss.LoadSegment(segment_num + i) & cr_left) << i;
+  }
+
+  if (start_bit > 0) {
+    segment_num += num_columns;
+    const CoeffRow cr_right =
+        cr >> static_cast<unsigned>(kCoeffBits - start_bit);
+    for (Index i = 0; i < num_columns; ++i) {
+      sr ^= BitParity(iss.LoadSegment(segment_num + i) & cr_right) << i;
+    }
+  }
+
+  return sr;
+}
+
+// Filter query a key from InterleavedFilterQuery.
+template <typename InterleavedSolutionStorage, typename FilterQueryHasher>
+inline bool InterleavedFilterQuery(
+    typename FilterQueryHasher::Hash hash,
+    typename InterleavedSolutionStorage::Index segment_num,
+    typename InterleavedSolutionStorage::Index num_columns,
+    typename InterleavedSolutionStorage::Index start_bit,
+    const FilterQueryHasher &hasher, const InterleavedSolutionStorage &iss) {
+  using CoeffRow = typename InterleavedSolutionStorage::CoeffRow;
+  using Index = typename InterleavedSolutionStorage::Index;
+  using ResultRow = typename InterleavedSolutionStorage::ResultRow;
+
+  static_assert(sizeof(Index) == sizeof(typename FilterQueryHasher::Index),
+                "must be same");
+  static_assert(
+      sizeof(CoeffRow) == sizeof(typename FilterQueryHasher::CoeffRow),
+      "must be same");
+  static_assert(
+      sizeof(ResultRow) == sizeof(typename FilterQueryHasher::ResultRow),
+      "must be same");
+
+  constexpr auto kCoeffBits = static_cast<Index>(sizeof(CoeffRow) * 8U);
+
+  const CoeffRow cr = hasher.GetCoeffRow(hash);
+  const ResultRow expected = hasher.GetResultRowFromHash(hash);
+
+  // TODO: consider optimizations such as
+  // * get rid of start_bit == 0 condition with careful fetching & shifting
+  if (start_bit == 0) {
+    for (Index i = 0; i < num_columns; ++i) {
+      if (BitParity(iss.LoadSegment(segment_num + i) & cr) !=
+          (static_cast<int>(expected >> i) & 1)) {
+        return false;
+      }
+    }
+  } else {
+    const CoeffRow cr_left = cr << static_cast<unsigned>(start_bit);
+    const CoeffRow cr_right =
+        cr >> static_cast<unsigned>(kCoeffBits - start_bit);
+
+    for (Index i = 0; i < num_columns; ++i) {
+      CoeffRow soln_data =
+          (iss.LoadSegment(segment_num + i) & cr_left) ^
+          (iss.LoadSegment(segment_num + num_columns + i) & cr_right);
+      if (BitParity(soln_data) != (static_cast<int>(expected >> i) & 1)) {
+        return false;
+      }
+    }
+  }
+  // otherwise, all match
+  return true;
+}
+
+// TODO: refactor Interleaved*Query so that queries can be "prepared" by
+// prefetching memory, to hide memory latency for multiple queries in a
+// single thread.
+
+}  // namespace ribbon
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/ribbon_config.cc b/util/ribbon_config.cc
new file mode 100644
index 00000000000..c1046f4aaa8
--- /dev/null
+++ b/util/ribbon_config.cc
@@ -0,0 +1,506 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/ribbon_config.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace ribbon {
+
+namespace detail {
+
+// Each instantiation of this struct is sufficiently unique for configuration
+// purposes, and is only instantiated for settings where we support the
+// configuration API. An application might only reference one instantiation,
+// meaning the rest could be pruned at link time.
+template <ConstructionFailureChance kCfc, uint64_t kCoeffBits, bool kUseSmash>
+struct BandingConfigHelperData {
+  static constexpr size_t kKnownSize = 18U;
+
+  // Because of complexity in the data, for smaller numbers of slots
+  // (powers of two up to 2^17), we record known numbers that can be added
+  // with kCfc chance of construction failure and settings in template
+  // parameters. Zero means "unsupported (too small) number of slots".
+  // (GetNumToAdd below will use interpolation for numbers of slots
+  // between powers of two; double rather than integer values here make
+  // that more accurate.)
+  static const std::array<double, kKnownSize> kKnownToAddByPow2;
+
+  // For sufficiently large number of slots, doubling the number of
+  // slots will increase the expected overhead (slots over number added)
+  // by approximately this constant.
+  // (This is roughly constant regardless of ConstructionFailureChance and
+  // smash setting.)
+  // (Would be a constant if we had partial template specialization for
+  // static const members.)
+  static inline double GetFactorPerPow2() {
+    if (kCoeffBits == 128U) {
+      return 0.0038;
+    } else {
+      assert(kCoeffBits == 64U);
+      return 0.0083;
+    }
+  }
+
+  // Overhead factor for 2^(kKnownSize-1) slots
+  // (Would be a constant if we had partial template specialization for
+  // static const members.)
+  static inline double GetFinalKnownFactor() {
+    return 1.0 * (uint32_t{1} << (kKnownSize - 1)) /
+           kKnownToAddByPow2[kKnownSize - 1];
+  }
+
+  // GetFinalKnownFactor() - (kKnownSize-1) * GetFactorPerPow2()
+  // (Would be a constant if we had partial template specialization for
+  // static const members.)
+  static inline double GetBaseFactor() {
+    return GetFinalKnownFactor() - (kKnownSize - 1) * GetFactorPerPow2();
+  }
+
+  // Get overhead factor (slots over number to add) for sufficiently large
+  // number of slots (by log base 2)
+  static inline double GetFactorForLarge(double log2_num_slots) {
+    return GetBaseFactor() + log2_num_slots * GetFactorPerPow2();
+  }
+
+  // For a given power of two number of slots (specified by whole number
+  // log base 2), implements GetNumToAdd for such limited case, returning
+  // double for better interpolation in GetNumToAdd and GetNumSlots.
+  static inline double GetNumToAddForPow2(uint32_t log2_num_slots) {
+    assert(log2_num_slots <= 32);  // help clang-analyze
+    if (log2_num_slots < kKnownSize) {
+      return kKnownToAddByPow2[log2_num_slots];
+    } else {
+      return 1.0 * (uint64_t{1} << log2_num_slots) /
+             GetFactorForLarge(1.0 * log2_num_slots);
+    }
+  }
+};
+
+// Based on data from FindOccupancy in ribbon_test
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn2, 128U, false>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        252.984,
+        506.109,
+        1013.71,
+        2029.47,
+        4060.43,
+        8115.63,
+        16202.2,
+        32305.1,
+        64383.5,
+        128274,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn2, 128U, /*smash*/ true>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        126.274,
+        254.279,
+        510.27,
+        1022.24,
+        2046.02,
+        4091.99,
+        8154.98,
+        16244.3,
+        32349.7,
+        64426.6,
+        128307,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn2, 64U, false>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        124.94,
+        249.968,
+        501.234,
+        1004.06,
+        2006.15,
+        3997.89,
+        7946.99,
+        15778.4,
+        31306.9,
+        62115.3,
+        123284,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn2, 64U, /*smash*/ true>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        62.2683,
+        126.259,
+        254.268,
+        509.975,
+        1019.98,
+        2026.16,
+        4019.75,
+        7969.8,
+        15798.2,
+        31330.3,
+        62134.2,
+        123255,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn20, 128U, false>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        248.851,
+        499.532,
+        1001.26,
+        2003.97,
+        4005.59,
+        8000.39,
+        15966.6,
+        31828.1,
+        63447.3,
+        126506,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn20, 128U, /*smash*/ true>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        122.637,
+        250.651,
+        506.625,
+        1018.54,
+        2036.43,
+        4041.6,
+        8039.25,
+        16005,
+        31869.6,
+        63492.8,
+        126537,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn20, 64U, false>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        120.659,
+        243.346,
+        488.168,
+        976.373,
+        1948.86,
+        3875.85,
+        7704.97,
+        15312.4,
+        30395.1,
+        60321.8,
+        119813,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn20, 64U, /*smash*/ true>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        58.6016,
+        122.619,
+        250.641,
+        503.595,
+        994.165,
+        1967.36,
+        3898.17,
+        7727.21,
+        15331.5,
+        30405.8,
+        60376.2,
+        119836,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn1000, 128U, false>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        242.61,
+        491.887,
+        983.603,
+        1968.21,
+        3926.98,
+        7833.99,
+        15629,
+        31199.9,
+        62307.8,
+        123870,
+    }};
+
+template <>
+const std::array<double, 18> BandingConfigHelperData<
+    kOneIn1000, 128U, /*smash*/ true>::kKnownToAddByPow2{{
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,  // unsupported
+    117.19,
+    245.105,
+    500.748,
+    1010.67,
+    1993.4,
+    3950.01,
+    7863.31,
+    15652,
+    31262.1,
+    62462.8,
+    124095,
+}};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn1000, 64U, false>::kKnownToAddByPow2{{
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,  // unsupported
+        114,
+        234.8,
+        471.498,
+        940.165,
+        1874,
+        3721.5,
+        7387.5,
+        14592,
+        29160,
+        57745,
+        115082,
+    }};
+
+template <>
+const std::array<double, 18>
+    BandingConfigHelperData<kOneIn1000, 64U, /*smash*/ true>::kKnownToAddByPow2{
+        {
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,  // unsupported
+            53.0434,
+            117,
+            245.312,
+            483.571,
+            950.251,
+            1878,
+            3736.34,
+            7387.97,
+            14618,
+            29142.9,
+            57838.8,
+            114932,
+        }};
+
+// We hide these implementation details from the .h file with explicit
+// instantiations below these partial specializations.
+
+template <ConstructionFailureChance kCfc, uint64_t kCoeffBits, bool kUseSmash,
+          bool kHomogeneous>
+uint32_t BandingConfigHelper1MaybeSupported<
+    kCfc, kCoeffBits, kUseSmash, kHomogeneous,
+    true /* kIsSupported */>::GetNumToAdd(uint32_t num_slots) {
+  using Data = detail::BandingConfigHelperData<kCfc, kCoeffBits, kUseSmash>;
+  if (num_slots == 0) {
+    return 0;
+  }
+  uint32_t num_to_add;
+  double log2_num_slots = std::log(num_slots) * 1.4426950409;
+  uint32_t floor_log2 = static_cast<uint32_t>(log2_num_slots);
+  if (floor_log2 + 1 < Data::kKnownSize) {
+    double ceil_portion = 1.0 * num_slots / (uint32_t{1} << floor_log2) - 1.0;
+    // Must be a supported number of slots
+    assert(Data::kKnownToAddByPow2[floor_log2] > 0.0);
+    // Weighted average of two nearest known data points
+    num_to_add = static_cast<uint32_t>(
+        ceil_portion * Data::kKnownToAddByPow2[floor_log2 + 1] +
+        (1.0 - ceil_portion) * Data::kKnownToAddByPow2[floor_log2]);
+  } else {
+    // Use formula for large values
+    double factor = Data::GetFactorForLarge(log2_num_slots);
+    assert(factor >= 1.0);
+    num_to_add = static_cast<uint32_t>(num_slots / factor);
+  }
+  if (kHomogeneous) {
+    // Even when standard filter construction would succeed, we might
+    // have loaded things up too much for Homogeneous filter. (Complete
+    // explanation not known but observed empirically.) This seems to
+    // correct for that, mostly affecting small filter configurations.
+    if (num_to_add >= 8) {
+      num_to_add -= 8;
+    } else {
+      assert(false);
+    }
+  }
+  return num_to_add;
+}
+
+template <ConstructionFailureChance kCfc, uint64_t kCoeffBits, bool kUseSmash,
+          bool kHomogeneous>
+uint32_t BandingConfigHelper1MaybeSupported<
+    kCfc, kCoeffBits, kUseSmash, kHomogeneous,
+    true /* kIsSupported */>::GetNumSlots(uint32_t num_to_add) {
+  using Data = detail::BandingConfigHelperData<kCfc, kCoeffBits, kUseSmash>;
+
+  if (num_to_add == 0) {
+    return 0;
+  }
+  if (kHomogeneous) {
+    // Reverse of above in GetNumToAdd
+    num_to_add += 8;
+  }
+  double log2_num_to_add = std::log(num_to_add) * 1.4426950409;
+  uint32_t approx_log2_slots = static_cast<uint32_t>(log2_num_to_add + 0.5);
+  assert(approx_log2_slots <= 32);  // help clang-analyze
+
+  double lower_num_to_add = Data::GetNumToAddForPow2(approx_log2_slots);
+  double upper_num_to_add;
+  if (approx_log2_slots == 0 || lower_num_to_add == /* unsupported */ 0) {
+    // Return minimum non-zero slots in standard implementation
+    return kUseSmash ? kCoeffBits : 2 * kCoeffBits;
+  } else if (num_to_add < lower_num_to_add) {
+    upper_num_to_add = lower_num_to_add;
+    --approx_log2_slots;
+    lower_num_to_add = Data::GetNumToAddForPow2(approx_log2_slots);
+  } else {
+    upper_num_to_add = Data::GetNumToAddForPow2(approx_log2_slots + 1);
+  }
+
+  assert(num_to_add >= lower_num_to_add);
+  assert(num_to_add < upper_num_to_add);
+
+  double upper_portion =
+      (num_to_add - lower_num_to_add) / (upper_num_to_add - lower_num_to_add);
+
+  double lower_num_slots = 1.0 * (uint64_t{1} << approx_log2_slots);
+
+  // Interpolation, round up
+  return static_cast<uint32_t>(upper_portion * lower_num_slots +
+                               lower_num_slots + 0.999999999);
+}
+
+// These explicit instantiations enable us to hide most of the
+// implementation details from the .h file. (The .h file currently
+// needs to determine whether settings are "supported" or not.)
+
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 128U, /*sm*/ false,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 128U, /*sm*/ true,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 128U, /*sm*/ false,
+                                                   /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 128U, /*sm*/ true,
+                                                   /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 64U, /*sm*/ false,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 64U, /*sm*/ true,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 64U, /*sm*/ false,
+                                                   /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn2, 64U, /*sm*/ true,
+                                                   /*hm*/ true, /*sup*/ true>;
+
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 128U, /*sm*/ false,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 128U, /*sm*/ true,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 128U, /*sm*/ false,
+                                                   /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 128U, /*sm*/ true,
+                                                   /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 64U, /*sm*/ false,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 64U, /*sm*/ true,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 64U, /*sm*/ false,
+                                                   /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn20, 64U, /*sm*/ true,
+                                                   /*hm*/ true, /*sup*/ true>;
+
+template struct BandingConfigHelper1MaybeSupported<
+    kOneIn1000, 128U, /*sm*/ false, /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<
+    kOneIn1000, 128U, /*sm*/ true, /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<
+    kOneIn1000, 128U, /*sm*/ false, /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<
+    kOneIn1000, 128U, /*sm*/ true, /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<
+    kOneIn1000, 64U, /*sm*/ false, /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn1000, 64U, /*sm*/ true,
+                                                   /*hm*/ false, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<
+    kOneIn1000, 64U, /*sm*/ false, /*hm*/ true, /*sup*/ true>;
+template struct BandingConfigHelper1MaybeSupported<kOneIn1000, 64U, /*sm*/ true,
+                                                   /*hm*/ true, /*sup*/ true>;
+
+}  // namespace detail
+
+}  // namespace ribbon
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/ribbon_config.h b/util/ribbon_config.h
new file mode 100644
index 00000000000..0e3edf0734a
--- /dev/null
+++ b/util/ribbon_config.h
@@ -0,0 +1,182 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+
+#include "port/lang.h"  // for FALLTHROUGH_INTENDED
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace ribbon {
+
+// RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly)
+//
+// ribbon_config.h: APIs for relating numbers of slots with numbers of
+// additions for tolerable construction failure probabilities. This is
+// separate from ribbon_impl.h because it might not be needed for
+// some applications.
+//
+// This API assumes uint32_t for number of slots, as a single Ribbon
+// linear system should not normally overflow that without big penalties.
+//
+// Template parameter kCoeffBits uses uint64_t for convenience in case it
+// comes from size_t.
+//
+// Most of the complexity here is trying to optimize speed and
+// compiled code size, using templates to minimize table look-ups and
+// the compiled size of all linked look-up tables. Look-up tables are
+// required because we don't have good formulas, and the data comes
+// from running FindOccupancy in ribbon_test.
+
+// Represents a chosen chance of successful Ribbon construction for a single
+// seed. Allowing higher chance of failed construction can reduce space
+// overhead but takes extra time in construction.
+enum ConstructionFailureChance {
+  kOneIn2,
+  kOneIn20,
+  // When using kHomogeneous==true, construction failure chance should
+  // not generally exceed target FP rate, so it unlikely useful to
+  // allow a higher "failure" chance. In some cases, even more overhead
+  // is appropriate. (TODO)
+  kOneIn1000,
+};
+
+namespace detail {
+
+// It is useful to compile ribbon_test linking to BandingConfigHelper with
+// settings for which we do not have configuration data, as long as we don't
+// run the code. This template hack supports that.
+template <ConstructionFailureChance kCfc, uint64_t kCoeffBits, bool kUseSmash,
+          bool kHomogeneous, bool kIsSupported>
+struct BandingConfigHelper1MaybeSupported {
+ public:
+  static uint32_t GetNumToAdd(uint32_t num_slots) {
+    // Unsupported
+    assert(num_slots == 0);
+    (void)num_slots;
+    return 0;
+  }
+
+  static uint32_t GetNumSlots(uint32_t num_to_add) {
+    // Unsupported
+    assert(num_to_add == 0);
+    (void)num_to_add;
+    return 0;
+  }
+};
+
+// Base class for BandingConfigHelper1 and helper for BandingConfigHelper
+// with core implementations built on above data
+template <ConstructionFailureChance kCfc, uint64_t kCoeffBits, bool kUseSmash,
+          bool kHomogeneous>
+struct BandingConfigHelper1MaybeSupported<
+    kCfc, kCoeffBits, kUseSmash, kHomogeneous, true /* kIsSupported */> {
+ public:
+  // See BandingConfigHelper1. Implementation in ribbon_config.cc
+  static uint32_t GetNumToAdd(uint32_t num_slots);
+
+  // See BandingConfigHelper1. Implementation in ribbon_config.cc
+  static uint32_t GetNumSlots(uint32_t num_to_add);
+};
+
+}  // namespace detail
+
+template <ConstructionFailureChance kCfc, uint64_t kCoeffBits, bool kUseSmash,
+          bool kHomogeneous>
+struct BandingConfigHelper1
+    : public detail::BandingConfigHelper1MaybeSupported<
+          kCfc, kCoeffBits, kUseSmash, kHomogeneous,
+          /* kIsSupported */ kCoeffBits == 64 || kCoeffBits == 128> {
+ public:
+  // Returns a number of entries that can be added to a given number of
+  // slots, with roughly kCfc chance of construction failure per seed,
+  // or better. Does NOT do rounding for InterleavedSoln; call
+  // RoundUpNumSlots for that.
+  //
+  // inherited:
+  // static uint32_t GetNumToAdd(uint32_t num_slots);
+
+  // Returns a number of slots for a given number of entries to add
+  // that should have roughly kCfc chance of construction failure per
+  // seed, or better. Does NOT do rounding for InterleavedSoln; call
+  // RoundUpNumSlots for that.
+  //
+  // num_to_add should not exceed roughly 2/3rds of the maximum value
+  // of the uint32_t type to avoid overflow.
+  //
+  // inherited:
+  // static uint32_t GetNumSlots(uint32_t num_to_add);
+};
+
+// Configured using TypesAndSettings as in ribbon_impl.h
+template <ConstructionFailureChance kCfc, class TypesAndSettings>
+struct BandingConfigHelper1TS
+    : public BandingConfigHelper1<
+          kCfc,
+          /* kCoeffBits */ sizeof(typename TypesAndSettings::CoeffRow) * 8U,
+          TypesAndSettings::kUseSmash, TypesAndSettings::kHomogeneous> {};
+
+// Like BandingConfigHelper1TS except failure chance can be a runtime rather
+// than compile time value.
+template <class TypesAndSettings>
+struct BandingConfigHelper {
+ public:
+  static constexpr ConstructionFailureChance kDefaultFailureChance =
+      TypesAndSettings::kHomogeneous ? kOneIn1000 : kOneIn20;
+
+  static uint32_t GetNumToAdd(
+      uint32_t num_slots,
+      ConstructionFailureChance max_failure = kDefaultFailureChance) {
+    switch (max_failure) {
+      default:
+        assert(false);
+        FALLTHROUGH_INTENDED;
+      case kOneIn20: {
+        using H1 = BandingConfigHelper1TS<kOneIn20, TypesAndSettings>;
+        return H1::GetNumToAdd(num_slots);
+      }
+      case kOneIn2: {
+        using H1 = BandingConfigHelper1TS<kOneIn2, TypesAndSettings>;
+        return H1::GetNumToAdd(num_slots);
+      }
+      case kOneIn1000: {
+        using H1 = BandingConfigHelper1TS<kOneIn1000, TypesAndSettings>;
+        return H1::GetNumToAdd(num_slots);
+      }
+    }
+  }
+
+  static uint32_t GetNumSlots(
+      uint32_t num_to_add,
+      ConstructionFailureChance max_failure = kDefaultFailureChance) {
+    switch (max_failure) {
+      default:
+        assert(false);
+        FALLTHROUGH_INTENDED;
+      case kOneIn20: {
+        using H1 = BandingConfigHelper1TS<kOneIn20, TypesAndSettings>;
+        return H1::GetNumSlots(num_to_add);
+      }
+      case kOneIn2: {
+        using H1 = BandingConfigHelper1TS<kOneIn2, TypesAndSettings>;
+        return H1::GetNumSlots(num_to_add);
+      }
+      case kOneIn1000: {
+        using H1 = BandingConfigHelper1TS<kOneIn1000, TypesAndSettings>;
+        return H1::GetNumSlots(num_to_add);
+      }
+    }
+  }
+};
+
+}  // namespace ribbon
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/ribbon_impl.h b/util/ribbon_impl.h
new file mode 100644
index 00000000000..3b84a0d90bd
--- /dev/null
+++ b/util/ribbon_impl.h
@@ -0,0 +1,1127 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cmath>
+
+#include "port/port.h"  // for PREFETCH
+#include "util/fastrange.h"
+#include "util/ribbon_alg.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace ribbon {
+
+// RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly)
+//
+// ribbon_impl.h: templated (parameterized) standard implementations
+//
+// Ribbon is a Perfect Hash Static Function construction useful as a compact
+// static Bloom filter alternative. See ribbon_alg.h for core algorithms
+// and core design details.
+//
+// TODO: more details on trade-offs and practical issues.
+//
+// APIs for configuring Ribbon are in ribbon_config.h
+
+// Ribbon implementations in this file take these parameters, which must be
+// provided in a class/struct type with members expressed in this concept:
+
+// concept TypesAndSettings {
+//   // See RibbonTypes and *Hasher in ribbon_alg.h, except here we have
+//   // the added constraint that Hash be equivalent to either uint32_t or
+//   // uint64_t.
+//   typename Hash;
+//   typename CoeffRow;
+//   typename ResultRow;
+//   typename Index;
+//   typename Key;
+//   static constexpr bool kFirstCoeffAlwaysOne;
+//
+//   // An unsigned integer type for identifying a hash seed, typically
+//   // uint32_t or uint64_t. Importantly, this is the amount of data
+//   // stored in memory for identifying a raw seed. See StandardHasher.
+//   typename Seed;
+//
+//   // When true, the PHSF implements a static filter, expecting just
+//   // keys as inputs for construction. When false, implements a general
+//   // PHSF and expects std::pair<Key, ResultRow> as inputs for
+//   // construction.
+//   static constexpr bool kIsFilter;
+//
+//   // When true, enables a special "homogeneous" filter implementation that
+//   // is slightly faster to construct, and never fails to construct though
+//   // FP rate can quickly explode in cases where corresponding
+//   // non-homogeneous filter would fail (or nearly fail?) to construct.
+//   // For smaller filters, you can configure with ConstructionFailureChance
+//   // smaller than desired FP rate to largely counteract this effect.
+//   // TODO: configuring Homogeneous Ribbon for arbitrarily large filters
+//   // based on data from OptimizeHomogAtScale
+//   static constexpr bool kHomogeneous;
+//
+//   // When true, adds a tiny bit more hashing logic on queries and
+//   // construction to improve utilization at the beginning and end of
+//   // the structure.  Recommended when CoeffRow is only 64 bits (or
+//   // less), so typical num_starts < 10k. Although this is compatible
+//   // with kHomogeneous, the competing space vs. time priorities might
+//   // not be useful.
+//   static constexpr bool kUseSmash;
+//
+//   // When true, allows number of "starts" to be zero, for best support
+//   // of the "no keys to add" case by always returning false for filter
+//   // queries. (This is distinct from the "keys added but no space for
+//   // any data" case, in which a filter always returns true.) The cost
+//   // supporting this is a conditional branch (probably predictable) in
+//   // queries.
+//   static constexpr bool kAllowZeroStarts;
+//
+//   // A seedable stock hash function on Keys. All bits of Hash must
+//   // be reasonably high quality. XXH functions recommended, but
+//   // Murmur, City, Farm, etc. also work.
+//   static Hash HashFn(const Key &, Seed raw_seed);
+// };
+
+// A bit of a hack to automatically construct the type for
+// AddInput based on a constexpr bool.
+template <typename Key, typename ResultRow, bool IsFilter>
+struct AddInputSelector {
+  // For general PHSF, not filter
+  using T = std::pair<Key, ResultRow>;
+};
+
+template <typename Key, typename ResultRow>
+struct AddInputSelector<Key, ResultRow, true /*IsFilter*/> {
+  // For Filter
+  using T = Key;
+};
+
+// To avoid writing 'typename' everywhere that we use types like 'Index'
+#define IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings)                   \
+  using CoeffRow = typename TypesAndSettings::CoeffRow;                      \
+  using ResultRow = typename TypesAndSettings::ResultRow;                    \
+  using Index = typename TypesAndSettings::Index;                            \
+  using Hash = typename TypesAndSettings::Hash;                              \
+  using Key = typename TypesAndSettings::Key;                                \
+  using Seed = typename TypesAndSettings::Seed;                              \
+                                                                             \
+  /* Some more additions */                                                  \
+  using QueryInput = Key;                                                    \
+  using AddInput = typename ROCKSDB_NAMESPACE::ribbon::AddInputSelector<     \
+      Key, ResultRow, TypesAndSettings::kIsFilter>::T;                       \
+  static constexpr auto kCoeffBits =                                         \
+      static_cast<Index>(sizeof(CoeffRow) * 8U);                             \
+                                                                             \
+  /* Export to algorithm */                                                  \
+  static constexpr bool kFirstCoeffAlwaysOne =                               \
+      TypesAndSettings::kFirstCoeffAlwaysOne;                                \
+                                                                             \
+  static_assert(sizeof(CoeffRow) + sizeof(ResultRow) + sizeof(Index) +       \
+                        sizeof(Hash) + sizeof(Key) + sizeof(Seed) +          \
+                        sizeof(QueryInput) + sizeof(AddInput) + kCoeffBits + \
+                        kFirstCoeffAlwaysOne >                               \
+                    0,                                                       \
+                "avoid unused warnings, semicolon expected after macro call")
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4309)  // cast truncating constant
+#pragma warning(disable : 4307)  // arithmetic constant overflow
+#endif
+
+// StandardHasher: A standard implementation of concepts RibbonTypes,
+// PhsfQueryHasher, FilterQueryHasher, and BandingHasher from ribbon_alg.h.
+//
+// This implementation should be suitable for most all practical purposes
+// as it "behaves" across a wide range of settings, with little room left
+// for improvement. The key functionality in this hasher is generating
+// CoeffRows, starts, and (for filters) ResultRows, which could be ~150
+// bits of data or more, from a modest hash of 64 or even just 32 bits, with
+// enough uniformity and bitwise independence to be close to "the best you
+// can do" with available hash information in terms of FP rate and
+// compactness. (64 bits recommended and sufficient for PHSF practical
+// purposes.)
+//
+// Another feature of this hasher is a minimal "premixing" of seeds before
+// they are provided to TypesAndSettings::HashFn in case that function does
+// not provide sufficiently independent hashes when iterating merely
+// sequentially on seeds. (This for example works around a problem with the
+// preview version 0.7.2 of XXH3 used in RocksDB, a.k.a. XXH3p or Hash64, and
+// MurmurHash1 used in RocksDB, a.k.a. Hash.) We say this pre-mixing step
+// translates "ordinal seeds," which we iterate sequentially to find a
+// solution, into "raw seeds," with many more bits changing for each
+// iteration. The translation is an easily reversible lightweight mixing,
+// not suitable for hashing on its own. An advantage of this approach is that
+// StandardHasher can store just the raw seed (e.g. 64 bits) for fast query
+// times, while from the application perspective, we can limit to a small
+// number of ordinal keys (e.g. 64 in 6 bits) for saving in metadata.
+//
+// The default constructor initializes the seed to ordinal seed zero, which
+// is equal to raw seed zero.
+//
+template <class TypesAndSettings>
+class StandardHasher {
+ public:
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings);
+
+  inline Hash GetHash(const Key& key) const {
+    return TypesAndSettings::HashFn(key, raw_seed_);
+  };
+  // For when AddInput == pair<Key, ResultRow> (kIsFilter == false)
+  inline Hash GetHash(const std::pair<Key, ResultRow>& bi) const {
+    return GetHash(bi.first);
+  };
+  inline Index GetStart(Hash h, Index num_starts) const {
+    // This is "critical path" code because it's required before memory
+    // lookup.
+    //
+    // FastRange gives us a fast and effective mapping from h to the
+    // appropriate range. This depends most, sometimes exclusively, on
+    // upper bits of h.
+    //
+    if (TypesAndSettings::kUseSmash) {
+      // Extra logic to "smash" entries at beginning and end, for
+      // better utilization. For example, without smash and with
+      // kFirstCoeffAlwaysOne, there's about a 30% chance that the
+      // first slot in the banding will be unused, and worse without
+      // kFirstCoeffAlwaysOne. The ending slots are even less utilized
+      // without smash.
+      //
+      // But since this only affects roughly kCoeffBits of the slots,
+      // it's usually small enough to be ignorable (less computation in
+      // this function) when number of slots is roughly 10k or larger.
+      //
+      // The best values for these smash weights might depend on how
+      // densely you're packing entries, and also kCoeffBits, but this
+      // seems to work well for roughly 95% success probability.
+      //
+      constexpr Index kFrontSmash = kCoeffBits / 4;
+      constexpr Index kBackSmash = kCoeffBits / 4;
+      Index start = FastRangeGeneric(h, num_starts + kFrontSmash + kBackSmash);
+      start = std::max(start, kFrontSmash);
+      start -= kFrontSmash;
+      start = std::min(start, num_starts - 1);
+      return start;
+    } else {
+      // For query speed, we allow small number of initial and final
+      // entries to be under-utilized.
+      // NOTE: This call statically enforces that Hash is equivalent to
+      // either uint32_t or uint64_t.
+      return FastRangeGeneric(h, num_starts);
+    }
+  }
+  inline CoeffRow GetCoeffRow(Hash h) const {
+    // This is not so much "critical path" code because it can be done in
+    // parallel (instruction level) with memory lookup.
+    //
+    // When we might have many entries squeezed into a single start,
+    // we need reasonably good remixing for CoeffRow.
+    if (TypesAndSettings::kUseSmash) {
+      // Reasonably good, reasonably fast, reasonably general.
+      // Probably not 1:1 but probably close enough.
+      Unsigned128 a = Multiply64to128(h, kAltCoeffFactor1);
+      Unsigned128 b = Multiply64to128(h, kAltCoeffFactor2);
+      auto cr = static_cast<CoeffRow>(b ^ (a << 64) ^ (a >> 64));
+
+      // Now ensure the value is non-zero
+      if (kFirstCoeffAlwaysOne) {
+        cr |= 1;
+      } else {
+        // Still have to ensure some bit is non-zero
+        cr |= (cr == 0) ? 1 : 0;
+      }
+      return cr;
+    }
+    // If not kUseSmash, we ensure we're not squeezing many entries into a
+    // single start, in part by ensuring num_starts > num_slots / 2. Thus,
+    // here we do not need good remixing for CoeffRow, but just enough that
+    // (a) every bit is reasonably independent from Start.
+    // (b) every Hash-length bit subsequence of the CoeffRow has full or
+    // nearly full entropy from h.
+    // (c) if nontrivial bit subsequences within are correlated, it needs to
+    // be more complicated than exact copy or bitwise not (at least without
+    // kFirstCoeffAlwaysOne), or else there seems to be a kind of
+    // correlated clustering effect.
+    // (d) the CoeffRow is not zero, so that no one input on its own can
+    // doom construction success. (Preferably a mix of 1's and 0's if
+    // satisfying above.)
+
+    // First, establish sufficient bitwise independence from Start, with
+    // multiplication by a large random prime.
+    // Note that we cast to Hash because if we use product bits beyond
+    // original input size, that's going to correlate with Start (FastRange)
+    // even with a (likely) different multiplier here.
+    Hash a = h * kCoeffAndResultFactor;
+
+    static_assert(
+        sizeof(Hash) == sizeof(uint64_t) || sizeof(Hash) == sizeof(uint32_t),
+        "Supported sizes");
+    // If that's big enough, we're done. If not, we have to expand it,
+    // maybe up to 4x size.
+    uint64_t b;
+    if (sizeof(Hash) < sizeof(uint64_t)) {
+      // Almost-trivial hash expansion (OK - see above), favoring roughly
+      // equal number of 1's and 0's in result
+      b = (uint64_t{a} << 32) ^ (a ^ kCoeffXor32);
+    } else {
+      b = a;
+    }
+    static_assert(sizeof(CoeffRow) <= sizeof(Unsigned128), "Supported sizes");
+    Unsigned128 c;
+    if (sizeof(uint64_t) < sizeof(CoeffRow)) {
+      // Almost-trivial hash expansion (OK - see above), favoring roughly
+      // equal number of 1's and 0's in result
+      c = (Unsigned128{b} << 64) ^ (b ^ kCoeffXor64);
+    } else {
+      c = b;
+    }
+    auto cr = static_cast<CoeffRow>(c);
+
+    // Now ensure the value is non-zero
+    if (kFirstCoeffAlwaysOne) {
+      cr |= 1;
+    } else if (sizeof(CoeffRow) == sizeof(Hash)) {
+      // Still have to ensure some bit is non-zero
+      cr |= (cr == 0) ? 1 : 0;
+    } else {
+      // (We did trivial expansion with constant xor, which ensures some
+      // bits are non-zero.)
+    }
+    return cr;
+  }
+  inline ResultRow GetResultRowMask() const {
+    // TODO: will be used with InterleavedSolutionStorage?
+    // For now, all bits set (note: might be a small type so might need to
+    // narrow after promotion)
+    return static_cast<ResultRow>(~ResultRow{0});
+  }
+  inline ResultRow GetResultRowFromHash(Hash h) const {
+    if (TypesAndSettings::kIsFilter && !TypesAndSettings::kHomogeneous) {
+      // This is not so much "critical path" code because it can be done in
+      // parallel (instruction level) with memory lookup.
+      //
+      // ResultRow bits only needs to be independent from CoeffRow bits if
+      // many entries might have the same start location, where "many" is
+      // comparable to number of hash bits or kCoeffBits. If !kUseSmash
+      // and num_starts > kCoeffBits, it is safe and efficient to draw from
+      // the same bits computed for CoeffRow, which are reasonably
+      // independent from Start. (Inlining and common subexpression
+      // elimination with GetCoeffRow should make this
+      // a single shared multiplication in generated code when !kUseSmash.)
+      Hash a = h * kCoeffAndResultFactor;
+
+      // The bits here that are *most* independent of Start are the highest
+      // order bits (as in Knuth multiplicative hash). To make those the
+      // most preferred for use in the result row, we do a bswap here.
+      auto rr = static_cast<ResultRow>(EndianSwapValue(a));
+      return rr & GetResultRowMask();
+    } else {
+      // Must be zero
+      return 0;
+    }
+  }
+  // For when AddInput == Key (kIsFilter == true)
+  inline ResultRow GetResultRowFromInput(const Key&) const {
+    // Must be zero
+    return 0;
+  }
+  // For when AddInput == pair<Key, ResultRow> (kIsFilter == false)
+  inline ResultRow GetResultRowFromInput(
+      const std::pair<Key, ResultRow>& bi) const {
+    // Simple extraction
+    return bi.second;
+  }
+
+  // Seed tracking APIs - see class comment
+  void SetRawSeed(Seed seed) { raw_seed_ = seed; }
+  Seed GetRawSeed() { return raw_seed_; }
+  void SetOrdinalSeed(Seed count) {
+    // A simple, reversible mixing of any size (whole bytes) up to 64 bits.
+    // This allows casting the raw seed to any smaller size we use for
+    // ordinal seeds without risk of duplicate raw seeds for unique ordinal
+    // seeds.
+
+    // Seed type might be smaller than numerical promotion size, but Hash
+    // should be at least that size, so we use Hash as intermediate type.
+    static_assert(sizeof(Seed) <= sizeof(Hash),
+                  "Hash must be at least size of Seed");
+
+    // Multiply by a large random prime (one-to-one for any prefix of bits)
+    Hash tmp = count * kToRawSeedFactor;
+    // Within-byte one-to-one mixing
+    static_assert((kSeedMixMask & (kSeedMixMask >> kSeedMixShift)) == 0,
+                  "Illegal mask+shift");
+    tmp ^= (tmp & kSeedMixMask) >> kSeedMixShift;
+    raw_seed_ = static_cast<Seed>(tmp);
+    // dynamic verification
+    assert(GetOrdinalSeed() == count);
+  }
+  Seed GetOrdinalSeed() {
+    Hash tmp = raw_seed_;
+    // Within-byte one-to-one mixing (its own inverse)
+    tmp ^= (tmp & kSeedMixMask) >> kSeedMixShift;
+    // Multiply by 64-bit multiplicative inverse
+    static_assert(kToRawSeedFactor * kFromRawSeedFactor == Hash{1},
+                  "Must be inverses");
+    return static_cast<Seed>(tmp * kFromRawSeedFactor);
+  }
+
+ protected:
+  // For expanding hash:
+  // large random prime
+  static constexpr Hash kCoeffAndResultFactor =
+      static_cast<Hash>(0xc28f82822b650bedULL);
+  static constexpr uint64_t kAltCoeffFactor1 = 0x876f170be4f1fcb9U;
+  static constexpr uint64_t kAltCoeffFactor2 = 0xf0433a4aecda4c5fU;
+  // random-ish data
+  static constexpr uint32_t kCoeffXor32 = 0xa6293635U;
+  static constexpr uint64_t kCoeffXor64 = 0xc367844a6e52731dU;
+
+  // For pre-mixing seeds
+  static constexpr Hash kSeedMixMask = static_cast<Hash>(0xf0f0f0f0f0f0f0f0ULL);
+  static constexpr unsigned kSeedMixShift = 4U;
+  static constexpr Hash kToRawSeedFactor =
+      static_cast<Hash>(0xc78219a23eeadd03ULL);
+  static constexpr Hash kFromRawSeedFactor =
+      static_cast<Hash>(0xfe1a137d14b475abULL);
+
+  // See class description
+  Seed raw_seed_ = 0;
+};
+
+// StandardRehasher (and StandardRehasherAdapter): A variant of
+// StandardHasher that uses the same type for keys as for hashes.
+// This is primarily intended for building a Ribbon filter
+// from existing hashes without going back to original inputs in
+// order to apply a different seed. This hasher seeds a 1-to-1 mixing
+// transformation to apply a seed to an existing hash. (Untested for
+// hash-sized keys that are not already uniformly distributed.) This
+// transformation builds on the seed pre-mixing done in StandardHasher.
+//
+// Testing suggests essentially no degradation of solution success rate
+// vs. going back to original inputs when changing hash seeds. For example:
+// Average re-seeds for solution with r=128, 1.02x overhead, and ~100k keys
+// is about 1.10 for both StandardHasher and StandardRehasher.
+//
+// StandardRehasher is not really recommended for general PHSFs (not
+// filters) because a collision in the original hash could prevent
+// construction despite re-seeding the Rehasher. (Such collisions
+// do not interfere with filter construction.)
+//
+// concept RehasherTypesAndSettings: like TypesAndSettings but
+// does not require Key or HashFn.
+template <class RehasherTypesAndSettings>
+class StandardRehasherAdapter : public RehasherTypesAndSettings {
+ public:
+  using Hash = typename RehasherTypesAndSettings::Hash;
+  using Key = Hash;
+  using Seed = typename RehasherTypesAndSettings::Seed;
+
+  static Hash HashFn(const Hash& input, Seed raw_seed) {
+    // Note: raw_seed is already lightly pre-mixed, and this multiplication
+    // by a large prime is sufficient mixing (low-to-high bits) on top of
+    // that for good FastRange results, which depends primarily on highest
+    // bits. (The hashed CoeffRow and ResultRow are less sensitive to
+    // mixing than Start.)
+    // Also note: did consider adding ^ (input >> some) before the
+    // multiplication, but doesn't appear to be necessary.
+    return (input ^ raw_seed) * kRehashFactor;
+  }
+
+ private:
+  static constexpr Hash kRehashFactor =
+      static_cast<Hash>(0x6193d459236a3a0dULL);
+};
+
+// See comment on StandardRehasherAdapter
+template <class RehasherTypesAndSettings>
+using StandardRehasher =
+    StandardHasher<StandardRehasherAdapter<RehasherTypesAndSettings>>;
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+// Especially with smaller hashes (e.g. 32 bit), there can be noticeable
+// false positives due to collisions in the Hash returned by GetHash.
+// This function returns the expected FP rate due to those collisions,
+// which can be added to the expected FP rate from the underlying data
+// structure. (Note: technically, a + b is only a good approximation of
+// 1-(1-a)(1-b) == a + b - a*b, if a and b are much closer to 0 than to 1.)
+// The number of entries added can be a double here in case it's an
+// average.
+template <class Hasher, typename Numerical>
+double ExpectedCollisionFpRate(const Hasher& hasher, Numerical added) {
+  // Standardize on the 'double' specialization
+  return ExpectedCollisionFpRate(hasher, 1.0 * added);
+}
+template <class Hasher>
+double ExpectedCollisionFpRate(const Hasher& /*hasher*/, double added) {
+  // Technically, there could be overlap among the added, but ignoring that
+  // is typically close enough.
+  return added / std::pow(256.0, sizeof(typename Hasher::Hash));
+}
+
+// StandardBanding: a canonical implementation of BandingStorage and
+// BacktrackStorage, with convenience API for banding (solving with on-the-fly
+// Gaussian elimination) with and without backtracking.
+template <class TypesAndSettings>
+class StandardBanding : public StandardHasher<TypesAndSettings> {
+ public:
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings);
+
+  StandardBanding(Index num_slots = 0, Index backtrack_size = 0) {
+    Reset(num_slots, backtrack_size);
+  }
+
+  void Reset(Index num_slots, Index backtrack_size = 0) {
+    if (num_slots == 0) {
+      // Unusual (TypesAndSettings::kAllowZeroStarts) or "uninitialized"
+      num_starts_ = 0;
+    } else {
+      // Normal
+      assert(num_slots >= kCoeffBits);
+      if (num_slots > num_slots_allocated_) {
+        coeff_rows_.reset(new CoeffRow[num_slots]());
+        if (!TypesAndSettings::kHomogeneous) {
+          // Note: don't strictly have to zero-init result_rows,
+          // except possible information leakage, etc ;)
+          result_rows_.reset(new ResultRow[num_slots]());
+        }
+        num_slots_allocated_ = num_slots;
+      } else {
+        for (Index i = 0; i < num_slots; ++i) {
+          coeff_rows_[i] = 0;
+          if (!TypesAndSettings::kHomogeneous) {
+            // Note: don't strictly have to zero-init result_rows,
+            // except possible information leakage, etc ;)
+            result_rows_[i] = 0;
+          }
+        }
+      }
+      num_starts_ = num_slots - kCoeffBits + 1;
+    }
+    EnsureBacktrackSize(backtrack_size);
+  }
+
+  void EnsureBacktrackSize(Index backtrack_size) {
+    if (backtrack_size > backtrack_size_) {
+      backtrack_.reset(new Index[backtrack_size]);
+      backtrack_size_ = backtrack_size;
+    }
+  }
+
+  // ********************************************************************
+  // From concept BandingStorage
+
+  inline bool UsePrefetch() const {
+    // A rough guesstimate of when prefetching during construction pays off.
+    // TODO: verify/validate
+    return num_starts_ > 1500;
+  }
+  inline void Prefetch(Index i) const {
+    PREFETCH(&coeff_rows_[i], 1 /* rw */, 1 /* locality */);
+    if (!TypesAndSettings::kHomogeneous) {
+      PREFETCH(&result_rows_[i], 1 /* rw */, 1 /* locality */);
+    }
+  }
+  inline void LoadRow(Index i, CoeffRow* cr, ResultRow* rr,
+                      bool for_back_subst) const {
+    *cr = coeff_rows_[i];
+    if (TypesAndSettings::kHomogeneous) {
+      if (for_back_subst && *cr == 0) {
+        // Cheap pseudorandom data to fill unconstrained solution rows
+        *rr = static_cast<ResultRow>(i * 0x9E3779B185EBCA87ULL);
+      } else {
+        *rr = 0;
+      }
+    } else {
+      *rr = result_rows_[i];
+    }
+  }
+  inline void StoreRow(Index i, CoeffRow cr, ResultRow rr) {
+    coeff_rows_[i] = cr;
+    if (TypesAndSettings::kHomogeneous) {
+      assert(rr == 0);
+    } else {
+      result_rows_[i] = rr;
+    }
+  }
+  inline Index GetNumStarts() const { return num_starts_; }
+
+  // from concept BacktrackStorage, for when backtracking is used
+  inline bool UseBacktrack() const { return true; }
+  inline void BacktrackPut(Index i, Index to_save) { backtrack_[i] = to_save; }
+  inline Index BacktrackGet(Index i) const { return backtrack_[i]; }
+
+  // ********************************************************************
+  // Some useful API, still somewhat low level. Here an input is
+  // a Key for filters, or std::pair<Key, ResultRow> for general PHSF.
+
+  // Adds a range of inputs to the banding, returning true if successful.
+  // False means none or some may have been successfully added, so it's
+  // best to Reset this banding before any further use.
+  //
+  // Adding can fail even before all the "slots" are completely "full".
+  //
+  template <typename InputIterator>
+  bool AddRange(InputIterator begin, InputIterator end) {
+    assert(num_starts_ > 0 || TypesAndSettings::kAllowZeroStarts);
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual. Can't add any in this case.
+      return begin == end;
+    }
+    // Normal
+    return BandingAddRange(this, *this, begin, end);
+  }
+
+  // Adds a range of inputs to the banding, returning true if successful,
+  // or if unsuccessful, rolls back to state before this call and returns
+  // false. Caller guarantees that the number of inputs in this batch
+  // does not exceed `backtrack_size` provided to Reset.
+  //
+  // Adding can fail even before all the "slots" are completely "full".
+  //
+  template <typename InputIterator>
+  bool AddRangeOrRollBack(InputIterator begin, InputIterator end) {
+    assert(num_starts_ > 0 || TypesAndSettings::kAllowZeroStarts);
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual. Can't add any in this case.
+      return begin == end;
+    }
+    // else Normal
+    return BandingAddRange(this, this, *this, begin, end);
+  }
+
+  // Adds a single input to the banding, returning true if successful.
+  // If unsuccessful, returns false and banding state is unchanged.
+  //
+  // Adding can fail even before all the "slots" are completely "full".
+  //
+  bool Add(const AddInput& input) {
+    // Pointer can act as iterator
+    return AddRange(&input, &input + 1);
+  }
+
+  // Return the number of "occupied" rows (with non-zero coefficients stored).
+  Index GetOccupiedCount() const {
+    Index count = 0;
+    if (num_starts_ > 0) {
+      const Index num_slots = num_starts_ + kCoeffBits - 1;
+      for (Index i = 0; i < num_slots; ++i) {
+        if (coeff_rows_[i] != 0) {
+          ++count;
+        }
+      }
+    }
+    return count;
+  }
+
+  // Returns whether a row is "occupied" in the banding (non-zero
+  // coefficients stored). (Only recommended for debug/test)
+  bool IsOccupied(Index i) { return coeff_rows_[i] != 0; }
+
+  // ********************************************************************
+  // High-level API
+
+  // Iteratively (a) resets the structure for `num_slots`, (b) attempts
+  // to add the range of inputs, and (c) if unsuccessful, chooses next
+  // hash seed, until either successful or unsuccessful with all the
+  // allowed seeds. Returns true if successful. In that case, use
+  // GetOrdinalSeed() or GetRawSeed() to get the successful seed.
+  //
+  // The allowed sequence of hash seeds is determined by
+  // `starting_ordinal_seed,` the first ordinal seed to be attempted
+  // (see StandardHasher), and `ordinal_seed_mask,` a bit mask (power of
+  // two minus one) for the range of ordinal seeds to consider. The
+  // max number of seeds considered will be ordinal_seed_mask + 1.
+  // For filters we suggest `starting_ordinal_seed` be chosen randomly
+  // or round-robin, to minimize false positive correlations between keys.
+  //
+  // If unsuccessful, how best to continue is going to be application
+  // specific. It should be possible to choose parameters such that
+  // failure is extremely unlikely, using max_seed around 32 to 64.
+  // (TODO: APIs to help choose parameters) One option for fallback in
+  // constructing a filter is to construct a Bloom filter instead.
+  // Increasing num_slots is an option, but should not be used often
+  // unless construction maximum latency is a concern (rather than
+  // average running time of construction). Instead, choose parameters
+  // appropriately and trust that seeds are independent. (Also,
+  // increasing num_slots without changing hash seed would have a
+  // significant correlation in success, rather than independence.)
+  template <typename InputIterator>
+  bool ResetAndFindSeedToSolve(Index num_slots, InputIterator begin,
+                               InputIterator end,
+                               Seed starting_ordinal_seed = 0U,
+                               Seed ordinal_seed_mask = 63U) {
+    // power of 2 minus 1
+    assert((ordinal_seed_mask & (ordinal_seed_mask + 1)) == 0);
+    // starting seed is within mask
+    assert((starting_ordinal_seed & ordinal_seed_mask) ==
+           starting_ordinal_seed);
+    starting_ordinal_seed &= ordinal_seed_mask;  // if not debug
+
+    Seed cur_ordinal_seed = starting_ordinal_seed;
+    do {
+      StandardHasher<TypesAndSettings>::SetOrdinalSeed(cur_ordinal_seed);
+      Reset(num_slots);
+      bool success = AddRange(begin, end);
+      if (success) {
+        return true;
+      }
+      cur_ordinal_seed = (cur_ordinal_seed + 1) & ordinal_seed_mask;
+    } while (cur_ordinal_seed != starting_ordinal_seed);
+    // Reached limit by circling around
+    return false;
+  }
+
+ protected:
+  // TODO: explore combining in a struct
+  std::unique_ptr<CoeffRow[]> coeff_rows_;
+  std::unique_ptr<ResultRow[]> result_rows_;
+  // We generally store "starts" instead of slots for speed of GetStart(),
+  // as in StandardHasher.
+  Index num_starts_ = 0;
+  Index num_slots_allocated_ = 0;
+  std::unique_ptr<Index[]> backtrack_;
+  Index backtrack_size_ = 0;
+};
+
+// Implements concept SimpleSolutionStorage, mostly for demonstration
+// purposes. This is "in memory" only because it does not handle byte
+// ordering issues for serialization.
+template <class TypesAndSettings>
+class InMemSimpleSolution {
+ public:
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings);
+
+  void PrepareForNumStarts(Index num_starts) {
+    if (TypesAndSettings::kAllowZeroStarts && num_starts == 0) {
+      // Unusual
+      num_starts_ = 0;
+    } else {
+      // Normal
+      const Index num_slots = num_starts + kCoeffBits - 1;
+      assert(num_slots >= kCoeffBits);
+      if (num_slots > num_slots_allocated_) {
+        // Do not need to init the memory
+        solution_rows_.reset(new ResultRow[num_slots]);
+        num_slots_allocated_ = num_slots;
+      }
+      num_starts_ = num_starts;
+    }
+  }
+
+  Index GetNumStarts() const { return num_starts_; }
+
+  ResultRow Load(Index slot_num) const { return solution_rows_[slot_num]; }
+
+  void Store(Index slot_num, ResultRow solution_row) {
+    solution_rows_[slot_num] = solution_row;
+  }
+
+  // ********************************************************************
+  // High-level API
+
+  template <typename BandingStorage>
+  void BackSubstFrom(const BandingStorage& bs) {
+    if (TypesAndSettings::kAllowZeroStarts && bs.GetNumStarts() == 0) {
+      // Unusual
+      PrepareForNumStarts(0);
+    } else {
+      // Normal
+      SimpleBackSubst(this, bs);
+    }
+  }
+
+  template <typename PhsfQueryHasher>
+  ResultRow PhsfQuery(const Key& input, const PhsfQueryHasher& hasher) const {
+    // assert(!TypesAndSettings::kIsFilter);  Can be useful in testing
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual
+      return 0;
+    } else {
+      // Normal
+      return SimplePhsfQuery(input, hasher, *this);
+    }
+  }
+
+  template <typename FilterQueryHasher>
+  bool FilterQuery(const Key& input, const FilterQueryHasher& hasher) const {
+    assert(TypesAndSettings::kIsFilter);
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual. Zero starts presumes no keys added -> always false
+      return false;
+    } else {
+      // Normal, or upper_num_columns_ == 0 means "no space for data" and
+      // thus will always return true.
+      return SimpleFilterQuery(input, hasher, *this);
+    }
+  }
+
+  double ExpectedFpRate() const {
+    assert(TypesAndSettings::kIsFilter);
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual, but we don't have FPs if we always return false.
+      return 0.0;
+    }
+    // else Normal
+
+    // Each result (solution) bit (column) cuts FP rate in half
+    return std::pow(0.5, 8U * sizeof(ResultRow));
+  }
+
+  // ********************************************************************
+  // Static high-level API
+
+  // Round up to a number of slots supported by this structure. Note that
+  // this needs to be must be taken into account for the banding if this
+  // solution layout/storage is to be used.
+  static Index RoundUpNumSlots(Index num_slots) {
+    // Must be at least kCoeffBits for at least one start
+    // Or if not smash, even more because hashing not equipped
+    // for stacking up so many entries on a single start location
+    auto min_slots = kCoeffBits * (TypesAndSettings::kUseSmash ? 1 : 2);
+    return std::max(num_slots, static_cast<Index>(min_slots));
+  }
+
+ protected:
+  // We generally store "starts" instead of slots for speed of GetStart(),
+  // as in StandardHasher.
+  Index num_starts_ = 0;
+  Index num_slots_allocated_ = 0;
+  std::unique_ptr<ResultRow[]> solution_rows_;
+};
+
+// Implements concept InterleavedSolutionStorage always using little-endian
+// byte order, so easy for serialization/deserialization. This implementation
+// fully supports fractional bits per key, where any number of segments
+// (number of bytes multiple of sizeof(CoeffRow)) can be used with any number
+// of slots that is a multiple of kCoeffBits.
+//
+// The structure is passed an externally allocated/de-allocated byte buffer
+// that is optionally pre-populated (from storage) for answering queries,
+// or can be populated by BackSubstFrom.
+//
+template <class TypesAndSettings>
+class SerializableInterleavedSolution {
+ public:
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings);
+
+  // Does not take ownership of `data` but uses it (up to `data_len` bytes)
+  // throughout lifetime
+  SerializableInterleavedSolution(char* data, size_t data_len)
+      : data_(data), data_len_(data_len) {}
+
+  void PrepareForNumStarts(Index num_starts) {
+    assert(num_starts == 0 || (num_starts % kCoeffBits == 1));
+    num_starts_ = num_starts;
+
+    InternalConfigure();
+  }
+
+  Index GetNumStarts() const { return num_starts_; }
+
+  Index GetNumBlocks() const {
+    const Index num_slots = num_starts_ + kCoeffBits - 1;
+    return num_slots / kCoeffBits;
+  }
+
+  Index GetUpperNumColumns() const { return upper_num_columns_; }
+
+  Index GetUpperStartBlock() const { return upper_start_block_; }
+
+  Index GetNumSegments() const {
+    return static_cast<Index>(data_len_ / sizeof(CoeffRow));
+  }
+
+  CoeffRow LoadSegment(Index segment_num) const {
+    assert(data_ != nullptr);  // suppress clang analyzer report
+    return DecodeFixedGeneric<CoeffRow>(data_ + segment_num * sizeof(CoeffRow));
+  }
+  void StoreSegment(Index segment_num, CoeffRow val) {
+    assert(data_ != nullptr);  // suppress clang analyzer report
+    EncodeFixedGeneric(data_ + segment_num * sizeof(CoeffRow), val);
+  }
+  void PrefetchSegmentRange(Index begin_segment_num,
+                            Index end_segment_num) const {
+    if (end_segment_num == begin_segment_num) {
+      // Nothing to do
+      return;
+    }
+    char* cur = data_ + begin_segment_num * sizeof(CoeffRow);
+    char* last = data_ + (end_segment_num - 1) * sizeof(CoeffRow);
+    while (cur < last) {
+      PREFETCH(cur, 0 /* rw */, 1 /* locality */);
+      cur += CACHE_LINE_SIZE;
+    }
+    PREFETCH(last, 0 /* rw */, 1 /* locality */);
+  }
+
+  // ********************************************************************
+  // High-level API
+
+  void ConfigureForNumBlocks(Index num_blocks) {
+    if (num_blocks == 0) {
+      PrepareForNumStarts(0);
+    } else {
+      PrepareForNumStarts(num_blocks * kCoeffBits - kCoeffBits + 1);
+    }
+  }
+
+  void ConfigureForNumSlots(Index num_slots) {
+    assert(num_slots % kCoeffBits == 0);
+    ConfigureForNumBlocks(num_slots / kCoeffBits);
+  }
+
+  template <typename BandingStorage>
+  void BackSubstFrom(const BandingStorage& bs) {
+    if (TypesAndSettings::kAllowZeroStarts && bs.GetNumStarts() == 0) {
+      // Unusual
+      PrepareForNumStarts(0);
+    } else {
+      // Normal
+      InterleavedBackSubst(this, bs);
+    }
+  }
+
+  template <typename PhsfQueryHasher>
+  ResultRow PhsfQuery(const Key& input, const PhsfQueryHasher& hasher) const {
+    // assert(!TypesAndSettings::kIsFilter);  Can be useful in testing
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual
+      return 0;
+    } else {
+      // Normal
+      // NOTE: not using a struct to encourage compiler optimization
+      Hash hash;
+      Index segment_num;
+      Index num_columns;
+      Index start_bit;
+      InterleavedPrepareQuery(input, hasher, *this, &hash, &segment_num,
+                              &num_columns, &start_bit);
+      return InterleavedPhsfQuery(hash, segment_num, num_columns, start_bit,
+                                  hasher, *this);
+    }
+  }
+
+  template <typename FilterQueryHasher>
+  bool FilterQuery(const Key& input, const FilterQueryHasher& hasher) const {
+    assert(TypesAndSettings::kIsFilter);
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual. Zero starts presumes no keys added -> always false
+      return false;
+    } else {
+      // Normal, or upper_num_columns_ == 0 means "no space for data" and
+      // thus will always return true.
+      // NOTE: not using a struct to encourage compiler optimization
+      Hash hash;
+      Index segment_num;
+      Index num_columns;
+      Index start_bit;
+      InterleavedPrepareQuery(input, hasher, *this, &hash, &segment_num,
+                              &num_columns, &start_bit);
+      return InterleavedFilterQuery(hash, segment_num, num_columns, start_bit,
+                                    hasher, *this);
+    }
+  }
+
+  double ExpectedFpRate() const {
+    assert(TypesAndSettings::kIsFilter);
+    if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) {
+      // Unusual. Zero starts presumes no keys added -> always false
+      return 0.0;
+    }
+    // else Normal
+
+    // Note: Ignoring smash setting; still close enough in that case
+    double lower_portion =
+        (upper_start_block_ * 1.0 * kCoeffBits) / num_starts_;
+
+    // Each result (solution) bit (column) cuts FP rate in half. Weight that
+    // for upper and lower number of bits (columns).
+    return lower_portion * std::pow(0.5, upper_num_columns_ - 1) +
+           (1.0 - lower_portion) * std::pow(0.5, upper_num_columns_);
+  }
+
+  // ********************************************************************
+  // Static high-level API
+
+  // Round up to a number of slots supported by this structure. Note that
+  // this needs to be must be taken into account for the banding if this
+  // solution layout/storage is to be used.
+  static Index RoundUpNumSlots(Index num_slots) {
+    // Must be multiple of kCoeffBits
+    Index corrected = (num_slots + kCoeffBits - 1) / kCoeffBits * kCoeffBits;
+
+    // Do not use num_starts==1 unless kUseSmash, because the hashing
+    // might not be equipped for stacking up so many entries on a
+    // single start location.
+    if (!TypesAndSettings::kUseSmash && corrected == kCoeffBits) {
+      corrected += kCoeffBits;
+    }
+    return corrected;
+  }
+
+  // Round down to a number of slots supported by this structure. Note that
+  // this needs to be must be taken into account for the banding if this
+  // solution layout/storage is to be used.
+  static Index RoundDownNumSlots(Index num_slots) {
+    // Must be multiple of kCoeffBits
+    Index corrected = num_slots / kCoeffBits * kCoeffBits;
+
+    // Do not use num_starts==1 unless kUseSmash, because the hashing
+    // might not be equipped for stacking up so many entries on a
+    // single start location.
+    if (!TypesAndSettings::kUseSmash && corrected == kCoeffBits) {
+      corrected = 0;
+    }
+    return corrected;
+  }
+
+  // Compute the number of bytes for a given number of slots and desired
+  // FP rate. Since desired FP rate might not be exactly achievable,
+  // rounding_bias32==0 means to always round toward lower FP rate
+  // than desired (more bytes); rounding_bias32==max uint32_t means always
+  // round toward higher FP rate than desired (fewer bytes); other values
+  // act as a proportional threshold or bias between the two.
+  static size_t GetBytesForFpRate(Index num_slots, double desired_fp_rate,
+                                  uint32_t rounding_bias32) {
+    return InternalGetBytesForFpRate(num_slots, desired_fp_rate,
+                                     1.0 / desired_fp_rate, rounding_bias32);
+  }
+
+  // The same, but specifying desired accuracy as 1.0 / FP rate, or
+  // one_in_fp_rate. E.g. desired_one_in_fp_rate=100 means 1% FP rate.
+  static size_t GetBytesForOneInFpRate(Index num_slots,
+                                       double desired_one_in_fp_rate,
+                                       uint32_t rounding_bias32) {
+    return InternalGetBytesForFpRate(num_slots, 1.0 / desired_one_in_fp_rate,
+                                     desired_one_in_fp_rate, rounding_bias32);
+  }
+
+ protected:
+  static size_t InternalGetBytesForFpRate(Index num_slots,
+                                          double desired_fp_rate,
+                                          double desired_one_in_fp_rate,
+                                          uint32_t rounding_bias32) {
+    assert(TypesAndSettings::kIsFilter);
+    if (TypesAndSettings::kAllowZeroStarts) {
+      if (num_slots == 0) {
+        // Unusual. Zero starts presumes no keys added -> always false (no FPs)
+        return 0U;
+      }
+    } else {
+      assert(num_slots > 0);
+    }
+    // Must be rounded up already.
+    assert(RoundUpNumSlots(num_slots) == num_slots);
+
+    if (desired_one_in_fp_rate > 1.0 && desired_fp_rate < 1.0) {
+      // Typical: less than 100% FP rate
+      if (desired_one_in_fp_rate <= static_cast<ResultRow>(-1)) {
+        // Typical: Less than maximum result row entropy
+        ResultRow rounded = static_cast<ResultRow>(desired_one_in_fp_rate);
+        int lower_columns = FloorLog2(rounded);
+        double lower_columns_fp_rate = std::pow(2.0, -lower_columns);
+        double upper_columns_fp_rate = std::pow(2.0, -(lower_columns + 1));
+        // Floating point don't let me down!
+        assert(lower_columns_fp_rate >= desired_fp_rate);
+        assert(upper_columns_fp_rate <= desired_fp_rate);
+
+        double lower_portion = (desired_fp_rate - upper_columns_fp_rate) /
+                               (lower_columns_fp_rate - upper_columns_fp_rate);
+        // Floating point don't let me down!
+        assert(lower_portion >= 0.0);
+        assert(lower_portion <= 1.0);
+
+        double rounding_bias = (rounding_bias32 + 0.5) / double{0x100000000};
+        assert(rounding_bias > 0.0);
+        assert(rounding_bias < 1.0);
+
+        // Note: Ignoring smash setting; still close enough in that case
+        Index num_starts = num_slots - kCoeffBits + 1;
+        // Lower upper_start_block means lower FP rate (higher accuracy)
+        Index upper_start_block = static_cast<Index>(
+            (lower_portion * num_starts + rounding_bias) / kCoeffBits);
+        Index num_blocks = num_slots / kCoeffBits;
+        assert(upper_start_block < num_blocks);
+
+        // Start by assuming all blocks use lower number of columns
+        Index num_segments = num_blocks * static_cast<Index>(lower_columns);
+        // Correct by 1 each for blocks using upper number of columns
+        num_segments += (num_blocks - upper_start_block);
+        // Total bytes
+        return num_segments * sizeof(CoeffRow);
+      } else {
+        // one_in_fp_rate too big, thus requested FP rate is smaller than
+        // supported. Use max number of columns for minimum supported FP rate.
+        return num_slots * sizeof(ResultRow);
+      }
+    } else {
+      // Effectively asking for 100% FP rate, or NaN etc.
+      if (TypesAndSettings::kAllowZeroStarts) {
+        // Zero segments
+        return 0U;
+      } else {
+        // One segment (minimum size, maximizing FP rate)
+        return sizeof(CoeffRow);
+      }
+    }
+  }
+
+  void InternalConfigure() {
+    const Index num_blocks = GetNumBlocks();
+    Index num_segments = GetNumSegments();
+
+    if (num_blocks == 0) {
+      // Exceptional
+      upper_num_columns_ = 0;
+      upper_start_block_ = 0;
+    } else {
+      // Normal
+      upper_num_columns_ =
+          (num_segments + /*round up*/ num_blocks - 1) / num_blocks;
+      upper_start_block_ = upper_num_columns_ * num_blocks - num_segments;
+      // Unless that's more columns than supported by ResultRow data type
+      if (upper_num_columns_ > 8U * sizeof(ResultRow)) {
+        // Use maximum columns (there will be space unused)
+        upper_num_columns_ = static_cast<Index>(8U * sizeof(ResultRow));
+        upper_start_block_ = 0;
+        num_segments = num_blocks * upper_num_columns_;
+      }
+    }
+    // Update data_len_ for correct rounding and/or unused space
+    // NOTE: unused space stays gone if we PrepareForNumStarts again.
+    // We are prioritizing minimizing the number of fields over making
+    // the "unusued space" feature work well.
+    data_len_ = num_segments * sizeof(CoeffRow);
+  }
+
+  char* const data_;
+  size_t data_len_;
+  Index num_starts_ = 0;
+  Index upper_num_columns_ = 0;
+  Index upper_start_block_ = 0;
+};
+
+}  // namespace ribbon
+
+}  // namespace ROCKSDB_NAMESPACE
+
+// For convenience working with templates
+#define IMPORT_RIBBON_IMPL_TYPES(TypesAndSettings)                            \
+  using Hasher = ROCKSDB_NAMESPACE::ribbon::StandardHasher<TypesAndSettings>; \
+  using Banding =                                                             \
+      ROCKSDB_NAMESPACE::ribbon::StandardBanding<TypesAndSettings>;           \
+  using SimpleSoln =                                                          \
+      ROCKSDB_NAMESPACE::ribbon::InMemSimpleSolution<TypesAndSettings>;       \
+  using InterleavedSoln =                                                     \
+      ROCKSDB_NAMESPACE::ribbon::SerializableInterleavedSolution<             \
+          TypesAndSettings>;                                                  \
+  static_assert(sizeof(Hasher) + sizeof(Banding) + sizeof(SimpleSoln) +       \
+                        sizeof(InterleavedSoln) >                             \
+                    0,                                                        \
+                "avoid unused warnings, semicolon expected after macro call")
diff --git a/util/ribbon_test.cc b/util/ribbon_test.cc
new file mode 100644
index 00000000000..c668cb576f0
--- /dev/null
+++ b/util/ribbon_test.cc
@@ -0,0 +1,1308 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/system_clock.h"
+#include "test_util/testharness.h"
+#include "util/bloom_impl.h"
+#include "util/coding.h"
+#include "util/hash.h"
+#include "util/ribbon_config.h"
+#include "util/ribbon_impl.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+
+#ifndef GFLAGS
+uint32_t FLAGS_thoroughness = 5;
+uint32_t FLAGS_max_add = 0;
+uint32_t FLAGS_min_check = 4000;
+uint32_t FLAGS_max_check = 100000;
+bool FLAGS_verbose = false;
+
+bool FLAGS_find_occ = false;
+bool FLAGS_find_slot_occ = false;
+double FLAGS_find_next_factor = 1.618;
+uint32_t FLAGS_find_iters = 10000;
+uint32_t FLAGS_find_min_slots = 128;
+uint32_t FLAGS_find_max_slots = 1000000;
+
+bool FLAGS_optimize_homog = false;
+uint32_t FLAGS_optimize_homog_slots = 30000000;
+uint32_t FLAGS_optimize_homog_check = 200000;
+double FLAGS_optimize_homog_granularity = 0.002;
+#else
+#include "util/gflags_compat.h"
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+// Using 500 is a good test when you have time to be thorough.
+// Default is for general RocksDB regression test runs.
+DEFINE_uint32(thoroughness, 5, "iterations per configuration");
+DEFINE_uint32(max_add, 0,
+              "Add up to this number of entries to a single filter in "
+              "CompactnessAndBacktrackAndFpRate; 0 == reasonable default");
+DEFINE_uint32(min_check, 4000,
+              "Minimum number of novel entries for testing FP rate");
+DEFINE_uint32(max_check, 10000,
+              "Maximum number of novel entries for testing FP rate");
+DEFINE_bool(verbose, false, "Print extra details");
+
+// Options for FindOccupancy, which is more of a tool than a test.
+DEFINE_bool(find_occ, false, "whether to run the FindOccupancy tool");
+DEFINE_bool(find_slot_occ, false,
+            "whether to show individual slot occupancies with "
+            "FindOccupancy tool");
+DEFINE_double(find_next_factor, 1.618,
+              "factor to next num_slots for FindOccupancy");
+DEFINE_uint32(find_iters, 10000, "number of samples for FindOccupancy");
+DEFINE_uint32(find_min_slots, 128, "number of slots for FindOccupancy");
+DEFINE_uint32(find_max_slots, 1000000, "number of slots for FindOccupancy");
+
+// Options for OptimizeHomogAtScale, which is more of a tool than a test.
+DEFINE_bool(optimize_homog, false,
+            "whether to run the OptimizeHomogAtScale tool");
+DEFINE_uint32(optimize_homog_slots, 30000000,
+              "number of slots for OptimizeHomogAtScale");
+DEFINE_uint32(optimize_homog_check, 200000,
+              "number of queries for checking FP rate in OptimizeHomogAtScale");
+DEFINE_double(
+    optimize_homog_granularity, 0.002,
+    "overhead change between FP rate checking in OptimizeHomogAtScale");
+
+#endif  // GFLAGS
+
+template <typename TypesAndSettings>
+class RibbonTypeParamTest : public ::testing::Test {};
+
+class RibbonTest : public ::testing::Test {};
+
+namespace {
+
+// Different ways of generating keys for testing
+
+// Generate semi-sequential keys
+struct StandardKeyGen {
+  StandardKeyGen(const std::string& prefix, uint64_t id)
+      : id_(id), str_(prefix) {
+    ROCKSDB_NAMESPACE::PutFixed64(&str_, /*placeholder*/ 0);
+  }
+
+  // Prefix (only one required)
+  StandardKeyGen& operator++() {
+    ++id_;
+    return *this;
+  }
+
+  StandardKeyGen& operator+=(uint64_t i) {
+    id_ += i;
+    return *this;
+  }
+
+  const std::string& operator*() {
+    // Use multiplication to mix things up a little in the key
+    ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8],
+                                     id_ * uint64_t{0x1500000001});
+    return str_;
+  }
+
+  bool operator==(const StandardKeyGen& other) {
+    // Same prefix is assumed
+    return id_ == other.id_;
+  }
+  bool operator!=(const StandardKeyGen& other) {
+    // Same prefix is assumed
+    return id_ != other.id_;
+  }
+
+  uint64_t id_;
+  std::string str_;
+};
+
+// Generate small sequential keys, that can misbehave with sequential seeds
+// as in https://github.com/Cyan4973/xxHash/issues/469.
+// These keys are only heuristically unique, but that's OK with 64 bits,
+// for testing purposes.
+struct SmallKeyGen {
+  SmallKeyGen(const std::string& prefix, uint64_t id) : id_(id) {
+    // Hash the prefix for a heuristically unique offset
+    id_ += ROCKSDB_NAMESPACE::GetSliceHash64(prefix);
+    ROCKSDB_NAMESPACE::PutFixed64(&str_, id_);
+  }
+
+  // Prefix (only one required)
+  SmallKeyGen& operator++() {
+    ++id_;
+    return *this;
+  }
+
+  SmallKeyGen& operator+=(uint64_t i) {
+    id_ += i;
+    return *this;
+  }
+
+  const std::string& operator*() {
+    ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8], id_);
+    return str_;
+  }
+
+  bool operator==(const SmallKeyGen& other) { return id_ == other.id_; }
+  bool operator!=(const SmallKeyGen& other) { return id_ != other.id_; }
+
+  uint64_t id_;
+  std::string str_;
+};
+
+template <typename KeyGen>
+struct Hash32KeyGenWrapper : public KeyGen {
+  Hash32KeyGenWrapper(const std::string& prefix, uint64_t id)
+      : KeyGen(prefix, id) {}
+  uint32_t operator*() {
+    auto& key = *static_cast<KeyGen&>(*this);
+    // unseeded
+    return ROCKSDB_NAMESPACE::GetSliceHash(key);
+  }
+};
+
+template <typename KeyGen>
+struct Hash64KeyGenWrapper : public KeyGen {
+  Hash64KeyGenWrapper(const std::string& prefix, uint64_t id)
+      : KeyGen(prefix, id) {}
+  uint64_t operator*() {
+    auto& key = *static_cast<KeyGen&>(*this);
+    // unseeded
+    return ROCKSDB_NAMESPACE::GetSliceHash64(key);
+  }
+};
+
+using ROCKSDB_NAMESPACE::ribbon::ConstructionFailureChance;
+
+const std::vector<ConstructionFailureChance> kFailureOnly50Pct = {
+    ROCKSDB_NAMESPACE::ribbon::kOneIn2};
+
+const std::vector<ConstructionFailureChance> kFailureOnlyRare = {
+    ROCKSDB_NAMESPACE::ribbon::kOneIn1000};
+
+const std::vector<ConstructionFailureChance> kFailureAll = {
+    ROCKSDB_NAMESPACE::ribbon::kOneIn2, ROCKSDB_NAMESPACE::ribbon::kOneIn20,
+    ROCKSDB_NAMESPACE::ribbon::kOneIn1000};
+
+}  // namespace
+
+using ROCKSDB_NAMESPACE::ribbon::ExpectedCollisionFpRate;
+using ROCKSDB_NAMESPACE::ribbon::StandardHasher;
+using ROCKSDB_NAMESPACE::ribbon::StandardRehasherAdapter;
+
+struct DefaultTypesAndSettings {
+  using CoeffRow = ROCKSDB_NAMESPACE::Unsigned128;
+  using ResultRow = uint8_t;
+  using Index = uint32_t;
+  using Hash = uint64_t;
+  using Seed = uint32_t;
+  using Key = ROCKSDB_NAMESPACE::Slice;
+  static constexpr bool kIsFilter = true;
+  static constexpr bool kHomogeneous = false;
+  static constexpr bool kFirstCoeffAlwaysOne = true;
+  static constexpr bool kUseSmash = false;
+  static constexpr bool kAllowZeroStarts = false;
+  static Hash HashFn(const Key& key, uint64_t raw_seed) {
+    // This version 0.7.2 preview of XXH3 (a.k.a. XXH3p) function does
+    // not pass SmallKeyGen tests below without some seed premixing from
+    // StandardHasher. See https://github.com/Cyan4973/xxHash/issues/469
+    return ROCKSDB_NAMESPACE::Hash64(key.data(), key.size(), raw_seed);
+  }
+  // For testing
+  using KeyGen = StandardKeyGen;
+  static const std::vector<ConstructionFailureChance>& FailureChanceToTest() {
+    return kFailureAll;
+  }
+};
+
+using TypesAndSettings_Coeff128 = DefaultTypesAndSettings;
+struct TypesAndSettings_Coeff128Smash : public DefaultTypesAndSettings {
+  static constexpr bool kUseSmash = true;
+};
+struct TypesAndSettings_Coeff64 : public DefaultTypesAndSettings {
+  using CoeffRow = uint64_t;
+};
+struct TypesAndSettings_Coeff64Smash : public TypesAndSettings_Coeff64 {
+  static constexpr bool kUseSmash = true;
+};
+struct TypesAndSettings_Coeff64Smash0 : public TypesAndSettings_Coeff64Smash {
+  static constexpr bool kFirstCoeffAlwaysOne = false;
+};
+
+// Homogeneous Ribbon configurations
+struct TypesAndSettings_Coeff128_Homog : public DefaultTypesAndSettings {
+  static constexpr bool kHomogeneous = true;
+  // Since our best construction success setting still has 1/1000 failure
+  // rate, the best FP rate we test is 1/256
+  using ResultRow = uint8_t;
+  // Homogeneous only makes sense with sufficient slots for equivalent of
+  // almost sure construction success
+  static const std::vector<ConstructionFailureChance>& FailureChanceToTest() {
+    return kFailureOnlyRare;
+  }
+};
+struct TypesAndSettings_Coeff128Smash_Homog
+    : public TypesAndSettings_Coeff128_Homog {
+  // Smash (extra time to save space) + Homog (extra space to save time)
+  // doesn't make much sense in practice, but we minimally test it
+  static constexpr bool kUseSmash = true;
+};
+struct TypesAndSettings_Coeff64_Homog : public TypesAndSettings_Coeff128_Homog {
+  using CoeffRow = uint64_t;
+};
+struct TypesAndSettings_Coeff64Smash_Homog
+    : public TypesAndSettings_Coeff64_Homog {
+  // Smash (extra time to save space) + Homog (extra space to save time)
+  // doesn't make much sense in practice, but we minimally test it
+  static constexpr bool kUseSmash = true;
+};
+
+// Less exhaustive mix of coverage, but still covering the most stressful case
+// (only 50% construction success)
+struct AbridgedTypesAndSettings : public DefaultTypesAndSettings {
+  static const std::vector<ConstructionFailureChance>& FailureChanceToTest() {
+    return kFailureOnly50Pct;
+  }
+};
+struct TypesAndSettings_Result16 : public AbridgedTypesAndSettings {
+  using ResultRow = uint16_t;
+};
+struct TypesAndSettings_Result32 : public AbridgedTypesAndSettings {
+  using ResultRow = uint32_t;
+};
+struct TypesAndSettings_IndexSizeT : public AbridgedTypesAndSettings {
+  using Index = size_t;
+};
+struct TypesAndSettings_Hash32 : public AbridgedTypesAndSettings {
+  using Hash = uint32_t;
+  static Hash HashFn(const Key& key, Hash raw_seed) {
+    // This MurmurHash1 function does not pass tests below without the
+    // seed premixing from StandardHasher. In fact, it needs more than
+    // just a multiplication mixer on the ordinal seed.
+    return ROCKSDB_NAMESPACE::Hash(key.data(), key.size(), raw_seed);
+  }
+};
+struct TypesAndSettings_Hash32_Result16 : public AbridgedTypesAndSettings {
+  using ResultRow = uint16_t;
+};
+struct TypesAndSettings_KeyString : public AbridgedTypesAndSettings {
+  using Key = std::string;
+};
+struct TypesAndSettings_Seed8 : public AbridgedTypesAndSettings {
+  // This is not a generally recommended configuration. With the configured
+  // hash function, it would fail with SmallKeyGen due to insufficient
+  // independence among the seeds.
+  using Seed = uint8_t;
+};
+struct TypesAndSettings_NoAlwaysOne : public AbridgedTypesAndSettings {
+  static constexpr bool kFirstCoeffAlwaysOne = false;
+};
+struct TypesAndSettings_AllowZeroStarts : public AbridgedTypesAndSettings {
+  static constexpr bool kAllowZeroStarts = true;
+};
+struct TypesAndSettings_Seed64 : public AbridgedTypesAndSettings {
+  using Seed = uint64_t;
+};
+struct TypesAndSettings_Rehasher
+    : public StandardRehasherAdapter<AbridgedTypesAndSettings> {
+  using KeyGen = Hash64KeyGenWrapper<StandardKeyGen>;
+};
+struct TypesAndSettings_Rehasher_Result16 : public TypesAndSettings_Rehasher {
+  using ResultRow = uint16_t;
+};
+struct TypesAndSettings_Rehasher_Result32 : public TypesAndSettings_Rehasher {
+  using ResultRow = uint32_t;
+};
+struct TypesAndSettings_Rehasher_Seed64
+    : public StandardRehasherAdapter<TypesAndSettings_Seed64> {
+  using KeyGen = Hash64KeyGenWrapper<StandardKeyGen>;
+  // Note: 64-bit seed with Rehasher gives slightly better average reseeds
+};
+struct TypesAndSettings_Rehasher32
+    : public StandardRehasherAdapter<TypesAndSettings_Hash32> {
+  using KeyGen = Hash32KeyGenWrapper<StandardKeyGen>;
+};
+struct TypesAndSettings_Rehasher32_Coeff64
+    : public TypesAndSettings_Rehasher32 {
+  using CoeffRow = uint64_t;
+};
+struct TypesAndSettings_SmallKeyGen : public AbridgedTypesAndSettings {
+  // SmallKeyGen stresses the independence of different hash seeds
+  using KeyGen = SmallKeyGen;
+};
+struct TypesAndSettings_Hash32_SmallKeyGen : public TypesAndSettings_Hash32 {
+  // SmallKeyGen stresses the independence of different hash seeds
+  using KeyGen = SmallKeyGen;
+};
+struct TypesAndSettings_Coeff32 : public DefaultTypesAndSettings {
+  using CoeffRow = uint32_t;
+};
+struct TypesAndSettings_Coeff32Smash : public TypesAndSettings_Coeff32 {
+  static constexpr bool kUseSmash = true;
+};
+struct TypesAndSettings_Coeff16 : public DefaultTypesAndSettings {
+  using CoeffRow = uint16_t;
+};
+struct TypesAndSettings_Coeff16Smash : public TypesAndSettings_Coeff16 {
+  static constexpr bool kUseSmash = true;
+};
+
+using TestTypesAndSettings = ::testing::Types<
+    TypesAndSettings_Coeff128, TypesAndSettings_Coeff128Smash,
+    TypesAndSettings_Coeff64, TypesAndSettings_Coeff64Smash,
+    TypesAndSettings_Coeff64Smash0, TypesAndSettings_Coeff128_Homog,
+    TypesAndSettings_Coeff128Smash_Homog, TypesAndSettings_Coeff64_Homog,
+    TypesAndSettings_Coeff64Smash_Homog, TypesAndSettings_Result16,
+    TypesAndSettings_Result32, TypesAndSettings_IndexSizeT,
+    TypesAndSettings_Hash32, TypesAndSettings_Hash32_Result16,
+    TypesAndSettings_KeyString, TypesAndSettings_Seed8,
+    TypesAndSettings_NoAlwaysOne, TypesAndSettings_AllowZeroStarts,
+    TypesAndSettings_Seed64, TypesAndSettings_Rehasher,
+    TypesAndSettings_Rehasher_Result16, TypesAndSettings_Rehasher_Result32,
+    TypesAndSettings_Rehasher_Seed64, TypesAndSettings_Rehasher32,
+    TypesAndSettings_Rehasher32_Coeff64, TypesAndSettings_SmallKeyGen,
+    TypesAndSettings_Hash32_SmallKeyGen, TypesAndSettings_Coeff32,
+    TypesAndSettings_Coeff32Smash, TypesAndSettings_Coeff16,
+    TypesAndSettings_Coeff16Smash>;
+TYPED_TEST_CASE(RibbonTypeParamTest, TestTypesAndSettings);
+
+namespace {
+
+// For testing Poisson-distributed (or similar) statistics, get value for
+// `stddevs_allowed` standard deviations above expected mean
+// `expected_count`.
+// (Poisson approximates Binomial only if probability of a trial being
+// in the count is low.)
+uint64_t PoissonUpperBound(double expected_count, double stddevs_allowed) {
+  return static_cast<uint64_t>(
+      expected_count + stddevs_allowed * std::sqrt(expected_count) + 1.0);
+}
+
+uint64_t PoissonLowerBound(double expected_count, double stddevs_allowed) {
+  return static_cast<uint64_t>(std::max(
+      0.0, expected_count - stddevs_allowed * std::sqrt(expected_count)));
+}
+
+uint64_t FrequentPoissonUpperBound(double expected_count) {
+  // Allow up to 5.0 standard deviations for frequently checked statistics
+  return PoissonUpperBound(expected_count, 5.0);
+}
+
+uint64_t FrequentPoissonLowerBound(double expected_count) {
+  return PoissonLowerBound(expected_count, 5.0);
+}
+
+uint64_t InfrequentPoissonUpperBound(double expected_count) {
+  // Allow up to 3 standard deviations for infrequently checked statistics
+  return PoissonUpperBound(expected_count, 3.0);
+}
+
+uint64_t InfrequentPoissonLowerBound(double expected_count) {
+  return PoissonLowerBound(expected_count, 3.0);
+}
+
+}  // namespace
+
+TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) {
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
+  IMPORT_RIBBON_IMPL_TYPES(TypeParam);
+  using KeyGen = typename TypeParam::KeyGen;
+  using ConfigHelper =
+      ROCKSDB_NAMESPACE::ribbon::BandingConfigHelper<TypeParam>;
+
+  if (sizeof(CoeffRow) < 8) {
+    ROCKSDB_GTEST_BYPASS("Not fully supported");
+    return;
+  }
+
+  const auto log2_thoroughness =
+      static_cast<uint32_t>(ROCKSDB_NAMESPACE::FloorLog2(FLAGS_thoroughness));
+
+  // We are going to choose num_to_add using an exponential distribution,
+  // so that we have good representation of small-to-medium filters.
+  // Here we just pick some reasonable, practical upper bound based on
+  // kCoeffBits or option.
+  const double log_max_add = std::log(
+      FLAGS_max_add > 0 ? FLAGS_max_add
+                        : static_cast<uint32_t>(kCoeffBits * kCoeffBits) *
+                              std::max(FLAGS_thoroughness, uint32_t{32}));
+
+  // This needs to be enough below the minimum number of slots to get a
+  // reasonable number of samples with the minimum number of slots.
+  const double log_min_add = std::log(0.66 * SimpleSoln::RoundUpNumSlots(1));
+
+  ASSERT_GT(log_max_add, log_min_add);
+
+  const double diff_log_add = log_max_add - log_min_add;
+
+  for (ConstructionFailureChance cs : TypeParam::FailureChanceToTest()) {
+    double expected_reseeds;
+    switch (cs) {
+      default:
+        assert(false);
+        FALLTHROUGH_INTENDED;
+      case ROCKSDB_NAMESPACE::ribbon::kOneIn2:
+        fprintf(stderr, "== Failure: 50 percent\n");
+        expected_reseeds = 1.0;
+        break;
+      case ROCKSDB_NAMESPACE::ribbon::kOneIn20:
+        fprintf(stderr, "== Failure: 95 percent\n");
+        expected_reseeds = 0.053;
+        break;
+      case ROCKSDB_NAMESPACE::ribbon::kOneIn1000:
+        fprintf(stderr, "== Failure: 1/1000\n");
+        expected_reseeds = 0.001;
+        break;
+    }
+
+    uint64_t total_reseeds = 0;
+    uint64_t total_singles = 0;
+    uint64_t total_single_failures = 0;
+    uint64_t total_batch = 0;
+    uint64_t total_batch_successes = 0;
+    uint64_t total_fp_count = 0;
+    uint64_t total_added = 0;
+    uint64_t total_expand_trials = 0;
+    uint64_t total_expand_failures = 0;
+    double total_expand_overhead = 0.0;
+
+    uint64_t soln_query_nanos = 0;
+    uint64_t soln_query_count = 0;
+    uint64_t bloom_query_nanos = 0;
+    uint64_t isoln_query_nanos = 0;
+    uint64_t isoln_query_count = 0;
+
+    // Take different samples if you change thoroughness
+    ROCKSDB_NAMESPACE::Random32 rnd(FLAGS_thoroughness);
+
+    for (uint32_t i = 0; i < FLAGS_thoroughness; ++i) {
+      // We are going to choose num_to_add using an exponential distribution
+      // as noted above, but instead of randomly choosing them, we generate
+      // samples linearly using the golden ratio, which ensures a nice spread
+      // even for a small number of samples, and starting with the minimum
+      // number of slots to ensure it is tested.
+      double log_add =
+          std::fmod(0.6180339887498948482 * diff_log_add * i, diff_log_add) +
+          log_min_add;
+      uint32_t num_to_add = static_cast<uint32_t>(std::exp(log_add));
+
+      // Most of the time, test the Interleaved solution storage, but when
+      // we do we have to make num_slots a multiple of kCoeffBits. So
+      // sometimes we want to test without that limitation.
+      bool test_interleaved = (i % 7) != 6;
+
+      // Compute num_slots, and re-adjust num_to_add to get as close as possible
+      // to next num_slots, to stress that num_slots in terms of construction
+      // success. Ensure at least one iteration:
+      Index num_slots = Index{0} - 1;
+      --num_to_add;
+      for (;;) {
+        Index next_num_slots = SimpleSoln::RoundUpNumSlots(
+            ConfigHelper::GetNumSlots(num_to_add + 1, cs));
+        if (test_interleaved) {
+          next_num_slots = InterleavedSoln::RoundUpNumSlots(next_num_slots);
+          // assert idempotent
+          EXPECT_EQ(next_num_slots,
+                    InterleavedSoln::RoundUpNumSlots(next_num_slots));
+        }
+        // assert idempotent with InterleavedSoln::RoundUpNumSlots
+        EXPECT_EQ(next_num_slots, SimpleSoln::RoundUpNumSlots(next_num_slots));
+
+        if (next_num_slots > num_slots) {
+          break;
+        }
+        num_slots = next_num_slots;
+        ++num_to_add;
+      }
+      assert(num_slots < Index{0} - 1);
+
+      total_added += num_to_add;
+
+      std::string prefix;
+      ROCKSDB_NAMESPACE::PutFixed32(&prefix, rnd.Next());
+
+      // Batch that must be added
+      std::string added_str = prefix + "added";
+      KeyGen keys_begin(added_str, 0);
+      KeyGen keys_end(added_str, num_to_add);
+
+      // A couple more that will probably be added
+      KeyGen one_more(prefix + "more", 1);
+      KeyGen two_more(prefix + "more", 2);
+
+      // Batch that may or may not be added
+      uint32_t batch_size =
+          static_cast<uint32_t>(2.0 * std::sqrt(num_slots - num_to_add));
+      if (batch_size < 10U) {
+        batch_size = 0;
+      }
+      std::string batch_str = prefix + "batch";
+      KeyGen batch_begin(batch_str, 0);
+      KeyGen batch_end(batch_str, batch_size);
+
+      // Batch never (successfully) added, but used for querying FP rate
+      std::string not_str = prefix + "not";
+      KeyGen other_keys_begin(not_str, 0);
+      KeyGen other_keys_end(not_str, FLAGS_max_check);
+
+      double overhead_ratio = 1.0 * num_slots / num_to_add;
+      if (FLAGS_verbose) {
+        fprintf(stderr, "Adding(%s) %u / %u   Overhead: %g   Batch size: %u\n",
+                test_interleaved ? "i" : "s", (unsigned)num_to_add,
+                (unsigned)num_slots, overhead_ratio, (unsigned)batch_size);
+      }
+
+      // Vary bytes for InterleavedSoln to use number of solution columns
+      // from 0 to max allowed by ResultRow type (and used by SimpleSoln).
+      // Specifically include 0 and max, and otherwise skew toward max.
+      uint32_t max_ibytes =
+          static_cast<uint32_t>(sizeof(ResultRow) * num_slots);
+      size_t ibytes;
+      if (i == 0) {
+        ibytes = 0;
+      } else if (i == 1) {
+        ibytes = max_ibytes;
+      } else {
+        // Skewed
+        ibytes =
+            std::max(rnd.Uniformish(max_ibytes), rnd.Uniformish(max_ibytes));
+      }
+      std::unique_ptr<char[]> idata(new char[ibytes]);
+      InterleavedSoln isoln(idata.get(), ibytes);
+
+      SimpleSoln soln;
+      Hasher hasher;
+      bool first_single;
+      bool second_single;
+      bool batch_success;
+      {
+        Banding banding;
+        // Traditional solve for a fixed set.
+        ASSERT_TRUE(
+            banding.ResetAndFindSeedToSolve(num_slots, keys_begin, keys_end));
+
+        Index occupied_count = banding.GetOccupiedCount();
+        Index more_added = 0;
+
+        if (TypeParam::kHomogeneous || overhead_ratio < 1.01 ||
+            batch_size == 0) {
+          // Homogeneous not compatible with backtracking because add
+          // doesn't fail. Small overhead ratio too packed to expect more
+          first_single = false;
+          second_single = false;
+          batch_success = false;
+        } else {
+          // Now to test backtracking, starting with guaranteed fail. By using
+          // the keys that will be used to test FP rate, we are then doing an
+          // extra check that after backtracking there are no remnants (e.g. in
+          // result side of banding) of these entries.
+          KeyGen other_keys_too_big_end = other_keys_begin;
+          other_keys_too_big_end += num_to_add;
+          banding.EnsureBacktrackSize(std::max(num_to_add, batch_size));
+          EXPECT_FALSE(banding.AddRangeOrRollBack(other_keys_begin,
+                                                  other_keys_too_big_end));
+          EXPECT_EQ(occupied_count, banding.GetOccupiedCount());
+
+          // Check that we still have a good chance of adding a couple more
+          // individually
+          first_single = banding.Add(*one_more);
+          second_single = banding.Add(*two_more);
+          more_added += (first_single ? 1 : 0) + (second_single ? 1 : 0);
+          total_singles += 2U;
+          total_single_failures += 2U - more_added;
+
+          // Or as a batch
+          batch_success = banding.AddRangeOrRollBack(batch_begin, batch_end);
+          ++total_batch;
+          if (batch_success) {
+            more_added += batch_size;
+            ++total_batch_successes;
+          }
+          EXPECT_LE(banding.GetOccupiedCount(), occupied_count + more_added);
+        }
+
+        // Also verify that redundant adds are OK (no effect)
+        ASSERT_TRUE(
+            banding.AddRange(keys_begin, KeyGen(added_str, num_to_add / 8)));
+        EXPECT_LE(banding.GetOccupiedCount(), occupied_count + more_added);
+
+        // Now back-substitution
+        soln.BackSubstFrom(banding);
+        if (test_interleaved) {
+          isoln.BackSubstFrom(banding);
+        }
+
+        Seed reseeds = banding.GetOrdinalSeed();
+        total_reseeds += reseeds;
+
+        EXPECT_LE(reseeds, 8 + log2_thoroughness);
+        if (reseeds > log2_thoroughness + 1) {
+          fprintf(
+              stderr, "%s high reseeds at %u, %u/%u: %u\n",
+              reseeds > log2_thoroughness + 8 ? "ERROR Extremely" : "Somewhat",
+              static_cast<unsigned>(i), static_cast<unsigned>(num_to_add),
+              static_cast<unsigned>(num_slots), static_cast<unsigned>(reseeds));
+        }
+
+        if (reseeds > 0) {
+          // "Expand" test: given a failed construction, how likely is it to
+          // pass with same seed and more slots. At each step, we increase
+          // enough to ensure there is at least one shift within each coeff
+          // block.
+          ++total_expand_trials;
+          Index expand_count = 0;
+          Index ex_slots = num_slots;
+          banding.SetOrdinalSeed(0);
+          for (;; ++expand_count) {
+            ASSERT_LE(expand_count, log2_thoroughness);
+            ex_slots += ex_slots / kCoeffBits;
+            if (test_interleaved) {
+              ex_slots = InterleavedSoln::RoundUpNumSlots(ex_slots);
+            }
+            banding.Reset(ex_slots);
+            bool success = banding.AddRange(keys_begin, keys_end);
+            if (success) {
+              break;
+            }
+          }
+          total_expand_failures += expand_count;
+          total_expand_overhead += 1.0 * (ex_slots - num_slots) / num_slots;
+        }
+
+        hasher.SetOrdinalSeed(reseeds);
+      }
+      // soln and hasher now independent of Banding object
+
+      // Verify keys added
+      KeyGen cur = keys_begin;
+      while (cur != keys_end) {
+        ASSERT_TRUE(soln.FilterQuery(*cur, hasher));
+        ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher));
+        ++cur;
+      }
+      // We (maybe) snuck these in!
+      if (first_single) {
+        ASSERT_TRUE(soln.FilterQuery(*one_more, hasher));
+        ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*one_more, hasher));
+      }
+      if (second_single) {
+        ASSERT_TRUE(soln.FilterQuery(*two_more, hasher));
+        ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*two_more, hasher));
+      }
+      if (batch_success) {
+        cur = batch_begin;
+        while (cur != batch_end) {
+          ASSERT_TRUE(soln.FilterQuery(*cur, hasher));
+          ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher));
+          ++cur;
+        }
+      }
+
+      // Check FP rate (depends only on number of result bits == solution
+      // columns)
+      Index fp_count = 0;
+      cur = other_keys_begin;
+      {
+        ROCKSDB_NAMESPACE::StopWatchNano timer(
+            ROCKSDB_NAMESPACE::SystemClock::Default().get(), true);
+        while (cur != other_keys_end) {
+          bool fp = soln.FilterQuery(*cur, hasher);
+          fp_count += fp ? 1 : 0;
+          ++cur;
+        }
+        soln_query_nanos += timer.ElapsedNanos();
+        soln_query_count += FLAGS_max_check;
+      }
+      {
+        double expected_fp_count = soln.ExpectedFpRate() * FLAGS_max_check;
+        // For expected FP rate, also include false positives due to collisions
+        // in Hash value. (Negligible for 64-bit, can matter for 32-bit.)
+        double correction =
+            FLAGS_max_check * ExpectedCollisionFpRate(hasher, num_to_add);
+
+        // NOTE: rare violations expected with kHomogeneous
+        EXPECT_LE(fp_count,
+                  FrequentPoissonUpperBound(expected_fp_count + correction));
+        EXPECT_GE(fp_count,
+                  FrequentPoissonLowerBound(expected_fp_count + correction));
+      }
+      total_fp_count += fp_count;
+
+      // And also check FP rate for isoln
+      if (test_interleaved) {
+        Index ifp_count = 0;
+        cur = other_keys_begin;
+        ROCKSDB_NAMESPACE::StopWatchNano timer(
+            ROCKSDB_NAMESPACE::SystemClock::Default().get(), true);
+        while (cur != other_keys_end) {
+          ifp_count += isoln.FilterQuery(*cur, hasher) ? 1 : 0;
+          ++cur;
+        }
+        isoln_query_nanos += timer.ElapsedNanos();
+        isoln_query_count += FLAGS_max_check;
+        {
+          double expected_fp_count = isoln.ExpectedFpRate() * FLAGS_max_check;
+          // For expected FP rate, also include false positives due to
+          // collisions in Hash value. (Negligible for 64-bit, can matter for
+          // 32-bit.)
+          double correction =
+              FLAGS_max_check * ExpectedCollisionFpRate(hasher, num_to_add);
+
+          // NOTE: rare violations expected with kHomogeneous
+          EXPECT_LE(ifp_count,
+                    FrequentPoissonUpperBound(expected_fp_count + correction));
+
+          // FIXME: why sometimes can we slightly "beat the odds"?
+          // (0.95 factor should not be needed)
+          EXPECT_GE(ifp_count, FrequentPoissonLowerBound(
+                                   0.95 * expected_fp_count + correction));
+        }
+        // Since the bits used in isoln are a subset of the bits used in soln,
+        // it cannot have fewer FPs
+        EXPECT_GE(ifp_count, fp_count);
+      }
+
+      // And compare to Bloom time, for fun
+      if (ibytes >= /* minimum Bloom impl bytes*/ 64) {
+        Index bfp_count = 0;
+        cur = other_keys_begin;
+        ROCKSDB_NAMESPACE::StopWatchNano timer(
+            ROCKSDB_NAMESPACE::SystemClock::Default().get(), true);
+        while (cur != other_keys_end) {
+          uint64_t h = hasher.GetHash(*cur);
+          uint32_t h1 = ROCKSDB_NAMESPACE::Lower32of64(h);
+          uint32_t h2 = sizeof(Hash) >= 8 ? ROCKSDB_NAMESPACE::Upper32of64(h)
+                                          : h1 * 0x9e3779b9;
+          bfp_count +=
+              ROCKSDB_NAMESPACE::FastLocalBloomImpl::HashMayMatch(
+                  h1, h2, static_cast<uint32_t>(ibytes), 6, idata.get())
+                  ? 1
+                  : 0;
+          ++cur;
+        }
+        bloom_query_nanos += timer.ElapsedNanos();
+        // ensure bfp_count is used
+        ASSERT_LT(bfp_count, FLAGS_max_check);
+      }
+    }
+
+    // "outside" == key not in original set so either negative or false positive
+    fprintf(stderr,
+            "Simple      outside query, hot, incl hashing, ns/key: %g\n",
+            1.0 * soln_query_nanos / soln_query_count);
+    fprintf(stderr,
+            "Interleaved outside query, hot, incl hashing, ns/key: %g\n",
+            1.0 * isoln_query_nanos / isoln_query_count);
+    fprintf(stderr,
+            "Bloom       outside query, hot, incl hashing, ns/key: %g\n",
+            1.0 * bloom_query_nanos / soln_query_count);
+
+    if (TypeParam::kHomogeneous) {
+      EXPECT_EQ(total_reseeds, 0U);
+    } else {
+      double average_reseeds = 1.0 * total_reseeds / FLAGS_thoroughness;
+      fprintf(stderr, "Average re-seeds: %g\n", average_reseeds);
+      // Values above were chosen to target around 50% chance of encoding
+      // success rate (average of 1.0 re-seeds) or slightly better. But 1.15 is
+      // also close enough.
+      EXPECT_LE(total_reseeds,
+                InfrequentPoissonUpperBound(1.15 * expected_reseeds *
+                                            FLAGS_thoroughness));
+      // Would use 0.85 here instead of 0.75, but
+      // TypesAndSettings_Hash32_SmallKeyGen can "beat the odds" because of
+      // sequential keys with a small, cheap hash function. We accept that
+      // there are surely inputs that are somewhat bad for this setup, but
+      // these somewhat good inputs are probably more likely.
+      EXPECT_GE(total_reseeds,
+                InfrequentPoissonLowerBound(0.75 * expected_reseeds *
+                                            FLAGS_thoroughness));
+    }
+
+    if (total_expand_trials > 0) {
+      double average_expand_failures =
+          1.0 * total_expand_failures / total_expand_trials;
+      fprintf(stderr, "Average expand failures, and overhead: %g, %g\n",
+              average_expand_failures,
+              total_expand_overhead / total_expand_trials);
+      // Seems to be a generous allowance
+      EXPECT_LE(total_expand_failures,
+                InfrequentPoissonUpperBound(1.0 * total_expand_trials));
+    } else {
+      fprintf(stderr, "Average expand failures: N/A\n");
+    }
+
+    if (total_singles > 0) {
+      double single_failure_rate = 1.0 * total_single_failures / total_singles;
+      fprintf(stderr, "Add'l single, failure rate: %g\n", single_failure_rate);
+      // A rough bound (one sided) based on nothing in particular
+      double expected_single_failures =
+          1.0 * total_singles /
+          (sizeof(CoeffRow) == 16 ? 128 : TypeParam::kUseSmash ? 64 : 32);
+      EXPECT_LE(total_single_failures,
+                InfrequentPoissonUpperBound(expected_single_failures));
+    }
+
+    if (total_batch > 0) {
+      // Counting successes here for Poisson to approximate the Binomial
+      // distribution.
+      // A rough bound (one sided) based on nothing in particular.
+      double expected_batch_successes = 1.0 * total_batch / 2;
+      uint64_t lower_bound =
+          InfrequentPoissonLowerBound(expected_batch_successes);
+      fprintf(stderr, "Add'l batch, success rate: %g (>= %g)\n",
+              1.0 * total_batch_successes / total_batch,
+              1.0 * lower_bound / total_batch);
+      EXPECT_GE(total_batch_successes, lower_bound);
+    }
+
+    {
+      uint64_t total_checked = uint64_t{FLAGS_max_check} * FLAGS_thoroughness;
+      double expected_total_fp_count =
+          total_checked * std::pow(0.5, 8U * sizeof(ResultRow));
+      // For expected FP rate, also include false positives due to collisions
+      // in Hash value. (Negligible for 64-bit, can matter for 32-bit.)
+      double average_added = 1.0 * total_added / FLAGS_thoroughness;
+      expected_total_fp_count +=
+          total_checked * ExpectedCollisionFpRate(Hasher(), average_added);
+
+      uint64_t upper_bound =
+          InfrequentPoissonUpperBound(expected_total_fp_count);
+      uint64_t lower_bound =
+          InfrequentPoissonLowerBound(expected_total_fp_count);
+      fprintf(stderr, "Average FP rate: %g (~= %g, <= %g, >= %g)\n",
+              1.0 * total_fp_count / total_checked,
+              expected_total_fp_count / total_checked,
+              1.0 * upper_bound / total_checked,
+              1.0 * lower_bound / total_checked);
+      EXPECT_LE(total_fp_count, upper_bound);
+      EXPECT_GE(total_fp_count, lower_bound);
+    }
+  }
+}
+
+TYPED_TEST(RibbonTypeParamTest, Extremes) {
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
+  IMPORT_RIBBON_IMPL_TYPES(TypeParam);
+  using KeyGen = typename TypeParam::KeyGen;
+
+  size_t bytes = 128 * 1024;
+  std::unique_ptr<char[]> buf(new char[bytes]);
+  InterleavedSoln isoln(buf.get(), bytes);
+  SimpleSoln soln;
+  Hasher hasher;
+  Banding banding;
+
+  // ########################################
+  // Add zero keys to minimal number of slots
+  KeyGen begin_and_end("foo", 123);
+  ASSERT_TRUE(banding.ResetAndFindSeedToSolve(
+      /*slots*/ kCoeffBits, begin_and_end, begin_and_end, /*first seed*/ 0,
+      /* seed mask*/ 0));
+
+  soln.BackSubstFrom(banding);
+  isoln.BackSubstFrom(banding);
+
+  // Because there's plenty of memory, we expect the interleaved solution to
+  // use maximum supported columns (same as simple solution)
+  ASSERT_EQ(isoln.GetUpperNumColumns(), 8U * sizeof(ResultRow));
+  ASSERT_EQ(isoln.GetUpperStartBlock(), 0U);
+
+  // Somewhat oddly, we expect same FP rate as if we had essentially filled
+  // up the slots.
+  KeyGen other_keys_begin("not", 0);
+  KeyGen other_keys_end("not", FLAGS_max_check);
+
+  Index fp_count = 0;
+  KeyGen cur = other_keys_begin;
+  while (cur != other_keys_end) {
+    bool isoln_query_result = isoln.FilterQuery(*cur, hasher);
+    bool soln_query_result = soln.FilterQuery(*cur, hasher);
+    // Solutions are equivalent
+    ASSERT_EQ(isoln_query_result, soln_query_result);
+    if (!TypeParam::kHomogeneous) {
+      // And in fact we only expect an FP when ResultRow is 0
+      // (except Homogeneous)
+      ASSERT_EQ(soln_query_result, hasher.GetResultRowFromHash(
+                                       hasher.GetHash(*cur)) == ResultRow{0});
+    }
+    fp_count += soln_query_result ? 1 : 0;
+    ++cur;
+  }
+  {
+    ASSERT_EQ(isoln.ExpectedFpRate(), soln.ExpectedFpRate());
+    double expected_fp_count = isoln.ExpectedFpRate() * FLAGS_max_check;
+    EXPECT_LE(fp_count, InfrequentPoissonUpperBound(expected_fp_count));
+    if (TypeParam::kHomogeneous) {
+      // Pseudorandom garbage in Homogeneous filter can "beat the odds" if
+      // nothing added
+    } else {
+      EXPECT_GE(fp_count, InfrequentPoissonLowerBound(expected_fp_count));
+    }
+  }
+
+  // ######################################################
+  // Use zero bytes for interleaved solution (key(s) added)
+
+  // Add one key
+  KeyGen key_begin("added", 0);
+  KeyGen key_end("added", 1);
+  ASSERT_TRUE(banding.ResetAndFindSeedToSolve(
+      /*slots*/ kCoeffBits, key_begin, key_end, /*first seed*/ 0,
+      /* seed mask*/ 0));
+
+  InterleavedSoln isoln2(nullptr, /*bytes*/ 0);
+
+  isoln2.BackSubstFrom(banding);
+
+  ASSERT_EQ(isoln2.GetUpperNumColumns(), 0U);
+  ASSERT_EQ(isoln2.GetUpperStartBlock(), 0U);
+
+  // All queries return true
+  ASSERT_TRUE(isoln2.FilterQuery(*other_keys_begin, hasher));
+  ASSERT_EQ(isoln2.ExpectedFpRate(), 1.0);
+}
+
+TEST(RibbonTest, AllowZeroStarts) {
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings_AllowZeroStarts);
+  IMPORT_RIBBON_IMPL_TYPES(TypesAndSettings_AllowZeroStarts);
+  using KeyGen = StandardKeyGen;
+
+  InterleavedSoln isoln(nullptr, /*bytes*/ 0);
+  SimpleSoln soln;
+  Hasher hasher;
+  Banding banding;
+
+  KeyGen begin("foo", 0);
+  KeyGen end("foo", 1);
+  // Can't add 1 entry
+  ASSERT_FALSE(banding.ResetAndFindSeedToSolve(/*slots*/ 0, begin, end));
+
+  KeyGen begin_and_end("foo", 123);
+  // Can add 0 entries
+  ASSERT_TRUE(banding.ResetAndFindSeedToSolve(/*slots*/ 0, begin_and_end,
+                                              begin_and_end));
+
+  Seed reseeds = banding.GetOrdinalSeed();
+  ASSERT_EQ(reseeds, 0U);
+  hasher.SetOrdinalSeed(reseeds);
+
+  // Can construct 0-slot solutions
+  isoln.BackSubstFrom(banding);
+  soln.BackSubstFrom(banding);
+
+  // Should always return false
+  ASSERT_FALSE(isoln.FilterQuery(*begin, hasher));
+  ASSERT_FALSE(soln.FilterQuery(*begin, hasher));
+
+  // And report that in FP rate
+  ASSERT_EQ(isoln.ExpectedFpRate(), 0.0);
+  ASSERT_EQ(soln.ExpectedFpRate(), 0.0);
+}
+
+TEST(RibbonTest, RawAndOrdinalSeeds) {
+  StandardHasher<TypesAndSettings_Seed64> hasher64;
+  StandardHasher<DefaultTypesAndSettings> hasher64_32;
+  StandardHasher<TypesAndSettings_Hash32> hasher32;
+  StandardHasher<TypesAndSettings_Seed8> hasher8;
+
+  for (uint32_t limit : {0xffU, 0xffffU}) {
+    std::vector<bool> seen(limit + 1);
+    for (uint32_t i = 0; i < limit; ++i) {
+      hasher64.SetOrdinalSeed(i);
+      auto raw64 = hasher64.GetRawSeed();
+      hasher32.SetOrdinalSeed(i);
+      auto raw32 = hasher32.GetRawSeed();
+      hasher8.SetOrdinalSeed(static_cast<uint8_t>(i));
+      auto raw8 = hasher8.GetRawSeed();
+      {
+        hasher64_32.SetOrdinalSeed(i);
+        auto raw64_32 = hasher64_32.GetRawSeed();
+        ASSERT_EQ(raw64_32, raw32);  // Same size seed
+      }
+      if (i == 0) {
+        // Documented that ordinal seed 0 == raw seed 0
+        ASSERT_EQ(raw64, 0U);
+        ASSERT_EQ(raw32, 0U);
+        ASSERT_EQ(raw8, 0U);
+      } else {
+        // Extremely likely that upper bits are set
+        ASSERT_GT(raw64, raw32);
+        ASSERT_GT(raw32, raw8);
+      }
+      // Hashers agree on lower bits
+      ASSERT_EQ(static_cast<uint32_t>(raw64), raw32);
+      ASSERT_EQ(static_cast<uint8_t>(raw32), raw8);
+
+      // The translation is one-to-one for this size prefix
+      uint32_t v = static_cast<uint32_t>(raw32 & limit);
+      ASSERT_EQ(raw64 & limit, v);
+      ASSERT_FALSE(seen[v]);
+      seen[v] = true;
+    }
+  }
+}
+
+namespace {
+
+struct PhsfInputGen {
+  PhsfInputGen(const std::string& prefix, uint64_t id) : id_(id) {
+    val_.first = prefix;
+    ROCKSDB_NAMESPACE::PutFixed64(&val_.first, /*placeholder*/ 0);
+  }
+
+  // Prefix (only one required)
+  PhsfInputGen& operator++() {
+    ++id_;
+    return *this;
+  }
+
+  const std::pair<std::string, uint8_t>& operator*() {
+    // Use multiplication to mix things up a little in the key
+    ROCKSDB_NAMESPACE::EncodeFixed64(&val_.first[val_.first.size() - 8],
+                                     id_ * uint64_t{0x1500000001});
+    // Occasionally repeat values etc.
+    val_.second = static_cast<uint8_t>(id_ * 7 / 8);
+    return val_;
+  }
+
+  const std::pair<std::string, uint8_t>* operator->() { return &**this; }
+
+  bool operator==(const PhsfInputGen& other) {
+    // Same prefix is assumed
+    return id_ == other.id_;
+  }
+  bool operator!=(const PhsfInputGen& other) {
+    // Same prefix is assumed
+    return id_ != other.id_;
+  }
+
+  uint64_t id_;
+  std::pair<std::string, uint8_t> val_;
+};
+
+struct PhsfTypesAndSettings : public DefaultTypesAndSettings {
+  static constexpr bool kIsFilter = false;
+};
+}  // namespace
+
+TEST(RibbonTest, PhsfBasic) {
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(PhsfTypesAndSettings);
+  IMPORT_RIBBON_IMPL_TYPES(PhsfTypesAndSettings);
+
+  Index num_slots = 12800;
+  Index num_to_add = static_cast<Index>(num_slots / 1.02);
+
+  PhsfInputGen begin("in", 0);
+  PhsfInputGen end("in", num_to_add);
+
+  std::unique_ptr<char[]> idata(new char[/*bytes*/ num_slots]);
+  InterleavedSoln isoln(idata.get(), /*bytes*/ num_slots);
+  SimpleSoln soln;
+  Hasher hasher;
+
+  {
+    Banding banding;
+    ASSERT_TRUE(banding.ResetAndFindSeedToSolve(num_slots, begin, end));
+
+    soln.BackSubstFrom(banding);
+    isoln.BackSubstFrom(banding);
+
+    hasher.SetOrdinalSeed(banding.GetOrdinalSeed());
+  }
+
+  for (PhsfInputGen cur = begin; cur != end; ++cur) {
+    ASSERT_EQ(cur->second, soln.PhsfQuery(cur->first, hasher));
+    ASSERT_EQ(cur->second, isoln.PhsfQuery(cur->first, hasher));
+  }
+}
+
+// Not a real test, but a tool used to build APIs in ribbon_config.h
+TYPED_TEST(RibbonTypeParamTest, FindOccupancy) {
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
+  IMPORT_RIBBON_IMPL_TYPES(TypeParam);
+  using KeyGen = typename TypeParam::KeyGen;
+
+  if (!FLAGS_find_occ) {
+    ROCKSDB_GTEST_BYPASS("Tool disabled during unit test runs");
+    return;
+  }
+
+  KeyGen cur(ROCKSDB_NAMESPACE::ToString(
+                 testing::UnitTest::GetInstance()->random_seed()),
+             0);
+
+  Banding banding;
+  Index num_slots = InterleavedSoln::RoundUpNumSlots(FLAGS_find_min_slots);
+  Index max_slots = InterleavedSoln::RoundUpNumSlots(FLAGS_find_max_slots);
+  while (num_slots <= max_slots) {
+    std::map<int32_t, uint32_t> rem_histogram;
+    std::map<Index, uint32_t> slot_histogram;
+    if (FLAGS_find_slot_occ) {
+      for (Index i = 0; i < kCoeffBits; ++i) {
+        slot_histogram[i] = 0;
+        slot_histogram[num_slots - 1 - i] = 0;
+        slot_histogram[num_slots / 2 - kCoeffBits / 2 + i] = 0;
+      }
+    }
+    uint64_t total_added = 0;
+    for (uint32_t i = 0; i < FLAGS_find_iters; ++i) {
+      banding.Reset(num_slots);
+      uint32_t j = 0;
+      KeyGen end = cur;
+      end += num_slots + num_slots / 10;
+      for (; cur != end; ++cur) {
+        if (banding.Add(*cur)) {
+          ++j;
+        } else {
+          break;
+        }
+      }
+      total_added += j;
+      for (auto& slot : slot_histogram) {
+        slot.second += banding.IsOccupied(slot.first);
+      }
+
+      int32_t bucket =
+          static_cast<int32_t>(num_slots) - static_cast<int32_t>(j);
+      rem_histogram[bucket]++;
+      if (FLAGS_verbose) {
+        fprintf(stderr, "num_slots: %u i: %u / %u avg_overhead: %g\r",
+                static_cast<unsigned>(num_slots), static_cast<unsigned>(i),
+                static_cast<unsigned>(FLAGS_find_iters),
+                1.0 * (i + 1) * num_slots / total_added);
+      }
+    }
+    if (FLAGS_verbose) {
+      fprintf(stderr, "\n");
+    }
+
+    uint32_t cumulative = 0;
+
+    double p50_rem = 0;
+    double p95_rem = 0;
+    double p99_9_rem = 0;
+
+    for (auto& h : rem_histogram) {
+      double before = 1.0 * cumulative / FLAGS_find_iters;
+      double not_after = 1.0 * (cumulative + h.second) / FLAGS_find_iters;
+      if (FLAGS_verbose) {
+        fprintf(stderr, "overhead: %g before: %g not_after: %g\n",
+                1.0 * num_slots / (num_slots - h.first), before, not_after);
+      }
+      cumulative += h.second;
+      if (before < 0.5 && 0.5 <= not_after) {
+        // fake it with linear interpolation
+        double portion = (0.5 - before) / (not_after - before);
+        p50_rem = h.first + portion;
+      } else if (before < 0.95 && 0.95 <= not_after) {
+        // fake it with linear interpolation
+        double portion = (0.95 - before) / (not_after - before);
+        p95_rem = h.first + portion;
+      } else if (before < 0.999 && 0.999 <= not_after) {
+        // fake it with linear interpolation
+        double portion = (0.999 - before) / (not_after - before);
+        p99_9_rem = h.first + portion;
+      }
+    }
+    for (auto& slot : slot_histogram) {
+      fprintf(stderr, "slot[%u] occupied: %g\n", (unsigned)slot.first,
+              1.0 * slot.second / FLAGS_find_iters);
+    }
+
+    double mean_rem =
+        (1.0 * FLAGS_find_iters * num_slots - total_added) / FLAGS_find_iters;
+    fprintf(
+        stderr,
+        "num_slots: %u iters: %u mean_ovr: %g p50_ovr: %g p95_ovr: %g "
+        "p99.9_ovr: %g mean_rem: %g p50_rem: %g p95_rem: %g p99.9_rem: %g\n",
+        static_cast<unsigned>(num_slots),
+        static_cast<unsigned>(FLAGS_find_iters),
+        1.0 * num_slots / (num_slots - mean_rem),
+        1.0 * num_slots / (num_slots - p50_rem),
+        1.0 * num_slots / (num_slots - p95_rem),
+        1.0 * num_slots / (num_slots - p99_9_rem), mean_rem, p50_rem, p95_rem,
+        p99_9_rem);
+
+    num_slots = std::max(
+        num_slots + 1, static_cast<Index>(num_slots * FLAGS_find_next_factor));
+    num_slots = InterleavedSoln::RoundUpNumSlots(num_slots);
+  }
+}
+
+// Not a real test, but a tool to understand Homogeneous Ribbon
+// behavior (TODO: configuration APIs & tests)
+TYPED_TEST(RibbonTypeParamTest, OptimizeHomogAtScale) {
+  IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam);
+  IMPORT_RIBBON_IMPL_TYPES(TypeParam);
+  using KeyGen = typename TypeParam::KeyGen;
+
+  if (!FLAGS_optimize_homog) {
+    ROCKSDB_GTEST_BYPASS("Tool disabled during unit test runs");
+    return;
+  }
+
+  if (!TypeParam::kHomogeneous) {
+    ROCKSDB_GTEST_BYPASS("Only for Homogeneous Ribbon");
+    return;
+  }
+
+  KeyGen cur(ROCKSDB_NAMESPACE::ToString(
+                 testing::UnitTest::GetInstance()->random_seed()),
+             0);
+
+  Banding banding;
+  Index num_slots = SimpleSoln::RoundUpNumSlots(FLAGS_optimize_homog_slots);
+  banding.Reset(num_slots);
+
+  // This and "band_ovr" is the "allocated overhead", or slots over added.
+  // It does not take into account FP rates.
+  double target_overhead = 1.20;
+  uint32_t num_added = 0;
+
+  do {
+    do {
+      (void)banding.Add(*cur);
+      ++cur;
+      ++num_added;
+    } while (1.0 * num_slots / num_added > target_overhead);
+
+    SimpleSoln soln;
+    soln.BackSubstFrom(banding);
+
+    std::array<uint32_t, 8U * sizeof(ResultRow)> fp_counts_by_cols;
+    fp_counts_by_cols.fill(0U);
+    for (uint32_t i = 0; i < FLAGS_optimize_homog_check; ++i) {
+      ResultRow r = soln.PhsfQuery(*cur, banding);
+      ++cur;
+      for (size_t j = 0; j < fp_counts_by_cols.size(); ++j) {
+        if ((r & 1) == 1) {
+          break;
+        }
+        fp_counts_by_cols[j]++;
+        r /= 2;
+      }
+    }
+    fprintf(stderr, "band_ovr: %g ", 1.0 * num_slots / num_added);
+    for (unsigned j = 0; j < fp_counts_by_cols.size(); ++j) {
+      double inv_fp_rate =
+          1.0 * FLAGS_optimize_homog_check / fp_counts_by_cols[j];
+      double equiv_cols = std::log(inv_fp_rate) * 1.4426950409;
+      // Overhead vs. information-theoretic minimum based on observed
+      // FP rate (subject to sampling error, especially for low FP rates)
+      double actual_overhead =
+          1.0 * (j + 1) * num_slots / (equiv_cols * num_added);
+      fprintf(stderr, "ovr_%u: %g ", j + 1, actual_overhead);
+    }
+    fprintf(stderr, "\n");
+    target_overhead -= FLAGS_optimize_homog_granularity;
+  } while (target_overhead > 1.0);
+}
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+#ifdef GFLAGS
+  ParseCommandLineFlags(&argc, &argv, true);
+#endif  // GFLAGS
+  return RUN_ALL_TESTS();
+}
diff --git a/util/slice_test.cc b/util/slice_test.cc
index 9e8a8e34008..4226768f3c5 100644
--- a/util/slice_test.cc
+++ b/util/slice_test.cc
@@ -3,9 +3,12 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include "rocksdb/slice.h"
+
 #include "port/port.h"
 #include "port/stack_trace.h"
-#include "rocksdb/slice.h"
+#include "rocksdb/data_structure.h"
+#include "rocksdb/types.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 
@@ -154,6 +157,22 @@ TEST_F(PinnableSliceTest, Move) {
   ASSERT_EQ(2, res);
 }
 
+// Unit test for SmallEnumSet
+class SmallEnumSetTest : public testing::Test {
+ public:
+  SmallEnumSetTest() {}
+  ~SmallEnumSetTest() {}
+};
+
+TEST_F(SmallEnumSetTest, SmallSetTest) {
+  FileTypeSet fs;
+  ASSERT_TRUE(fs.Add(FileType::kIdentityFile));
+  ASSERT_FALSE(fs.Add(FileType::kIdentityFile));
+  ASSERT_TRUE(fs.Add(FileType::kInfoLogFile));
+  ASSERT_TRUE(fs.Contains(FileType::kIdentityFile));
+  ASSERT_FALSE(fs.Contains(FileType::kDBLockFile));
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/util/status.cc b/util/status.cc
index ad948f017ae..efd83b0aedb 100644
--- a/util/status.cc
+++ b/util/status.cc
@@ -58,9 +58,8 @@ static const char* msgs[static_cast<int>(Status::kMaxSubCode)] = {
 };
 
 Status::Status(Code _code, SubCode _subcode, const Slice& msg,
-               const Slice& msg2)
-    : code_(_code), subcode_(_subcode), sev_(kNoError) {
-  assert(code_ != kOk);
+               const Slice& msg2, Severity sev)
+    : code_(_code), subcode_(_subcode), sev_(sev) {
   assert(subcode_ != kMaxSubCode);
   const size_t len1 = msg.size();
   const size_t len2 = msg2.size();
@@ -80,8 +79,7 @@ std::string Status::ToString() const {
 #ifdef ROCKSDB_ASSERT_STATUS_CHECKED
   checked_ = true;
 #endif  // ROCKSDB_ASSERT_STATUS_CHECKED
-  char tmp[30];
-  const char* type;
+  const char* type = nullptr;
   switch (code_) {
     case kOk:
       return "OK";
@@ -124,15 +122,25 @@ std::string Status::ToString() const {
     case kTryAgain:
       type = "Operation failed. Try again.: ";
       break;
+    case kCompactionTooLarge:
+      type = "Compaction too large: ";
+      break;
     case kColumnFamilyDropped:
       type = "Column family dropped: ";
       break;
-    default:
-      snprintf(tmp, sizeof(tmp), "Unknown code(%d): ",
-               static_cast<int>(code()));
-      type = tmp;
+    case kMaxCode:
+      assert(false);
       break;
   }
+  char tmp[30];
+  if (type == nullptr) {
+    // This should not happen since `code_` should be a valid non-`kMaxCode`
+    // member of the `Code` enum. The above switch-statement should have had a
+    // case assigning `type` to a corresponding string.
+    assert(false);
+    snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", static_cast<int>(code()));
+    type = tmp;
+  }
   std::string result(type);
   if (subcode_ != kNone) {
     uint32_t index = static_cast<int32_t>(subcode_);
@@ -141,6 +149,9 @@ std::string Status::ToString() const {
   }
 
   if (state_ != nullptr) {
+    if (subcode_ != kNone) {
+      result.append(": ");
+    }
     result.append(state_);
   }
   return result;
diff --git a/util/stop_watch.h b/util/stop_watch.h
index ad4905960c5..e26380d97cb 100644
--- a/util/stop_watch.h
+++ b/util/stop_watch.h
@@ -5,7 +5,7 @@
 //
 #pragma once
 #include "monitoring/statistics.h"
-#include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
 
 namespace ROCKSDB_NAMESPACE {
 // Auto-scoped.
@@ -14,10 +14,10 @@ namespace ROCKSDB_NAMESPACE {
 // and overwrite is true, it will be added to *elapsed if overwrite is false.
 class StopWatch {
  public:
-  StopWatch(Env* const env, Statistics* statistics, const uint32_t hist_type,
-            uint64_t* elapsed = nullptr, bool overwrite = true,
-            bool delay_enabled = false)
-      : env_(env),
+  StopWatch(SystemClock* clock, Statistics* statistics,
+            const uint32_t hist_type, uint64_t* elapsed = nullptr,
+            bool overwrite = true, bool delay_enabled = false)
+      : clock_(clock),
         statistics_(statistics),
         hist_type_(hist_type),
         elapsed_(elapsed),
@@ -29,15 +29,15 @@ class StopWatch {
         delay_enabled_(delay_enabled),
         total_delay_(0),
         delay_start_time_(0),
-        start_time_((stats_enabled_ || elapsed != nullptr) ? env->NowMicros()
+        start_time_((stats_enabled_ || elapsed != nullptr) ? clock->NowMicros()
                                                            : 0) {}
 
   ~StopWatch() {
     if (elapsed_) {
       if (overwrite_) {
-        *elapsed_ = env_->NowMicros() - start_time_;
+        *elapsed_ = clock_->NowMicros() - start_time_;
       } else {
-        *elapsed_ += env_->NowMicros() - start_time_;
+        *elapsed_ += clock_->NowMicros() - start_time_;
       }
     }
     if (elapsed_ && delay_enabled_) {
@@ -47,7 +47,7 @@ class StopWatch {
       statistics_->reportTimeToHistogram(
           hist_type_, (elapsed_ != nullptr)
                           ? *elapsed_
-                          : (env_->NowMicros() - start_time_));
+                          : (clock_->NowMicros() - start_time_));
     }
   }
 
@@ -55,13 +55,13 @@ class StopWatch {
     // if delay_start_time_ is not 0, it means we are already tracking delay,
     // so delay_start_time_ should not be overwritten
     if (elapsed_ && delay_enabled_ && delay_start_time_ == 0) {
-      delay_start_time_ = env_->NowMicros();
+      delay_start_time_ = clock_->NowMicros();
     }
   }
 
   void DelayStop() {
     if (elapsed_ && delay_enabled_ && delay_start_time_ != 0) {
-      total_delay_ += env_->NowMicros() - delay_start_time_;
+      total_delay_ += clock_->NowMicros() - delay_start_time_;
     }
     // reset to 0 means currently no delay is being tracked, so two consecutive
     // calls to DelayStop will not increase total_delay_
@@ -73,7 +73,7 @@ class StopWatch {
   uint64_t start_time() const { return start_time_; }
 
  private:
-  Env* const env_;
+  SystemClock* clock_;
   Statistics* statistics_;
   const uint32_t hist_type_;
   uint64_t* elapsed_;
@@ -88,17 +88,17 @@ class StopWatch {
 // a nano second precision stopwatch
 class StopWatchNano {
  public:
-  explicit StopWatchNano(Env* const env, bool auto_start = false)
-      : env_(env), start_(0) {
+  explicit StopWatchNano(SystemClock* clock, bool auto_start = false)
+      : clock_(clock), start_(0) {
     if (auto_start) {
       Start();
     }
   }
 
-  void Start() { start_ = env_->NowNanos(); }
+  void Start() { start_ = clock_->NowNanos(); }
 
   uint64_t ElapsedNanos(bool reset = false) {
-    auto now = env_->NowNanos();
+    auto now = clock_->NowNanos();
     auto elapsed = now - start_;
     if (reset) {
       start_ = now;
@@ -107,11 +107,11 @@ class StopWatchNano {
   }
 
   uint64_t ElapsedNanosSafe(bool reset = false) {
-    return (env_ != nullptr) ? ElapsedNanos(reset) : 0U;
+    return (clock_ != nullptr) ? ElapsedNanos(reset) : 0U;
   }
 
  private:
-  Env* const env_;
+  SystemClock* clock_;
   uint64_t start_;
 };
 
diff --git a/util/string_util.cc b/util/string_util.cc
index d098ecb0b5b..5e1149119c3 100644
--- a/util/string_util.cc
+++ b/util/string_util.cc
@@ -6,7 +6,6 @@
 #include "util/string_util.h"
 
 #include <errno.h>
-#include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <algorithm>
@@ -20,6 +19,20 @@
 #include "port/sys_time.h"
 #include "rocksdb/slice.h"
 
+#ifndef __has_cpp_attribute
+#define ROCKSDB_HAS_CPP_ATTRIBUTE(x) 0
+#else
+#define ROCKSDB_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#endif
+
+#if ROCKSDB_HAS_CPP_ATTRIBUTE(maybe_unused) && __cplusplus >= 201703L
+#define ROCKSDB_MAYBE_UNUSED [[maybe_unused]]
+#elif ROCKSDB_HAS_CPP_ATTRIBUTE(gnu::unused) || __GNUC__
+#define ROCKSDB_MAYBE_UNUSED [[gnu::unused]]
+#else
+#define ROCKSDB_MAYBE_UNUSED
+#endif
+
 namespace ROCKSDB_NAMESPACE {
 
 const std::string kNullptrString = "nullptr";
@@ -288,6 +301,15 @@ bool ParseBoolean(const std::string& type, const std::string& value) {
   throw std::invalid_argument(type);
 }
 
+uint8_t ParseUint8(const std::string& value) {
+  uint64_t num = ParseUint64(value);
+  if ((num >> 8LL) == 0) {
+    return static_cast<uint8_t>(num);
+  } else {
+    throw std::out_of_range(value);
+  }
+}
+
 uint32_t ParseUint32(const std::string& value) {
   uint64_t num = ParseUint64(value);
   if ((num >> 32LL) == 0) {
@@ -420,4 +442,66 @@ bool SerializeIntVector(const std::vector<int>& vec, std::string* value) {
   return true;
 }
 
+// Copied from folly/string.cpp:
+// https://github.com/facebook/folly/blob/0deef031cb8aab76dc7e736f8b7c22d701d5f36b/folly/String.cpp#L457
+// There are two variants of `strerror_r` function, one returns
+// `int`, and another returns `char*`. Selecting proper version using
+// preprocessor macros portably is extremely hard.
+//
+// For example, on Android function signature depends on `__USE_GNU` and
+// `__ANDROID_API__` macros (https://git.io/fjBBE).
+//
+// So we are using C++ overloading trick: we pass a pointer of
+// `strerror_r` to `invoke_strerror_r` function, and C++ compiler
+// selects proper function.
+
+#if !(defined(_WIN32) && (defined(__MINGW32__) || defined(_MSC_VER)))
+ROCKSDB_MAYBE_UNUSED
+static std::string invoke_strerror_r(int (*strerror_r)(int, char*, size_t),
+                                     int err, char* buf, size_t buflen) {
+  // Using XSI-compatible strerror_r
+  int r = strerror_r(err, buf, buflen);
+
+  // OSX/FreeBSD use EINVAL and Linux uses -1 so just check for non-zero
+  if (r != 0) {
+    snprintf(buf, buflen, "Unknown error %d (strerror_r failed with error %d)",
+             err, errno);
+  }
+  return buf;
+}
+
+ROCKSDB_MAYBE_UNUSED
+static std::string invoke_strerror_r(char* (*strerror_r)(int, char*, size_t),
+                                     int err, char* buf, size_t buflen) {
+  // Using GNU strerror_r
+  return strerror_r(err, buf, buflen);
+}
+#endif  // !(defined(_WIN32) && (defined(__MINGW32__) || defined(_MSC_VER)))
+
+std::string errnoStr(int err) {
+  char buf[1024];
+  buf[0] = '\0';
+
+  std::string result;
+
+  // https://developer.apple.com/library/mac/documentation/Darwin/Reference/ManPages/man3/strerror_r.3.html
+  // http://www.kernel.org/doc/man-pages/online/pages/man3/strerror.3.html
+#if defined(_WIN32) && (defined(__MINGW32__) || defined(_MSC_VER))
+  // mingw64 has no strerror_r, but Windows has strerror_s, which C11 added
+  // as well. So maybe we should use this across all platforms (together
+  // with strerrorlen_s). Note strerror_r and _s have swapped args.
+  int r = strerror_s(buf, sizeof(buf), err);
+  if (r != 0) {
+    snprintf(buf, sizeof(buf),
+             "Unknown error %d (strerror_r failed with error %d)", err, errno);
+  }
+  result.assign(buf);
+#else
+  // Using any strerror_r
+  result.assign(invoke_strerror_r(strerror_r, err, buf, sizeof(buf)));
+#endif
+
+  return result;
+}
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/string_util.h b/util/string_util.h
index 5ff516cac66..195ae8b0b20 100644
--- a/util/string_util.h
+++ b/util/string_util.h
@@ -120,6 +120,8 @@ bool StartsWith(const std::string& string, const std::string& pattern);
 #ifndef ROCKSDB_LITE
 bool ParseBoolean(const std::string& type, const std::string& value);
 
+uint8_t ParseUint8(const std::string& value);
+
 uint32_t ParseUint32(const std::string& value);
 
 int32_t ParseInt32(const std::string& value);
@@ -141,4 +143,8 @@ bool SerializeIntVector(const std::vector<int>& vec, std::string* value);
 
 extern const std::string kNullptrString;
 
+// errnoStr() function returns a string that describes the error code passed in
+// the argument err
+extern std::string errnoStr(int err);
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/util/thread_guard.h b/util/thread_guard.h
new file mode 100644
index 00000000000..b2bb06a1b08
--- /dev/null
+++ b/util/thread_guard.h
@@ -0,0 +1,41 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "port/port.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Resource management object for threads that joins the thread upon
+// destruction. Has unique ownership of the thread object, so copying it is not
+// allowed, while moving it transfers ownership.
+class ThreadGuard {
+ public:
+  ThreadGuard() = default;
+
+  explicit ThreadGuard(port::Thread&& thread) : thread_(std::move(thread)) {}
+
+  ThreadGuard(const ThreadGuard&) = delete;
+  ThreadGuard& operator=(const ThreadGuard&) = delete;
+
+  ThreadGuard(ThreadGuard&&) noexcept = default;
+  ThreadGuard& operator=(ThreadGuard&&) noexcept = default;
+
+  ~ThreadGuard() {
+    if (thread_.joinable()) {
+      thread_.join();
+    }
+  }
+
+  const port::Thread& GetThread() const { return thread_; }
+  port::Thread& GetThread() { return thread_; }
+
+ private:
+  port::Thread thread_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/util/thread_list_test.cc b/util/thread_list_test.cc
index 44c3ebc99ec..8ce31909fcc 100644
--- a/util/thread_list_test.cc
+++ b/util/thread_list_test.cc
@@ -38,6 +38,7 @@ class SimulatedBackgroundTask {
   void Run() {
     std::unique_lock<std::mutex> l(mutex_);
     running_count_++;
+    bg_cv_.notify_all();
     Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(cf_key_);
     Env::Default()->GetThreadStatusUpdater()->SetThreadOperation(
         operation_type_);
@@ -58,9 +59,10 @@ class SimulatedBackgroundTask {
     bg_cv_.notify_all();
   }
 
-  void WaitUntilScheduled(int job_count, Env* env) {
+  void WaitUntilScheduled(int job_count) {
+    std::unique_lock<std::mutex> l(mutex_);
     while (running_count_ < job_count) {
-      env->SleepForMicroseconds(1000);
+      bg_cv_.wait(l);
     }
   }
 
@@ -139,13 +141,13 @@ TEST_F(ThreadListTest, SimpleColumnFamilyInfoTest) {
     env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
         &running_task, Env::Priority::LOW);
   }
-  running_task.WaitUntilScheduled(
-      kSimulatedHighPriThreads + kSimulatedLowPriThreads, env);
+  running_task.WaitUntilScheduled(kSimulatedHighPriThreads +
+                                  kSimulatedLowPriThreads);
 
   std::vector<ThreadStatus> thread_list;
 
   // Verify the number of running threads in each pool.
-  env->GetThreadList(&thread_list);
+  ASSERT_OK(env->GetThreadList(&thread_list));
   int running_count[ThreadStatus::NUM_THREAD_TYPES] = {0};
   for (auto thread_status : thread_list) {
     if (thread_status.cf_name == "pikachu" &&
@@ -166,7 +168,7 @@ TEST_F(ThreadListTest, SimpleColumnFamilyInfoTest) {
   running_task.WaitUntilDone();
 
   // Verify none of the threads are running
-  env->GetThreadList(&thread_list);
+  ASSERT_OK(env->GetThreadList(&thread_list));
 
   for (int i = 0; i < ThreadStatus::NUM_THREAD_TYPES; ++i) {
     running_count[i] = 0;
@@ -256,32 +258,32 @@ TEST_F(ThreadListTest, SimpleEventTest) {
     env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
         &flush_write_task, Env::Priority::HIGH);
   }
-  flush_write_task.WaitUntilScheduled(kFlushWriteTasks, env);
+  flush_write_task.WaitUntilScheduled(kFlushWriteTasks);
 
   for (int t = 0; t < kCompactionWriteTasks; ++t) {
     env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
         &compaction_write_task, Env::Priority::LOW);
   }
-  compaction_write_task.WaitUntilScheduled(kCompactionWriteTasks, env);
+  compaction_write_task.WaitUntilScheduled(kCompactionWriteTasks);
 
   for (int t = 0; t < kCompactionReadTasks; ++t) {
     env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
         &compaction_read_task, Env::Priority::LOW);
   }
-  compaction_read_task.WaitUntilScheduled(kCompactionReadTasks, env);
+  compaction_read_task.WaitUntilScheduled(kCompactionReadTasks);
 
   for (int t = 0; t < kCompactionWaitTasks; ++t) {
     env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask,
         &compaction_wait_task, Env::Priority::LOW);
   }
-  compaction_wait_task.WaitUntilScheduled(kCompactionWaitTasks, env);
+  compaction_wait_task.WaitUntilScheduled(kCompactionWaitTasks);
 
   // verify the thread-status
   int operation_counts[ThreadStatus::NUM_OP_TYPES] = {0};
   int state_counts[ThreadStatus::NUM_STATE_TYPES] = {0};
 
   std::vector<ThreadStatus> thread_list;
-  env->GetThreadList(&thread_list);
+  ASSERT_OK(env->GetThreadList(&thread_list));
   UpdateStatusCounts(thread_list, operation_counts, state_counts);
   VerifyAndResetCounts(correct_operation_counts, operation_counts,
                        ThreadStatus::NUM_OP_TYPES);
@@ -293,7 +295,7 @@ TEST_F(ThreadListTest, SimpleEventTest) {
   UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION,
               ThreadStatus::OP_UNKNOWN, kCompactionWaitTasks);
 
-  env->GetThreadList(&thread_list);
+  ASSERT_OK(env->GetThreadList(&thread_list));
   UpdateStatusCounts(thread_list, operation_counts, state_counts);
   VerifyAndResetCounts(correct_operation_counts, operation_counts,
                        ThreadStatus::NUM_OP_TYPES);
@@ -305,7 +307,7 @@ TEST_F(ThreadListTest, SimpleEventTest) {
   UpdateCount(correct_operation_counts, ThreadStatus::OP_FLUSH,
               ThreadStatus::OP_UNKNOWN, kFlushWriteTasks);
 
-  env->GetThreadList(&thread_list);
+  ASSERT_OK(env->GetThreadList(&thread_list));
   UpdateStatusCounts(thread_list, operation_counts, state_counts);
   VerifyAndResetCounts(correct_operation_counts, operation_counts,
                        ThreadStatus::NUM_OP_TYPES);
@@ -317,7 +319,7 @@ TEST_F(ThreadListTest, SimpleEventTest) {
   UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION,
               ThreadStatus::OP_UNKNOWN, kCompactionWriteTasks);
 
-  env->GetThreadList(&thread_list);
+  ASSERT_OK(env->GetThreadList(&thread_list));
   UpdateStatusCounts(thread_list, operation_counts, state_counts);
   VerifyAndResetCounts(correct_operation_counts, operation_counts,
                        ThreadStatus::NUM_OP_TYPES);
@@ -329,7 +331,7 @@ TEST_F(ThreadListTest, SimpleEventTest) {
   UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION,
               ThreadStatus::OP_UNKNOWN, kCompactionReadTasks);
 
-  env->GetThreadList(&thread_list);
+  ASSERT_OK(env->GetThreadList(&thread_list));
   UpdateStatusCounts(thread_list, operation_counts, state_counts);
   VerifyAndResetCounts(correct_operation_counts, operation_counts,
                        ThreadStatus::NUM_OP_TYPES);
diff --git a/util/thread_local_test.cc b/util/thread_local_test.cc
index e719c7daa5f..7baab2fde5e 100644
--- a/util/thread_local_test.cc
+++ b/util/thread_local_test.cc
@@ -3,9 +3,11 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include <thread>
+#include "util/thread_local.h"
+
 #include <atomic>
 #include <string>
+#include <thread>
 
 #include "port/port.h"
 #include "rocksdb/env.h"
@@ -13,7 +15,6 @@
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "util/autovector.h"
-#include "util/thread_local.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -51,10 +52,8 @@ struct Params {
 };
 
 class IDChecker : public ThreadLocalPtr {
-public:
-  static uint32_t PeekId() {
-    return TEST_PeekId();
-  }
+ public:
+  static uint32_t PeekId() { return TEST_PeekId(); }
 };
 
 }  // anonymous namespace
@@ -122,9 +121,8 @@ TEST_F(ThreadLocalTest, SequentialReadWriteTest) {
   ASSERT_GT(IDChecker::PeekId(), base_id);
   base_id = IDChecker::PeekId();
 
-  auto func = [](void* ptr) {
-    auto& params = *static_cast<Params*>(ptr);
-
+  auto func = [](Params* ptr) {
+    Params& params = *ptr;
     ASSERT_TRUE(params.tls1.Get() == nullptr);
     params.tls1.Reset(reinterpret_cast<int*>(1));
     ASSERT_TRUE(params.tls1.Get() == reinterpret_cast<int*>(1));
@@ -146,7 +144,8 @@ TEST_F(ThreadLocalTest, SequentialReadWriteTest) {
   for (int iter = 0; iter < 1024; ++iter) {
     ASSERT_EQ(IDChecker::PeekId(), base_id);
     // Another new thread, read/write should not see value from previous thread
-    env_->StartThread(func, static_cast<void*>(&p));
+    env_->StartThreadTyped(func, &p);
+
     mu.Lock();
     while (p.completed != iter + 1) {
       cv.Wait();
@@ -221,10 +220,10 @@ TEST_F(ThreadLocalTest, ConcurrentReadWriteTest) {
   // Each thread local copy of the value are also different from each
   // other.
   for (int th = 0; th < p1.total; ++th) {
-    env_->StartThread(func, static_cast<void*>(&p1));
+    env_->StartThreadTyped(func, &p1);
   }
   for (int th = 0; th < p2.total; ++th) {
-    env_->StartThread(func, static_cast<void*>(&p2));
+    env_->StartThreadTyped(func, &p2);
   }
 
   mu1.Lock();
@@ -251,9 +250,8 @@ TEST_F(ThreadLocalTest, Unref) {
   };
 
   // Case 0: no unref triggered if ThreadLocalPtr is never accessed
-  auto func0 = [](void* ptr) {
-    auto& p = *static_cast<Params*>(ptr);
-
+  auto func0 = [](Params* ptr) {
+    auto& p = *ptr;
     p.mu->Lock();
     ++(p.started);
     p.cv->SignalAll();
@@ -270,15 +268,15 @@ TEST_F(ThreadLocalTest, Unref) {
     Params p(&mu, &cv, &unref_count, th, unref);
 
     for (int i = 0; i < p.total; ++i) {
-      env_->StartThread(func0, static_cast<void*>(&p));
+      env_->StartThreadTyped(func0, &p);
     }
     env_->WaitForJoin();
     ASSERT_EQ(unref_count, 0);
   }
 
   // Case 1: unref triggered by thread exit
-  auto func1 = [](void* ptr) {
-    auto& p = *static_cast<Params*>(ptr);
+  auto func1 = [](Params* ptr) {
+    auto& p = *ptr;
 
     p.mu->Lock();
     ++(p.started);
@@ -307,7 +305,7 @@ TEST_F(ThreadLocalTest, Unref) {
     p.tls2 = &tls2;
 
     for (int i = 0; i < p.total; ++i) {
-      env_->StartThread(func1, static_cast<void*>(&p));
+      env_->StartThreadTyped(func1, &p);
     }
 
     env_->WaitForJoin();
@@ -317,8 +315,8 @@ TEST_F(ThreadLocalTest, Unref) {
   }
 
   // Case 2: unref triggered by ThreadLocal instance destruction
-  auto func2 = [](void* ptr) {
-    auto& p = *static_cast<Params*>(ptr);
+  auto func2 = [](Params* ptr) {
+    auto& p = *ptr;
 
     p.mu->Lock();
     ++(p.started);
@@ -356,7 +354,7 @@ TEST_F(ThreadLocalTest, Unref) {
     p.tls2 = new ThreadLocalPtr(unref);
 
     for (int i = 0; i < p.total; ++i) {
-      env_->StartThread(func2, static_cast<void*>(&p));
+      env_->StartThreadTyped(func2, &p);
     }
 
     // Wait for all threads to finish using Params
@@ -431,7 +429,7 @@ TEST_F(ThreadLocalTest, Scrape) {
     p.tls2 = new ThreadLocalPtr(unref);
 
     for (int i = 0; i < p.total; ++i) {
-      env_->StartThread(func, static_cast<void*>(&p));
+      env_->StartThreadTyped(func, &p);
     }
 
     // Wait for all threads to finish using Params
@@ -490,7 +488,7 @@ TEST_F(ThreadLocalTest, Fold) {
   };
 
   for (int th = 0; th < params.total; ++th) {
-    env_->StartThread(func, static_cast<void*>(&params));
+    env_->StartThread(func, &params);
   }
 
   // Wait for all threads to finish using Params
diff --git a/util/threadpool_imp.cc b/util/threadpool_imp.cc
index dcaf288aaf2..b6a52171415 100644
--- a/util/threadpool_imp.cc
+++ b/util/threadpool_imp.cc
@@ -19,6 +19,7 @@
 #endif
 
 #include <stdlib.h>
+
 #include <algorithm>
 #include <atomic>
 #include <condition_variable>
@@ -31,12 +32,13 @@
 #include "monitoring/thread_status_util.h"
 #include "port/port.h"
 #include "test_util/sync_point.h"
+#include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 void ThreadPoolImpl::PthreadCall(const char* label, int result) {
   if (result != 0) {
-    fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
+    fprintf(stderr, "pthread %s: %s\n", label, errnoStr(result).c_str());
     abort();
   }
 }
diff --git a/util/timer.h b/util/timer.h
index b6ee42ed054..736d0bf0a98 100644
--- a/util/timer.h
+++ b/util/timer.h
@@ -14,7 +14,7 @@
 #include <vector>
 
 #include "monitoring/instrumented_mutex.h"
-#include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/sync_point.h"
 #include "util/mutexlock.h"
 
@@ -38,9 +38,9 @@ namespace ROCKSDB_NAMESPACE {
 // A map from a function name to the function keeps track of all the functions.
 class Timer {
  public:
-  explicit Timer(Env* env)
-      : env_(env),
-        mutex_(env),
+  explicit Timer(SystemClock* clock)
+      : clock_(clock),
+        mutex_(clock),
         cond_var_(&mutex_),
         running_(false),
         executing_task_(false) {}
@@ -60,9 +60,9 @@ class Timer {
            const std::string& fn_name,
            uint64_t start_after_us,
            uint64_t repeat_every_us) {
-    std::unique_ptr<FunctionInfo> fn_info(
-        new FunctionInfo(std::move(fn), fn_name,
-                         env_->NowMicros() + start_after_us, repeat_every_us));
+    std::unique_ptr<FunctionInfo> fn_info(new FunctionInfo(
+        std::move(fn), fn_name, clock_->NowMicros() + start_after_us,
+        repeat_every_us));
     {
       InstrumentedMutexLock l(&mutex_);
       auto it = map_.find(fn_name);
@@ -73,7 +73,7 @@ class Timer {
         // If it already exists, overriding it.
         it->second->fn = std::move(fn_info->fn);
         it->second->valid = true;
-        it->second->next_run_time_us = env_->NowMicros() + start_after_us;
+        it->second->next_run_time_us = clock_->NowMicros() + start_after_us;
         it->second->repeat_every_us = repeat_every_us;
       }
     }
@@ -151,7 +151,7 @@ class Timer {
 #ifndef NDEBUG
   // Wait until Timer starting waiting, call the optional callback, then wait
   // for Timer waiting again.
-  // Tests can provide a custom env object to mock time, and use the callback
+  // Tests can provide a custom Clock object to mock time, and use the callback
   // here to bump current time and trigger Timer. See timer_test for example.
   //
   // Note: only support one caller of this method.
@@ -160,18 +160,18 @@ class Timer {
     // It act as a spin lock
     while (executing_task_ ||
            (!heap_.empty() &&
-            heap_.top()->next_run_time_us <= env_->NowMicros())) {
-      cond_var_.TimedWait(env_->NowMicros() + 1000);
+            heap_.top()->next_run_time_us <= clock_->NowMicros())) {
+      cond_var_.TimedWait(clock_->NowMicros() + 1000);
     }
     if (callback != nullptr) {
       callback();
     }
     cond_var_.SignalAll();
     do {
-      cond_var_.TimedWait(env_->NowMicros() + 1000);
-    } while (
-        executing_task_ ||
-        (!heap_.empty() && heap_.top()->next_run_time_us <= env_->NowMicros()));
+      cond_var_.TimedWait(clock_->NowMicros() + 1000);
+    } while (executing_task_ ||
+             (!heap_.empty() &&
+              heap_.top()->next_run_time_us <= clock_->NowMicros()));
   }
 
   size_t TEST_GetPendingTaskNum() const {
@@ -208,7 +208,7 @@ class Timer {
         continue;
       }
 
-      if (current_fn->next_run_time_us <= env_->NowMicros()) {
+      if (current_fn->next_run_time_us <= clock_->NowMicros()) {
         // make a copy of the function so it won't be changed after
         // mutex_.unlock.
         std::function<void()> fn = current_fn->fn;
@@ -229,8 +229,8 @@ class Timer {
         // current_fn may be cancelled already.
         if (current_fn->IsValid() && current_fn->repeat_every_us > 0) {
           assert(running_);
-          current_fn->next_run_time_us = env_->NowMicros() +
-              current_fn->repeat_every_us;
+          current_fn->next_run_time_us =
+              clock_->NowMicros() + current_fn->repeat_every_us;
 
           // Schedule new work into the heap with new time.
           heap_.push(current_fn);
@@ -310,7 +310,7 @@ class Timer {
     }
   };
 
-  Env* const env_;
+  SystemClock* clock_;
   // This mutex controls both the heap_ and the map_. It needs to be held for
   // making any changes in them.
   mutable InstrumentedMutex mutex_;
diff --git a/util/timer_test.cc b/util/timer_test.cc
index 761143ad3e6..3407fe9ee3c 100644
--- a/util/timer_test.cc
+++ b/util/timer_test.cc
@@ -6,24 +6,27 @@
 #include "util/timer.h"
 
 #include "db/db_test_util.h"
+#include "test_util/mock_time_env.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 class TimerTest : public testing::Test {
  public:
-  TimerTest() : mock_env_(new MockTimeEnv(Env::Default())) {}
+  TimerTest()
+      : mock_clock_(std::make_shared<MockSystemClock>(SystemClock::Default())) {
+  }
 
  protected:
-  std::unique_ptr<MockTimeEnv> mock_env_;
+  std::shared_ptr<MockSystemClock> mock_clock_;
 
-  void SetUp() override { mock_env_->InstallTimedWaitFixCallback(); }
+  void SetUp() override { mock_clock_->InstallTimedWaitFixCallback(); }
 
   const int kUsPerSec = 1000000;
 };
 
 TEST_F(TimerTest, SingleScheduleOnce) {
   const int kInitDelayUs = 1 * kUsPerSec;
-  Timer timer(mock_env_.get());
+  Timer timer(mock_clock_.get());
 
   int count = 0;
   timer.Add([&] { count++; }, "fn_sch_test", kInitDelayUs, 0);
@@ -33,7 +36,7 @@ TEST_F(TimerTest, SingleScheduleOnce) {
   ASSERT_EQ(0, count);
   // Wait for execution to finish
   timer.TEST_WaitForRun(
-      [&] { mock_env_->MockSleepForMicroseconds(kInitDelayUs); });
+      [&] { mock_clock_->MockSleepForMicroseconds(kInitDelayUs); });
   ASSERT_EQ(1, count);
 
   ASSERT_TRUE(timer.Shutdown());
@@ -42,7 +45,7 @@ TEST_F(TimerTest, SingleScheduleOnce) {
 TEST_F(TimerTest, MultipleScheduleOnce) {
   const int kInitDelay1Us = 1 * kUsPerSec;
   const int kInitDelay2Us = 3 * kUsPerSec;
-  Timer timer(mock_env_.get());
+  Timer timer(mock_clock_.get());
 
   int count1 = 0;
   timer.Add([&] { count1++; }, "fn_sch_test1", kInitDelay1Us, 0);
@@ -55,13 +58,13 @@ TEST_F(TimerTest, MultipleScheduleOnce) {
   ASSERT_EQ(0, count2);
 
   timer.TEST_WaitForRun(
-      [&] { mock_env_->MockSleepForMicroseconds(kInitDelay1Us); });
+      [&] { mock_clock_->MockSleepForMicroseconds(kInitDelay1Us); });
 
   ASSERT_EQ(1, count1);
   ASSERT_EQ(0, count2);
 
   timer.TEST_WaitForRun([&] {
-    mock_env_->MockSleepForMicroseconds(kInitDelay2Us - kInitDelay1Us);
+    mock_clock_->MockSleepForMicroseconds(kInitDelay2Us - kInitDelay1Us);
   });
 
   ASSERT_EQ(1, count1);
@@ -75,7 +78,7 @@ TEST_F(TimerTest, SingleScheduleRepeatedly) {
   const int kInitDelayUs = 1 * kUsPerSec;
   const int kRepeatUs = 1 * kUsPerSec;
 
-  Timer timer(mock_env_.get());
+  Timer timer(mock_clock_.get());
   int count = 0;
   timer.Add([&] { count++; }, "fn_sch_test", kInitDelayUs, kRepeatUs);
 
@@ -83,14 +86,14 @@ TEST_F(TimerTest, SingleScheduleRepeatedly) {
   ASSERT_EQ(0, count);
 
   timer.TEST_WaitForRun(
-      [&] { mock_env_->MockSleepForMicroseconds(kInitDelayUs); });
+      [&] { mock_clock_->MockSleepForMicroseconds(kInitDelayUs); });
 
   ASSERT_EQ(1, count);
 
   // Wait for execution to finish
   for (int i = 1; i < kIterations; i++) {
     timer.TEST_WaitForRun(
-        [&] { mock_env_->MockSleepForMicroseconds(kRepeatUs); });
+        [&] { mock_clock_->MockSleepForMicroseconds(kRepeatUs); });
   }
   ASSERT_EQ(kIterations, count);
 
@@ -105,7 +108,7 @@ TEST_F(TimerTest, MultipleScheduleRepeatedly) {
   const int kRepeatUs = 2 * kUsPerSec;
   const int kLargeRepeatUs = 100 * kUsPerSec;
 
-  Timer timer(mock_env_.get());
+  Timer timer(mock_clock_.get());
 
   int count1 = 0;
   timer.Add([&] { count1++; }, "fn_sch_test1", kInitDelay1Us, kRepeatUs);
@@ -123,7 +126,7 @@ TEST_F(TimerTest, MultipleScheduleRepeatedly) {
   // Wait for execution to finish
   for (int i = 1; i < kIterations * (kRepeatUs / kUsPerSec); i++) {
     timer.TEST_WaitForRun(
-        [&] { mock_env_->MockSleepForMicroseconds(1 * kUsPerSec); });
+        [&] { mock_clock_->MockSleepForMicroseconds(1 * kUsPerSec); });
     ASSERT_EQ((i + 2) / (kRepeatUs / kUsPerSec), count1);
     ASSERT_EQ((i + 1) / (kRepeatUs / kUsPerSec), count2);
 
@@ -135,7 +138,7 @@ TEST_F(TimerTest, MultipleScheduleRepeatedly) {
 
   // Wait for execution to finish
   timer.TEST_WaitForRun(
-      [&] { mock_env_->MockSleepForMicroseconds(1 * kUsPerSec); });
+      [&] { mock_clock_->MockSleepForMicroseconds(1 * kUsPerSec); });
   ASSERT_EQ(kIterations, count1);
   ASSERT_EQ(kIterations, count2);
   ASSERT_EQ(1, count3);
@@ -147,8 +150,8 @@ TEST_F(TimerTest, MultipleScheduleRepeatedly) {
 
   // execute the long interval one
   timer.TEST_WaitForRun([&] {
-    mock_env_->MockSleepForMicroseconds(
-        kLargeRepeatUs - static_cast<int>(mock_env_->NowMicros()));
+    mock_clock_->MockSleepForMicroseconds(
+        kLargeRepeatUs - static_cast<int>(mock_clock_->NowMicros()));
   });
   ASSERT_EQ(2, count3);
 
@@ -165,7 +168,7 @@ TEST_F(TimerTest, AddAfterStartTest) {
       {{"Timer::Run::Waiting", "TimerTest:AddAfterStartTest:1"}});
   SyncPoint::GetInstance()->EnableProcessing();
 
-  Timer timer(mock_env_.get());
+  Timer timer(mock_clock_.get());
 
   ASSERT_TRUE(timer.Start());
 
@@ -175,12 +178,12 @@ TEST_F(TimerTest, AddAfterStartTest) {
   ASSERT_EQ(0, count);
   // Wait for execution to finish
   timer.TEST_WaitForRun(
-      [&] { mock_env_->MockSleepForMicroseconds(kInitDelayUs); });
+      [&] { mock_clock_->MockSleepForMicroseconds(kInitDelayUs); });
   ASSERT_EQ(1, count);
 
   for (int i = 1; i < kIterations; i++) {
     timer.TEST_WaitForRun(
-        [&] { mock_env_->MockSleepForMicroseconds(kRepeatUs); });
+        [&] { mock_clock_->MockSleepForMicroseconds(kRepeatUs); });
   }
   ASSERT_EQ(kIterations, count);
 
@@ -190,7 +193,7 @@ TEST_F(TimerTest, AddAfterStartTest) {
 TEST_F(TimerTest, CancelRunningTask) {
   static constexpr char kTestFuncName[] = "test_func";
   const int kRepeatUs = 1 * kUsPerSec;
-  Timer timer(mock_env_.get());
+  Timer timer(mock_clock_.get());
   ASSERT_TRUE(timer.Start());
   int* value = new int;
   *value = 0;
@@ -217,7 +220,7 @@ TEST_F(TimerTest, CancelRunningTask) {
     delete value;
     value = nullptr;
   });
-  mock_env_->MockSleepForMicroseconds(kRepeatUs);
+  mock_clock_->MockSleepForMicroseconds(kRepeatUs);
   control_thr.join();
   ASSERT_TRUE(timer.Shutdown());
 }
@@ -226,7 +229,7 @@ TEST_F(TimerTest, ShutdownRunningTask) {
   const int kRepeatUs = 1 * kUsPerSec;
   constexpr char kTestFunc1Name[] = "test_func1";
   constexpr char kTestFunc2Name[] = "test_func2";
-  Timer timer(mock_env_.get());
+  Timer timer(mock_clock_.get());
 
   SyncPoint::GetInstance()->DisableProcessing();
   SyncPoint::GetInstance()->LoadDependency({
@@ -255,7 +258,7 @@ TEST_F(TimerTest, ShutdownRunningTask) {
     TEST_SYNC_POINT("TimerTest::ShutdownRunningTest:BeforeShutdown");
     timer.Shutdown();
   });
-  mock_env_->MockSleepForMicroseconds(kRepeatUs);
+  mock_clock_->MockSleepForMicroseconds(kRepeatUs);
   control_thr.join();
   delete value;
 }
@@ -265,7 +268,7 @@ TEST_F(TimerTest, AddSameFuncName) {
   const int kRepeat1Us = 5 * kUsPerSec;
   const int kRepeat2Us = 4 * kUsPerSec;
 
-  Timer timer(mock_env_.get());
+  Timer timer(mock_clock_.get());
   ASSERT_TRUE(timer.Start());
 
   int func_counter1 = 0;
@@ -285,14 +288,14 @@ TEST_F(TimerTest, AddSameFuncName) {
   ASSERT_EQ(0, func_counter2);
 
   timer.TEST_WaitForRun(
-      [&] { mock_env_->MockSleepForMicroseconds(kInitDelayUs); });
+      [&] { mock_clock_->MockSleepForMicroseconds(kInitDelayUs); });
 
   ASSERT_EQ(0, func_counter1);
   ASSERT_EQ(1, func2_counter);
   ASSERT_EQ(1, func_counter2);
 
   timer.TEST_WaitForRun(
-      [&] { mock_env_->MockSleepForMicroseconds(kRepeat1Us); });
+      [&] { mock_clock_->MockSleepForMicroseconds(kRepeat1Us); });
 
   ASSERT_EQ(0, func_counter1);
   ASSERT_EQ(2, func2_counter);
@@ -306,22 +309,22 @@ TEST_F(TimerTest, RepeatIntervalWithFuncRunningTime) {
   const int kRepeatUs = 5 * kUsPerSec;
   const int kFuncRunningTimeUs = 1 * kUsPerSec;
 
-  Timer timer(mock_env_.get());
+  Timer timer(mock_clock_.get());
   ASSERT_TRUE(timer.Start());
 
   int func_counter = 0;
   timer.Add(
       [&] {
-        mock_env_->MockSleepForMicroseconds(kFuncRunningTimeUs);
+        mock_clock_->MockSleepForMicroseconds(kFuncRunningTimeUs);
         func_counter++;
       },
       "func", kInitDelayUs, kRepeatUs);
 
   ASSERT_EQ(0, func_counter);
   timer.TEST_WaitForRun(
-      [&] { mock_env_->MockSleepForMicroseconds(kInitDelayUs); });
+      [&] { mock_clock_->MockSleepForMicroseconds(kInitDelayUs); });
   ASSERT_EQ(1, func_counter);
-  ASSERT_EQ(kInitDelayUs + kFuncRunningTimeUs, mock_env_->NowMicros());
+  ASSERT_EQ(kInitDelayUs + kFuncRunningTimeUs, mock_clock_->NowMicros());
 
   // After repeat interval time, the function is not executed, as running
   // the function takes some time (`kFuncRunningTimeSec`). The repeat interval
@@ -329,13 +332,13 @@ TEST_F(TimerTest, RepeatIntervalWithFuncRunningTime) {
   // next call.
   uint64_t next_abs_interval_time_us = kInitDelayUs + kRepeatUs;
   timer.TEST_WaitForRun([&] {
-    mock_env_->set_current_time(next_abs_interval_time_us / kUsPerSec);
+    mock_clock_->SetCurrentTime(next_abs_interval_time_us / kUsPerSec);
   });
   ASSERT_EQ(1, func_counter);
 
   // After the function running time, it's executed again
   timer.TEST_WaitForRun(
-      [&] { mock_env_->MockSleepForMicroseconds(kFuncRunningTimeUs); });
+      [&] { mock_clock_->MockSleepForMicroseconds(kFuncRunningTimeUs); });
   ASSERT_EQ(2, func_counter);
 
   ASSERT_TRUE(timer.Shutdown());
@@ -345,14 +348,14 @@ TEST_F(TimerTest, DestroyRunningTimer) {
   const int kInitDelayUs = 1 * kUsPerSec;
   const int kRepeatUs = 1 * kUsPerSec;
 
-  auto timer_ptr = new Timer(mock_env_.get());
+  auto timer_ptr = new Timer(mock_clock_.get());
 
   int count = 0;
   timer_ptr->Add([&] { count++; }, "fn_sch_test", kInitDelayUs, kRepeatUs);
   ASSERT_TRUE(timer_ptr->Start());
 
   timer_ptr->TEST_WaitForRun(
-      [&] { mock_env_->MockSleepForMicroseconds(kInitDelayUs); });
+      [&] { mock_clock_->MockSleepForMicroseconds(kInitDelayUs); });
 
   // delete a running timer should not cause any exception
   delete timer_ptr;
@@ -360,7 +363,7 @@ TEST_F(TimerTest, DestroyRunningTimer) {
 
 TEST_F(TimerTest, DestroyTimerWithRunningFunc) {
   const int kRepeatUs = 1 * kUsPerSec;
-  auto timer_ptr = new Timer(mock_env_.get());
+  auto timer_ptr = new Timer(mock_clock_.get());
 
   SyncPoint::GetInstance()->DisableProcessing();
   SyncPoint::GetInstance()->LoadDependency({
@@ -386,7 +389,7 @@ TEST_F(TimerTest, DestroyTimerWithRunningFunc) {
     TEST_SYNC_POINT("TimerTest::DestroyTimerWithRunningFunc:BeforeDelete");
     delete timer_ptr;
   });
-  mock_env_->MockSleepForMicroseconds(kRepeatUs);
+  mock_clock_->MockSleepForMicroseconds(kRepeatUs);
   control_thr.join();
 }
 
diff --git a/util/user_comparator_wrapper.h b/util/user_comparator_wrapper.h
index 3b67debd0fb..c40e04a76c0 100644
--- a/util/user_comparator_wrapper.h
+++ b/util/user_comparator_wrapper.h
@@ -73,6 +73,10 @@ class UserComparatorWrapper final : public Comparator {
     return user_comparator_->CompareWithoutTimestamp(a, a_has_ts, b, b_has_ts);
   }
 
+  bool EqualWithoutTimestamp(const Slice& a, const Slice& b) const override {
+    return user_comparator_->EqualWithoutTimestamp(a, b);
+  }
+
  private:
   const Comparator* user_comparator_;
 };
diff --git a/util/xxhash.cc b/util/xxhash.cc
index d1b1395ab23..9b0d1238944 100644
--- a/util/xxhash.cc
+++ b/util/xxhash.cc
@@ -218,7 +218,6 @@ static xxh_u32 XXH_read32(const void* memPtr)
 
 
 /* ===   Endianess   === */
-typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
 
 /* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
 #ifndef XXH_CPU_LITTLE_ENDIAN
@@ -1154,7 +1153,6 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
 
 #include "xxh3p.h" /* XXH3 preview for RocksDB */
 
-
 #endif  /* XXH_NO_LONG_LONG */
 
 #endif  /* XXHASH_C_01393879 */
diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc
index 7948fc48adf..b98c56bc6be 100644
--- a/utilities/backupable/backupable_db.cc
+++ b/utilities/backupable/backupable_db.cc
@@ -9,13 +9,10 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "rocksdb/utilities/backupable_db.h"
-
-#include <stdlib.h>
-
 #include <algorithm>
 #include <atomic>
 #include <cinttypes>
+#include <cstdlib>
 #include <functional>
 #include <future>
 #include <limits>
@@ -29,7 +26,10 @@
 #include <vector>
 
 #include "env/composite_env_wrapper.h"
+#include "env/fs_readonly.h"
+#include "env/fs_remap.h"
 #include "file/filename.h"
+#include "file/line_file_reader.h"
 #include "file/sequence_file_reader.h"
 #include "file/writable_file_writer.h"
 #include "logging/logging.h"
@@ -38,16 +38,20 @@
 #include "rocksdb/transaction_log.h"
 #include "table/sst_file_dumper.h"
 #include "test_util/sync_point.h"
+#include "util/cast_util.h"
 #include "util/channel.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
 #include "util/string_util.h"
+#include "utilities/backupable/backupable_db_impl.h"
 #include "utilities/checkpoint/checkpoint_impl.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 namespace {
-using ShareFilesNaming = BackupableDBOptions::ShareFilesNaming;
+using ShareFilesNaming = BackupEngineOptions::ShareFilesNaming;
+
+constexpr BackupID kLatestBackupIDMarker = static_cast<BackupID>(-2);
 
 inline uint32_t ChecksumHexToInt32(const std::string& checksum_hex) {
   std::string checksum_str;
@@ -62,6 +66,16 @@ inline std::string ChecksumInt32ToHex(const uint32_t& checksum_value) {
   PutFixed32(&checksum_str, EndianSwapValue(checksum_value));
   return ChecksumStrToHex(checksum_str);
 }
+
+const std::string kPrivateDirName = "private";
+const std::string kMetaDirName = "meta";
+const std::string kSharedDirName = "shared";
+const std::string kSharedChecksumDirName = "shared_checksum";
+const std::string kPrivateDirSlash = kPrivateDirName + "/";
+const std::string kMetaDirSlash = kMetaDirName + "/";
+const std::string kSharedDirSlash = kSharedDirName + "/";
+const std::string kSharedChecksumDirSlash = kSharedChecksumDirName + "/";
+
 }  // namespace
 
 void BackupStatistics::IncrementNumberSuccessBackup() {
@@ -85,7 +99,7 @@ std::string BackupStatistics::ToString() const {
   return result;
 }
 
-void BackupableDBOptions::Dump(Logger* logger) const {
+void BackupEngineOptions::Dump(Logger* logger) const {
   ROCKS_LOG_INFO(logger, "               Options.backup_dir: %s",
                  backup_dir.c_str());
   ROCKS_LOG_INFO(logger, "               Options.backup_env: %p", backup_env);
@@ -107,68 +121,69 @@ void BackupableDBOptions::Dump(Logger* logger) const {
 }
 
 // -------- BackupEngineImpl class ---------
-class BackupEngineImpl : public BackupEngine {
+class BackupEngineImpl {
  public:
-  BackupEngineImpl(const BackupableDBOptions& options, Env* db_env,
+  BackupEngineImpl(const BackupEngineOptions& options, Env* db_env,
                    bool read_only = false);
-  ~BackupEngineImpl() override;
+  ~BackupEngineImpl();
 
-  using BackupEngine::CreateNewBackupWithMetadata;
   Status CreateNewBackupWithMetadata(const CreateBackupOptions& options, DB* db,
-                                     const std::string& app_metadata) override;
+                                     const std::string& app_metadata,
+                                     BackupID* new_backup_id_ptr);
 
-  Status PurgeOldBackups(uint32_t num_backups_to_keep) override;
+  Status PurgeOldBackups(uint32_t num_backups_to_keep);
 
-  Status DeleteBackup(BackupID backup_id) override;
+  Status DeleteBackup(BackupID backup_id);
 
-  void StopBackup() override {
-    stop_backup_.store(true, std::memory_order_release);
-  }
+  void StopBackup() { stop_backup_.store(true, std::memory_order_release); }
 
-  Status GarbageCollect() override;
+  Status GarbageCollect();
 
   // The returned BackupInfos are in chronological order, which means the
   // latest backup comes last.
-  void GetBackupInfo(std::vector<BackupInfo>* backup_info) override;
+  void GetBackupInfo(std::vector<BackupInfo>* backup_info,
+                     bool include_file_details) const;
 
-  void GetCorruptedBackups(std::vector<BackupID>* corrupt_backup_ids) override;
+  Status GetBackupInfo(BackupID backup_id, BackupInfo* backup_info,
+                       bool include_file_details = false) const;
+
+  void GetCorruptedBackups(std::vector<BackupID>* corrupt_backup_ids) const;
 
-  using BackupEngine::RestoreDBFromBackup;
   Status RestoreDBFromBackup(const RestoreOptions& options, BackupID backup_id,
                              const std::string& db_dir,
-                             const std::string& wal_dir) override;
+                             const std::string& wal_dir) const;
 
-  using BackupEngine::RestoreDBFromLatestBackup;
   Status RestoreDBFromLatestBackup(const RestoreOptions& options,
                                    const std::string& db_dir,
-                                   const std::string& wal_dir) override {
-    return RestoreDBFromBackup(options, latest_valid_backup_id_, db_dir,
-                               wal_dir);
+                                   const std::string& wal_dir) const {
+    // Note: don't read latest_valid_backup_id_ outside of lock
+    return RestoreDBFromBackup(options, kLatestBackupIDMarker, db_dir, wal_dir);
   }
 
   Status VerifyBackup(BackupID backup_id,
-                      bool verify_with_checksum = false) override;
+                      bool verify_with_checksum = false) const;
 
   Status Initialize();
 
   ShareFilesNaming GetNamingNoFlags() const {
     return options_.share_files_with_checksum_naming &
-           BackupableDBOptions::kMaskNoNamingFlags;
+           BackupEngineOptions::kMaskNoNamingFlags;
   }
   ShareFilesNaming GetNamingFlags() const {
     return options_.share_files_with_checksum_naming &
-           BackupableDBOptions::kMaskNamingFlags;
+           BackupEngineOptions::kMaskNamingFlags;
   }
 
  private:
-  void DeleteChildren(const std::string& dir, uint32_t file_type_filter = 0);
-  Status DeleteBackupInternal(BackupID backup_id);
+  void DeleteChildren(const std::string& dir,
+                      uint32_t file_type_filter = 0) const;
+  Status DeleteBackupNoGC(BackupID backup_id);
 
   // Extends the "result" map with pathname->size mappings for the contents of
   // "dir" in "env". Pathnames are prefixed with "dir".
-  Status InsertPathnameToSizeBytes(
+  Status ReadChildFileCurrentSizes(
       const std::string& dir, Env* env,
-      std::unordered_map<std::string, uint64_t>* result);
+      std::unordered_map<std::string, uint64_t>* result) const;
 
   struct FileInfo {
     FileInfo(const std::string& fname, uint64_t sz, const std::string& checksum,
@@ -186,7 +201,8 @@ class BackupEngineImpl : public BackupEngine {
     int refs;
     const std::string filename;
     const uint64_t size;
-    const std::string checksum_hex;
+    // crc32c checksum as hex. empty == unknown / unavailable
+    std::string checksum_hex;
     // DB identities
     // db_id is obtained for potential usage in the future but not used
     // currently
@@ -194,6 +210,133 @@ class BackupEngineImpl : public BackupEngine {
     // db_session_id appears in the backup SST filename if the table naming
     // option is kUseDbSessionId
     const std::string db_session_id;
+
+    std::string GetDbFileName() {
+      std::string rv;
+      // extract the filename part
+      size_t slash = filename.find_last_of('/');
+      // file will either be shared/<file>, shared_checksum/<file_crc32c_size>,
+      // shared_checksum/<file_session>, shared_checksum/<file_crc32c_session>,
+      // or private/<number>/<file>
+      assert(slash != std::string::npos);
+      rv = filename.substr(slash + 1);
+
+      // if the file was in shared_checksum, extract the real file name
+      // in this case the file is <number>_<checksum>_<size>.<type>,
+      // <number>_<session>.<type>, or <number>_<checksum>_<session>.<type>
+      if (filename.substr(0, slash) == kSharedChecksumDirName) {
+        rv = GetFileFromChecksumFile(rv);
+      }
+      return rv;
+    }
+  };
+
+  static inline std::string WithoutTrailingSlash(const std::string& path) {
+    if (path.empty() || path.back() != '/') {
+      return path;
+    } else {
+      return path.substr(path.size() - 1);
+    }
+  }
+
+  static inline std::string WithTrailingSlash(const std::string& path) {
+    if (path.empty() || path.back() != '/') {
+      return path + '/';
+    } else {
+      return path;
+    }
+  }
+
+  // A filesystem wrapper that makes shared backup files appear to be in the
+  // private backup directory (dst_dir), so that the private backup dir can
+  // be opened as a read-only DB.
+  class RemapSharedFileSystem : public RemapFileSystem {
+   public:
+    RemapSharedFileSystem(const std::shared_ptr<FileSystem>& base,
+                          const std::string& dst_dir,
+                          const std::string& src_base_dir,
+                          const std::vector<std::shared_ptr<FileInfo>>& files)
+        : RemapFileSystem(base),
+          dst_dir_(WithoutTrailingSlash(dst_dir)),
+          dst_dir_slash_(WithTrailingSlash(dst_dir)),
+          src_base_dir_(WithTrailingSlash(src_base_dir)) {
+      for (auto& info : files) {
+        if (!StartsWith(info->filename, kPrivateDirSlash)) {
+          assert(StartsWith(info->filename, kSharedDirSlash) ||
+                 StartsWith(info->filename, kSharedChecksumDirSlash));
+          remaps_[info->GetDbFileName()] = info;
+        }
+      }
+    }
+
+    const char* Name() const override {
+      return "BackupEngineImpl::RemapSharedFileSystem";
+    }
+
+    // Sometimes a directory listing is required in opening a DB
+    IOStatus GetChildren(const std::string& dir, const IOOptions& options,
+                         std::vector<std::string>* result,
+                         IODebugContext* dbg) override {
+      IOStatus s = RemapFileSystem::GetChildren(dir, options, result, dbg);
+      if (s.ok() && (dir == dst_dir_ || dir == dst_dir_slash_)) {
+        // Assume remapped files exist
+        for (auto& r : remaps_) {
+          result->push_back(r.first);
+        }
+      }
+      return s;
+    }
+
+    // Sometimes a directory listing is required in opening a DB
+    IOStatus GetChildrenFileAttributes(const std::string& dir,
+                                       const IOOptions& options,
+                                       std::vector<FileAttributes>* result,
+                                       IODebugContext* dbg) override {
+      IOStatus s =
+          RemapFileSystem::GetChildrenFileAttributes(dir, options, result, dbg);
+      if (s.ok() && (dir == dst_dir_ || dir == dst_dir_slash_)) {
+        // Assume remapped files exist with recorded size
+        for (auto& r : remaps_) {
+          result->emplace_back();  // clean up with C++20
+          FileAttributes& attr = result->back();
+          attr.name = r.first;
+          attr.size_bytes = r.second->size;
+        }
+      }
+      return s;
+    }
+
+   protected:
+    // When a file in dst_dir is requested, see if we need to remap to shared
+    // file path.
+    std::pair<IOStatus, std::string> EncodePath(
+        const std::string& path) override {
+      if (path.empty() || path[0] != '/') {
+        return {IOStatus::InvalidArgument(path, "Not an absolute path"), ""};
+      }
+      std::pair<IOStatus, std::string> rv{IOStatus(), path};
+      if (StartsWith(path, dst_dir_slash_)) {
+        std::string relative = path.substr(dst_dir_slash_.size());
+        auto it = remaps_.find(relative);
+        if (it != remaps_.end()) {
+          rv.second = src_base_dir_ + it->second->filename;
+        }
+      }
+      return rv;
+    }
+
+   private:
+    // Absolute path to a directory that some extra files will be mapped into.
+    const std::string dst_dir_;
+    // Includes a trailing slash.
+    const std::string dst_dir_slash_;
+    // Absolute path to a directory containing some files to be mapped into
+    // dst_dir_. Includes a trailing slash.
+    const std::string src_base_dir_;
+    // If remaps_[x] exists, attempt to read dst_dir_ / x should instead read
+    // src_base_dir_ / remaps_[x]->filename. FileInfo is used to maximize
+    // sharing with other backup data in memory.
+    std::unordered_map<std::string, std::shared_ptr<FileInfo>> remaps_;
   };
 
   class BackupMeta {
@@ -216,7 +359,11 @@ class BackupEngineImpl : public BackupEngine {
     ~BackupMeta() {}
 
     void RecordTimestamp() {
-      env_->GetCurrentTime(&timestamp_);
+      // Best effort
+      Status s = env_->GetCurrentTime(&timestamp_);
+      if (!s.ok()) {
+        timestamp_ = /* something clearly fabricated */ 1;
+      }
     }
     int64_t GetTimestamp() const {
       return timestamp_;
@@ -224,13 +371,13 @@ class BackupEngineImpl : public BackupEngine {
     uint64_t GetSize() const {
       return size_;
     }
-    uint32_t GetNumberFiles() { return static_cast<uint32_t>(files_.size()); }
+    uint32_t GetNumberFiles() const {
+      return static_cast<uint32_t>(files_.size());
+    }
     void SetSequenceNumber(uint64_t sequence_number) {
       sequence_number_ = sequence_number;
     }
-    uint64_t GetSequenceNumber() {
-      return sequence_number_;
-    }
+    uint64_t GetSequenceNumber() const { return sequence_number_; }
 
     const std::string& GetAppMetadata() const { return app_metadata_; }
 
@@ -242,9 +389,7 @@ class BackupEngineImpl : public BackupEngine {
 
     Status Delete(bool delete_meta = true);
 
-    bool Empty() {
-      return files_.empty();
-    }
+    bool Empty() const { return files_.empty(); }
 
     std::shared_ptr<FileInfo> GetFile(const std::string& filename) const {
       auto it = file_infos_->find(filename);
@@ -253,15 +398,18 @@ class BackupEngineImpl : public BackupEngine {
       return it->second;
     }
 
-    const std::vector<std::shared_ptr<FileInfo>>& GetFiles() {
+    const std::vector<std::shared_ptr<FileInfo>>& GetFiles() const {
       return files_;
     }
 
     // @param abs_path_to_size Pre-fetched file sizes (bytes).
     Status LoadFromFile(
         const std::string& backup_dir,
-        const std::unordered_map<std::string, uint64_t>& abs_path_to_size);
-    Status StoreToFile(bool sync);
+        const std::unordered_map<std::string, uint64_t>& abs_path_to_size,
+        Logger* info_log,
+        std::unordered_set<std::string>* reported_ignored_fields);
+    Status StoreToFile(
+        bool sync, const TEST_FutureSchemaVersion2Options* test_future_options);
 
     std::string GetInfoString() {
       std::ostringstream ss;
@@ -278,6 +426,27 @@ class BackupEngineImpl : public BackupEngine {
       return ss.str();
     }
 
+    const std::shared_ptr<Env>& GetEnvForOpen() const {
+      if (!env_for_open_) {
+        // Lazy initialize
+        // Find directories
+        std::string dst_dir = meta_filename_;
+        auto i = dst_dir.rfind(kMetaDirSlash);
+        assert(i != std::string::npos);
+        std::string src_base_dir = dst_dir.substr(0, i);
+        dst_dir.replace(i, kMetaDirSlash.size(), kPrivateDirSlash);
+        // Make the RemapSharedFileSystem
+        std::shared_ptr<FileSystem> remap_fs =
+            std::make_shared<RemapSharedFileSystem>(
+                env_->GetFileSystem(), dst_dir, src_base_dir, files_);
+        // Make it read-only for safety
+        remap_fs = std::make_shared<ReadOnlyFileSystem>(remap_fs);
+        // Make an Env wrapper
+        env_for_open_ = std::make_shared<CompositeEnvWrapper>(env_, remap_fs);
+      }
+      return env_for_open_;
+    }
+
    private:
     int64_t timestamp_;
     // sequence number is only approximate, should not be used
@@ -291,88 +460,70 @@ class BackupEngineImpl : public BackupEngine {
     std::vector<std::shared_ptr<FileInfo>> files_;
     std::unordered_map<std::string, std::shared_ptr<FileInfo>>* file_infos_;
     Env* env_;
-
-    static const size_t max_backup_meta_file_size_ = 10 * 1024 * 1024;  // 10MB
+    mutable std::shared_ptr<Env> env_for_open_;
   };  // BackupMeta
 
+  void SetBackupInfoFromBackupMeta(BackupID id, const BackupMeta& meta,
+                                   BackupInfo* backup_info,
+                                   bool include_file_details) const;
+
   inline std::string GetAbsolutePath(
       const std::string &relative_path = "") const {
     assert(relative_path.size() == 0 || relative_path[0] != '/');
     return options_.backup_dir + "/" + relative_path;
   }
-  inline std::string GetPrivateDirRel() const {
-    return "private";
-  }
-  inline std::string GetSharedDirRel() const { return "shared"; }
-  inline std::string GetSharedChecksumDirRel() const {
-    return "shared_checksum";
-  }
   inline std::string GetPrivateFileRel(BackupID backup_id,
                                        bool tmp = false,
                                        const std::string& file = "") const {
     assert(file.size() == 0 || file[0] != '/');
-    return GetPrivateDirRel() + "/" + ROCKSDB_NAMESPACE::ToString(backup_id) +
+    return kPrivateDirSlash + ROCKSDB_NAMESPACE::ToString(backup_id) +
            (tmp ? ".tmp" : "") + "/" + file;
   }
   inline std::string GetSharedFileRel(const std::string& file = "",
                                       bool tmp = false) const {
     assert(file.size() == 0 || file[0] != '/');
-    return GetSharedDirRel() + "/" + (tmp ? "." : "") + file +
+    return kSharedDirSlash + std::string(tmp ? "." : "") + file +
            (tmp ? ".tmp" : "");
   }
   inline std::string GetSharedFileWithChecksumRel(const std::string& file = "",
                                                   bool tmp = false) const {
     assert(file.size() == 0 || file[0] != '/');
-    return GetSharedChecksumDirRel() + "/" + (tmp ? "." : "") + file +
+    return kSharedChecksumDirSlash + std::string(tmp ? "." : "") + file +
            (tmp ? ".tmp" : "");
   }
   inline bool UseLegacyNaming(const std::string& sid) const {
     return GetNamingNoFlags() ==
-               BackupableDBOptions::kLegacyCrc32cAndFileSize ||
+               BackupEngineOptions::kLegacyCrc32cAndFileSize ||
            sid.empty();
   }
-  inline bool UseInterimNaming(const std::string& sid) const {
-    // The indicator of SST file from early internal 6.12 release
-    // is a '-' in the DB session id. DB session id was made more
-    // concise without '-' after that.
-    return (GetNamingFlags() & BackupableDBOptions::kFlagMatchInterimNaming) &&
-           sid.find('-') != std::string::npos;
-  }
   inline std::string GetSharedFileWithChecksum(
-      const std::string& file, bool has_checksum,
-      const std::string& checksum_hex, const uint64_t file_size,
-      const std::string& db_session_id) const {
+      const std::string& file, const std::string& checksum_hex,
+      const uint64_t file_size, const std::string& db_session_id) const {
     assert(file.size() == 0 || file[0] != '/');
     std::string file_copy = file;
     if (UseLegacyNaming(db_session_id)) {
-      assert(has_checksum);
-      (void)has_checksum;
+      assert(!checksum_hex.empty());
       file_copy.insert(file_copy.find_last_of('.'),
                        "_" + ToString(ChecksumHexToInt32(checksum_hex)) + "_" +
                            ToString(file_size));
-    } else if (UseInterimNaming(db_session_id)) {
-      file_copy.insert(file_copy.find_last_of('.'), "_" + db_session_id);
     } else {
       file_copy.insert(file_copy.find_last_of('.'), "_s" + db_session_id);
-      if (GetNamingFlags() & BackupableDBOptions::kFlagIncludeFileSize) {
+      if (GetNamingFlags() & BackupEngineOptions::kFlagIncludeFileSize) {
         file_copy.insert(file_copy.find_last_of('.'),
                          "_" + ToString(file_size));
       }
     }
     return file_copy;
   }
-  inline std::string GetFileFromChecksumFile(const std::string& file) const {
+  static inline std::string GetFileFromChecksumFile(const std::string& file) {
     assert(file.size() == 0 || file[0] != '/');
     std::string file_copy = file;
     size_t first_underscore = file_copy.find_first_of('_');
     return file_copy.erase(first_underscore,
                            file_copy.find_last_of('.') - first_underscore);
   }
-  inline std::string GetBackupMetaDir() const {
-    return GetAbsolutePath("meta");
-  }
   inline std::string GetBackupMetaFile(BackupID backup_id, bool tmp) const {
-    return GetBackupMetaDir() + "/" + (tmp ? "." : "") +
+    return GetAbsolutePath(kMetaDirName) + "/" + (tmp ? "." : "") +
            ROCKSDB_NAMESPACE::ToString(backup_id) + (tmp ? ".tmp" : "");
   }
 
@@ -394,7 +545,7 @@ class BackupEngineImpl : public BackupEngine {
   Status ReadFileAndComputeChecksum(const std::string& src, Env* src_env,
                                     const EnvOptions& src_env_options,
                                     uint64_t size_limit,
-                                    std::string* checksum_hex);
+                                    std::string* checksum_hex) const;
 
   // Obtain db_id and db_session_id from the table properties of file_path
   Status GetFileDbIdentities(Env* src_env, const EnvOptions& src_env_options,
@@ -402,6 +553,16 @@ class BackupEngineImpl : public BackupEngine {
                              std::string* db_session_id);
 
   struct CopyOrCreateResult {
+    ~CopyOrCreateResult() {
+      // The Status needs to be ignored here for two reasons.
+      // First, if the BackupEngineImpl shuts down with jobs outstanding, then
+      // it is possible that the Status in the future/promise is never read,
+      // resulting in an unchecked Status. Second, if there are items in the
+      // channel when the BackupEngineImpl is shutdown, these will also have
+      // Status that have not been checked.  This
+      // TODO: Fix those issues so that the Status
+      status.PermitUncheckedError();
+    }
     uint64_t size;
     std::string checksum_hex;
     std::string db_id;
@@ -424,7 +585,6 @@ class BackupEngineImpl : public BackupEngine {
     uint64_t size_limit;
     std::promise<CopyOrCreateResult> result;
     std::function<void()> progress_callback;
-    bool verify_checksum_after_work;
     std::string src_checksum_func_name;
     std::string src_checksum_hex;
     std::string db_id;
@@ -440,7 +600,6 @@ class BackupEngineImpl : public BackupEngine {
           sync(false),
           rate_limiter(nullptr),
           size_limit(0),
-          verify_checksum_after_work(false),
           src_checksum_func_name(kUnknownFileChecksumFuncName),
           src_checksum_hex(""),
           db_id(""),
@@ -465,7 +624,6 @@ class BackupEngineImpl : public BackupEngine {
       size_limit = o.size_limit;
       result = std::move(o.result);
       progress_callback = std::move(o.progress_callback);
-      verify_checksum_after_work = o.verify_checksum_after_work;
       src_checksum_func_name = std::move(o.src_checksum_func_name);
       src_checksum_hex = std::move(o.src_checksum_hex);
       db_id = std::move(o.db_id);
@@ -478,7 +636,6 @@ class BackupEngineImpl : public BackupEngine {
         Env* _src_env, Env* _dst_env, EnvOptions _src_env_options, bool _sync,
         RateLimiter* _rate_limiter, uint64_t _size_limit,
         std::function<void()> _progress_callback = []() {},
-        bool _verify_checksum_after_work = false,
         const std::string& _src_checksum_func_name =
             kUnknownFileChecksumFuncName,
         const std::string& _src_checksum_hex = "",
@@ -493,7 +650,6 @@ class BackupEngineImpl : public BackupEngine {
           rate_limiter(_rate_limiter),
           size_limit(_size_limit),
           progress_callback(_progress_callback),
-          verify_checksum_after_work(_verify_checksum_after_work),
           src_checksum_func_name(_src_checksum_func_name),
           src_checksum_hex(_src_checksum_hex),
           db_id(_db_id),
@@ -549,11 +705,18 @@ class BackupEngineImpl : public BackupEngine {
 
   struct RestoreAfterCopyOrCreateWorkItem {
     std::future<CopyOrCreateResult> result;
+    std::string from_file;
+    std::string to_file;
     std::string checksum_hex;
     RestoreAfterCopyOrCreateWorkItem() : checksum_hex("") {}
     RestoreAfterCopyOrCreateWorkItem(std::future<CopyOrCreateResult>&& _result,
+                                     const std::string& _from_file,
+                                     const std::string& _to_file,
                                      const std::string& _checksum_hex)
-        : result(std::move(_result)), checksum_hex(_checksum_hex) {}
+        : result(std::move(_result)),
+          from_file(_from_file),
+          to_file(_to_file),
+          checksum_hex(_checksum_hex) {}
     RestoreAfterCopyOrCreateWorkItem(RestoreAfterCopyOrCreateWorkItem&& o)
         ROCKSDB_NOEXCEPT {
       *this = std::move(o);
@@ -569,9 +732,10 @@ class BackupEngineImpl : public BackupEngine {
 
   bool initialized_;
   std::mutex byte_report_mutex_;
-  channel<CopyOrCreateWorkItem> files_to_copy_or_create_;
+  mutable channel<CopyOrCreateWorkItem> files_to_copy_or_create_;
   std::vector<port::Thread> threads_;
   std::atomic<CpuPriority> threads_cpu_priority_;
+
   // Certain operations like PurgeOldBackups and DeleteBackup will trigger
   // automatic GarbageCollect (true) unless we've already done one in this
   // session and have not failed to delete backup files since then (false).
@@ -592,7 +756,7 @@ class BackupEngineImpl : public BackupEngine {
       BackupID backup_id, bool shared, const std::string& src_dir,
       const std::string& fname,  // starts with "/"
       const EnvOptions& src_env_options, RateLimiter* rate_limiter,
-      uint64_t size_bytes, uint64_t size_limit = 0,
+      FileType file_type, uint64_t size_bytes, uint64_t size_limit = 0,
       bool shared_checksum = false,
       std::function<void()> progress_callback = []() {},
       const std::string& contents = std::string(),
@@ -610,7 +774,7 @@ class BackupEngineImpl : public BackupEngine {
   std::atomic<bool> stop_backup_;
 
   // options data
-  BackupableDBOptions options_;
+  BackupEngineOptions options_;
   Env* db_env_;
   Env* backup_env_;
 
@@ -621,16 +785,126 @@ class BackupEngineImpl : public BackupEngine {
   std::unique_ptr<Directory> private_directory_;
 
   static const size_t kDefaultCopyFileBufferSize = 5 * 1024 * 1024LL;  // 5MB
-  size_t copy_file_buffer_size_;
+  mutable size_t copy_file_buffer_size_;
   bool read_only_;
   BackupStatistics backup_statistics_;
+  std::unordered_set<std::string> reported_ignored_fields_;
   static const size_t kMaxAppMetaSize = 1024 * 1024;  // 1MB
+
+ public:
+  std::unique_ptr<TEST_FutureSchemaVersion2Options> test_future_options_;
 };
 
-Status BackupEngine::Open(const BackupableDBOptions& options, Env* env,
+// -------- BackupEngineImplThreadSafe class ---------
+// This locking layer for thread safety in the public API is layered on
+// top to prevent accidental recursive locking with RWMutex, which is UB.
+// Note: BackupEngineReadOnlyBase inherited twice, but has no fields
+class BackupEngineImplThreadSafe : public BackupEngine,
+                                   public BackupEngineReadOnly {
+ public:
+  BackupEngineImplThreadSafe(const BackupEngineOptions& options, Env* db_env,
+                             bool read_only = false)
+      : impl_(options, db_env, read_only) {}
+  ~BackupEngineImplThreadSafe() override {}
+
+  using BackupEngine::CreateNewBackupWithMetadata;
+  Status CreateNewBackupWithMetadata(const CreateBackupOptions& options, DB* db,
+                                     const std::string& app_metadata,
+                                     BackupID* new_backup_id) override {
+    WriteLock lock(&mutex_);
+    return impl_.CreateNewBackupWithMetadata(options, db, app_metadata,
+                                             new_backup_id);
+  }
+
+  Status PurgeOldBackups(uint32_t num_backups_to_keep) override {
+    WriteLock lock(&mutex_);
+    return impl_.PurgeOldBackups(num_backups_to_keep);
+  }
+
+  Status DeleteBackup(BackupID backup_id) override {
+    WriteLock lock(&mutex_);
+    return impl_.DeleteBackup(backup_id);
+  }
+
+  void StopBackup() override {
+    // No locking needed
+    impl_.StopBackup();
+  }
+
+  Status GarbageCollect() override {
+    WriteLock lock(&mutex_);
+    return impl_.GarbageCollect();
+  }
+
+  Status GetLatestBackupInfo(BackupInfo* backup_info,
+                             bool include_file_details = false) const override {
+    ReadLock lock(&mutex_);
+    return impl_.GetBackupInfo(kLatestBackupIDMarker, backup_info,
+                               include_file_details);
+  }
+
+  Status GetBackupInfo(BackupID backup_id, BackupInfo* backup_info,
+                       bool include_file_details = false) const override {
+    ReadLock lock(&mutex_);
+    return impl_.GetBackupInfo(backup_id, backup_info, include_file_details);
+  }
+
+  void GetBackupInfo(std::vector<BackupInfo>* backup_info,
+                     bool include_file_details) const override {
+    ReadLock lock(&mutex_);
+    impl_.GetBackupInfo(backup_info, include_file_details);
+  }
+
+  void GetCorruptedBackups(
+      std::vector<BackupID>* corrupt_backup_ids) const override {
+    ReadLock lock(&mutex_);
+    impl_.GetCorruptedBackups(corrupt_backup_ids);
+  }
+
+  using BackupEngine::RestoreDBFromBackup;
+  Status RestoreDBFromBackup(const RestoreOptions& options, BackupID backup_id,
+                             const std::string& db_dir,
+                             const std::string& wal_dir) const override {
+    ReadLock lock(&mutex_);
+    return impl_.RestoreDBFromBackup(options, backup_id, db_dir, wal_dir);
+  }
+
+  using BackupEngine::RestoreDBFromLatestBackup;
+  Status RestoreDBFromLatestBackup(const RestoreOptions& options,
+                                   const std::string& db_dir,
+                                   const std::string& wal_dir) const override {
+    // Defer to above function, which locks
+    return RestoreDBFromBackup(options, kLatestBackupIDMarker, db_dir, wal_dir);
+  }
+
+  Status VerifyBackup(BackupID backup_id,
+                      bool verify_with_checksum = false) const override {
+    ReadLock lock(&mutex_);
+    return impl_.VerifyBackup(backup_id, verify_with_checksum);
+  }
+
+  // Not public API but needed
+  Status Initialize() {
+    // No locking needed
+    return impl_.Initialize();
+  }
+
+  // Not public API but used in testing
+  void TEST_EnableWriteFutureSchemaVersion2(
+      const TEST_FutureSchemaVersion2Options& options) {
+    impl_.test_future_options_.reset(
+        new TEST_FutureSchemaVersion2Options(options));
+  }
+
+ private:
+  mutable port::RWMutex mutex_;
+  BackupEngineImpl impl_;
+};
+
+Status BackupEngine::Open(const BackupEngineOptions& options, Env* env,
                           BackupEngine** backup_engine_ptr) {
-  std::unique_ptr<BackupEngineImpl> backup_engine(
-      new BackupEngineImpl(options, env));
+  std::unique_ptr<BackupEngineImplThreadSafe> backup_engine(
+      new BackupEngineImplThreadSafe(options, env));
   auto s = backup_engine->Initialize();
   if (!s.ok()) {
     *backup_engine_ptr = nullptr;
@@ -640,7 +914,7 @@ Status BackupEngine::Open(const BackupableDBOptions& options, Env* env,
   return Status::OK();
 }
 
-BackupEngineImpl::BackupEngineImpl(const BackupableDBOptions& options,
+BackupEngineImpl::BackupEngineImpl(const BackupEngineOptions& options,
                                    Env* db_env, bool read_only)
     : initialized_(false),
       threads_cpu_priority_(),
@@ -670,6 +944,9 @@ BackupEngineImpl::~BackupEngineImpl() {
     t.join();
   }
   LogFlush(options_.info_log);
+  for (const auto& it : corrupt_backups_) {
+    it.second.first.PermitUncheckedError();
+  }
 }
 
 Status BackupEngineImpl::Initialize() {
@@ -680,6 +957,8 @@ Status BackupEngineImpl::Initialize() {
   }
   options_.Dump(options_.info_log);
 
+  auto meta_path = GetAbsolutePath(kMetaDirName);
+
   if (!read_only_) {
     // we might need to clean up from previous crash or I/O errors
     might_need_garbage_collect_ = true;
@@ -706,9 +985,9 @@ Status BackupEngineImpl::Initialize() {
                                  &shared_directory_);
       }
     }
-    directories.emplace_back(GetAbsolutePath(GetPrivateDirRel()),
+    directories.emplace_back(GetAbsolutePath(kPrivateDirName),
                              &private_directory_);
-    directories.emplace_back(GetBackupMetaDir(), &meta_directory_);
+    directories.emplace_back(meta_path, &meta_directory_);
     // create all the dirs we need
     for (const auto& d : directories) {
       auto s = backup_env_->CreateDirIfMissing(d.first);
@@ -723,29 +1002,24 @@ Status BackupEngineImpl::Initialize() {
 
   std::vector<std::string> backup_meta_files;
   {
-    auto s = backup_env_->GetChildren(GetBackupMetaDir(), &backup_meta_files);
+    auto s = backup_env_->GetChildren(meta_path, &backup_meta_files);
     if (s.IsNotFound()) {
-      return Status::NotFound(GetBackupMetaDir() + " is missing");
+      return Status::NotFound(meta_path + " is missing");
     } else if (!s.ok()) {
       return s;
     }
   }
   // create backups_ structure
   for (auto& file : backup_meta_files) {
-    if (file == "." || file == "..") {
-      continue;
-    }
     ROCKS_LOG_INFO(options_.info_log, "Detected backup %s", file.c_str());
     BackupID backup_id = 0;
     sscanf(file.c_str(), "%u", &backup_id);
     if (backup_id == 0 || file != ROCKSDB_NAMESPACE::ToString(backup_id)) {
-      if (!read_only_) {
-        // invalid file name, delete that
-        auto s = backup_env_->DeleteFile(GetBackupMetaDir() + "/" + file);
-        ROCKS_LOG_INFO(options_.info_log,
-                       "Unrecognized meta file %s, deleting -- %s",
-                       file.c_str(), s.ToString().c_str());
-      }
+      // Invalid file name, will be deleted with auto-GC when user
+      // initiates an append or write operation. (Behave as read-only until
+      // then.)
+      ROCKS_LOG_INFO(options_.info_log, "Skipping unrecognized meta file %s",
+                     file.c_str());
       continue;
     }
     assert(backups_.find(backup_id) == backups_.end());
@@ -783,7 +1057,12 @@ Status BackupEngineImpl::Initialize() {
     for (const auto& rel_dir :
          {GetSharedFileRel(), GetSharedFileWithChecksumRel()}) {
       const auto abs_dir = GetAbsolutePath(rel_dir);
-      InsertPathnameToSizeBytes(abs_dir, backup_env_, &abs_path_to_size);
+      Status s =
+          ReadChildFileCurrentSizes(abs_dir, backup_env_, &abs_path_to_size);
+      if (!s.ok()) {
+        // I/O error likely impacting all backups
+        return s;
+      }
     }
     // load the backups if any, until valid_backups_to_open of the latest
     // non-corrupted backups have been successfully opened.
@@ -801,12 +1080,15 @@ Status BackupEngineImpl::Initialize() {
 
       // Insert files and their sizes in backup sub-directories
       // (private/backup_id) to abs_path_to_size
-      InsertPathnameToSizeBytes(
+      Status s = ReadChildFileCurrentSizes(
           GetAbsolutePath(GetPrivateFileRel(backup_iter->first)), backup_env_,
           &abs_path_to_size);
-      Status s = backup_iter->second->LoadFromFile(options_.backup_dir,
-                                                   abs_path_to_size);
-      if (s.IsCorruption()) {
+      if (s.ok()) {
+        s = backup_iter->second->LoadFromFile(
+            options_.backup_dir, abs_path_to_size, options_.info_log,
+            &reported_ignored_fields_);
+      }
+      if (s.IsCorruption() || s.IsNotSupported()) {
         ROCKS_LOG_INFO(options_.info_log, "Backup %u corrupted -- %s",
                        backup_iter->first, s.ToString().c_str());
         corrupt_backups_.insert(
@@ -882,9 +1164,9 @@ Status BackupEngineImpl::Initialize() {
             work_item.progress_callback);
         result.db_id = work_item.db_id;
         result.db_session_id = work_item.db_session_id;
-        if (result.status.ok() && work_item.verify_checksum_after_work) {
+        if (result.status.ok() && !work_item.src_checksum_hex.empty()) {
           // unknown checksum function name implies no db table file checksum in
-          // db manifest; work_item.verify_checksum_after_work being true means
+          // db manifest; work_item.src_checksum_hex not empty means
           // backup engine has calculated its crc32c checksum for the table
           // file; therefore, we are able to compare the checksums.
           if (work_item.src_checksum_func_name ==
@@ -899,6 +1181,7 @@ Status BackupEngineImpl::Initialize() {
                                      work_item.dst_path + ": " + checksum_info);
             }
           } else {
+            // FIXME(peterd): dead code?
             std::string checksum_function_info(
                 "Existing checksum function is " +
                 work_item.src_checksum_func_name +
@@ -920,8 +1203,8 @@ Status BackupEngineImpl::Initialize() {
 }
 
 Status BackupEngineImpl::CreateNewBackupWithMetadata(
-    const CreateBackupOptions& options, DB* db,
-    const std::string& app_metadata) {
+    const CreateBackupOptions& options, DB* db, const std::string& app_metadata,
+    BackupID* new_backup_id_ptr) {
   assert(initialized_);
   assert(!read_only_);
   if (app_metadata.size() > kMaxAppMetaSize) {
@@ -969,6 +1252,13 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
   ROCKS_LOG_INFO(options_.info_log,
                  "Started the backup process -- creating backup %u",
                  new_backup_id);
+
+  if (options_.share_table_files && !options_.share_files_with_checksum) {
+    ROCKS_LOG_WARN(options_.info_log,
+                   "BackupEngineOptions::share_files_with_checksum=false is "
+                   "DEPRECATED and could lead to data loss.");
+  }
+
   if (s.ok()) {
     s = backup_env_->CreateDir(private_dir);
   }
@@ -986,7 +1276,7 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
 
   std::vector<BackupAfterCopyOrCreateWorkItem> backup_items_to_finish;
   // Add a CopyOrCreateWorkItem to the channel for each live file
-  db->DisableFileDeletions();
+  Status disabled = db->DisableFileDeletions();
   if (s.ok()) {
     CheckpointImpl checkpoint(db);
     uint64_t sequence_number = 0;
@@ -1013,18 +1303,18 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
             uint64_t size_limit_bytes, FileType type,
             const std::string& checksum_func_name,
             const std::string& checksum_val) {
-          if (type == kLogFile && !options_.backup_log_files) {
+          if (type == kWalFile && !options_.backup_log_files) {
             return Status::OK();
           }
           Log(options_.info_log, "add file for backup %s", fname.c_str());
           uint64_t size_bytes = 0;
           Status st;
-          if (type == kTableFile) {
+          if (type == kTableFile || type == kBlobFile) {
             st = db_env_->GetFileSize(src_dirname + fname, &size_bytes);
           }
           EnvOptions src_env_options;
           switch (type) {
-            case kLogFile:
+            case kWalFile:
               src_env_options =
                   db_env_->OptimizeForLogRead(src_raw_env_options);
               break;
@@ -1036,6 +1326,10 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
               src_env_options =
                   db_env_->OptimizeForManifestRead(src_raw_env_options);
               break;
+            case kBlobFile:
+              src_env_options = db_env_->OptimizeForBlobFileRead(
+                  src_raw_env_options, ImmutableDBOptions(db_options));
+              break;
             default:
               // Other backed up files (like options file) are not read by live
               // DB, so don't need to worry about avoiding mixing buffered and
@@ -1046,22 +1340,25 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
           if (st.ok()) {
             st = AddBackupFileWorkItem(
                 live_dst_paths, backup_items_to_finish, new_backup_id,
-                options_.share_table_files && type == kTableFile, src_dirname,
-                fname, src_env_options, rate_limiter, size_bytes,
-                size_limit_bytes,
-                options_.share_files_with_checksum && type == kTableFile,
+                options_.share_table_files &&
+                    (type == kTableFile || type == kBlobFile),
+                src_dirname, fname, src_env_options, rate_limiter, type,
+                size_bytes, size_limit_bytes,
+                options_.share_files_with_checksum &&
+                    (type == kTableFile || type == kBlobFile),
                 options.progress_callback, "" /* contents */,
                 checksum_func_name, checksum_val);
           }
           return st;
         } /* copy_file_cb */,
-        [&](const std::string& fname, const std::string& contents, FileType) {
+        [&](const std::string& fname, const std::string& contents,
+            FileType type) {
           Log(options_.info_log, "add file for backup %s", fname.c_str());
           return AddBackupFileWorkItem(
               live_dst_paths, backup_items_to_finish, new_backup_id,
               false /* shared */, "" /* src_dir */, fname,
-              EnvOptions() /* src_env_options */, rate_limiter, contents.size(),
-              0 /* size_limit */, false /* shared_checksum */,
+              EnvOptions() /* src_env_options */, rate_limiter, type,
+              contents.size(), 0 /* size_limit */, false /* shared_checksum */,
               options.progress_callback, contents);
         } /* create_file_cb */,
         &sequence_number, options.flush_before_backup ? 0 : port::kMaxUint64,
@@ -1091,13 +1388,14 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
   }
 
   // we copied all the files, enable file deletions
-  db->EnableFileDeletions(false);
-
+  if (disabled.ok()) {  // If we successfully disabled file deletions
+    db->EnableFileDeletions(false).PermitUncheckedError();
+  }
   auto backup_time = backup_env_->NowMicros() - start_backup;
 
   if (s.ok()) {
     // persist the backup metadata on the disk
-    s = new_backup->StoreToFile(options_.sync);
+    s = new_backup->StoreToFile(options_.sync, test_future_options_.get());
   }
   if (s.ok() && options_.sync) {
     std::unique_ptr<Directory> backup_private_directory;
@@ -1133,7 +1431,7 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
                    backup_statistics_.ToString().c_str());
     // delete files that we might have already written
     might_need_garbage_collect_ = true;
-    DeleteBackup(new_backup_id);
+    DeleteBackup(new_backup_id).PermitUncheckedError();
     return s;
   }
 
@@ -1141,6 +1439,9 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
   // in the LATEST_BACKUP file
   latest_backup_id_ = new_backup_id;
   latest_valid_backup_id_ = new_backup_id;
+  if (new_backup_id_ptr) {
+    *new_backup_id_ptr = new_backup_id;
+  }
   ROCKS_LOG_INFO(options_.info_log, "Backup DONE. All is good");
 
   // backup_speed is in byte/second
@@ -1174,7 +1475,8 @@ Status BackupEngineImpl::PurgeOldBackups(uint32_t num_backups_to_keep) {
     itr++;
   }
   for (auto backup_id : to_delete) {
-    auto s = DeleteBackupInternal(backup_id);
+    // Do not GC until end
+    auto s = DeleteBackupNoGC(backup_id);
     if (!s.ok()) {
       overall_status = s;
     }
@@ -1191,7 +1493,7 @@ Status BackupEngineImpl::PurgeOldBackups(uint32_t num_backups_to_keep) {
 }
 
 Status BackupEngineImpl::DeleteBackup(BackupID backup_id) {
-  auto s1 = DeleteBackupInternal(backup_id);
+  auto s1 = DeleteBackupNoGC(backup_id);
   auto s2 = Status::OK();
 
   // Clean up after any incomplete backup deletion, potentially from
@@ -1201,14 +1503,17 @@ Status BackupEngineImpl::DeleteBackup(BackupID backup_id) {
   }
 
   if (!s1.ok()) {
+    // Any failure in the primary objective trumps any failure in the
+    // secondary objective.
+    s2.PermitUncheckedError();
     return s1;
   } else {
     return s2;
   }
 }
 
-// Does not auto-GarbageCollect
-Status BackupEngineImpl::DeleteBackupInternal(BackupID backup_id) {
+// Does not auto-GarbageCollect nor lock
+Status BackupEngineImpl::DeleteBackupNoGC(BackupID backup_id) {
   assert(initialized_);
   assert(!read_only_);
 
@@ -1229,6 +1534,7 @@ Status BackupEngineImpl::DeleteBackupInternal(BackupID backup_id) {
     if (!s.ok()) {
       return s;
     }
+    corrupt->second.first.PermitUncheckedError();
     corrupt_backups_.erase(corrupt);
   }
 
@@ -1265,21 +1571,67 @@ Status BackupEngineImpl::DeleteBackupInternal(BackupID backup_id) {
   return Status::OK();
 }
 
-void BackupEngineImpl::GetBackupInfo(std::vector<BackupInfo>* backup_info) {
+void BackupEngineImpl::SetBackupInfoFromBackupMeta(
+    BackupID id, const BackupMeta& meta, BackupInfo* backup_info,
+    bool include_file_details) const {
+  *backup_info = BackupInfo(id, meta.GetTimestamp(), meta.GetSize(),
+                            meta.GetNumberFiles(), meta.GetAppMetadata());
+  if (include_file_details) {
+    auto& file_details = backup_info->file_details;
+    file_details.reserve(meta.GetFiles().size());
+    for (auto& file_ptr : meta.GetFiles()) {
+      BackupFileInfo& finfo = *file_details.emplace(file_details.end());
+      finfo.relative_filename = file_ptr->filename;
+      finfo.size = file_ptr->size;
+    }
+    backup_info->name_for_open = GetAbsolutePath(GetPrivateFileRel(id));
+    backup_info->name_for_open.pop_back();  // remove trailing '/'
+    backup_info->env_for_open = meta.GetEnvForOpen();
+  }
+}
+
+Status BackupEngineImpl::GetBackupInfo(BackupID backup_id,
+                                       BackupInfo* backup_info,
+                                       bool include_file_details) const {
   assert(initialized_);
-  backup_info->reserve(backups_.size());
+  if (backup_id == kLatestBackupIDMarker) {
+    // Note: Read latest_valid_backup_id_ inside of lock
+    backup_id = latest_valid_backup_id_;
+  }
+  auto corrupt_itr = corrupt_backups_.find(backup_id);
+  if (corrupt_itr != corrupt_backups_.end()) {
+    return Status::Corruption(corrupt_itr->second.first.ToString());
+  }
+  auto backup_itr = backups_.find(backup_id);
+  if (backup_itr == backups_.end()) {
+    return Status::NotFound("Backup not found");
+  }
+  auto& backup = backup_itr->second;
+  if (backup->Empty()) {
+    return Status::NotFound("Backup not found");
+  }
+
+  SetBackupInfoFromBackupMeta(backup_id, *backup, backup_info,
+                              include_file_details);
+  return Status::OK();
+}
+
+void BackupEngineImpl::GetBackupInfo(std::vector<BackupInfo>* backup_info,
+                                     bool include_file_details) const {
+  assert(initialized_);
+  backup_info->resize(backups_.size());
+  size_t i = 0;
   for (auto& backup : backups_) {
-    if (!backup.second->Empty()) {
-      backup_info->push_back(BackupInfo(
-          backup.first, backup.second->GetTimestamp(), backup.second->GetSize(),
-          backup.second->GetNumberFiles(), backup.second->GetAppMetadata()));
+    const BackupMeta& meta = *backup.second;
+    if (!meta.Empty()) {
+      SetBackupInfoFromBackupMeta(backup.first, meta, &backup_info->at(i++),
+                                  include_file_details);
     }
   }
 }
 
-void
-BackupEngineImpl::GetCorruptedBackups(
-    std::vector<BackupID>* corrupt_backup_ids) {
+void BackupEngineImpl::GetCorruptedBackups(
+    std::vector<BackupID>* corrupt_backup_ids) const {
   assert(initialized_);
   corrupt_backup_ids->reserve(corrupt_backups_.size());
   for (auto& backup : corrupt_backups_) {
@@ -1290,8 +1642,12 @@ BackupEngineImpl::GetCorruptedBackups(
 Status BackupEngineImpl::RestoreDBFromBackup(const RestoreOptions& options,
                                              BackupID backup_id,
                                              const std::string& db_dir,
-                                             const std::string& wal_dir) {
+                                             const std::string& wal_dir) const {
   assert(initialized_);
+  if (backup_id == kLatestBackupIDMarker) {
+    // Note: Read latest_valid_backup_id_ inside of lock
+    backup_id = latest_valid_backup_id_;
+  }
   auto corrupt_itr = corrupt_backups_.find(backup_id);
   if (corrupt_itr != corrupt_backups_.end()) {
     return corrupt_itr->second.first;
@@ -1310,21 +1666,22 @@ Status BackupEngineImpl::RestoreDBFromBackup(const RestoreOptions& options,
                  static_cast<int>(options.keep_log_files));
 
   // just in case. Ignore errors
-  db_env_->CreateDirIfMissing(db_dir);
-  db_env_->CreateDirIfMissing(wal_dir);
+  db_env_->CreateDirIfMissing(db_dir).PermitUncheckedError();
+  db_env_->CreateDirIfMissing(wal_dir).PermitUncheckedError();
 
   if (options.keep_log_files) {
     // delete files in db_dir, but keep all the log files
-    DeleteChildren(db_dir, 1 << kLogFile);
+    DeleteChildren(db_dir, 1 << kWalFile);
     // move all the files from archive dir to wal_dir
     std::string archive_dir = ArchivalDirectory(wal_dir);
     std::vector<std::string> archive_files;
-    db_env_->GetChildren(archive_dir, &archive_files);  // ignore errors
+    db_env_->GetChildren(archive_dir, &archive_files)
+        .PermitUncheckedError();  // ignore errors
     for (const auto& f : archive_files) {
       uint64_t number;
       FileType type;
       bool ok = ParseFileName(f, &number, &type);
-      if (ok && type == kLogFile) {
+      if (ok && type == kWalFile) {
         ROCKS_LOG_INFO(options_.info_log,
                        "Moving log file from archive/ to wal_dir: %s",
                        f.c_str());
@@ -1352,21 +1709,8 @@ Status BackupEngineImpl::RestoreDBFromBackup(const RestoreOptions& options,
   std::vector<RestoreAfterCopyOrCreateWorkItem> restore_items_to_finish;
   for (const auto& file_info : backup->GetFiles()) {
     const std::string& file = file_info->filename;
-    std::string dst;
-    // 1. extract the filename
-    size_t slash = file.find_last_of('/');
-    // file will either be shared/<file>, shared_checksum/<file_crc32c_size>,
-    // shared_checksum/<file_session>, shared_checksum/<file_crc32c_session>,
-    // or private/<number>/<file>
-    assert(slash != std::string::npos);
-    dst = file.substr(slash + 1);
-
-    // if the file was in shared_checksum, extract the real file name
-    // in this case the file is <number>_<checksum>_<size>.<type>,
-    // <number>_<session>.<type>, or <number>_<checksum>_<session>.<type>
-    if (file.substr(0, slash) == GetSharedChecksumDirRel()) {
-      dst = GetFileFromChecksumFile(dst);
-    }
+    // 1. get DB filename
+    std::string dst = file_info->GetDbFileName();
 
     // 2. find the filetype
     uint64_t number;
@@ -1377,9 +1721,8 @@ Status BackupEngineImpl::RestoreDBFromBackup(const RestoreOptions& options,
                                 dst);
     }
     // 3. Construct the final path
-    // kLogFile lives in wal_dir and all the rest live in db_dir
-    dst = ((type == kLogFile) ? wal_dir : db_dir) +
-      "/" + dst;
+    // kWalFile lives in wal_dir and all the rest live in db_dir
+    dst = ((type == kWalFile) ? wal_dir : db_dir) + "/" + dst;
 
     ROCKS_LOG_INFO(options_.info_log, "Restoring %s to %s\n", file.c_str(),
                    dst.c_str());
@@ -1388,7 +1731,8 @@ Status BackupEngineImpl::RestoreDBFromBackup(const RestoreOptions& options,
         EnvOptions() /* src_env_options */, false, rate_limiter,
         0 /* size_limit */);
     RestoreAfterCopyOrCreateWorkItem after_copy_or_create_work_item(
-        copy_or_create_work_item.result.get_future(), file_info->checksum_hex);
+        copy_or_create_work_item.result.get_future(), file, dst,
+        file_info->checksum_hex);
     files_to_copy_or_create_.write(std::move(copy_or_create_work_item));
     restore_items_to_finish.push_back(
         std::move(after_copy_or_create_work_item));
@@ -1403,8 +1747,12 @@ Status BackupEngineImpl::RestoreDBFromBackup(const RestoreOptions& options,
     if (!item_status.ok()) {
       s = item_status;
       break;
-    } else if (item.checksum_hex != result.checksum_hex) {
-      s = Status::Corruption("Checksum check failed");
+    } else if (!item.checksum_hex.empty() &&
+               item.checksum_hex != result.checksum_hex) {
+      s = Status::Corruption(
+          "While restoring " + item.from_file + " -> " + item.to_file +
+          ": expected checksum is " + item.checksum_hex +
+          " while computed checksum is " + result.checksum_hex);
       break;
     }
   }
@@ -1415,9 +1763,9 @@ Status BackupEngineImpl::RestoreDBFromBackup(const RestoreOptions& options,
 }
 
 Status BackupEngineImpl::VerifyBackup(BackupID backup_id,
-                                      bool verify_with_checksum) {
-  // Check if backup_id is corrupted, or valid and registered
+                                      bool verify_with_checksum) const {
   assert(initialized_);
+  // Check if backup_id is corrupted, or valid and registered
   auto corrupt_itr = corrupt_backups_.find(backup_id);
   if (corrupt_itr != corrupt_backups_.end()) {
     return corrupt_itr->second.first;
@@ -1440,7 +1788,10 @@ Status BackupEngineImpl::VerifyBackup(BackupID backup_id,
   for (const auto& rel_dir : {GetPrivateFileRel(backup_id), GetSharedFileRel(),
                               GetSharedFileWithChecksumRel()}) {
     const auto abs_dir = GetAbsolutePath(rel_dir);
-    InsertPathnameToSizeBytes(abs_dir, backup_env_, &curr_abs_path_to_size);
+    // Shared directories allowed to be missing in some cases. Expected but
+    // missing files will be reported a few lines down.
+    ReadChildFileCurrentSizes(abs_dir, backup_env_, &curr_abs_path_to_size)
+        .PermitUncheckedError();
   }
 
   // For all files registered in backup
@@ -1459,14 +1810,16 @@ Status BackupEngineImpl::VerifyBackup(BackupID backup_id,
       return Status::Corruption("File corrupted: File size mismatch for " +
                                 abs_path + ": " + size_info);
     }
-    if (verify_with_checksum) {
+    if (verify_with_checksum && !file_info->checksum_hex.empty()) {
       // verify file checksum
       std::string checksum_hex;
       ROCKS_LOG_INFO(options_.info_log, "Verifying %s checksum...\n",
                      abs_path.c_str());
-      ReadFileAndComputeChecksum(abs_path, backup_env_, EnvOptions(),
-                                 0 /* size_limit */, &checksum_hex);
-      if (file_info->checksum_hex != checksum_hex) {
+      Status s = ReadFileAndComputeChecksum(abs_path, backup_env_, EnvOptions(),
+                                            0 /* size_limit */, &checksum_hex);
+      if (!s.ok()) {
+        return s;
+      } else if (file_info->checksum_hex != checksum_hex) {
         std::string checksum_info(
             "Expected checksum is " + file_info->checksum_hex +
             " while computed checksum is " + checksum_hex);
@@ -1485,10 +1838,10 @@ Status BackupEngineImpl::CopyOrCreateFile(
     uint64_t size_limit, std::function<void()> progress_callback) {
   assert(src.empty() != contents.empty());
   Status s;
-  std::unique_ptr<WritableFile> dst_file;
-  std::unique_ptr<SequentialFile> src_file;
-  EnvOptions dst_env_options;
-  dst_env_options.use_mmap_writes = false;
+  std::unique_ptr<FSWritableFile> dst_file;
+  std::unique_ptr<FSSequentialFile> src_file;
+  FileOptions dst_file_options;
+  dst_file_options.use_mmap_writes = false;
   // TODO:(gzh) maybe use direct reads/writes here if possible
   if (size != nullptr) {
     *size = 0;
@@ -1500,21 +1853,22 @@ Status BackupEngineImpl::CopyOrCreateFile(
     size_limit = std::numeric_limits<uint64_t>::max();
   }
 
-  s = dst_env->NewWritableFile(dst, &dst_file, dst_env_options);
+  s = dst_env->GetFileSystem()->NewWritableFile(dst, dst_file_options,
+                                                &dst_file, nullptr);
   if (s.ok() && !src.empty()) {
-    s = src_env->NewSequentialFile(src, &src_file, src_env_options);
+    s = src_env->GetFileSystem()->NewSequentialFile(
+        src, FileOptions(src_env_options), &src_file, nullptr);
   }
   if (!s.ok()) {
     return s;
   }
 
-  std::unique_ptr<WritableFileWriter> dest_writer(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(dst_file)), dst, dst_env_options));
+  std::unique_ptr<WritableFileWriter> dest_writer(
+      new WritableFileWriter(std::move(dst_file), dst, dst_file_options));
   std::unique_ptr<SequentialFileReader> src_reader;
   std::unique_ptr<char[]> buf;
   if (!src.empty()) {
-    src_reader.reset(new SequentialFileReader(
-        NewLegacySequentialFileWrapper(src_file), src));
+    src_reader.reset(new SequentialFileReader(std::move(src_file), src));
     buf.reset(new char[copy_file_buffer_size_]);
   }
 
@@ -1581,25 +1935,24 @@ Status BackupEngineImpl::AddBackupFileWorkItem(
     std::vector<BackupAfterCopyOrCreateWorkItem>& backup_items_to_finish,
     BackupID backup_id, bool shared, const std::string& src_dir,
     const std::string& fname, const EnvOptions& src_env_options,
-    RateLimiter* rate_limiter, uint64_t size_bytes, uint64_t size_limit,
-    bool shared_checksum, std::function<void()> progress_callback,
-    const std::string& contents, const std::string& src_checksum_func_name,
+    RateLimiter* rate_limiter, FileType file_type, uint64_t size_bytes,
+    uint64_t size_limit, bool shared_checksum,
+    std::function<void()> progress_callback, const std::string& contents,
+    const std::string& src_checksum_func_name,
     const std::string& src_checksum_str) {
   assert(!fname.empty() && fname[0] == '/');
   assert(contents.empty() != src_dir.empty());
 
   std::string dst_relative = fname.substr(1);
   std::string dst_relative_tmp;
-  Status s;
-  std::string checksum_hex;
   std::string db_id;
   std::string db_session_id;
-  // whether the checksum for a table file is available
-  bool has_checksum = false;
+  // crc32c checksum in hex. empty == unavailable / unknown
+  std::string checksum_hex;
 
   // Whenever a default checksum function name is passed in, we will compares
-  // the corresponding checksum values after copying. Note that only table files
-  // may have a known checksum function name passed in.
+  // the corresponding checksum values after copying. Note that only table and
+  // blob files may have a known checksum function name passed in.
   //
   // If no default checksum function name is passed in and db session id is not
   // available, we will calculate the checksum *before* copying in two cases
@@ -1613,29 +1966,29 @@ Status BackupEngineImpl::AddBackupFileWorkItem(
       return Status::Aborted("Unknown checksum value for " + fname);
     }
     checksum_hex = ChecksumStrToHex(src_checksum_str);
-    has_checksum = true;
   }
 
   // Step 1: Prepare the relative path to destination
   if (shared && shared_checksum) {
-    if (GetNamingNoFlags() != BackupableDBOptions::kLegacyCrc32cAndFileSize) {
+    if (GetNamingNoFlags() != BackupEngineOptions::kLegacyCrc32cAndFileSize &&
+        file_type != kBlobFile) {
       // Prepare db_session_id to add to the file name
       // Ignore the returned status
       // In the failed cases, db_id and db_session_id will be empty
       GetFileDbIdentities(db_env_, src_env_options, src_dir + fname, &db_id,
-                          &db_session_id);
+                          &db_session_id)
+          .PermitUncheckedError();
     }
     // Calculate checksum if checksum and db session id are not available.
     // If db session id is available, we will not calculate the checksum
     // since the session id should suffice to avoid file name collision in
     // the shared_checksum directory.
-    if (!has_checksum && db_session_id.empty()) {
-      s = ReadFileAndComputeChecksum(src_dir + fname, db_env_, src_env_options,
-                                     size_limit, &checksum_hex);
+    if (checksum_hex.empty() && db_session_id.empty()) {
+      Status s = ReadFileAndComputeChecksum(
+          src_dir + fname, db_env_, src_env_options, size_limit, &checksum_hex);
       if (!s.ok()) {
         return s;
       }
-      has_checksum = true;
     }
     if (size_bytes == port::kMaxUint64) {
       return Status::NotFound("File missing: " + src_dir + fname);
@@ -1650,8 +2003,13 @@ Status BackupEngineImpl::AddBackupFileWorkItem(
     // shared_checksum/<file_number>_<db_session_id>.sst
     // Otherwise, dst_relative is of the form
     // shared_checksum/<file_number>_<checksum>_<size>.sst
-    dst_relative = GetSharedFileWithChecksum(
-        dst_relative, has_checksum, checksum_hex, size_bytes, db_session_id);
+    //
+    // For blob files, db_session_id is not supported with the blob file format.
+    // It uses original/legacy naming scheme.
+    // dst_relative will be of the form:
+    // shared_checksum/<file_number>_<checksum>_<size>.blob
+    dst_relative = GetSharedFileWithChecksum(dst_relative, checksum_hex,
+                                             size_bytes, db_session_id);
     dst_relative_tmp = GetSharedFileWithChecksumRel(dst_relative, true);
     dst_relative = GetSharedFileWithChecksumRel(dst_relative, false);
   } else if (shared) {
@@ -1693,7 +2051,6 @@ Status BackupEngineImpl::AddBackupFileWorkItem(
     } else if (exist.IsNotFound()) {
       file_exists = false;
     } else {
-      assert(s.IsIOError());
       return exist;
     }
   }
@@ -1711,17 +2068,24 @@ Status BackupEngineImpl::AddBackupFileWorkItem(
           "overwrite the file.",
           fname.c_str());
       need_to_copy = true;
-      backup_env_->DeleteFile(final_dest_path);
+      // Defer any failure reporting to when we try to write the file
+      backup_env_->DeleteFile(final_dest_path).PermitUncheckedError();
     } else {
       // file exists and referenced
-      if (!has_checksum) {
-        if (!same_path) {
+      if (checksum_hex.empty()) {
+        // same_path should not happen for a standard DB, so OK to
+        // read file contents to check for checksum mismatch between
+        // two files from same DB getting same name.
+        // For compatibility with future meta file that might not have
+        // crc32c checksum available, consider it might be empty, but
+        // we don't currently generate meta file without crc32c checksum.
+        // Therefore we have to read & compute it if we don't have it.
+        if (!same_path && !find_result->second->checksum_hex.empty()) {
           assert(find_result != backuped_file_infos_.end());
           // Note: to save I/O on incremental backups, we copy prior known
           // checksum of the file instead of reading entire file contents
           // to recompute it.
           checksum_hex = find_result->second->checksum_hex;
-          has_checksum = true;
           // Regarding corruption detection, consider:
           // (a) the DB file is corrupt (since previous backup) and the backup
           // file is OK: we failed to detect, but the backup is safe. DB can
@@ -1737,12 +2101,9 @@ Status BackupEngineImpl::AddBackupFileWorkItem(
           // ID, but even in that case, we double check the file sizes in
           // BackupMeta::AddFile.
         } else {
-          // same_path should not happen for a standard DB, so OK to
-          // read file contents to check for checksum mismatch between
-          // two files from same DB getting same name.
-          s = ReadFileAndComputeChecksum(src_dir + fname, db_env_,
-                                         src_env_options, size_limit,
-                                         &checksum_hex);
+          Status s = ReadFileAndComputeChecksum(src_dir + fname, db_env_,
+                                                src_env_options, size_limit,
+                                                &checksum_hex);
           if (!s.ok()) {
             return s;
           }
@@ -1770,8 +2131,8 @@ Status BackupEngineImpl::AddBackupFileWorkItem(
     CopyOrCreateWorkItem copy_or_create_work_item(
         src_dir.empty() ? "" : src_dir + fname, *copy_dest_path, contents,
         db_env_, backup_env_, src_env_options, options_.sync, rate_limiter,
-        size_limit, progress_callback, has_checksum, src_checksum_func_name,
-        checksum_hex, db_id, db_session_id);
+        size_limit, progress_callback, src_checksum_func_name, checksum_hex,
+        db_id, db_session_id);
     BackupAfterCopyOrCreateWorkItem after_copy_or_create_work_item(
         copy_or_create_work_item.result.get_future(), shared, need_to_copy,
         backup_env_, temp_dest_path, final_dest_path, dst_relative);
@@ -1784,19 +2145,19 @@ Status BackupEngineImpl::AddBackupFileWorkItem(
         temp_dest_path, final_dest_path, dst_relative);
     backup_items_to_finish.push_back(std::move(after_copy_or_create_work_item));
     CopyOrCreateResult result;
-    result.status = s;
+    result.status = Status::OK();
     result.size = size_bytes;
     result.checksum_hex = std::move(checksum_hex);
     result.db_id = std::move(db_id);
     result.db_session_id = std::move(db_session_id);
     promise_result.set_value(std::move(result));
   }
-  return s;
+  return Status::OK();
 }
 
 Status BackupEngineImpl::ReadFileAndComputeChecksum(
     const std::string& src, Env* src_env, const EnvOptions& src_env_options,
-    uint64_t size_limit, std::string* checksum_hex) {
+    uint64_t size_limit, std::string* checksum_hex) const {
   if (checksum_hex == nullptr) {
     return Status::Aborted("Checksum pointer is null");
   }
@@ -1805,14 +2166,14 @@ Status BackupEngineImpl::ReadFileAndComputeChecksum(
     size_limit = std::numeric_limits<uint64_t>::max();
   }
 
-  std::unique_ptr<SequentialFile> src_file;
-  Status s = src_env->NewSequentialFile(src, &src_file, src_env_options);
+  std::unique_ptr<SequentialFileReader> src_reader;
+  Status s = SequentialFileReader::Create(src_env->GetFileSystem(), src,
+                                          FileOptions(src_env_options),
+                                          &src_reader, nullptr);
   if (!s.ok()) {
     return s;
   }
 
-  std::unique_ptr<SequentialFileReader> src_reader(
-      new SequentialFileReader(NewLegacySequentialFileWrapper(src_file), src));
   std::unique_ptr<char[]> buf(new char[copy_file_buffer_size_]);
   Slice data;
 
@@ -1892,9 +2253,9 @@ Status BackupEngineImpl::GetFileDbIdentities(Env* src_env,
 }
 
 void BackupEngineImpl::DeleteChildren(const std::string& dir,
-                                      uint32_t file_type_filter) {
+                                      uint32_t file_type_filter) const {
   std::vector<std::string> children;
-  db_env_->GetChildren(dir, &children);  // ignore errors
+  db_env_->GetChildren(dir, &children).PermitUncheckedError();  // ignore errors
 
   for (const auto& f : children) {
     uint64_t number;
@@ -1904,13 +2265,13 @@ void BackupEngineImpl::DeleteChildren(const std::string& dir,
       // don't delete this file
       continue;
     }
-    db_env_->DeleteFile(dir + "/" + f);  // ignore errors
+    db_env_->DeleteFile(dir + "/" + f).PermitUncheckedError();  // ignore errors
   }
 }
 
-Status BackupEngineImpl::InsertPathnameToSizeBytes(
+Status BackupEngineImpl::ReadChildFileCurrentSizes(
     const std::string& dir, Env* env,
-    std::unordered_map<std::string, uint64_t>* result) {
+    std::unordered_map<std::string, uint64_t>* result) const {
   assert(result != nullptr);
   std::vector<Env::FileAttributes> files_attrs;
   Status status = env->FileExists(dir);
@@ -1962,9 +2323,6 @@ Status BackupEngineImpl::GarbageCollect() {
       }
     }
     for (auto& child : shared_children) {
-      if (child == "." || child == "..") {
-        continue;
-      }
       std::string rel_fname;
       if (with_checksum) {
         rel_fname = GetSharedFileWithChecksumRel(child);
@@ -1992,7 +2350,7 @@ Status BackupEngineImpl::GarbageCollect() {
   // delete obsolete private files
   std::vector<std::string> private_children;
   {
-    auto s = backup_env_->GetChildren(GetAbsolutePath(GetPrivateDirRel()),
+    auto s = backup_env_->GetChildren(GetAbsolutePath(kPrivateDirName),
                                       &private_children);
     if (!s.ok()) {
       overall_status = s;
@@ -2001,10 +2359,6 @@ Status BackupEngineImpl::GarbageCollect() {
     }
   }
   for (auto& child : private_children) {
-    if (child == "." || child == "..") {
-      continue;
-    }
-
     BackupID backup_id = 0;
     bool tmp_dir = child.find(".tmp") != std::string::npos;
     sscanf(child.c_str(), "%u", &backup_id);
@@ -2017,18 +2371,16 @@ Status BackupEngineImpl::GarbageCollect() {
     std::string full_private_path =
         GetAbsolutePath(GetPrivateFileRel(backup_id));
     std::vector<std::string> subchildren;
-    backup_env_->GetChildren(full_private_path, &subchildren);
-    for (auto& subchild : subchildren) {
-      if (subchild == "." || subchild == "..") {
-        continue;
-      }
-      Status s = backup_env_->DeleteFile(full_private_path + subchild);
-      ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
-                     (full_private_path + subchild).c_str(),
-                     s.ToString().c_str());
-      if (!s.ok()) {
-        // Trying again later might work
-        might_need_garbage_collect_ = true;
+    if (backup_env_->GetChildren(full_private_path, &subchildren).ok()) {
+      for (auto& subchild : subchildren) {
+        Status s = backup_env_->DeleteFile(full_private_path + subchild);
+        ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
+                       (full_private_path + subchild).c_str(),
+                       s.ToString().c_str());
+        if (!s.ok()) {
+          // Trying again later might work
+          might_need_garbage_collect_ = true;
+        }
       }
     }
     // finally delete the private dir
@@ -2072,10 +2424,16 @@ Status BackupEngineImpl::BackupMeta::AddFile(
           " backups or backing up to a different backup directory.");
       return Status::Corruption(msg);
     }
-    // Note: to save I/O, this check will pass trivially on already backed
-    // up files that don't have the checksum in their name. And it should
-    // never fail for files that do have checksum in their name.
-    if (itr->second->checksum_hex != file_info->checksum_hex) {
+    if (file_info->checksum_hex.empty()) {
+      // No checksum available to check
+    } else if (itr->second->checksum_hex.empty()) {
+      // Remember checksum if newly acquired
+      itr->second->checksum_hex = file_info->checksum_hex;
+    } else if (itr->second->checksum_hex != file_info->checksum_hex) {
+      // Note: to save I/O, these will be equal trivially on already backed
+      // up files that don't have the checksum in their name. And it should
+      // never fail for files that do have checksum in their name.
+
       // Should never reach here, but produce an appropriate corruption
       // message in case we do in a release build.
       assert(false);
@@ -2116,126 +2474,297 @@ Status BackupEngineImpl::BackupMeta::Delete(bool delete_meta) {
   return s;
 }
 
-Slice kMetaDataPrefix("metadata ");
+// Constants for backup meta file schema (see LoadFromFile)
+namespace {
+
+const std::string kSchemaVersionPrefix{"schema_version "};
+const std::string kFooterMarker{"// FOOTER"};
+
+const std::string kAppMetaDataFieldName{"metadata"};
+
+// WART: The checksums are crc32c but named "crc32"
+const std::string kFileCrc32cFieldName{"crc32"};
+const std::string kFileSizeFieldName{"size"};
 
-// each backup meta file is of the format:
+// Marks a (future) field that should cause failure if not recognized.
+// Other fields are assumed to be ignorable. For example, in the future
+// we might add
+//  ni::file_name_escape uri_percent
+// to indicate all file names have had spaces and special characters
+// escaped using a URI percent encoding.
+const std::string kNonIgnorableFieldPrefix{"ni::"};
+}  // namespace
+
+// Each backup meta file is of the format (schema version 1):
+//----------------------------------------------------------
 // <timestamp>
 // <seq number>
-// <metadata(literal string)> <metadata> (optional)
+// metadata <metadata> (optional)
 // <number of files>
-// <file1> <crc32(literal string)> <crc32c_value>
-// <file2> <crc32(literal string)> <crc32c_value>
+// <file1> crc32 <crc32c_as_unsigned_decimal>
+// <file2> crc32 <crc32c_as_unsigned_decimal>
 // ...
+//----------------------------------------------------------
+//
+// For schema version 2.x (not in public APIs, but
+// forward-compatibility started):
+//----------------------------------------------------------
+// schema_version <ver>
+// <timestamp>
+// <seq number>
+// [<field name> <field data>]
+// ...
+// <number of files>
+// <file1>( <field name> <field data no spaces>)*
+// <file2>( <field name> <field data no spaces>)*
+// ...
+// [// FOOTER]
+// [<field name> <field data>]
+// ...
+//----------------------------------------------------------
+// where
+// <ver> ::= [0-9]+([.][0-9]+)
+// <field name> ::= [A-Za-z_][A-Za-z_0-9.]+
+// <field data> is anything but newline
+// <field data no spaces> is anything but space and newline
+// Although "// FOOTER" wouldn't strictly be required as a delimiter
+// given the number of files is included, it is there for parsing
+// sanity in case of corruption. It is only required if followed
+// by footer fields, such as a checksum of the meta file (so far).
+// Unrecognized fields are ignored, to support schema evolution on
+// non-critical features with forward compatibility. Update schema
+// major version for breaking changes. Schema minor versions are indicated
+// only for diagnostic/debugging purposes.
+//
+// Fields in schema version 2.0:
+// * Top-level meta fields:
+//   * Only "metadata" as in schema version 1
+// * File meta fields:
+//   * "crc32" - a crc32c checksum as in schema version 1
+//   * "size" - the size of the file (new)
+// * Footer meta fields:
+//   * None yet (future use for meta file checksum anticipated)
+//
 Status BackupEngineImpl::BackupMeta::LoadFromFile(
     const std::string& backup_dir,
-    const std::unordered_map<std::string, uint64_t>& abs_path_to_size) {
+    const std::unordered_map<std::string, uint64_t>& abs_path_to_size,
+    Logger* info_log,
+    std::unordered_set<std::string>* reported_ignored_fields) {
+  assert(reported_ignored_fields);
   assert(Empty());
-  Status s;
-  std::unique_ptr<SequentialFile> backup_meta_file;
-  s = env_->NewSequentialFile(meta_filename_, &backup_meta_file, EnvOptions());
-  if (!s.ok()) {
-    return s;
-  }
 
-  std::unique_ptr<SequentialFileReader> backup_meta_reader(
-      new SequentialFileReader(NewLegacySequentialFileWrapper(backup_meta_file),
-                               meta_filename_));
-  std::unique_ptr<char[]> buf(new char[max_backup_meta_file_size_ + 1]);
-  Slice data;
-  s = backup_meta_reader->Read(max_backup_meta_file_size_, &data, buf.get());
-
-  if (!s.ok() || data.size() == max_backup_meta_file_size_) {
-    return s.ok() ? Status::Corruption("File size too big") : s;
+  std::unique_ptr<LineFileReader> backup_meta_reader;
+  {
+    Status s =
+        LineFileReader::Create(env_->GetFileSystem(), meta_filename_,
+                               FileOptions(), &backup_meta_reader, nullptr);
+    if (!s.ok()) {
+      return s;
+    }
   }
-  buf[data.size()] = 0;
 
-  uint32_t num_files = 0;
-  char *next;
-  timestamp_ = strtoull(data.data(), &next, 10);
-  data.remove_prefix(next - data.data() + 1); // +1 for '\n'
-  sequence_number_ = strtoull(data.data(), &next, 10);
-  data.remove_prefix(next - data.data() + 1); // +1 for '\n'
+  // If we don't read an explicit schema_version, that implies version 1,
+  // which is what we call the original backup meta schema.
+  int schema_major_version = 1;
 
-  if (data.starts_with(kMetaDataPrefix)) {
-    // app metadata present
-    data.remove_prefix(kMetaDataPrefix.size());
-    Slice hex_encoded_metadata = GetSliceUntil(&data, '\n');
-    bool decode_success = hex_encoded_metadata.DecodeHex(&app_metadata_);
-    if (!decode_success) {
-      return Status::Corruption(
-          "Failed to decode stored hex encoded app metadata");
+  // Failures handled at the end
+  std::string line;
+  if (backup_meta_reader->ReadLine(&line)) {
+    if (StartsWith(line, kSchemaVersionPrefix)) {
+      std::string ver = line.substr(kSchemaVersionPrefix.size());
+      if (ver == "2" || StartsWith(ver, "2.")) {
+        schema_major_version = 2;
+      } else {
+        return Status::NotSupported(
+            "Unsupported/unrecognized schema version: " + ver);
+      }
+      line.clear();
+    } else if (line.empty()) {
+      return Status::Corruption("Unexpected empty line");
+    }
+  }
+  if (!line.empty() || backup_meta_reader->ReadLine(&line)) {
+    timestamp_ = std::strtoull(line.c_str(), nullptr, /*base*/ 10);
+  }
+  if (backup_meta_reader->ReadLine(&line)) {
+    sequence_number_ = std::strtoull(line.c_str(), nullptr, /*base*/ 10);
+  }
+  uint32_t num_files = UINT32_MAX;
+  while (backup_meta_reader->ReadLine(&line)) {
+    if (line.empty()) {
+      return Status::Corruption("Unexpected empty line");
+    }
+    // Number -> number of files -> exit loop reading optional meta fields
+    if (line[0] >= '0' && line[0] <= '9') {
+      num_files = static_cast<uint32_t>(strtoul(line.c_str(), nullptr, 10));
+      break;
+    }
+    // else, must be a meta field assignment
+    auto space_pos = line.find_first_of(' ');
+    if (space_pos == std::string::npos) {
+      return Status::Corruption("Expected number of files or meta field");
+    }
+    std::string field_name = line.substr(0, space_pos);
+    std::string field_data = line.substr(space_pos + 1);
+    if (field_name == kAppMetaDataFieldName) {
+      // app metadata present
+      bool decode_success = Slice(field_data).DecodeHex(&app_metadata_);
+      if (!decode_success) {
+        return Status::Corruption(
+            "Failed to decode stored hex encoded app metadata");
+      }
+    } else if (schema_major_version < 2) {
+      return Status::Corruption("Expected number of files or \"" +
+                                kAppMetaDataFieldName + "\" field");
+    } else if (StartsWith(field_name, kNonIgnorableFieldPrefix)) {
+      return Status::NotSupported("Unrecognized non-ignorable meta field " +
+                                  field_name + " (from future version?)");
+    } else {
+      // Warn the first time we see any particular unrecognized meta field
+      if (reported_ignored_fields->insert("meta:" + field_name).second) {
+        ROCKS_LOG_WARN(info_log, "Ignoring unrecognized backup meta field %s",
+                       field_name.c_str());
+      }
     }
   }
-
-  num_files = static_cast<uint32_t>(strtoul(data.data(), &next, 10));
-  data.remove_prefix(next - data.data() + 1); // +1 for '\n'
-
   std::vector<std::shared_ptr<FileInfo>> files;
+  bool footer_present = false;
+  while (backup_meta_reader->ReadLine(&line)) {
+    std::vector<std::string> components = StringSplit(line, ' ');
 
-  // WART: The checksums are crc32c, not original crc32
-  Slice checksum_prefix("crc32 ");
+    if (components.size() < 1) {
+      return Status::Corruption("Empty line instead of file entry.");
+    }
+    if (schema_major_version >= 2 && components.size() == 2 &&
+        line == kFooterMarker) {
+      footer_present = true;
+      break;
+    }
 
-  for (uint32_t i = 0; s.ok() && i < num_files; ++i) {
-    auto line = GetSliceUntil(&data, '\n');
-    // filename is relative, i.e., shared/number.sst,
-    // shared_checksum/number.sst, or private/backup_id/number.sst
-    std::string filename = GetSliceUntil(&line, ' ').ToString();
+    const std::string& filename = components[0];
 
-    uint64_t size;
+    uint64_t actual_size;
     const std::shared_ptr<FileInfo> file_info = GetFile(filename);
     if (file_info) {
-      size = file_info->size;
+      actual_size = file_info->size;
     } else {
       std::string abs_path = backup_dir + "/" + filename;
-      try {
-        size = abs_path_to_size.at(abs_path);
-      } catch (std::out_of_range&) {
-        return Status::Corruption("Size missing for pathname: " + abs_path);
+      auto e = abs_path_to_size.find(abs_path);
+      if (e == abs_path_to_size.end()) {
+        return Status::Corruption("Pathname in meta file not found on disk: " +
+                                  abs_path);
       }
+      actual_size = e->second;
     }
 
-    if (line.empty()) {
-      return Status::Corruption("File checksum is missing for " + filename +
-                                " in " + meta_filename_);
+    if (schema_major_version >= 2) {
+      if (components.size() % 2 != 1) {
+        return Status::Corruption(
+            "Bad number of line components for file entry.");
+      }
+    } else {
+      // Check restricted original schema
+      if (components.size() < 3) {
+        return Status::Corruption("File checksum is missing for " + filename +
+                                  " in " + meta_filename_);
+      }
+      if (components[1] != kFileCrc32cFieldName) {
+        return Status::Corruption("Unknown checksum type for " + filename +
+                                  " in " + meta_filename_);
+      }
+      if (components.size() > 3) {
+        return Status::Corruption("Extra data for entry " + filename + " in " +
+                                  meta_filename_);
+      }
     }
 
-    uint32_t checksum_value = 0;
-    if (line.starts_with(checksum_prefix)) {
-      line.remove_prefix(checksum_prefix.size());
-      checksum_value = static_cast<uint32_t>(strtoul(line.data(), nullptr, 10));
-      if (line != ROCKSDB_NAMESPACE::ToString(checksum_value)) {
-        return Status::Corruption("Invalid checksum value for " + filename +
-                                  " in " + meta_filename_);
+    std::string checksum_hex;
+    for (unsigned i = 1; i < components.size(); i += 2) {
+      const std::string& field_name = components[i];
+      const std::string& field_data = components[i + 1];
+
+      if (field_name == kFileCrc32cFieldName) {
+        uint32_t checksum_value =
+            static_cast<uint32_t>(strtoul(field_data.c_str(), nullptr, 10));
+        if (field_data != ROCKSDB_NAMESPACE::ToString(checksum_value)) {
+          return Status::Corruption("Invalid checksum value for " + filename +
+                                    " in " + meta_filename_);
+        }
+        checksum_hex = ChecksumInt32ToHex(checksum_value);
+      } else if (field_name == kFileSizeFieldName) {
+        uint64_t ex_size =
+            std::strtoull(field_data.c_str(), nullptr, /*base*/ 10);
+        if (ex_size != actual_size) {
+          return Status::Corruption("For file " + filename + " expected size " +
+                                    ToString(ex_size) + " but found size" +
+                                    ToString(actual_size));
+        }
+      } else if (StartsWith(field_name, kNonIgnorableFieldPrefix)) {
+        return Status::NotSupported("Unrecognized non-ignorable file field " +
+                                    field_name + " (from future version?)");
+      } else {
+        // Warn the first time we see any particular unrecognized file field
+        if (reported_ignored_fields->insert("file:" + field_name).second) {
+          ROCKS_LOG_WARN(info_log, "Ignoring unrecognized backup file field %s",
+                         field_name.c_str());
+        }
       }
-    } else {
-      return Status::Corruption("Unknown checksum type for " + filename +
-                                " in " + meta_filename_);
     }
 
-    files.emplace_back(
-        new FileInfo(filename, size, ChecksumInt32ToHex(checksum_value)));
+    files.emplace_back(new FileInfo(filename, actual_size, checksum_hex));
   }
 
-  if (s.ok() && data.size() > 0) {
-    // file has to be read completely. if not, we count it as corruption
-    s = Status::Corruption("Tailing data in backup meta file in " +
-                           meta_filename_);
+  if (footer_present) {
+    assert(schema_major_version >= 2);
+    while (backup_meta_reader->ReadLine(&line)) {
+      if (line.empty()) {
+        return Status::Corruption("Unexpected empty line");
+      }
+      auto space_pos = line.find_first_of(' ');
+      if (space_pos == std::string::npos) {
+        return Status::Corruption("Expected footer field");
+      }
+      std::string field_name = line.substr(0, space_pos);
+      std::string field_data = line.substr(space_pos + 1);
+      if (StartsWith(field_name, kNonIgnorableFieldPrefix)) {
+        return Status::NotSupported("Unrecognized non-ignorable field " +
+                                    field_name + " (from future version?)");
+      } else if (reported_ignored_fields->insert("footer:" + field_name)
+                     .second) {
+        // Warn the first time we see any particular unrecognized footer field
+        ROCKS_LOG_WARN(info_log,
+                       "Ignoring unrecognized backup meta footer field %s",
+                       field_name.c_str());
+      }
+    }
   }
 
-  if (s.ok()) {
-    files_.reserve(files.size());
-    for (const auto& file_info : files) {
-      s = AddFile(file_info);
-      if (!s.ok()) {
-        break;
-      }
+  {
+    Status s = backup_meta_reader->GetStatus();
+    if (!s.ok()) {
+      return s;
     }
   }
 
-  return s;
+  if (num_files != files.size()) {
+    return Status::Corruption(
+        "Inconsistent number of files or missing/incomplete header in " +
+        meta_filename_);
+  }
+
+  files_.reserve(files.size());
+  for (const auto& file_info : files) {
+    Status s = AddFile(file_info);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  return Status::OK();
 }
 
-Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) {
+Status BackupEngineImpl::BackupMeta::StoreToFile(
+    bool sync, const TEST_FutureSchemaVersion2Options* test_future_options) {
   Status s;
   std::unique_ptr<WritableFile> backup_meta_file;
   EnvOptions env_options;
@@ -2246,70 +2775,52 @@ Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) {
     return s;
   }
 
-  std::unique_ptr<char[]> buf(new char[max_backup_meta_file_size_]);
-  size_t len = 0, buf_size = max_backup_meta_file_size_;
-  len += snprintf(buf.get(), buf_size, "%" PRId64 "\n", timestamp_);
-  len += snprintf(buf.get() + len, buf_size - len, "%" PRIu64 "\n",
-                  sequence_number_);
+  std::ostringstream buf;
+  if (test_future_options) {
+    buf << kSchemaVersionPrefix << test_future_options->version << "\n";
+  }
+  buf << static_cast<unsigned long long>(timestamp_) << "\n";
+  buf << sequence_number_ << "\n";
+
   if (!app_metadata_.empty()) {
     std::string hex_encoded_metadata =
         Slice(app_metadata_).ToString(/* hex */ true);
-
-    // +1 to accommodate newline character
-    size_t hex_meta_strlen =
-        kMetaDataPrefix.ToString().length() + hex_encoded_metadata.length() + 1;
-    if (hex_meta_strlen >= buf_size) {
-      return Status::Corruption("Buffer too small to fit backup metadata");
-    }
-    else if (len + hex_meta_strlen >= buf_size) {
-      backup_meta_file->Append(Slice(buf.get(), len));
-      buf.reset();
-      std::unique_ptr<char[]> new_reset_buf(
-          new char[max_backup_meta_file_size_]);
-      buf.swap(new_reset_buf);
-      len = 0;
-    }
-    len += snprintf(buf.get() + len, buf_size - len, "%s%s\n",
-                    kMetaDataPrefix.ToString().c_str(),
-                    hex_encoded_metadata.c_str());
-  }
-
-  char writelen_temp[19];
-  if (len + snprintf(writelen_temp, sizeof(writelen_temp),
-                     "%" ROCKSDB_PRIszt "\n", files_.size()) >= buf_size) {
-    backup_meta_file->Append(Slice(buf.get(), len));
-    buf.reset();
-    std::unique_ptr<char[]> new_reset_buf(new char[max_backup_meta_file_size_]);
-    buf.swap(new_reset_buf);
-    len = 0;
+    buf << kAppMetaDataFieldName << " " << hex_encoded_metadata << "\n";
   }
-  {
-    const char *const_write = writelen_temp;
-    len += snprintf(buf.get() + len, buf_size - len, "%s", const_write);
+  if (test_future_options) {
+    for (auto& e : test_future_options->meta_fields) {
+      buf << e.first << " " << e.second << "\n";
+    }
   }
+  buf << files_.size() << "\n";
 
   for (const auto& file : files_) {
-    // use crc32c for now, switch to something else if needed
-    // WART: The checksums are crc32c, not original crc32
-
-    size_t newlen =
-        len + file->filename.length() +
-        snprintf(writelen_temp, sizeof(writelen_temp), " crc32 %u\n",
-                 ChecksumHexToInt32(file->checksum_hex));
-    const char* const_write = writelen_temp;
-    if (newlen >= buf_size) {
-      backup_meta_file->Append(Slice(buf.get(), len));
-      buf.reset();
-      std::unique_ptr<char[]> new_reset_buf(
-          new char[max_backup_meta_file_size_]);
-      buf.swap(new_reset_buf);
-      len = 0;
-    }
-    len += snprintf(buf.get() + len, buf_size - len, "%s%s",
-                    file->filename.c_str(), const_write);
-  }
-
-  s = backup_meta_file->Append(Slice(buf.get(), len));
+    buf << file->filename;
+    if (test_future_options == nullptr ||
+        test_future_options->crc32c_checksums) {
+      // use crc32c for now, switch to something else if needed
+      buf << " " << kFileCrc32cFieldName << " "
+          << ChecksumHexToInt32(file->checksum_hex);
+    }
+    if (test_future_options && test_future_options->file_sizes) {
+      buf << " " << kFileSizeFieldName << " " << ToString(file->size);
+    }
+    if (test_future_options) {
+      for (auto& e : test_future_options->file_fields) {
+        buf << " " << e.first << " " << e.second;
+      }
+    }
+    buf << "\n";
+  }
+
+  if (test_future_options && !test_future_options->footer_fields.empty()) {
+    buf << kFooterMarker << "\n";
+    for (auto& e : test_future_options->footer_fields) {
+      buf << e.first << " " << e.second << "\n";
+    }
+  }
+
+  s = backup_meta_file->Append(Slice(buf.str()));
   if (s.ok() && sync) {
     s = backup_meta_file->Sync();
   }
@@ -2322,58 +2833,14 @@ Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) {
   return s;
 }
 
-// -------- BackupEngineReadOnlyImpl ---------
-class BackupEngineReadOnlyImpl : public BackupEngineReadOnly {
- public:
-  BackupEngineReadOnlyImpl(const BackupableDBOptions& options, Env* db_env)
-      : backup_engine_(new BackupEngineImpl(options, db_env, true)) {}
-
-  ~BackupEngineReadOnlyImpl() override {}
-
-  // The returned BackupInfos are in chronological order, which means the
-  // latest backup comes last.
-  void GetBackupInfo(std::vector<BackupInfo>* backup_info) override {
-    backup_engine_->GetBackupInfo(backup_info);
-  }
-
-  void GetCorruptedBackups(std::vector<BackupID>* corrupt_backup_ids) override {
-    backup_engine_->GetCorruptedBackups(corrupt_backup_ids);
-  }
-
-  using BackupEngineReadOnly::RestoreDBFromBackup;
-  Status RestoreDBFromBackup(const RestoreOptions& options, BackupID backup_id,
-                             const std::string& db_dir,
-                             const std::string& wal_dir) override {
-    return backup_engine_->RestoreDBFromBackup(options, backup_id, db_dir,
-                                               wal_dir);
-  }
-
-  using BackupEngineReadOnly::RestoreDBFromLatestBackup;
-  Status RestoreDBFromLatestBackup(const RestoreOptions& options,
-                                   const std::string& db_dir,
-                                   const std::string& wal_dir) override {
-    return backup_engine_->RestoreDBFromLatestBackup(options, db_dir, wal_dir);
-  }
-
-  Status VerifyBackup(BackupID backup_id,
-                      bool verify_with_checksum = false) override {
-    return backup_engine_->VerifyBackup(backup_id, verify_with_checksum);
-  }
-
-  Status Initialize() { return backup_engine_->Initialize(); }
-
- private:
-  std::unique_ptr<BackupEngineImpl> backup_engine_;
-};
-
-Status BackupEngineReadOnly::Open(const BackupableDBOptions& options, Env* env,
+Status BackupEngineReadOnly::Open(const BackupEngineOptions& options, Env* env,
                                   BackupEngineReadOnly** backup_engine_ptr) {
   if (options.destroy_old_data) {
     return Status::InvalidArgument(
         "Can't destroy old data with ReadOnly BackupEngine");
   }
-  std::unique_ptr<BackupEngineReadOnlyImpl> backup_engine(
-      new BackupEngineReadOnlyImpl(options, env));
+  std::unique_ptr<BackupEngineImplThreadSafe> backup_engine(
+      new BackupEngineImplThreadSafe(options, env, true /*read_only*/));
   auto s = backup_engine->Initialize();
   if (!s.ok()) {
     *backup_engine_ptr = nullptr;
@@ -2383,6 +2850,13 @@ Status BackupEngineReadOnly::Open(const BackupableDBOptions& options, Env* env,
   return Status::OK();
 }
 
+void TEST_EnableWriteFutureSchemaVersion2(
+    BackupEngine* engine, const TEST_FutureSchemaVersion2Options& options) {
+  BackupEngineImplThreadSafe* impl =
+      static_cast_with_check<BackupEngineImplThreadSafe>(engine);
+  impl->TEST_EnableWriteFutureSchemaVersion2(options);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 #endif  // ROCKSDB_LITE
diff --git a/utilities/backupable/backupable_db_impl.h b/utilities/backupable/backupable_db_impl.h
new file mode 100644
index 00000000000..afd4b7bf5c9
--- /dev/null
+++ b/utilities/backupable/backupable_db_impl.h
@@ -0,0 +1,29 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/backupable_db.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct TEST_FutureSchemaVersion2Options {
+  std::string version = "2";
+  bool crc32c_checksums = false;
+  bool file_sizes = true;
+  std::map<std::string, std::string> meta_fields;
+  std::map<std::string, std::string> file_fields;
+  std::map<std::string, std::string> footer_fields;
+};
+
+// Modifies the BackupEngine(Impl) to write backup meta files using the
+// unpublished schema version 2, for the life of this object (not backup_dir).
+// TEST_FutureSchemaVersion2Options offers some customization for testing.
+void TEST_EnableWriteFutureSchemaVersion2(
+    BackupEngine *engine, const TEST_FutureSchemaVersion2Options &options);
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc
index dfc809b16cb..a7440951213 100644
--- a/utilities/backupable/backupable_db_test.cc
+++ b/utilities/backupable/backupable_db_test.cc
@@ -12,7 +12,9 @@
 #include "rocksdb/utilities/backupable_db.h"
 
 #include <algorithm>
+#include <array>
 #include <limits>
+#include <random>
 #include <regex>
 #include <string>
 #include <utility>
@@ -34,6 +36,7 @@
 #include "util/random.h"
 #include "util/stderr_logger.h"
 #include "util/string_util.h"
+#include "utilities/backupable/backupable_db_impl.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -43,10 +46,7 @@ const auto kLegacyCrc32cAndFileSize =
     BackupableDBOptions::kLegacyCrc32cAndFileSize;
 const auto kUseDbSessionId = BackupableDBOptions::kUseDbSessionId;
 const auto kFlagIncludeFileSize = BackupableDBOptions::kFlagIncludeFileSize;
-const auto kFlagMatchInterimNaming =
-    BackupableDBOptions::kFlagMatchInterimNaming;
-const auto kNamingDefault =
-    kUseDbSessionId | kFlagIncludeFileSize | kFlagMatchInterimNaming;
+const auto kNamingDefault = kUseDbSessionId | kFlagIncludeFileSize;
 
 class DummyDB : public StackableDB {
  public:
@@ -70,6 +70,16 @@ class DummyDB : public StackableDB {
 
   DBOptions GetDBOptions() const override { return DBOptions(options_); }
 
+  using StackableDB::GetIntProperty;
+  bool GetIntProperty(ColumnFamilyHandle*, const Slice& property,
+                      uint64_t* value) override {
+    if (property == DB::Properties::kMinLogNumberToKeep) {
+      *value = 1;
+      return true;
+    }
+    return false;
+  }
+
   Status EnableFileDeletions(bool /*force*/) override {
     EXPECT_TRUE(!deletions_enabled_);
     deletions_enabled_ = true;
@@ -398,18 +408,18 @@ class FileManager : public EnvWrapper {
   Status GetRandomFileInDir(const std::string& dir, std::string* fname,
                             uint64_t* fsize) {
     std::vector<FileAttributes> children;
-    GetChildrenFileAttributes(dir, &children);
-    if (children.size() <= 2) {  // . and ..
+    auto s = GetChildrenFileAttributes(dir, &children);
+    if (!s.ok()) {
+      return s;
+    } else if (children.size() <= 2) {  // . and ..
       return Status::NotFound("Empty directory: " + dir);
     }
     assert(fname != nullptr);
     while (true) {
       int i = rnd_.Next() % children.size();
-      if (children[i].name != "." && children[i].name != "..") {
-        fname->assign(dir + "/" + children[i].name);
-        *fsize = children[i].size_bytes;
-        return Status::OK();
-      }
+      fname->assign(dir + "/" + children[i].name);
+      *fsize = children[i].size_bytes;
+      return Status::OK();
     }
     // should never get here
     assert(false);
@@ -418,15 +428,13 @@ class FileManager : public EnvWrapper {
 
   Status DeleteRandomFileInDir(const std::string& dir) {
     std::vector<std::string> children;
-    GetChildren(dir, &children);
-    if (children.size() <= 2) { // . and ..
-      return Status::NotFound("");
+    Status s = GetChildren(dir, &children);
+    if (!s.ok()) {
+      return s;
     }
     while (true) {
       int i = rnd_.Next() % children.size();
-      if (children[i] != "." && children[i] != "..") {
-        return DeleteFile(dir + "/" + children[i]);
-      }
+      return DeleteFile(dir + "/" + children[i]);
     }
     // should never get here
     assert(false);
@@ -436,15 +444,13 @@ class FileManager : public EnvWrapper {
   Status AppendToRandomFileInDir(const std::string& dir,
                                  const std::string& data) {
     std::vector<std::string> children;
-    GetChildren(dir, &children);
-    if (children.size() <= 2) {
-      return Status::NotFound("");
+    Status s = GetChildren(dir, &children);
+    if (!s.ok()) {
+      return s;
     }
     while (true) {
       int i = rnd_.Next() % children.size();
-      if (children[i] != "." && children[i] != "..") {
-        return WriteToFile(dir + "/" + children[i], data);
-      }
+      return WriteToFile(dir + "/" + children[i], data);
     }
     // should never get here
     assert(false);
@@ -592,7 +598,7 @@ void AssertEmpty(DB* db, int from, int to) {
 }
 }  // namespace
 
-class BackupableDBTest : public testing::Test {
+class BackupEngineTest : public testing::Test {
  public:
   enum ShareOption {
     kNoShare,
@@ -603,12 +609,12 @@ class BackupableDBTest : public testing::Test {
   const std::vector<ShareOption> kAllShareOptions = {
       kNoShare, kShareNoChecksum, kShareWithChecksum};
 
-  BackupableDBTest() {
+  BackupEngineTest() {
     // set up files
-    std::string db_chroot = test::PerThreadDBPath("backupable_db");
-    std::string backup_chroot = test::PerThreadDBPath("backupable_db_backup");
-    Env::Default()->CreateDir(db_chroot);
-    Env::Default()->CreateDir(backup_chroot);
+    std::string db_chroot = test::PerThreadDBPath("db_for_backup");
+    std::string backup_chroot = test::PerThreadDBPath("db_backups");
+    EXPECT_OK(Env::Default()->CreateDirIfMissing(db_chroot));
+    EXPECT_OK(Env::Default()->CreateDirIfMissing(backup_chroot));
     dbname_ = "/tempdb";
     backupdir_ = "/tempbk";
 
@@ -626,11 +632,15 @@ class BackupableDBTest : public testing::Test {
     options_.write_buffer_size = 1 << 17; // 128KB
     options_.env = test_db_env_.get();
     options_.wal_dir = dbname_;
+    options_.enable_blob_files = true;
 
     // Create logger
     DBOptions logger_options;
     logger_options.env = db_chroot_env_.get();
-    CreateLoggerFromOptions(dbname_, logger_options, &logger_);
+    // TODO: This should really be an EXPECT_OK, but this CreateLogger fails
+    // regularly in some environments with "no such directory"
+    CreateLoggerFromOptions(dbname_, logger_options, &logger_)
+        .PermitUncheckedError();
 
     // set up backup db options
     backupable_options_.reset(new BackupableDBOptions(
@@ -649,14 +659,18 @@ class BackupableDBTest : public testing::Test {
     return db;
   }
 
-  void CloseAndReopenDB() {
+  void CloseAndReopenDB(bool read_only = false) {
     // Close DB
     db_.reset();
 
     // Open DB
     test_db_env_->SetLimitWrittenFiles(1000000);
     DB* db;
-    ASSERT_OK(DB::Open(options_, dbname_, &db));
+    if (read_only) {
+      ASSERT_OK(DB::OpenForReadOnly(options_, dbname_, &db));
+    } else {
+      ASSERT_OK(DB::Open(options_, dbname_, &db));
+    }
     db_.reset(db);
   }
 
@@ -703,6 +717,69 @@ class BackupableDBTest : public testing::Test {
 
   void CloseBackupEngine() { backup_engine_.reset(nullptr); }
 
+  // cross-cutting test of GetBackupInfo
+  void AssertBackupInfoConsistency() {
+    std::vector<BackupInfo> backup_info;
+    backup_engine_->GetBackupInfo(&backup_info, /*with file details*/ true);
+    std::map<std::string, uint64_t> file_sizes;
+
+    // Find the files that are supposed to be there
+    for (auto& backup : backup_info) {
+      uint64_t sum_for_backup = 0;
+      for (auto& file : backup.file_details) {
+        auto e = file_sizes.find(file.relative_filename);
+        if (e == file_sizes.end()) {
+          // fprintf(stderr, "Adding %s -> %u\n",
+          // file.relative_filename.c_str(), (unsigned)file.size);
+          file_sizes[file.relative_filename] = file.size;
+        } else {
+          ASSERT_EQ(file_sizes[file.relative_filename], file.size);
+        }
+        sum_for_backup += file.size;
+      }
+      ASSERT_EQ(backup.size, sum_for_backup);
+    }
+
+    std::vector<BackupID> corrupt_backup_ids;
+    backup_engine_->GetCorruptedBackups(&corrupt_backup_ids);
+    bool has_corrupt = corrupt_backup_ids.size() > 0;
+
+    // Compare with what's in backup dir
+    std::vector<std::string> child_dirs;
+    ASSERT_OK(
+        test_backup_env_->GetChildren(backupdir_ + "/private", &child_dirs));
+    for (auto& dir : child_dirs) {
+      dir = "private/" + dir;
+    }
+    child_dirs.push_back("shared");           // might not exist
+    child_dirs.push_back("shared_checksum");  // might not exist
+    for (auto& dir : child_dirs) {
+      std::vector<std::string> children;
+      test_backup_env_->GetChildren(backupdir_ + "/" + dir, &children)
+          .PermitUncheckedError();
+      // fprintf(stderr, "ls %s\n", (backupdir_ + "/" + dir).c_str());
+      for (auto& file : children) {
+        uint64_t size;
+        size = UINT64_MAX;  // appease clang-analyze
+        std::string rel_file = dir + "/" + file;
+        // fprintf(stderr, "stat %s\n", (backupdir_ + "/" + rel_file).c_str());
+        ASSERT_OK(
+            test_backup_env_->GetFileSize(backupdir_ + "/" + rel_file, &size));
+        auto e = file_sizes.find(rel_file);
+        if (e == file_sizes.end()) {
+          // The only case in which we should find files not reported
+          ASSERT_TRUE(has_corrupt);
+        } else {
+          ASSERT_EQ(e->second, size);
+          file_sizes.erase(e);
+        }
+      }
+    }
+
+    // Everything should have been matched
+    ASSERT_EQ(file_sizes.size(), 0);
+  }
+
   // restores backup backup_id and asserts the existence of
   // [start_exist, end_exist> and not-existence of
   // [end_exist, end>
@@ -718,6 +795,9 @@ class BackupableDBTest : public testing::Test {
       opened_backup_engine = true;
       OpenBackupEngine();
     }
+    AssertBackupInfoConsistency();
+
+    // Now perform restore
     if (backup_id > 0) {
       ASSERT_OK(backup_engine_->RestoreDBFromBackup(backup_id, dbname_, dbname_,
                                                     restore_options));
@@ -726,6 +806,7 @@ class BackupableDBTest : public testing::Test {
                                                           restore_options));
     }
     DB* db = OpenDB();
+    // Check DB contents
     AssertExists(db, start_exist, end_exist);
     if (end != 0) {
       AssertEmpty(db, end_exist, end);
@@ -738,52 +819,62 @@ class BackupableDBTest : public testing::Test {
 
   void DeleteLogFiles() {
     std::vector<std::string> delete_logs;
-    db_chroot_env_->GetChildren(dbname_, &delete_logs);
+    ASSERT_OK(db_chroot_env_->GetChildren(dbname_, &delete_logs));
     for (auto f : delete_logs) {
       uint64_t number;
       FileType type;
       bool ok = ParseFileName(f, &number, &type);
-      if (ok && type == kLogFile) {
-        db_chroot_env_->DeleteFile(dbname_ + "/" + f);
+      if (ok && type == kWalFile) {
+        ASSERT_OK(db_chroot_env_->DeleteFile(dbname_ + "/" + f));
       }
     }
   }
 
-  Status GetTableFilesInDB(std::vector<FileAttributes>* table_files) {
+  Status GetDataFilesInDB(const FileType& file_type,
+                          std::vector<FileAttributes>* files) {
+    std::vector<std::string> live;
+    uint64_t ignore_manifest_size;
+    Status s = db_->GetLiveFiles(live, &ignore_manifest_size, /*flush*/ false);
+    if (!s.ok()) {
+      return s;
+    }
     std::vector<FileAttributes> children;
-    Status s = test_db_env_->GetChildrenFileAttributes(dbname_, &children);
+    s = test_db_env_->GetChildrenFileAttributes(dbname_, &children);
     for (const auto& child : children) {
-      if (child.size_bytes > 0 && child.name.size() > 4 &&
-          child.name.rfind(".sst") == child.name.length() - 4) {
-        table_files->push_back(child);
+      FileType type;
+      uint64_t number = 0;
+      if (ParseFileName(child.name, &number, &type) && type == file_type &&
+          std::find(live.begin(), live.end(), "/" + child.name) != live.end()) {
+        files->push_back(child);
       }
     }
     return s;
   }
 
-  Status GetRandomTableFileInDB(std::string* fname_out,
-                                uint64_t* fsize_out = nullptr) {
+  Status GetRandomDataFileInDB(const FileType& file_type,
+                               std::string* fname_out,
+                               uint64_t* fsize_out = nullptr) {
     Random rnd(6);  // NB: hardly "random"
-    std::vector<FileAttributes> table_files;
-    Status s = GetTableFilesInDB(&table_files);
+    std::vector<FileAttributes> files;
+    Status s = GetDataFilesInDB(file_type, &files);
     if (!s.ok()) {
       return s;
     }
-    if (table_files.empty()) {
+    if (files.empty()) {
       return Status::NotFound("");
     }
-    size_t i = rnd.Uniform(static_cast<int>(table_files.size()));
-    *fname_out = dbname_ + "/" + table_files[i].name;
+    size_t i = rnd.Uniform(static_cast<int>(files.size()));
+    *fname_out = dbname_ + "/" + files[i].name;
     if (fsize_out) {
-      *fsize_out = table_files[i].size_bytes;
+      *fsize_out = files[i].size_bytes;
     }
     return Status::OK();
   }
 
-  Status CorruptRandomTableFileInDB() {
+  Status CorruptRandomDataFileInDB(const FileType& file_type) {
     std::string fname;
     uint64_t fsize = 0;
-    Status s = GetRandomTableFileInDB(&fname, &fsize);
+    Status s = GetRandomDataFileInDB(file_type, &fname, &fsize);
     if (!s.ok()) {
       return s;
     }
@@ -804,17 +895,17 @@ class BackupableDBTest : public testing::Test {
 
   void AssertDirectoryFilesMatchRegex(const std::string& dir,
                                       const std::regex& pattern,
+                                      const std::string& file_type,
                                       int minimum_count) {
     std::vector<FileAttributes> children;
     ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children));
     int found_count = 0;
     for (const auto& child : children) {
-      if (child.name == "." || child.name == "..") {
-        continue;
+      if (EndsWith(child.name, file_type)) {
+        ASSERT_TRUE(std::regex_match(child.name, pattern))
+            << "File name " << child.name << " does not match regex.";
+        ++found_count;
       }
-      const std::string match("match");
-      ASSERT_EQ(match, std::regex_replace(child.name, pattern, match));
-      ++found_count;
     }
     ASSERT_GE(found_count, minimum_count);
   }
@@ -825,9 +916,6 @@ class BackupableDBTest : public testing::Test {
     ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children));
     int found_count = 0;
     for (const auto& child : children) {
-      if (child.name == "." || child.name == "..") {
-        continue;
-      }
       auto last_underscore = child.name.find_last_of('_');
       auto last_dot = child.name.find_last_of('.');
       ASSERT_NE(child.name, child.name.substr(0, last_underscore));
@@ -858,7 +946,7 @@ class BackupableDBTest : public testing::Test {
   std::unique_ptr<FileManager> db_file_manager_;
 
   // all the dbs!
-  DummyDB* dummy_db_; // BackupableDB owns dummy_db_
+  DummyDB* dummy_db_;  // owned as db_ when present
   std::unique_ptr<DB> db_;
   std::unique_ptr<BackupEngine> backup_engine_;
 
@@ -867,7 +955,7 @@ class BackupableDBTest : public testing::Test {
 
  protected:
   std::unique_ptr<BackupableDBOptions> backupable_options_;
-}; // BackupableDBTest
+};  // BackupEngineTest
 
 void AppendPath(const std::string& path, std::vector<std::string>& v) {
   for (auto& f : v) {
@@ -875,16 +963,16 @@ void AppendPath(const std::string& path, std::vector<std::string>& v) {
   }
 }
 
-class BackupableDBTestWithParam : public BackupableDBTest,
+class BackupEngineTestWithParam : public BackupEngineTest,
                                   public testing::WithParamInterface<bool> {
  public:
-  BackupableDBTestWithParam() {
+  BackupEngineTestWithParam() {
     backupable_options_->share_files_with_checksum = GetParam();
   }
   void OpenDBAndBackupEngine(
       bool destroy_old_data = false, bool dummy = false,
       ShareOption shared_option = kShareNoChecksum) override {
-    BackupableDBTest::InitializeDBAndBackupEngine(dummy);
+    BackupEngineTest::InitializeDBAndBackupEngine(dummy);
     // reset backup env defaults
     test_backup_env_->SetLimitWrittenFiles(1000000);
     backupable_options_->destroy_old_data = destroy_old_data;
@@ -894,7 +982,7 @@ class BackupableDBTestWithParam : public BackupableDBTest,
   }
 };
 
-TEST_F(BackupableDBTest, FileCollision) {
+TEST_F(BackupEngineTest, FileCollision) {
   const int keys_iteration = 5000;
   for (const auto& sopt : kAllShareOptions) {
     OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */, sopt);
@@ -906,7 +994,7 @@ TEST_F(BackupableDBTest, FileCollision) {
 
     // If the db directory has been cleaned up, it is sensitive to file
     // collision.
-    DestroyDB(dbname_, options_);
+    ASSERT_OK(DestroyDB(dbname_, options_));
 
     // open with old backup
     OpenDBAndBackupEngine(false /* destroy_old_data */, false /* dummy */,
@@ -925,15 +1013,14 @@ TEST_F(BackupableDBTest, FileCollision) {
     CloseDBAndBackupEngine();
 
     // delete old data
-    DestroyDB(dbname_, options_);
+    ASSERT_OK(DestroyDB(dbname_, options_));
   }
 }
 
 // This test verifies that the verifyBackup method correctly identifies
 // invalid backups
-TEST_P(BackupableDBTestWithParam, VerifyBackup) {
+TEST_P(BackupEngineTestWithParam, VerifyBackup) {
   const int keys_iteration = 5000;
-  Status s;
   OpenDBAndBackupEngine(true);
   // create five backups
   for (int i = 0; i < 5; ++i) {
@@ -947,13 +1034,13 @@ TEST_P(BackupableDBTestWithParam, VerifyBackup) {
   ASSERT_TRUE(backup_engine_->VerifyBackup(1).ok());
 
   // ---------- case 2. - delete a file -----------i
-  file_manager_->DeleteRandomFileInDir(backupdir_ + "/private/1");
+  ASSERT_OK(file_manager_->DeleteRandomFileInDir(backupdir_ + "/private/1"));
   ASSERT_TRUE(backup_engine_->VerifyBackup(1).IsNotFound());
 
   // ---------- case 3. - corrupt a file -----------
   std::string append_data = "Corrupting a random file";
-  file_manager_->AppendToRandomFileInDir(backupdir_ + "/private/2",
-                                         append_data);
+  ASSERT_OK(file_manager_->AppendToRandomFileInDir(backupdir_ + "/private/2",
+                                                   append_data));
   ASSERT_TRUE(backup_engine_->VerifyBackup(2).IsCorruption());
 
   // ---------- case 4. - invalid backup -----------
@@ -962,7 +1049,7 @@ TEST_P(BackupableDBTestWithParam, VerifyBackup) {
 }
 
 // open DB, write, close DB, backup, restore, repeat
-TEST_P(BackupableDBTestWithParam, OfflineIntegrationTest) {
+TEST_P(BackupEngineTestWithParam, OfflineIntegrationTest) {
   // has to be a big number, so that it triggers the memtable flush
   const int keys_iteration = 5000;
   const int max_key = keys_iteration * 4 + 10;
@@ -1010,7 +1097,7 @@ TEST_P(BackupableDBTestWithParam, OfflineIntegrationTest) {
 }
 
 // open DB, write, backup, write, backup, close, restore
-TEST_P(BackupableDBTestWithParam, OnlineIntegrationTest) {
+TEST_P(BackupEngineTestWithParam, OnlineIntegrationTest) {
   // has to be a big number, so that it triggers the memtable flush
   const int keys_iteration = 5000;
   const int max_key = keys_iteration * 4 + 10;
@@ -1072,11 +1159,11 @@ TEST_P(BackupableDBTestWithParam, OnlineIntegrationTest) {
   CloseBackupEngine();
 }
 
-INSTANTIATE_TEST_CASE_P(BackupableDBTestWithParam, BackupableDBTestWithParam,
+INSTANTIATE_TEST_CASE_P(BackupEngineTestWithParam, BackupEngineTestWithParam,
                         ::testing::Bool());
 
 // this will make sure that backup does not copy the same file twice
-TEST_F(BackupableDBTest, NoDoubleCopy_And_AutoGC) {
+TEST_F(BackupEngineTest, NoDoubleCopy_And_AutoGC) {
   OpenDBAndBackupEngine(true, true);
 
   // should write 5 DB files + one meta file
@@ -1129,9 +1216,11 @@ TEST_F(BackupableDBTest, NoDoubleCopy_And_AutoGC) {
 
   // MANIFEST file size should be only 100
   uint64_t size = 0;
-  test_backup_env_->GetFileSize(backupdir_ + "/private/2/MANIFEST-01", &size);
+  ASSERT_OK(test_backup_env_->GetFileSize(backupdir_ + "/private/2/MANIFEST-01",
+                                          &size));
   ASSERT_EQ(100UL, size);
-  test_backup_env_->GetFileSize(backupdir_ + "/shared/00015.sst", &size);
+  ASSERT_OK(
+      test_backup_env_->GetFileSize(backupdir_ + "/shared/00015.sst", &size));
   ASSERT_EQ(200UL, size);
 
   CloseBackupEngine();
@@ -1181,7 +1270,7 @@ TEST_F(BackupableDBTest, NoDoubleCopy_And_AutoGC) {
 //      fine
 // 3. Corrupted checksum value - if the checksum is not a valid uint32_t,
 //      db open should fail, otherwise, it aborts during the restore process.
-TEST_F(BackupableDBTest, CorruptionsTest) {
+TEST_F(BackupEngineTest, CorruptionsTest) {
   const int keys_iteration = 5000;
   Random rnd(6);
   Status s;
@@ -1199,7 +1288,7 @@ TEST_F(BackupableDBTest, CorruptionsTest) {
   test_backup_env_->SetLimitWrittenFiles(2);
   // should fail
   s = backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2));
-  ASSERT_TRUE(!s.ok());
+  ASSERT_NOK(s);
   test_backup_env_->SetLimitWrittenFiles(1000000);
   // latest backup should have all the keys
   CloseDBAndBackupEngine();
@@ -1211,7 +1300,7 @@ TEST_F(BackupableDBTest, CorruptionsTest) {
   AssertBackupConsistency(0, 0, keys_iteration * 4, keys_iteration * 5);
   OpenBackupEngine();
   s = backup_engine_->RestoreDBFromBackup(5, dbname_, dbname_);
-  ASSERT_TRUE(!s.ok());
+  ASSERT_NOK(s);
   CloseBackupEngine();
   ASSERT_OK(file_manager_->DeleteRandomFileInDir(backupdir_ + "/private/4"));
   // 4 is corrupted, 3 is the latest backup now
@@ -1219,7 +1308,7 @@ TEST_F(BackupableDBTest, CorruptionsTest) {
   OpenBackupEngine();
   s = backup_engine_->RestoreDBFromBackup(4, dbname_, dbname_);
   CloseBackupEngine();
-  ASSERT_TRUE(!s.ok());
+  ASSERT_NOK(s);
 
   // --------- case 3. corrupted checksum value ----
   ASSERT_OK(file_manager_->CorruptChecksum(backupdir_ + "/meta/3", false));
@@ -1233,7 +1322,7 @@ TEST_F(BackupableDBTest, CorruptionsTest) {
   OpenBackupEngine();
   ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/2"));
   s = backup_engine_->RestoreDBFromBackup(2, dbname_, dbname_);
-  ASSERT_TRUE(!s.ok());
+  ASSERT_NOK(s);
 
   // make sure that no corrupt backups have actually been deleted!
   ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/1"));
@@ -1270,7 +1359,6 @@ TEST_F(BackupableDBTest, CorruptionsTest) {
             file_manager_->FileExists(backupdir_ + "/meta/2"));
   ASSERT_EQ(Status::NotFound(),
             file_manager_->FileExists(backupdir_ + "/private/2"));
-
   CloseBackupEngine();
   AssertBackupConsistency(0, 0, keys_iteration * 1, keys_iteration * 5);
 
@@ -1283,9 +1371,8 @@ TEST_F(BackupableDBTest, CorruptionsTest) {
 }
 
 // Corrupt a file but maintain its size
-TEST_F(BackupableDBTest, CorruptFileMaintainSize) {
+TEST_F(BackupEngineTest, CorruptFileMaintainSize) {
   const int keys_iteration = 5000;
-  Status s;
   OpenDBAndBackupEngine(true);
   // create a backup
   FillDB(db_.get(), 0, keys_iteration);
@@ -1333,7 +1420,7 @@ TEST_F(BackupableDBTest, CorruptFileMaintainSize) {
   const std::string dir = backupdir_ + "/shared_checksum";
   ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children));
   for (const auto& child : children) {
-    if (child.name == "." || child.name == ".." || child.size_bytes == 0) {
+    if (child.size_bytes == 0) {
       continue;
     }
     // corrupt the file by replacing its content by file_size random bytes
@@ -1349,17 +1436,62 @@ TEST_F(BackupableDBTest, CorruptFileMaintainSize) {
   CloseDBAndBackupEngine();
 }
 
+// Corrupt a blob file but maintain its size
+TEST_P(BackupEngineTestWithParam, CorruptBlobFileMaintainSize) {
+  const int keys_iteration = 5000;
+  OpenDBAndBackupEngine(true);
+  // create a backup
+  FillDB(db_.get(), 0, keys_iteration);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  OpenDBAndBackupEngine();
+  // verify with file size
+  ASSERT_OK(backup_engine_->VerifyBackup(1, false));
+  // verify with file checksum
+  ASSERT_OK(backup_engine_->VerifyBackup(1, true));
+
+  std::string file_to_corrupt;
+  std::vector<FileAttributes> children;
+
+  std::string dir = backupdir_;
+  if (backupable_options_->share_files_with_checksum) {
+    dir += "/shared_checksum";
+  } else {
+    dir += "/shared";
+  }
+
+  ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children));
+
+  for (const auto& child : children) {
+    if (EndsWith(child.name, ".blob") && child.size_bytes != 0) {
+      // corrupt the blob files by replacing its content by file_size random
+      // bytes
+      ASSERT_OK(
+          file_manager_->CorruptFile(dir + "/" + child.name, child.size_bytes));
+    }
+  }
+
+  // file sizes match
+  ASSERT_OK(backup_engine_->VerifyBackup(1, false));
+  // file checksums mismatch
+  ASSERT_NOK(backup_engine_->VerifyBackup(1, true));
+  // sanity check, use default second argument
+  ASSERT_OK(backup_engine_->VerifyBackup(1));
+  CloseDBAndBackupEngine();
+}
+
 // Test if BackupEngine will fail to create new backup if some table has been
 // corrupted and the table file checksum is stored in the DB manifest
-TEST_F(BackupableDBTest, TableFileCorruptedBeforeBackup) {
+TEST_F(BackupEngineTest, TableFileCorruptedBeforeBackup) {
   const int keys_iteration = 50000;
 
   OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
                         kNoShare);
   FillDB(db_.get(), 0, keys_iteration);
-  CloseAndReopenDB();
+  CloseAndReopenDB(/*read_only*/ true);
   // corrupt a random table file in the DB directory
-  ASSERT_OK(CorruptRandomTableFileInDB());
+  ASSERT_OK(CorruptRandomDataFileInDB(kTableFile));
   // file_checksum_gen_factory is null, and thus table checksum is not
   // verified for creating a new backup; no correction is detected
   ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
@@ -1373,26 +1505,60 @@ TEST_F(BackupableDBTest, TableFileCorruptedBeforeBackup) {
   OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
                         kNoShare);
   FillDB(db_.get(), 0, keys_iteration);
-  CloseAndReopenDB();
+  CloseAndReopenDB(/*read_only*/ true);
   // corrupt a random table file in the DB directory
-  ASSERT_OK(CorruptRandomTableFileInDB());
+  ASSERT_OK(CorruptRandomDataFileInDB(kTableFile));
   // table file checksum is enabled so we should be able to detect any
   // corruption
   ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get()));
   CloseDBAndBackupEngine();
 }
 
+// Test if BackupEngine will fail to create new backup if some blob files has
+// been corrupted and the blob file checksum is stored in the DB manifest
+TEST_F(BackupEngineTest, BlobFileCorruptedBeforeBackup) {
+  const int keys_iteration = 50000;
+
+  OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+                        kNoShare);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseAndReopenDB(/*read_only*/ true);
+  // corrupt a random blob file in the DB directory
+  ASSERT_OK(CorruptRandomDataFileInDB(kBlobFile));
+  // file_checksum_gen_factory is null, and thus blob checksum is not
+  // verified for creating a new backup; no correction is detected
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+  CloseDBAndBackupEngine();
+
+  // delete old files in db
+  ASSERT_OK(DestroyDB(dbname_, options_));
+
+  // Enable file checksum in DB manifest
+  options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  OpenDBAndBackupEngine(true /* destroy_old_data */, false /* dummy */,
+                        kNoShare);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseAndReopenDB(/*read_only*/ true);
+  // corrupt a random blob file in the DB directory
+  ASSERT_OK(CorruptRandomDataFileInDB(kBlobFile));
+
+  // file checksum is enabled so we should be able to detect any
+  // corruption
+  ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get()));
+  CloseDBAndBackupEngine();
+}
+
 // Test if BackupEngine will fail to create new backup if some table has been
 // corrupted and the table file checksum is stored in the DB manifest for the
 // case when backup table files will be stored in a shared directory
-TEST_P(BackupableDBTestWithParam, TableFileCorruptedBeforeBackup) {
+TEST_P(BackupEngineTestWithParam, TableFileCorruptedBeforeBackup) {
   const int keys_iteration = 50000;
 
   OpenDBAndBackupEngine(true /* destroy_old_data */);
   FillDB(db_.get(), 0, keys_iteration);
-  CloseAndReopenDB();
+  CloseAndReopenDB(/*read_only*/ true);
   // corrupt a random table file in the DB directory
-  ASSERT_OK(CorruptRandomTableFileInDB());
+  ASSERT_OK(CorruptRandomDataFileInDB(kTableFile));
   // cannot detect corruption since DB manifest has no table checksums
   ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
   CloseDBAndBackupEngine();
@@ -1404,15 +1570,44 @@ TEST_P(BackupableDBTestWithParam, TableFileCorruptedBeforeBackup) {
   options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
   OpenDBAndBackupEngine(true /* destroy_old_data */);
   FillDB(db_.get(), 0, keys_iteration);
-  CloseAndReopenDB();
+  CloseAndReopenDB(/*read_only*/ true);
   // corrupt a random table file in the DB directory
-  ASSERT_OK(CorruptRandomTableFileInDB());
+  ASSERT_OK(CorruptRandomDataFileInDB(kTableFile));
+  // corruption is detected
+  ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get()));
+  CloseDBAndBackupEngine();
+}
+
+// Test if BackupEngine will fail to create new backup if some blob files have
+// been corrupted and the blob file checksum is stored in the DB manifest for
+// the case when backup blob files will be stored in a shared directory
+TEST_P(BackupEngineTestWithParam, BlobFileCorruptedBeforeBackup) {
+  const int keys_iteration = 50000;
+  OpenDBAndBackupEngine(true /* destroy_old_data */);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseAndReopenDB(/*read_only*/ true);
+  // corrupt a random blob file in the DB directory
+  ASSERT_OK(CorruptRandomDataFileInDB(kBlobFile));
+  // cannot detect corruption since DB manifest has no blob file checksums
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+  CloseDBAndBackupEngine();
+
+  // delete old files in db
+  ASSERT_OK(DestroyDB(dbname_, options_));
+
+  // Enable blob file checksums in DB manifest
+  options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  OpenDBAndBackupEngine(true /* destroy_old_data */);
+  FillDB(db_.get(), 0, keys_iteration);
+  CloseAndReopenDB(/*read_only*/ true);
+  // corrupt a random blob file in the DB directory
+  ASSERT_OK(CorruptRandomDataFileInDB(kBlobFile));
   // corruption is detected
   ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get()));
   CloseDBAndBackupEngine();
 }
 
-TEST_F(BackupableDBTest, TableFileWithoutDbChecksumCorruptedDuringBackup) {
+TEST_F(BackupEngineTest, TableFileWithoutDbChecksumCorruptedDuringBackup) {
   const int keys_iteration = 50000;
   backupable_options_->share_files_with_checksum_naming =
       kLegacyCrc32cAndFileSize;
@@ -1453,7 +1648,7 @@ TEST_F(BackupableDBTest, TableFileWithoutDbChecksumCorruptedDuringBackup) {
   ASSERT_OK(DestroyDB(dbname_, options_));
 }
 
-TEST_F(BackupableDBTest, TableFileWithDbChecksumCorruptedDuringBackup) {
+TEST_F(BackupEngineTest, TableFileWithDbChecksumCorruptedDuringBackup) {
   const int keys_iteration = 50000;
   options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
   for (auto& sopt : kAllShareOptions) {
@@ -1493,7 +1688,7 @@ TEST_F(BackupableDBTest, TableFileWithDbChecksumCorruptedDuringBackup) {
   }
 }
 
-TEST_F(BackupableDBTest, InterruptCreationTest) {
+TEST_F(BackupEngineTest, InterruptCreationTest) {
   // Interrupt backup creation by failing new writes and failing cleanup of the
   // partial state. Then verify a subsequent backup can still succeed.
   const int keys_iteration = 5000;
@@ -1504,8 +1699,7 @@ TEST_F(BackupableDBTest, InterruptCreationTest) {
   test_backup_env_->SetLimitWrittenFiles(2);
   test_backup_env_->SetDeleteFileFailure(true);
   // should fail creation
-  ASSERT_FALSE(
-      backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)).ok());
+  ASSERT_NOK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)));
   CloseDBAndBackupEngine();
   // should also fail cleanup so the tmp directory stays behind
   ASSERT_OK(backup_chroot_env_->FileExists(backupdir_ + "/private/1/"));
@@ -1519,7 +1713,7 @@ TEST_F(BackupableDBTest, InterruptCreationTest) {
   AssertBackupConsistency(0, 0, keys_iteration);
 }
 
-TEST_F(BackupableDBTest, FlushCompactDuringBackupCheckpoint) {
+TEST_F(BackupEngineTest, FlushCompactDuringBackupCheckpoint) {
   const int keys_iteration = 5000;
   options_.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
   for (const auto& sopt : kAllShareOptions) {
@@ -1528,21 +1722,21 @@ TEST_F(BackupableDBTest, FlushCompactDuringBackupCheckpoint) {
     // That FillDB leaves a mix of flushed and unflushed data
     SyncPoint::GetInstance()->LoadDependency(
         {{"CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1",
-          "BackupableDBTest::FlushCompactDuringBackupCheckpoint:Before"},
-         {"BackupableDBTest::FlushCompactDuringBackupCheckpoint:After",
+          "BackupEngineTest::FlushCompactDuringBackupCheckpoint:Before"},
+         {"BackupEngineTest::FlushCompactDuringBackupCheckpoint:After",
           "CheckpointImpl::CreateCustomCheckpoint:AfterGetLive2"}});
     SyncPoint::GetInstance()->EnableProcessing();
     ROCKSDB_NAMESPACE::port::Thread flush_thread{[this]() {
       TEST_SYNC_POINT(
-          "BackupableDBTest::FlushCompactDuringBackupCheckpoint:Before");
+          "BackupEngineTest::FlushCompactDuringBackupCheckpoint:Before");
       FillDB(db_.get(), keys_iteration, 2 * keys_iteration);
       ASSERT_OK(db_->Flush(FlushOptions()));
       DBImpl* dbi = static_cast<DBImpl*>(db_.get());
-      dbi->TEST_WaitForFlushMemTable();
+      ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
       ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-      dbi->TEST_WaitForCompact();
+      ASSERT_OK(dbi->TEST_WaitForCompact());
       TEST_SYNC_POINT(
-          "BackupableDBTest::FlushCompactDuringBackupCheckpoint:After");
+          "BackupEngineTest::FlushCompactDuringBackupCheckpoint:After");
     }};
     ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
     flush_thread.join();
@@ -1558,7 +1752,7 @@ TEST_F(BackupableDBTest, FlushCompactDuringBackupCheckpoint) {
       const std::string dir = backupdir_ + "/shared_checksum";
       ASSERT_OK(file_manager_->GetChildrenFileAttributes(dir, &children));
       for (const auto& child : children) {
-        if (child.name == "." || child.name == ".." || child.size_bytes == 0) {
+        if (child.size_bytes == 0) {
           continue;
         }
         const std::string match("match");
@@ -1580,7 +1774,7 @@ inline std::string OptionsPath(std::string ret, int backupID) {
 // Backup the LATEST options file to
 // "<backup_dir>/private/<backup_id>/OPTIONS<number>"
 
-TEST_F(BackupableDBTest, BackupOptions) {
+TEST_F(BackupEngineTest, BackupOptions) {
   OpenDBAndBackupEngine(true);
   for (int i = 1; i < 5; i++) {
     std::string name;
@@ -1590,10 +1784,11 @@ TEST_F(BackupableDBTest, BackupOptions) {
     db_.reset();
     db_.reset(OpenDB());
     ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
-    ROCKSDB_NAMESPACE::GetLatestOptionsFileName(db_->GetName(), options_.env,
-                                                &name);
+    ASSERT_OK(ROCKSDB_NAMESPACE::GetLatestOptionsFileName(db_->GetName(),
+                                                          options_.env, &name));
     ASSERT_OK(file_manager_->FileExists(OptionsPath(backupdir_, i) + name));
-    backup_chroot_env_->GetChildren(OptionsPath(backupdir_, i), &filenames);
+    ASSERT_OK(backup_chroot_env_->GetChildren(OptionsPath(backupdir_, i),
+                                              &filenames));
     for (auto fn : filenames) {
       if (fn.compare(0, 7, "OPTIONS") == 0) {
         ASSERT_EQ(name, fn);
@@ -1604,17 +1799,17 @@ TEST_F(BackupableDBTest, BackupOptions) {
   CloseDBAndBackupEngine();
 }
 
-TEST_F(BackupableDBTest, SetOptionsBackupRaceCondition) {
+TEST_F(BackupEngineTest, SetOptionsBackupRaceCondition) {
   OpenDBAndBackupEngine(true);
   SyncPoint::GetInstance()->LoadDependency(
       {{"CheckpointImpl::CreateCheckpoint:SavedLiveFiles1",
-        "BackupableDBTest::SetOptionsBackupRaceCondition:BeforeSetOptions"},
-       {"BackupableDBTest::SetOptionsBackupRaceCondition:AfterSetOptions",
+        "BackupEngineTest::SetOptionsBackupRaceCondition:BeforeSetOptions"},
+       {"BackupEngineTest::SetOptionsBackupRaceCondition:AfterSetOptions",
         "CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"}});
   SyncPoint::GetInstance()->EnableProcessing();
   ROCKSDB_NAMESPACE::port::Thread setoptions_thread{[this]() {
     TEST_SYNC_POINT(
-        "BackupableDBTest::SetOptionsBackupRaceCondition:BeforeSetOptions");
+        "BackupEngineTest::SetOptionsBackupRaceCondition:BeforeSetOptions");
     DBImpl* dbi = static_cast<DBImpl*>(db_.get());
     // Change arbitrary option to trigger OPTIONS file deletion
     ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
@@ -1624,7 +1819,7 @@ TEST_F(BackupableDBTest, SetOptionsBackupRaceCondition) {
     ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
                               {{"paranoid_file_checks", "false"}}));
     TEST_SYNC_POINT(
-        "BackupableDBTest::SetOptionsBackupRaceCondition:AfterSetOptions");
+        "BackupEngineTest::SetOptionsBackupRaceCondition:AfterSetOptions");
   }};
   ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
   setoptions_thread.join();
@@ -1633,10 +1828,9 @@ TEST_F(BackupableDBTest, SetOptionsBackupRaceCondition) {
 
 // This test verifies we don't delete the latest backup when read-only option is
 // set
-TEST_F(BackupableDBTest, NoDeleteWithReadOnly) {
+TEST_F(BackupEngineTest, NoDeleteWithReadOnly) {
   const int keys_iteration = 5000;
   Random rnd(6);
-  Status s;
 
   OpenDBAndBackupEngine(true);
   // create five backups
@@ -1666,7 +1860,7 @@ TEST_F(BackupableDBTest, NoDeleteWithReadOnly) {
   delete read_only_backup_engine;
 }
 
-TEST_F(BackupableDBTest, FailOverwritingBackups) {
+TEST_F(BackupEngineTest, FailOverwritingBackups) {
   options_.write_buffer_size = 1024 * 1024 * 1024;  // 1GB
   options_.disable_auto_compactions = true;
 
@@ -1704,7 +1898,7 @@ TEST_F(BackupableDBTest, FailOverwritingBackups) {
   CloseDBAndBackupEngine();
 }
 
-TEST_F(BackupableDBTest, NoShareTableFiles) {
+TEST_F(BackupEngineTest, NoShareTableFiles) {
   const int keys_iteration = 5000;
   OpenDBAndBackupEngine(true, false, kNoShare);
   for (int i = 0; i < 5; ++i) {
@@ -1720,7 +1914,7 @@ TEST_F(BackupableDBTest, NoShareTableFiles) {
 }
 
 // Verify that you can backup and restore with share_files_with_checksum on
-TEST_F(BackupableDBTest, ShareTableFilesWithChecksums) {
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksums) {
   const int keys_iteration = 5000;
   OpenDBAndBackupEngine(true, false, kShareWithChecksum);
   for (int i = 0; i < 5; ++i) {
@@ -1737,7 +1931,7 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksums) {
 
 // Verify that you can backup and restore using share_files_with_checksum set to
 // false and then transition this option to true
-TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsTransition) {
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsTransition) {
   const int keys_iteration = 5000;
   // set share_files_with_checksum to false
   OpenDBAndBackupEngine(true, false, kShareNoChecksum);
@@ -1767,8 +1961,8 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsTransition) {
   // For an extra challenge, make sure that GarbageCollect / DeleteBackup
   // is OK even if we open without share_table_files
   OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare);
-  backup_engine_->DeleteBackup(1);
-  backup_engine_->GarbageCollect();
+  ASSERT_OK(backup_engine_->DeleteBackup(1));
+  ASSERT_OK(backup_engine_->GarbageCollect());
   CloseDBAndBackupEngine();
 
   // Verify rest (not deleted)
@@ -1779,7 +1973,7 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsTransition) {
 }
 
 // Verify backup and restore with various naming options, check names
-TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNaming) {
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsNewNaming) {
   ASSERT_TRUE(backupable_options_->share_files_with_checksum_naming ==
               kNamingDefault);
 
@@ -1799,62 +1993,7 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNaming) {
        "[0-9]+_s[0-9A-Z]{20}_[0-9]+[.]sst"},
   };
 
-  for (const auto& pair : option_to_expected) {
-    // kFlagMatchInterimNaming must not matter on new SST files
-    for (const auto option :
-         {pair.first, pair.first | kFlagMatchInterimNaming}) {
-      CloseAndReopenDB();
-      backupable_options_->share_files_with_checksum_naming = option;
-      OpenBackupEngine(true /*destroy_old_data*/);
-      ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
-      CloseDBAndBackupEngine();
-      AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * 2);
-      AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum",
-                                     std::regex(pair.second),
-                                     1 /* minimum_count */);
-      if (std::string::npos != pair.second.find("_[0-9]+[.]sst")) {
-        AssertDirectoryFilesSizeIndicators(backupdir_ + "/shared_checksum",
-                                           1 /* minimum_count */);
-      }
-    }
-  }
-}
-
-// Mimic SST file generated by early internal-only 6.12 release
-// and test various naming options. This test can be removed when
-// the kFlagMatchInterimNaming feature is removed.
-TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsInterimNaming) {
-  const int keys_iteration = 5000;
-
-  // Essentially, reinstate old implementaiton of generating a DB
-  // session id. This is how we distinguish "interim" SST files from
-  // newer ones: from the form of the db session id string.
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::SetDbSessionId", [&](void* sid_void_star) {
-        std::string* sid = static_cast<std::string*>(sid_void_star);
-        *sid = test_db_env_->GenerateUniqueId();
-        if (!sid->empty() && sid->back() == '\n') {
-          sid->pop_back();
-        }
-      });
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-
-  OpenDBAndBackupEngine(true, false, kShareWithChecksum);
-  FillDB(db_.get(), 0, keys_iteration);
-  CloseDBAndBackupEngine();
-
-  static const std::map<ShareFilesNaming, std::string> option_to_expected = {
-      {kLegacyCrc32cAndFileSize, "[0-9]+_[0-9]+_[0-9]+[.]sst"},
-      // kFlagMatchInterimNaming ignored here
-      {kLegacyCrc32cAndFileSize | kFlagMatchInterimNaming,
-       "[0-9]+_[0-9]+_[0-9]+[.]sst"},
-      {kUseDbSessionId, "[0-9]+_s[0-9a-fA-F-]+[.]sst"},
-      {kUseDbSessionId | kFlagIncludeFileSize,
-       "[0-9]+_s[0-9a-fA-F-]+_[0-9]+[.]sst"},
-      {kUseDbSessionId | kFlagMatchInterimNaming, "[0-9]+_[0-9a-fA-F-]+[.]sst"},
-      {kUseDbSessionId | kFlagIncludeFileSize | kFlagMatchInterimNaming,
-       "[0-9]+_[0-9a-fA-F-]+[.]sst"},
-  };
+  const std::string blobfile_pattern = "[0-9]+_[0-9]+_[0-9]+[.]blob";
 
   for (const auto& pair : option_to_expected) {
     CloseAndReopenDB();
@@ -1864,17 +2003,21 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsInterimNaming) {
     CloseDBAndBackupEngine();
     AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * 2);
     AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum",
-                                   std::regex(pair.second),
+                                   std::regex(pair.second), ".sst",
+                                   1 /* minimum_count */);
+    if (std::string::npos != pair.second.find("_[0-9]+[.]sst")) {
+      AssertDirectoryFilesSizeIndicators(backupdir_ + "/shared_checksum",
+                                         1 /* minimum_count */);
+    }
+    AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum",
+                                   std::regex(blobfile_pattern), ".blob",
                                    1 /* minimum_count */);
   }
-
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
 // Mimic SST file generated by pre-6.12 releases and verify that
 // old names are always used regardless of naming option.
-TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsOldFileNaming) {
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsOldFileNaming) {
   const int keys_iteration = 5000;
 
   // Pre-6.12 release did not include db id and db session id properties.
@@ -1893,6 +2036,8 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsOldFileNaming) {
   // Old names should always be used on old files
   const std::regex expected("[0-9]+_[0-9]+_[0-9]+[.]sst");
 
+  const std::string blobfile_pattern = "[0-9]+_[0-9]+_[0-9]+[.]blob";
+
   for (ShareFilesNaming option : {kNamingDefault, kUseDbSessionId}) {
     CloseAndReopenDB();
     backupable_options_->share_files_with_checksum_naming = option;
@@ -1901,6 +2046,9 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsOldFileNaming) {
     CloseDBAndBackupEngine();
     AssertBackupConsistency(1, 0, keys_iteration, keys_iteration * 2);
     AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum", expected,
+                                   ".sst", 1 /* minimum_count */);
+    AssertDirectoryFilesMatchRegex(backupdir_ + "/shared_checksum",
+                                   std::regex(blobfile_pattern), ".blob",
                                    1 /* minimum_count */);
   }
 
@@ -1910,7 +2058,7 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsOldFileNaming) {
 
 // Test how naming options interact with detecting DB corruption
 // between incremental backups
-TEST_F(BackupableDBTest, TableFileCorruptionBeforeIncremental) {
+TEST_F(BackupEngineTest, TableFileCorruptionBeforeIncremental) {
   const auto share_no_checksum = static_cast<ShareFilesNaming>(0);
 
   for (bool corrupt_before_first_backup : {false, true}) {
@@ -1929,15 +2077,17 @@ TEST_F(BackupableDBTest, TableFileCorruptionBeforeIncremental) {
       // And a bigger one
       ASSERT_OK(dbi->Put(WriteOptions(), "y", Random(42).RandomString(500)));
       ASSERT_OK(dbi->Flush(FlushOptions()));
-      dbi->TEST_WaitForFlushMemTable();
-      CloseDBAndBackupEngine();
+      ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+      CloseAndReopenDB(/*read_only*/ true);
 
       std::vector<FileAttributes> table_files;
-      ASSERT_OK(GetTableFilesInDB(&table_files));
+      ASSERT_OK(GetDataFilesInDB(kTableFile, &table_files));
       ASSERT_EQ(table_files.size(), 2);
       std::string tf0 = dbname_ + "/" + table_files[0].name;
       std::string tf1 = dbname_ + "/" + table_files[1].name;
 
+      CloseDBAndBackupEngine();
+
       if (corrupt_before_first_backup) {
         // This corrupts a data block, which does not cause DB open
         // failure, only failure on accessing the block.
@@ -2016,8 +2166,10 @@ TEST_F(BackupableDBTest, TableFileCorruptionBeforeIncremental) {
 
 // Test how naming options interact with detecting file size corruption
 // between incremental backups
-TEST_F(BackupableDBTest, FileSizeForIncremental) {
+TEST_F(BackupEngineTest, FileSizeForIncremental) {
   const auto share_no_checksum = static_cast<ShareFilesNaming>(0);
+  // TODO: enable blob files once Integrated BlobDB supports DB session id.
+  options_.enable_blob_files = false;
 
   for (ShareFilesNaming option : {share_no_checksum, kLegacyCrc32cAndFileSize,
                                   kNamingDefault, kUseDbSessionId}) {
@@ -2042,9 +2194,9 @@ TEST_F(BackupableDBTest, FileSizeForIncremental) {
     ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true /*flush*/));
     CloseDBAndBackupEngine();
 
-    // Corrupt backup SST
+    // Corrupt backup SST file
     ASSERT_OK(file_manager_->GetChildrenFileAttributes(shared_dir, &children));
-    ASSERT_EQ(children.size(), 3U);  // ".", "..", one sst
+    ASSERT_EQ(children.size(), 1U);  // one sst
     for (const auto& child : children) {
       if (child.name.size() > 4 && child.size_bytes > 0) {
         ASSERT_OK(
@@ -2101,10 +2253,10 @@ TEST_F(BackupableDBTest, FileSizeForIncremental) {
     OpenDBAndBackupEngine(false, false, share);
     ASSERT_OK(db_->Put(WriteOptions(), "y", Random(42).RandomString(500)));
 
-    // Count backup SSTs
+    // Count backup SSTs files.
     children.clear();
     ASSERT_OK(file_manager_->GetChildrenFileAttributes(shared_dir, &children));
-    ASSERT_EQ(children.size(), 4U);  // ".", "..", two sst
+    ASSERT_EQ(children.size(), 2U);  // two sst files
 
     // Try create backup 3
     s = backup_engine_->CreateNewBackup(db_.get(), true /*flush*/);
@@ -2117,18 +2269,18 @@ TEST_F(BackupableDBTest, FileSizeForIncremental) {
       // Acceptable to call it corruption if size is not in name and
       // db session id collision is practically impossible.
       EXPECT_TRUE(s.IsCorruption());
-      EXPECT_EQ(children.size(), 4U);  // no SST added
+      EXPECT_EQ(children.size(), 2U);  // no SST file added
     } else if (option == share_no_checksum) {
       // Good to call it corruption if both backups cannot be
       // accommodated.
       EXPECT_TRUE(s.IsCorruption());
-      EXPECT_EQ(children.size(), 4U);  // no SST added
+      EXPECT_EQ(children.size(), 2U);  // no SST file added
     } else {
       // Since opening a DB seems sufficient for detecting size corruption
       // on the DB side, this should be a good thing, ...
       EXPECT_OK(s);
       // ... as long as we did actually treat it as a distinct SST file.
-      EXPECT_EQ(children.size(), 5U);  // Another SST added
+      EXPECT_EQ(children.size(), 3U);  // Another SST added
     }
     CloseDBAndBackupEngine();
     ASSERT_OK(DestroyDB(dbname_, options_));
@@ -2140,7 +2292,7 @@ TEST_F(BackupableDBTest, FileSizeForIncremental) {
 // Verify backup and restore with share_files_with_checksum off and then
 // transition this option to on and share_files_with_checksum_naming to be
 // based on kUseDbSessionId
-TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNamingTransition) {
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsNewNamingTransition) {
   const int keys_iteration = 5000;
   // We may set share_files_with_checksum_naming to kLegacyCrc32cAndFileSize
   // here but even if we don't, it should have no effect when
@@ -2188,8 +2340,8 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNamingTransition) {
   ASSERT_TRUE(backupable_options_->share_files_with_checksum_naming ==
               kNamingDefault);
   OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare);
-  backup_engine_->DeleteBackup(1);
-  backup_engine_->GarbageCollect();
+  ASSERT_OK(backup_engine_->DeleteBackup(1));
+  ASSERT_OK(backup_engine_->GarbageCollect());
   CloseDBAndBackupEngine();
 
   // Verify second (about to delete)
@@ -2201,8 +2353,8 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNamingTransition) {
   backupable_options_->share_files_with_checksum_naming =
       kLegacyCrc32cAndFileSize;
   OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare);
-  backup_engine_->DeleteBackup(2);
-  backup_engine_->GarbageCollect();
+  ASSERT_OK(backup_engine_->DeleteBackup(2));
+  ASSERT_OK(backup_engine_->GarbageCollect());
   CloseDBAndBackupEngine();
 
   // Verify rest (not deleted)
@@ -2214,7 +2366,7 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNamingTransition) {
 
 // Verify backup and restore with share_files_with_checksum on and transition
 // from kLegacyCrc32cAndFileSize to kUseDbSessionId
-TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNamingUpgrade) {
+TEST_F(BackupEngineTest, ShareTableFilesWithChecksumsNewNamingUpgrade) {
   backupable_options_->share_files_with_checksum_naming =
       kLegacyCrc32cAndFileSize;
   const int keys_iteration = 5000;
@@ -2253,8 +2405,8 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNamingUpgrade) {
   // For an extra challenge, make sure that GarbageCollect / DeleteBackup
   // is OK even if we open without share_table_files
   OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare);
-  backup_engine_->DeleteBackup(1);
-  backup_engine_->GarbageCollect();
+  ASSERT_OK(backup_engine_->DeleteBackup(1));
+  ASSERT_OK(backup_engine_->GarbageCollect());
   CloseDBAndBackupEngine();
 
   // Verify second (about to delete)
@@ -2266,8 +2418,8 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNamingUpgrade) {
   backupable_options_->share_files_with_checksum_naming =
       kLegacyCrc32cAndFileSize;
   OpenDBAndBackupEngine(false /* destroy_old_data */, false, kNoShare);
-  backup_engine_->DeleteBackup(2);
-  backup_engine_->GarbageCollect();
+  ASSERT_OK(backup_engine_->DeleteBackup(2));
+  ASSERT_OK(backup_engine_->GarbageCollect());
   CloseDBAndBackupEngine();
 
   // Verify rest (not deleted)
@@ -2279,7 +2431,7 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsNewNamingUpgrade) {
 
 // This test simulates cleaning up after aborted or incomplete creation
 // of a new backup.
-TEST_F(BackupableDBTest, DeleteTmpFiles) {
+TEST_F(BackupEngineTest, DeleteTmpFiles) {
   for (int cleanup_fn : {1, 2, 3, 4}) {
     for (ShareOption shared_option : kAllShareOptions) {
       OpenDBAndBackupEngine(false /* destroy_old_data */, false /* dummy */,
@@ -2312,11 +2464,11 @@ TEST_F(BackupableDBTest, DeleteTmpFiles) {
                std::make_pair(next_private, std::string("00003.sst")),
            }) {
         std::string dir = backupdir_ + "/" + dir_and_file.first;
-        file_manager_->CreateDir(dir);
+        ASSERT_OK(file_manager_->CreateDirIfMissing(dir));
         ASSERT_OK(file_manager_->FileExists(dir));
 
         std::string file = dir + "/" + dir_and_file.second;
-        file_manager_->WriteToFile(file, "tmp");
+        ASSERT_OK(file_manager_->WriteToFile(file, "tmp"));
         ASSERT_OK(file_manager_->FileExists(file));
 
         tmp_files_and_dirs.push_back(file);
@@ -2357,7 +2509,7 @@ TEST_F(BackupableDBTest, DeleteTmpFiles) {
   }
 }
 
-TEST_F(BackupableDBTest, KeepLogFiles) {
+TEST_F(BackupEngineTest, KeepLogFiles) {
   backupable_options_->backup_log_files = false;
   // basically infinite
   options_.WAL_ttl_seconds = 24 * 60 * 60;
@@ -2374,20 +2526,20 @@ TEST_F(BackupableDBTest, KeepLogFiles) {
   AssertBackupConsistency(0, 0, 500, 600, true);
 }
 
-class BackupableDBRateLimitingTestWithParam
-    : public BackupableDBTest,
+class BackupEngineRateLimitingTestWithParam
+    : public BackupEngineTest,
       public testing::WithParamInterface<
           std::tuple<bool /* make throttle */,
                      int /* 0 = single threaded, 1 = multi threaded*/,
                      std::pair<uint64_t, uint64_t> /* limits */>> {
  public:
-  BackupableDBRateLimitingTestWithParam() {}
+  BackupEngineRateLimitingTestWithParam() {}
 };
 
 uint64_t const MB = 1024 * 1024;
 
 INSTANTIATE_TEST_CASE_P(
-    RateLimiting, BackupableDBRateLimitingTestWithParam,
+    RateLimiting, BackupEngineRateLimitingTestWithParam,
     ::testing::Values(std::make_tuple(false, 0, std::make_pair(1 * MB, 5 * MB)),
                       std::make_tuple(false, 0, std::make_pair(2 * MB, 3 * MB)),
                       std::make_tuple(false, 1, std::make_pair(1 * MB, 5 * MB)),
@@ -2398,7 +2550,7 @@ INSTANTIATE_TEST_CASE_P(
                       std::make_tuple(true, 1,
                                       std::make_pair(2 * MB, 3 * MB))));
 
-TEST_P(BackupableDBRateLimitingTestWithParam, RateLimiting) {
+TEST_P(BackupEngineRateLimitingTestWithParam, RateLimiting) {
   size_t const kMicrosPerSec = 1000 * 1000LL;
 
   std::shared_ptr<RateLimiter> backupThrottler(NewGenericRateLimiter(1));
@@ -2449,7 +2601,7 @@ TEST_P(BackupableDBRateLimitingTestWithParam, RateLimiting) {
   AssertBackupConsistency(0, 0, 100000, 100010);
 }
 
-TEST_F(BackupableDBTest, ReadOnlyBackupEngine) {
+TEST_F(BackupEngineTest, ReadOnlyBackupEngine) {
   DestroyDB(dbname_, options_);
   OpenDBAndBackupEngine(true);
   FillDB(db_.get(), 0, 100);
@@ -2481,7 +2633,73 @@ TEST_F(BackupableDBTest, ReadOnlyBackupEngine) {
   delete db;
 }
 
-TEST_F(BackupableDBTest, ProgressCallbackDuringBackup) {
+TEST_F(BackupEngineTest, OpenBackupAsReadOnlyDB) {
+  DestroyDB(dbname_, options_);
+  options_.write_dbid_to_manifest = false;
+
+  OpenDBAndBackupEngine(true);
+  FillDB(db_.get(), 0, 100);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), /*flush*/ false));
+
+  options_.write_dbid_to_manifest = true;  // exercises some read-only DB code
+  CloseAndReopenDB();
+
+  FillDB(db_.get(), 100, 200);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), /*flush*/ false));
+  db_.reset();  // CloseDB
+  DestroyDB(dbname_, options_);
+  BackupInfo backup_info;
+  // First, check that we get empty fields without include_file_details
+  ASSERT_OK(backup_engine_->GetBackupInfo(/*id*/ 1U, &backup_info,
+                                          /*with file details*/ false));
+  ASSERT_EQ(backup_info.name_for_open, "");
+  ASSERT_FALSE(backup_info.env_for_open);
+
+  // Now for the real test
+  backup_info = BackupInfo();
+  ASSERT_OK(backup_engine_->GetBackupInfo(/*id*/ 1U, &backup_info,
+                                          /*with file details*/ true));
+
+  // Caution: DBOptions only holds a raw pointer to Env, so something else
+  // must keep it alive.
+  // Case 1: Keeping BackupEngine open suffices to keep Env alive
+  DB* db = nullptr;
+  Options opts = options_;
+  // Ensure some key defaults are set
+  opts.wal_dir = "";
+  opts.create_if_missing = false;
+  opts.info_log.reset();
+
+  opts.env = backup_info.env_for_open.get();
+  std::string name = backup_info.name_for_open;
+  backup_info = BackupInfo();
+  ASSERT_OK(DB::OpenForReadOnly(opts, name, &db));
+
+  AssertExists(db, 0, 100);
+  AssertEmpty(db, 100, 200);
+
+  delete db;
+  db = nullptr;
+
+  // Case 2: Keeping BackupInfo alive rather than BackupEngine also suffices
+  ASSERT_OK(backup_engine_->GetBackupInfo(/*id*/ 2U, &backup_info,
+                                          /*with file details*/ true));
+  CloseBackupEngine();
+  opts.create_if_missing = true;  // check also OK (though pointless)
+  opts.env = backup_info.env_for_open.get();
+  name = backup_info.name_for_open;
+  // Note: keeping backup_info alive
+  ASSERT_OK(DB::OpenForReadOnly(opts, name, &db));
+
+  AssertExists(db, 0, 200);
+  delete db;
+  db = nullptr;
+
+  // Now try opening read-write and make sure it fails, for safety.
+  ASSERT_TRUE(DB::Open(opts, name, &db).IsIOError());
+}
+
+TEST_F(BackupEngineTest, ProgressCallbackDuringBackup) {
   DestroyDB(dbname_, options_);
   OpenDBAndBackupEngine(true);
   FillDB(db_.get(), 0, 100);
@@ -2495,24 +2713,24 @@ TEST_F(BackupableDBTest, ProgressCallbackDuringBackup) {
   DestroyDB(dbname_, options_);
 }
 
-TEST_F(BackupableDBTest, GarbageCollectionBeforeBackup) {
+TEST_F(BackupEngineTest, GarbageCollectionBeforeBackup) {
   DestroyDB(dbname_, options_);
   OpenDBAndBackupEngine(true);
 
-  backup_chroot_env_->CreateDirIfMissing(backupdir_ + "/shared");
-  std::string file_five = backupdir_ + "/shared/000007.sst";
+  ASSERT_OK(backup_chroot_env_->CreateDirIfMissing(backupdir_ + "/shared"));
+  std::string file_five = backupdir_ + "/shared/000009.sst";
   std::string file_five_contents = "I'm not really a sst file";
-  // this depends on the fact that 00007.sst is the first file created by the DB
+  // this depends on the fact that 00009.sst is the first file created by the DB
   ASSERT_OK(file_manager_->WriteToFile(file_five, file_five_contents));
 
   FillDB(db_.get(), 0, 100);
-  // backup overwrites file 000007.sst
-  ASSERT_TRUE(backup_engine_->CreateNewBackup(db_.get(), true).ok());
+  // backup overwrites file 000009.sst
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
 
   std::string new_file_five_contents;
   ASSERT_OK(ReadFileToString(backup_chroot_env_.get(), file_five,
                              &new_file_five_contents));
-  // file 000007.sst was overwritten
+  // file 000009.sst was overwritten
   ASSERT_TRUE(new_file_five_contents != file_five_contents);
 
   CloseDBAndBackupEngine();
@@ -2521,7 +2739,7 @@ TEST_F(BackupableDBTest, GarbageCollectionBeforeBackup) {
 }
 
 // Test that we properly propagate Env failures
-TEST_F(BackupableDBTest, EnvFailures) {
+TEST_F(BackupEngineTest, EnvFailures) {
   BackupEngine* backup_engine;
 
   // get children failure
@@ -2553,7 +2771,7 @@ TEST_F(BackupableDBTest, EnvFailures) {
     DestroyDB(dbname_, options_);
     OpenDBAndBackupEngine(true);
     FillDB(db_.get(), 0, 100);
-    ASSERT_TRUE(backup_engine_->CreateNewBackup(db_.get(), true).ok());
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
     CloseDBAndBackupEngine();
     test_backup_env_->SetDummySequentialFile(true);
     test_backup_env_->SetDummySequentialFileFailReads(true);
@@ -2574,7 +2792,7 @@ TEST_F(BackupableDBTest, EnvFailures) {
 
 // Verify manifest can roll while a backup is being created with the old
 // manifest.
-TEST_F(BackupableDBTest, ChangeManifestDuringBackupCreation) {
+TEST_F(BackupEngineTest, ChangeManifestDuringBackupCreation) {
   DestroyDB(dbname_, options_);
   options_.max_manifest_file_size = 0;  // always rollover manifest for file add
   OpenDBAndBackupEngine(true);
@@ -2613,10 +2831,11 @@ TEST_F(BackupableDBTest, ChangeManifestDuringBackupCreation) {
 }
 
 // see https://github.com/facebook/rocksdb/issues/921
-TEST_F(BackupableDBTest, Issue921Test) {
+TEST_F(BackupEngineTest, Issue921Test) {
   BackupEngine* backup_engine;
   backupable_options_->share_table_files = false;
-  backup_chroot_env_->CreateDirIfMissing(backupable_options_->backup_dir);
+  ASSERT_OK(
+      backup_chroot_env_->CreateDirIfMissing(backupable_options_->backup_dir));
   backupable_options_->backup_dir += "/new_dir";
   ASSERT_OK(BackupEngine::Open(backup_chroot_env_.get(), *backupable_options_,
                                &backup_engine));
@@ -2624,30 +2843,45 @@ TEST_F(BackupableDBTest, Issue921Test) {
   delete backup_engine;
 }
 
-TEST_F(BackupableDBTest, BackupWithMetadata) {
+TEST_F(BackupEngineTest, BackupWithMetadata) {
   const int keys_iteration = 5000;
   OpenDBAndBackupEngine(true);
   // create five backups
   for (int i = 0; i < 5; ++i) {
     const std::string metadata = std::to_string(i);
     FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
-    ASSERT_OK(
-        backup_engine_->CreateNewBackupWithMetadata(db_.get(), metadata, true));
+    // Here also test CreateNewBackupWithMetadata with CreateBackupOptions
+    // and outputting saved BackupID.
+    CreateBackupOptions opts;
+    opts.flush_before_backup = true;
+    BackupID new_id = 0;
+    ASSERT_OK(backup_engine_->CreateNewBackupWithMetadata(opts, db_.get(),
+                                                          metadata, &new_id));
+    ASSERT_EQ(new_id, static_cast<BackupID>(i + 1));
   }
   CloseDBAndBackupEngine();
 
   OpenDBAndBackupEngine();
-  std::vector<BackupInfo> backup_infos;
-  backup_engine_->GetBackupInfo(&backup_infos);
-  ASSERT_EQ(5, backup_infos.size());
+  {  // Verify in bulk BackupInfo
+    std::vector<BackupInfo> backup_infos;
+    backup_engine_->GetBackupInfo(&backup_infos);
+    ASSERT_EQ(5, backup_infos.size());
+    for (int i = 0; i < 5; i++) {
+      ASSERT_EQ(std::to_string(i), backup_infos[i].app_metadata);
+    }
+  }
+  // Also verify in individual BackupInfo
   for (int i = 0; i < 5; i++) {
-    ASSERT_EQ(std::to_string(i), backup_infos[i].app_metadata);
+    BackupInfo backup_info;
+    ASSERT_OK(backup_engine_->GetBackupInfo(static_cast<BackupID>(i + 1),
+                                            &backup_info));
+    ASSERT_EQ(std::to_string(i), backup_info.app_metadata);
   }
   CloseDBAndBackupEngine();
   DestroyDB(dbname_, options_);
 }
 
-TEST_F(BackupableDBTest, BinaryMetadata) {
+TEST_F(BackupEngineTest, BinaryMetadata) {
   OpenDBAndBackupEngine(true);
   std::string binaryMetadata = "abc\ndef";
   binaryMetadata.push_back('\0');
@@ -2665,7 +2899,7 @@ TEST_F(BackupableDBTest, BinaryMetadata) {
   DestroyDB(dbname_, options_);
 }
 
-TEST_F(BackupableDBTest, MetadataTooLarge) {
+TEST_F(BackupEngineTest, MetadataTooLarge) {
   OpenDBAndBackupEngine(true);
   std::string largeMetadata(1024 * 1024 + 1, 0);
   ASSERT_NOK(
@@ -2674,7 +2908,325 @@ TEST_F(BackupableDBTest, MetadataTooLarge) {
   DestroyDB(dbname_, options_);
 }
 
-TEST_F(BackupableDBTest, LimitBackupsOpened) {
+TEST_F(BackupEngineTest, FutureMetaSchemaVersion2_SizeCorruption) {
+  OpenDBAndBackupEngine(true);
+
+  // Backup 1: no future schema, no sizes, with checksums
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+  // Backup 2: no checksums, no sizes
+  TEST_FutureSchemaVersion2Options test_opts;
+  test_opts.crc32c_checksums = false;
+  test_opts.file_sizes = false;
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+  // Backup 3: no checksums, with sizes
+  test_opts.file_sizes = true;
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+  // Backup 4: with checksums and sizes
+  test_opts.crc32c_checksums = true;
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+  CloseDBAndBackupEngine();
+
+  // Corrupt all the CURRENT files with the wrong size
+  const std::string private_dir = backupdir_ + "/private";
+
+  for (int id = 1; id <= 3; ++id) {
+    ASSERT_OK(file_manager_->WriteToFile(
+        private_dir + "/" + ToString(id) + "/CURRENT", "x"));
+  }
+  // Except corrupt Backup 4 with same size CURRENT file
+  {
+    uint64_t size = 0;
+    ASSERT_OK(test_backup_env_->GetFileSize(private_dir + "/4/CURRENT", &size));
+    ASSERT_OK(file_manager_->WriteToFile(private_dir + "/4/CURRENT",
+                                         std::string(size, 'x')));
+  }
+
+  OpenBackupEngine();
+
+  // Only the one with sizes in metadata will be immediately detected
+  // as corrupt
+  std::vector<BackupID> corrupted;
+  backup_engine_->GetCorruptedBackups(&corrupted);
+  ASSERT_EQ(corrupted.size(), 1);
+  ASSERT_EQ(corrupted[0], 3);
+
+  // Size corruption detected on Restore with checksum
+  ASSERT_TRUE(backup_engine_->RestoreDBFromBackup(1 /*id*/, dbname_, dbname_)
+                  .IsCorruption());
+
+  // Size corruption not detected without checksums nor sizes
+  ASSERT_OK(backup_engine_->RestoreDBFromBackup(2 /*id*/, dbname_, dbname_));
+
+  // Non-size corruption detected on Restore with checksum
+  ASSERT_TRUE(backup_engine_->RestoreDBFromBackup(4 /*id*/, dbname_, dbname_)
+                  .IsCorruption());
+
+  CloseBackupEngine();
+}
+
+TEST_F(BackupEngineTest, FutureMetaSchemaVersion2_NotSupported) {
+  TEST_FutureSchemaVersion2Options test_opts;
+  std::string app_metadata = "abc\ndef";
+
+  OpenDBAndBackupEngine(true);
+  // Start with supported
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(
+      backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+
+  // Because we are injecting badness with a TEST API, the badness is only
+  // detected on attempt to restore.
+  // Not supported versions
+  test_opts.version = "3";
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(
+      backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+  test_opts.version = "23.45.67";
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(
+      backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+  test_opts.version = "2";
+
+  // Non-ignorable fields
+  test_opts.meta_fields["ni::blah"] = "123";
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(
+      backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+  test_opts.meta_fields.clear();
+
+  test_opts.file_fields["ni::123"] = "xyz";
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(
+      backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+  test_opts.file_fields.clear();
+
+  test_opts.footer_fields["ni::123"] = "xyz";
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(
+      backup_engine_->CreateNewBackupWithMetadata(db_.get(), app_metadata));
+  test_opts.footer_fields.clear();
+  CloseDBAndBackupEngine();
+
+  OpenBackupEngine();
+  std::vector<BackupID> corrupted;
+  backup_engine_->GetCorruptedBackups(&corrupted);
+  ASSERT_EQ(corrupted.size(), 5);
+
+  ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_));
+  CloseBackupEngine();
+}
+
+TEST_F(BackupEngineTest, FutureMetaSchemaVersion2_Restore) {
+  TEST_FutureSchemaVersion2Options test_opts;
+  const int keys_iteration = 5000;
+
+  OpenDBAndBackupEngine(true, false, kShareWithChecksum);
+  FillDB(db_.get(), 0, keys_iteration);
+  // Start with minimum metadata to ensure it works without it being filled
+  // based on shared files also in other backups with the metadata.
+  test_opts.crc32c_checksums = false;
+  test_opts.file_sizes = false;
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  AssertBackupConsistency(1 /* id */, 0, keys_iteration, keys_iteration * 2);
+
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+                        kShareWithChecksum);
+  test_opts.file_sizes = true;
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  for (int id = 1; id <= 2; ++id) {
+    AssertBackupConsistency(id, 0, keys_iteration, keys_iteration * 2);
+  }
+
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+                        kShareWithChecksum);
+  test_opts.crc32c_checksums = true;
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  for (int id = 1; id <= 3; ++id) {
+    AssertBackupConsistency(id, 0, keys_iteration, keys_iteration * 2);
+  }
+
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+                        kShareWithChecksum);
+  // No TEST_EnableWriteFutureSchemaVersion2
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  for (int id = 1; id <= 4; ++id) {
+    AssertBackupConsistency(id, 0, keys_iteration, keys_iteration * 2);
+  }
+
+  OpenDBAndBackupEngine(false /* destroy_old_data */, false,
+                        kShareWithChecksum);
+  // Minor version updates should be forward-compatible
+  test_opts.version = "2.5.70";
+  test_opts.meta_fields["asdf.3456"] = "-42";
+  test_opts.meta_fields["__QRST"] = " 1 $ %%& ";
+  test_opts.file_fields["z94._"] = "^\\";
+  test_opts.file_fields["_7yyyyyyyyy"] = "111111111111";
+  test_opts.footer_fields["Qwzn.tz89"] = "ASDF!!@# ##=\t ";
+  test_opts.footer_fields["yes"] = "no!";
+  TEST_EnableWriteFutureSchemaVersion2(backup_engine_.get(), test_opts);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  for (int id = 1; id <= 5; ++id) {
+    AssertBackupConsistency(id, 0, keys_iteration, keys_iteration * 2);
+  }
+}
+
+TEST_F(BackupEngineTest, Concurrency) {
+  // Check that we can simultaneously:
+  // * Run several read operations in different threads on a single
+  // BackupEngine object, and
+  // * With another BackupEngine object on the same
+  // backup_dir, run the same read operations in another thread, and
+  // * With yet another BackupEngine object on the same
+  // backup_dir, create two new backups in parallel threads.
+  //
+  // Because of the challenges of integrating this into db_stress,
+  // this is a non-deterministic mini-stress test here instead.
+  OpenDBAndBackupEngine(true, false, kShareWithChecksum);
+
+  static constexpr int keys_iteration = 5000;
+  FillDB(db_.get(), 0, keys_iteration);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+  FillDB(db_.get(), keys_iteration, 2 * keys_iteration);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+
+  static constexpr int max_factor = 3;
+  FillDB(db_.get(), 2 * keys_iteration, max_factor * keys_iteration);
+  // will create another backup soon...
+
+  Options db_opts = options_;
+  db_opts.wal_dir = "";
+  BackupableDBOptions be_opts = *backupable_options_;
+  be_opts.destroy_old_data = false;
+
+  std::mt19937 rng{std::random_device()()};
+
+  std::array<std::thread, 4> read_threads;
+  for (uint32_t i = 0; i < read_threads.size(); ++i) {
+    uint32_t sleep_micros = rng() % 100000;
+    read_threads[i] = std::thread([this, i, sleep_micros, &db_opts, &be_opts] {
+      test_db_env_->SleepForMicroseconds(sleep_micros);
+
+      // Whether to also re-open the BackupEngine, potentially seeing
+      // additional backups
+      bool reopen = i == 3;
+      // Whether we are going to restore "latest"
+      bool latest = i > 1;
+
+      BackupEngine* my_be;
+      if (reopen) {
+        ASSERT_OK(BackupEngine::Open(test_db_env_.get(), be_opts, &my_be));
+      } else {
+        my_be = backup_engine_.get();
+      }
+
+      // Verify metadata (we don't receive updates from concurrently
+      // creating a new backup)
+      std::vector<BackupInfo> infos;
+      my_be->GetBackupInfo(&infos);
+      const uint32_t count = static_cast<uint32_t>(infos.size());
+      infos.clear();
+      if (reopen) {
+        ASSERT_GE(count, 2U);
+        ASSERT_LE(count, 4U);
+        fprintf(stderr, "Reopen saw %u backups\n", count);
+      } else {
+        ASSERT_EQ(count, 2U);
+      }
+      std::vector<BackupID> ids;
+      my_be->GetCorruptedBackups(&ids);
+      ASSERT_EQ(ids.size(), 0U);
+
+      // Restore one of the backups, or "latest"
+      std::string restore_db_dir = dbname_ + "/restore" + ToString(i);
+      BackupID to_restore;
+      if (latest) {
+        to_restore = count;
+        ASSERT_OK(
+            my_be->RestoreDBFromLatestBackup(restore_db_dir, restore_db_dir));
+      } else {
+        to_restore = i + 1;
+        ASSERT_OK(my_be->VerifyBackup(to_restore, true));
+        ASSERT_OK(my_be->RestoreDBFromBackup(to_restore, restore_db_dir,
+                                             restore_db_dir));
+      }
+
+      // Open restored DB to verify its contents
+      DB* restored;
+      ASSERT_OK(DB::Open(db_opts, restore_db_dir, &restored));
+      int factor = std::min(static_cast<int>(to_restore), max_factor);
+      AssertExists(restored, 0, factor * keys_iteration);
+      AssertEmpty(restored, factor * keys_iteration,
+                  (factor + 1) * keys_iteration);
+      delete restored;
+
+      // Re-verify metadata (we don't receive updates from concurrently
+      // creating a new backup)
+      my_be->GetBackupInfo(&infos);
+      ASSERT_EQ(infos.size(), count);
+      my_be->GetCorruptedBackups(&ids);
+      ASSERT_EQ(ids.size(), 0);
+      // fprintf(stderr, "Finished read thread\n");
+
+      if (reopen) {
+        delete my_be;
+      }
+    });
+  }
+
+  BackupEngine* alt_be;
+  ASSERT_OK(BackupEngine::Open(test_db_env_.get(), be_opts, &alt_be));
+
+  std::array<std::thread, 2> append_threads;
+  for (unsigned i = 0; i < append_threads.size(); ++i) {
+    uint32_t sleep_micros = rng() % 100000;
+    append_threads[i] = std::thread([this, sleep_micros, alt_be] {
+      test_db_env_->SleepForMicroseconds(sleep_micros);
+      // WART: CreateNewBackup doesn't tell you the BackupID it just created,
+      // which is ugly for multithreaded setting.
+      // TODO: add delete backup also when that is added
+      ASSERT_OK(alt_be->CreateNewBackup(db_.get()));
+      // fprintf(stderr, "Finished append thread\n");
+    });
+  }
+
+  for (auto& t : append_threads) {
+    t.join();
+  }
+  // Verify metadata
+  std::vector<BackupInfo> infos;
+  alt_be->GetBackupInfo(&infos);
+  ASSERT_EQ(infos.size(), 2 + append_threads.size());
+
+  for (auto& t : read_threads) {
+    t.join();
+  }
+
+  delete alt_be;
+  CloseDBAndBackupEngine();
+}
+
+TEST_F(BackupEngineTest, LimitBackupsOpened) {
   // Verify the specified max backups are opened, including skipping over
   // corrupted backups.
   //
@@ -2711,7 +3263,7 @@ TEST_F(BackupableDBTest, LimitBackupsOpened) {
   delete read_only_backup_engine;
 }
 
-TEST_F(BackupableDBTest, IgnoreLimitBackupsOpenedWhenNotReadOnly) {
+TEST_F(BackupEngineTest, IgnoreLimitBackupsOpenedWhenNotReadOnly) {
   // Verify the specified max_valid_backups_to_open is ignored if the engine
   // is not read-only.
   //
@@ -2745,11 +3297,13 @@ TEST_F(BackupableDBTest, IgnoreLimitBackupsOpenedWhenNotReadOnly) {
   DestroyDB(dbname_, options_);
 }
 
-TEST_F(BackupableDBTest, CreateWhenLatestBackupCorrupted) {
+TEST_F(BackupEngineTest, CreateWhenLatestBackupCorrupted) {
   // we should pick an ID greater than corrupted backups' IDs so creation can
   // succeed even when latest backup is corrupted.
   const int kNumKeys = 5000;
   OpenDBAndBackupEngine(true /* destroy_old_data */);
+  BackupInfo backup_info;
+  ASSERT_TRUE(backup_engine_->GetLatestBackupInfo(&backup_info).IsNotFound());
   FillDB(db_.get(), 0 /* from */, kNumKeys);
   ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
                                             true /* flush_before_backup */));
@@ -2758,15 +3312,29 @@ TEST_F(BackupableDBTest, CreateWhenLatestBackupCorrupted) {
   CloseDBAndBackupEngine();
 
   OpenDBAndBackupEngine();
+  ASSERT_TRUE(backup_engine_->GetLatestBackupInfo(&backup_info).IsNotFound());
+
   ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
                                             true /* flush_before_backup */));
+
+  ASSERT_TRUE(backup_engine_->GetLatestBackupInfo(&backup_info).ok());
+  ASSERT_EQ(2, backup_info.backup_id);
+
   std::vector<BackupInfo> backup_infos;
   backup_engine_->GetBackupInfo(&backup_infos);
   ASSERT_EQ(1, backup_infos.size());
   ASSERT_EQ(2, backup_infos[0].backup_id);
+
+  // Verify individual GetBackupInfo by ID
+  ASSERT_TRUE(backup_engine_->GetBackupInfo(0U, &backup_info).IsNotFound());
+  ASSERT_TRUE(backup_engine_->GetBackupInfo(1U, &backup_info).IsCorruption());
+  ASSERT_TRUE(backup_engine_->GetBackupInfo(2U, &backup_info).ok());
+  ASSERT_TRUE(backup_engine_->GetBackupInfo(3U, &backup_info).IsNotFound());
+  ASSERT_TRUE(
+      backup_engine_->GetBackupInfo(999999U, &backup_info).IsNotFound());
 }
 
-TEST_F(BackupableDBTest, WriteOnlyEngineNoSharedFileDeletion) {
+TEST_F(BackupEngineTest, WriteOnlyEngineNoSharedFileDeletion) {
   // Verifies a write-only BackupEngine does not delete files belonging to valid
   // backups when GarbageCollect, PurgeOldBackups, or DeleteBackup are called.
   const int kNumKeys = 5000;
@@ -2798,12 +3366,13 @@ TEST_F(BackupableDBTest, WriteOnlyEngineNoSharedFileDeletion) {
   }
 }
 
-TEST_P(BackupableDBTestWithParam, BackupUsingDirectIO) {
+TEST_P(BackupEngineTestWithParam, BackupUsingDirectIO) {
   // Tests direct I/O on the backup engine's reads and writes on the DB env and
   // backup env
   // We use ChrootEnv underneath so the below line checks for direct I/O support
   // in the chroot directory, not the true filesystem root.
   if (!test::IsDirectIOSupported(test_db_env_.get(), "/")) {
+    ROCKSDB_GTEST_SKIP("Test requires Direct I/O Support");
     return;
   }
   const int kNumKeysPerBackup = 100;
@@ -2850,7 +3419,7 @@ TEST_P(BackupableDBTestWithParam, BackupUsingDirectIO) {
   }
 }
 
-TEST_F(BackupableDBTest, BackgroundThreadCpuPriority) {
+TEST_F(BackupEngineTest, BackgroundThreadCpuPriority) {
   std::atomic<CpuPriority> priority(CpuPriority::kNormal);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "BackupEngineImpl::Initialize:SetCpuPriority", [&](void* new_priority) {
@@ -2920,7 +3489,11 @@ TEST_F(BackupableDBTest, BackgroundThreadCpuPriority) {
     CreateBackupOptions options;
     options.decrease_background_thread_cpu_priority = true;
     options.background_thread_cpu_priority = CpuPriority::kIdle;
-    ASSERT_OK(backup_engine_->CreateNewBackup(options, db_.get()));
+
+    // Also check output backup_id with CreateNewBackup
+    BackupID new_id = 0;
+    ASSERT_OK(backup_engine_->CreateNewBackup(options, db_.get(), &new_id));
+    ASSERT_EQ(new_id, 5U);
 
     ASSERT_EQ(priority, CpuPriority::kNormal);
   }
@@ -2945,7 +3518,7 @@ int main(int argc, char** argv) {
 #include <stdio.h>
 
 int main(int /*argc*/, char** /*argv*/) {
-  fprintf(stderr, "SKIPPED as BackupableDB is not supported in ROCKSDB_LITE\n");
+  fprintf(stderr, "SKIPPED as BackupEngine is not supported in ROCKSDB_LITE\n");
   return 0;
 }
 
diff --git a/utilities/blob_db/blob_compaction_filter.cc b/utilities/blob_db/blob_compaction_filter.cc
index c86ae7c88c3..fb4c4466b71 100644
--- a/utilities/blob_db/blob_compaction_filter.cc
+++ b/utilities/blob_db/blob_compaction_filter.cc
@@ -10,6 +10,7 @@
 #include <cinttypes>
 
 #include "db/dbformat.h"
+#include "rocksdb/system_clock.h"
 #include "test_util/sync_point.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -66,7 +67,10 @@ CompactionFilter::Decision BlobIndexCompactionFilterBase::FilterV2(
     // Hack: Internal key is passed to BlobIndexCompactionFilter for it to
     // get sequence number.
     ParsedInternalKey ikey;
-    if (ParseInternalKey(key, &ikey) != Status::OK()) {
+    if (!ParseInternalKey(
+             key, &ikey,
+             context_.blob_db_impl->db_options_.allow_data_in_errors)
+             .ok()) {
       assert(false);
       return Decision::kKeep;
     }
@@ -83,7 +87,10 @@ CompactionFilter::Decision BlobIndexCompactionFilterBase::FilterV2(
     // Hack: Internal key is passed to BlobIndexCompactionFilter for it to
     // get sequence number.
     ParsedInternalKey ikey;
-    if (ParseInternalKey(key, &ikey) != Status::OK()) {
+    if (!ParseInternalKey(
+             key, &ikey,
+             context_.blob_db_impl->db_options_.allow_data_in_errors)
+             .ok()) {
       assert(false);
       return Decision::kKeep;
     }
@@ -429,10 +436,10 @@ BlobIndexCompactionFilterFactoryBase::CreateUserCompactionFilterFromFactory(
 std::unique_ptr<CompactionFilter>
 BlobIndexCompactionFilterFactory::CreateCompactionFilter(
     const CompactionFilter::Context& _context) {
-  assert(env());
+  assert(clock());
 
   int64_t current_time = 0;
-  Status s = env()->GetCurrentTime(&current_time);
+  Status s = clock()->GetCurrentTime(&current_time);
   if (!s.ok()) {
     return nullptr;
   }
@@ -454,10 +461,10 @@ BlobIndexCompactionFilterFactory::CreateCompactionFilter(
 std::unique_ptr<CompactionFilter>
 BlobIndexCompactionFilterFactoryGC::CreateCompactionFilter(
     const CompactionFilter::Context& _context) {
-  assert(env());
+  assert(clock());
 
   int64_t current_time = 0;
-  Status s = env()->GetCurrentTime(&current_time);
+  Status s = clock()->GetCurrentTime(&current_time);
   if (!s.ok()) {
     return nullptr;
   }
diff --git a/utilities/blob_db/blob_compaction_filter.h b/utilities/blob_db/blob_compaction_filter.h
index ab1be03d70e..1493cfc1a53 100644
--- a/utilities/blob_db/blob_compaction_filter.h
+++ b/utilities/blob_db/blob_compaction_filter.h
@@ -10,12 +10,12 @@
 #include "db/blob/blob_index.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/compaction_filter.h"
-#include "rocksdb/env.h"
 #include "utilities/blob_db/blob_db_gc_stats.h"
 #include "utilities/blob_db/blob_db_impl.h"
 #include "utilities/compaction_filters/layered_compaction_filter_base.h"
 
 namespace ROCKSDB_NAMESPACE {
+class SystemClock;
 namespace blob_db {
 
 struct BlobCompactionContext {
@@ -54,6 +54,8 @@ class BlobIndexCompactionFilterBase : public LayeredCompactionFilterBase {
                     const Slice& value, std::string* new_value,
                     std::string* skip_until) const override;
 
+  bool IsStackedBlobDbInternalCompactionFilter() const override { return true; }
+
  protected:
   bool IsBlobFileOpened() const;
   virtual bool OpenNewBlobFileIfNeeded() const;
@@ -134,11 +136,12 @@ class BlobIndexCompactionFilterGC : public BlobIndexCompactionFilterBase {
 // that creates non-GC filters.
 class BlobIndexCompactionFilterFactoryBase : public CompactionFilterFactory {
  public:
-  BlobIndexCompactionFilterFactoryBase(BlobDBImpl* _blob_db_impl, Env* _env,
+  BlobIndexCompactionFilterFactoryBase(BlobDBImpl* _blob_db_impl,
+                                       SystemClock* _clock,
                                        const ColumnFamilyOptions& _cf_options,
                                        Statistics* _statistics)
       : blob_db_impl_(_blob_db_impl),
-        env_(_env),
+        clock_(_clock),
         statistics_(_statistics),
         user_comp_filter_(_cf_options.compaction_filter),
         user_comp_filter_factory_(_cf_options.compaction_filter_factory) {}
@@ -148,13 +151,13 @@ class BlobIndexCompactionFilterFactoryBase : public CompactionFilterFactory {
       const CompactionFilter::Context& context) const;
 
   BlobDBImpl* blob_db_impl() const { return blob_db_impl_; }
-  Env* env() const { return env_; }
+  SystemClock* clock() const { return clock_; }
   Statistics* statistics() const { return statistics_; }
   const CompactionFilter* user_comp_filter() const { return user_comp_filter_; }
 
  private:
   BlobDBImpl* blob_db_impl_;
-  Env* env_;
+  SystemClock* clock_;
   Statistics* statistics_;
   const CompactionFilter* user_comp_filter_;
   std::shared_ptr<CompactionFilterFactory> user_comp_filter_factory_;
@@ -163,10 +166,11 @@ class BlobIndexCompactionFilterFactoryBase : public CompactionFilterFactory {
 class BlobIndexCompactionFilterFactory
     : public BlobIndexCompactionFilterFactoryBase {
  public:
-  BlobIndexCompactionFilterFactory(BlobDBImpl* _blob_db_impl, Env* _env,
+  BlobIndexCompactionFilterFactory(BlobDBImpl* _blob_db_impl,
+                                   SystemClock* _clock,
                                    const ColumnFamilyOptions& _cf_options,
                                    Statistics* _statistics)
-      : BlobIndexCompactionFilterFactoryBase(_blob_db_impl, _env, _cf_options,
+      : BlobIndexCompactionFilterFactoryBase(_blob_db_impl, _clock, _cf_options,
                                              _statistics) {}
 
   const char* Name() const override {
@@ -180,10 +184,11 @@ class BlobIndexCompactionFilterFactory
 class BlobIndexCompactionFilterFactoryGC
     : public BlobIndexCompactionFilterFactoryBase {
  public:
-  BlobIndexCompactionFilterFactoryGC(BlobDBImpl* _blob_db_impl, Env* _env,
+  BlobIndexCompactionFilterFactoryGC(BlobDBImpl* _blob_db_impl,
+                                     SystemClock* _clock,
                                      const ColumnFamilyOptions& _cf_options,
                                      Statistics* _statistics)
-      : BlobIndexCompactionFilterFactoryBase(_blob_db_impl, _env, _cf_options,
+      : BlobIndexCompactionFilterFactoryBase(_blob_db_impl, _clock, _cf_options,
                                              _statistics) {}
 
   const char* Name() const override {
diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc
index 79c034db757..e3e0befe433 100644
--- a/utilities/blob_db/blob_db_impl.cc
+++ b/utilities/blob_db/blob_db_impl.cc
@@ -15,7 +15,6 @@
 #include "db/blob/blob_index.h"
 #include "db/db_impl/db_impl.h"
 #include "db/write_batch_internal.h"
-#include "env/composite_env_wrapper.h"
 #include "file/file_util.h"
 #include "file/filename.h"
 #include "file/random_access_file_reader.h"
@@ -81,7 +80,7 @@ BlobDBImpl::BlobDBImpl(const std::string& dbname,
       bdb_options_(blob_db_options),
       db_options_(db_options),
       cf_options_(cf_options),
-      env_options_(db_options),
+      file_options_(db_options),
       statistics_(db_options_.statistics.get()),
       next_file_number_(1),
       flush_sequence_(0),
@@ -92,10 +91,11 @@ BlobDBImpl::BlobDBImpl(const std::string& dbname,
       fifo_eviction_seq_(0),
       evict_expiration_up_to_(0),
       debug_level_(0) {
+  clock_ = env_->GetSystemClock().get();
   blob_dir_ = (bdb_options_.path_relative)
                   ? dbname + "/" + bdb_options_.blob_dir
                   : bdb_options_.blob_dir;
-  env_options_.bytes_per_sync = blob_db_options.bytes_per_sync;
+  file_options_.bytes_per_sync = blob_db_options.bytes_per_sync;
 }
 
 BlobDBImpl::~BlobDBImpl() {
@@ -196,12 +196,12 @@ Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
     db_options_.listeners.push_back(std::make_shared<BlobDBListenerGC>(this));
     cf_options_.compaction_filter_factory =
         std::make_shared<BlobIndexCompactionFilterFactoryGC>(
-            this, env_, cf_options_, statistics_);
+            this, clock_, cf_options_, statistics_);
   } else {
     db_options_.listeners.push_back(std::make_shared<BlobDBListener>(this));
     cf_options_.compaction_filter_factory =
         std::make_shared<BlobIndexCompactionFilterFactory>(
-            this, env_, cf_options_, statistics_);
+            this, clock_, cf_options_, statistics_);
   }
 
   // Reset user compaction filter after building into compaction factory.
@@ -345,7 +345,8 @@ Status BlobDBImpl::OpenAllBlobFiles() {
     blob_file->MarkImmutable(/* sequence */ 0);
 
     // Read file header and footer
-    Status read_metadata_status = blob_file->ReadMetadata(env_, env_options_);
+    Status read_metadata_status =
+        blob_file->ReadMetadata(env_->GetFileSystem(), file_options_);
     if (read_metadata_status.IsCorruption()) {
       // Remove incomplete file.
       if (!obsolete_files_.empty()) {
@@ -678,7 +679,7 @@ Status BlobDBImpl::GetBlobFileReader(
     std::shared_ptr<RandomAccessFileReader>* reader) {
   assert(reader != nullptr);
   bool fresh_open = false;
-  Status s = blob_file->GetReader(env_, env_options_, reader, &fresh_open);
+  Status s = blob_file->GetReader(env_, file_options_, reader, &fresh_open);
   if (s.ok() && fresh_open) {
     assert(*reader != nullptr);
     open_file_count_++;
@@ -719,21 +720,23 @@ void BlobDBImpl::RegisterBlobFile(std::shared_ptr<BlobFile> blob_file) {
 
 Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile) {
   std::string fpath(bfile->PathName());
-  std::unique_ptr<WritableFile> wfile;
+  std::unique_ptr<FSWritableFile> wfile;
+  const auto& fs = env_->GetFileSystem();
 
-  Status s = env_->ReopenWritableFile(fpath, &wfile, env_options_);
+  Status s = fs->ReopenWritableFile(fpath, file_options_, &wfile, nullptr);
   if (!s.ok()) {
     ROCKS_LOG_ERROR(db_options_.info_log,
                     "Failed to open blob file for write: %s status: '%s'"
                     " exists: '%s'",
                     fpath.c_str(), s.ToString().c_str(),
-                    env_->FileExists(fpath).ToString().c_str());
+                    fs->FileExists(fpath, file_options_.io_options, nullptr)
+                        .ToString()
+                        .c_str());
     return s;
   }
 
   std::unique_ptr<WritableFileWriter> fwriter;
-  fwriter.reset(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(wfile)), fpath, env_options_));
+  fwriter.reset(new WritableFileWriter(std::move(wfile), fpath, file_options_));
 
   uint64_t boffset = bfile->GetFileSize();
   if (debug_level_ >= 2 && boffset) {
@@ -754,9 +757,11 @@ Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile) {
     return Status::Corruption("Invalid blob file size");
   }
 
+  constexpr bool do_flush = true;
+
   bfile->log_writer_ = std::make_shared<BlobLogWriter>(
-      std::move(fwriter), env_, statistics_, bfile->file_number_,
-      db_options_.use_fsync, boffset);
+      std::move(fwriter), clock_, statistics_, bfile->file_number_,
+      db_options_.use_fsync, do_flush, boffset);
   bfile->log_writer_->last_elem_type_ = et;
 
   return s;
@@ -992,7 +997,7 @@ class BlobDBImpl::BlobInserter : public WriteBatch::Handler {
 };
 
 Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
-  StopWatch write_sw(env_, statistics_, BLOB_DB_WRITE_MICROS);
+  StopWatch write_sw(clock_, statistics_, BLOB_DB_WRITE_MICROS);
   RecordTick(statistics_, BLOB_DB_NUM_WRITE);
   uint32_t default_cf_id =
       static_cast_with_check<ColumnFamilyHandleImpl>(DefaultColumnFamily())
@@ -1027,7 +1032,7 @@ Status BlobDBImpl::PutWithTTL(const WriteOptions& options,
 
 Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key,
                             const Slice& value, uint64_t expiration) {
-  StopWatch write_sw(env_, statistics_, BLOB_DB_WRITE_MICROS);
+  StopWatch write_sw(clock_, statistics_, BLOB_DB_WRITE_MICROS);
   RecordTick(statistics_, BLOB_DB_NUM_PUT);
   Status s;
   WriteBatch batch;
@@ -1134,7 +1139,7 @@ Slice BlobDBImpl::GetCompressedSlice(const Slice& raw,
   if (bdb_options_.compression == kNoCompression) {
     return raw;
   }
-  StopWatch compression_sw(env_, statistics_, BLOB_DB_COMPRESSION_MICROS);
+  StopWatch compression_sw(clock_, statistics_, BLOB_DB_COMPRESSION_MICROS);
   CompressionType type = bdb_options_.compression;
   CompressionOptions opts;
   CompressionContext context(type);
@@ -1154,7 +1159,8 @@ Status BlobDBImpl::DecompressSlice(const Slice& compressed_value,
   auto cfh = static_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
 
   {
-    StopWatch decompression_sw(env_, statistics_, BLOB_DB_DECOMPRESSION_MICROS);
+    StopWatch decompression_sw(clock_, statistics_,
+                               BLOB_DB_DECOMPRESSION_MICROS);
     UncompressionContext context(compression_type);
     UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
                            compression_type);
@@ -1381,7 +1387,7 @@ Status BlobDBImpl::AppendBlob(const std::shared_ptr<BlobFile>& bfile,
 std::vector<Status> BlobDBImpl::MultiGet(
     const ReadOptions& read_options,
     const std::vector<Slice>& keys, std::vector<std::string>* values) {
-  StopWatch multiget_sw(env_, statistics_, BLOB_DB_MULTIGET_MICROS);
+  StopWatch multiget_sw(clock_, statistics_, BLOB_DB_MULTIGET_MICROS);
   RecordTick(statistics_, BLOB_DB_NUM_MULTIGET);
   // Get a snapshot to avoid blob file get deleted between we
   // fetch and index entry and reading from the file.
@@ -1533,7 +1539,7 @@ Status BlobDBImpl::GetRawBlobFromFile(const Slice& key, uint64_t file_number,
   Slice blob_record;
 
   {
-    StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
+    StopWatch read_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
     if (reader->use_direct_io()) {
       s = reader->Read(IOOptions(), record_offset,
                        static_cast<size_t>(record_size), &blob_record, nullptr,
@@ -1612,7 +1618,7 @@ Status BlobDBImpl::Get(const ReadOptions& read_options,
 Status BlobDBImpl::Get(const ReadOptions& read_options,
                        ColumnFamilyHandle* column_family, const Slice& key,
                        PinnableSlice* value, uint64_t* expiration) {
-  StopWatch get_sw(env_, statistics_, BLOB_DB_GET_MICROS);
+  StopWatch get_sw(clock_, statistics_, BLOB_DB_GET_MICROS);
   RecordTick(statistics_, BLOB_DB_NUM_GET);
   return GetImpl(read_options, column_family, key, value, expiration);
 }
@@ -1686,35 +1692,30 @@ std::pair<bool, int64_t> BlobDBImpl::SanityCheck(bool aborted) {
 
   for (auto blob_file_pair : blob_files_) {
     auto blob_file = blob_file_pair.second;
-    char buf[1000];
-    int pos = snprintf(buf, sizeof(buf),
-                       "Blob file %" PRIu64 ", size %" PRIu64
-                       ", blob count %" PRIu64 ", immutable %d",
-                       blob_file->BlobFileNumber(), blob_file->GetFileSize(),
-                       blob_file->BlobCount(), blob_file->Immutable());
+    std::ostringstream buf;
+
+    buf << "Blob file " << blob_file->BlobFileNumber() << ", size "
+        << blob_file->GetFileSize() << ", blob count " << blob_file->BlobCount()
+        << ", immutable " << blob_file->Immutable();
+
     if (blob_file->HasTTL()) {
       ExpirationRange expiration_range;
-
       {
         ReadLock file_lock(&blob_file->mutex_);
         expiration_range = blob_file->GetExpirationRange();
       }
+      buf << ", expiration range (" << expiration_range.first << ", "
+          << expiration_range.second << ")";
 
-      pos += snprintf(buf + pos, sizeof(buf) - pos,
-                      ", expiration range (%" PRIu64 ", %" PRIu64 ")",
-                      expiration_range.first, expiration_range.second);
       if (!blob_file->Obsolete()) {
-        pos += snprintf(buf + pos, sizeof(buf) - pos,
-                        ", expire in %" PRIu64 " seconds",
-                        expiration_range.second - now);
+        buf << ", expire in " << (expiration_range.second - now) << "seconds";
       }
     }
     if (blob_file->Obsolete()) {
-      pos += snprintf(buf + pos, sizeof(buf) - pos, ", obsolete at %" PRIu64,
-                      blob_file->GetObsoleteSequence());
+      buf << ", obsolete at " << blob_file->GetObsoleteSequence();
     }
-    snprintf(buf + pos, sizeof(buf) - pos, ".");
-    ROCKS_LOG_INFO(db_options_.info_log, "%s", buf);
+    buf << ".";
+    ROCKS_LOG_INFO(db_options_.info_log, "%s", buf.str().c_str());
   }
 
   // reschedule
@@ -2045,8 +2046,8 @@ Iterator* BlobDBImpl::NewIterator(const ReadOptions& read_options) {
   }
   auto* iter = db_impl_->NewIteratorImpl(
       read_options, cfd, snapshot->GetSequenceNumber(),
-      nullptr /*read_callback*/, true /*allow_blob*/);
-  return new BlobDBIterator(own_snapshot, iter, this, env_, statistics_);
+      nullptr /*read_callback*/, true /*expose_blob_index*/);
+  return new BlobDBIterator(own_snapshot, iter, this, clock_, statistics_);
 }
 
 Status DestroyBlobDB(const std::string& dbname, const Options& options,
@@ -2060,21 +2061,21 @@ Status DestroyBlobDB(const std::string& dbname, const Options& options,
                                         : bdb_options.blob_dir;
 
   std::vector<std::string> filenames;
-  env->GetChildren(blobdir, &filenames);
-
-  for (const auto& f : filenames) {
-    uint64_t number;
-    FileType type;
-    if (ParseFileName(f, &number, &type) && type == kBlobFile) {
-      Status del = DeleteDBFile(&soptions, blobdir + "/" + f, blobdir, true,
-                                /*force_fg=*/false);
-      if (status.ok() && !del.ok()) {
-        status = del;
+  if (env->GetChildren(blobdir, &filenames).ok()) {
+    for (const auto& f : filenames) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(f, &number, &type) && type == kBlobFile) {
+        Status del = DeleteDBFile(&soptions, blobdir + "/" + f, blobdir, true,
+                                  /*force_fg=*/false);
+        if (status.ok() && !del.ok()) {
+          status = del;
+        }
       }
     }
+    // TODO: What to do if we cannot delete the directory?
+    env->DeleteDir(blobdir).PermitUncheckedError();
   }
-  env->DeleteDir(blobdir);
-
   Status destroy = DestroyDB(dbname, options);
   if (status.ok() && !destroy.ok()) {
     status = destroy;
diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h
index e400fae1811..ea7554e8f29 100644
--- a/utilities/blob_db/blob_db_impl.h
+++ b/utilities/blob_db/blob_db_impl.h
@@ -24,6 +24,7 @@
 #include "db/db_iter.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
 #include "rocksdb/listener.h"
 #include "rocksdb/options.h"
 #include "rocksdb/statistics.h"
@@ -38,6 +39,8 @@ namespace ROCKSDB_NAMESPACE {
 class DBImpl;
 class ColumnFamilyHandle;
 class ColumnFamilyData;
+class SystemClock;
+
 struct FlushJobInfo;
 
 namespace blob_db {
@@ -385,7 +388,7 @@ class BlobDBImpl : public BlobDB {
 
   void CopyBlobFiles(std::vector<std::shared_ptr<BlobFile>>* bfiles_copy);
 
-  uint64_t EpochNow() { return env_->NowMicros() / 1000000; }
+  uint64_t EpochNow() { return clock_->NowMicros() / 1000000; }
 
   // Check if inserting a new blob will make DB grow out of space.
   // If is_fifo = true, FIFO eviction will be triggered to make room for the
@@ -400,12 +403,12 @@ class BlobDBImpl : public BlobDB {
   // the base DB
   DBImpl* db_impl_;
   Env* env_;
-
+  SystemClock* clock_;
   // the options that govern the behavior of Blob Storage
   BlobDBOptions bdb_options_;
   DBOptions db_options_;
   ColumnFamilyOptions cf_options_;
-  EnvOptions env_options_;
+  FileOptions file_options_;
 
   // Raw pointer of statistic. db_options_ has a std::shared_ptr to hold
   // ownership.
diff --git a/utilities/blob_db/blob_db_iterator.h b/utilities/blob_db/blob_db_iterator.h
index af07117eb48..fd2b2f8f527 100644
--- a/utilities/blob_db/blob_db_iterator.h
+++ b/utilities/blob_db/blob_db_iterator.h
@@ -7,12 +7,14 @@
 #ifndef ROCKSDB_LITE
 
 #include "db/arena_wrapped_db_iter.h"
-#include "monitoring/statistics.h"
 #include "rocksdb/iterator.h"
 #include "util/stop_watch.h"
 #include "utilities/blob_db/blob_db_impl.h"
 
 namespace ROCKSDB_NAMESPACE {
+class Statistics;
+class SystemClock;
+
 namespace blob_db {
 
 using ROCKSDB_NAMESPACE::ManagedSnapshot;
@@ -20,11 +22,12 @@ using ROCKSDB_NAMESPACE::ManagedSnapshot;
 class BlobDBIterator : public Iterator {
  public:
   BlobDBIterator(ManagedSnapshot* snapshot, ArenaWrappedDBIter* iter,
-                 BlobDBImpl* blob_db, Env* env, Statistics* statistics)
+                 BlobDBImpl* blob_db, SystemClock* clock,
+                 Statistics* statistics)
       : snapshot_(snapshot),
         iter_(iter),
         blob_db_(blob_db),
-        env_(env),
+        clock_(clock),
         statistics_(statistics) {}
 
   virtual ~BlobDBIterator() = default;
@@ -44,7 +47,7 @@ class BlobDBIterator : public Iterator {
   }
 
   void SeekToFirst() override {
-    StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
+    StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS);
     RecordTick(statistics_, BLOB_DB_NUM_SEEK);
     iter_->SeekToFirst();
     while (UpdateBlobValue()) {
@@ -53,7 +56,7 @@ class BlobDBIterator : public Iterator {
   }
 
   void SeekToLast() override {
-    StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
+    StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS);
     RecordTick(statistics_, BLOB_DB_NUM_SEEK);
     iter_->SeekToLast();
     while (UpdateBlobValue()) {
@@ -62,7 +65,7 @@ class BlobDBIterator : public Iterator {
   }
 
   void Seek(const Slice& target) override {
-    StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
+    StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS);
     RecordTick(statistics_, BLOB_DB_NUM_SEEK);
     iter_->Seek(target);
     while (UpdateBlobValue()) {
@@ -71,7 +74,7 @@ class BlobDBIterator : public Iterator {
   }
 
   void SeekForPrev(const Slice& target) override {
-    StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
+    StopWatch seek_sw(clock_, statistics_, BLOB_DB_SEEK_MICROS);
     RecordTick(statistics_, BLOB_DB_NUM_SEEK);
     iter_->SeekForPrev(target);
     while (UpdateBlobValue()) {
@@ -81,7 +84,7 @@ class BlobDBIterator : public Iterator {
 
   void Next() override {
     assert(Valid());
-    StopWatch next_sw(env_, statistics_, BLOB_DB_NEXT_MICROS);
+    StopWatch next_sw(clock_, statistics_, BLOB_DB_NEXT_MICROS);
     RecordTick(statistics_, BLOB_DB_NUM_NEXT);
     iter_->Next();
     while (UpdateBlobValue()) {
@@ -91,7 +94,7 @@ class BlobDBIterator : public Iterator {
 
   void Prev() override {
     assert(Valid());
-    StopWatch prev_sw(env_, statistics_, BLOB_DB_PREV_MICROS);
+    StopWatch prev_sw(clock_, statistics_, BLOB_DB_PREV_MICROS);
     RecordTick(statistics_, BLOB_DB_NUM_PREV);
     iter_->Prev();
     while (UpdateBlobValue()) {
@@ -137,7 +140,7 @@ class BlobDBIterator : public Iterator {
   std::unique_ptr<ManagedSnapshot> snapshot_;
   std::unique_ptr<ArenaWrappedDBIter> iter_;
   BlobDBImpl* blob_db_;
-  Env* env_;
+  SystemClock* clock_;
   Statistics* statistics_;
   Status status_;
   PinnableSlice value_;
diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc
index 4be5ee8d5ed..652b082c864 100644
--- a/utilities/blob_db/blob_db_test.cc
+++ b/utilities/blob_db/blob_db_test.cc
@@ -24,9 +24,9 @@
 #include "file/sst_file_manager_impl.h"
 #include "port/port.h"
 #include "rocksdb/utilities/debug.h"
+#include "test_util/mock_time_env.h"
 #include "test_util/sync_point.h"
 #include "test_util/testharness.h"
-#include "util/cast_util.h"
 #include "util/random.h"
 #include "util/string_util.h"
 #include "utilities/blob_db/blob_db_impl.h"
@@ -59,9 +59,11 @@ class BlobDBTest : public testing::Test {
 
   BlobDBTest()
       : dbname_(test::PerThreadDBPath("blob_db_test")),
-        mock_env_(new MockTimeEnv(Env::Default())),
-        fault_injection_env_(new FaultInjectionTestEnv(Env::Default())),
         blob_db_(nullptr) {
+    mock_clock_ = std::make_shared<MockSystemClock>(SystemClock::Default());
+    mock_env_.reset(new CompositeEnvWrapper(Env::Default(), mock_clock_));
+    fault_injection_env_.reset(new FaultInjectionTestEnv(Env::Default()));
+
     Status s = DestroyBlobDB(dbname_, Options(), BlobDBOptions());
     assert(s.ok());
   }
@@ -237,7 +239,7 @@ class BlobDBTest : public testing::Test {
     DB *db = blob_db_->GetRootDB();
     const size_t kMaxKeys = 10000;
     std::vector<KeyVersion> versions;
-    GetAllKeyVersions(db, "", "", kMaxKeys, &versions);
+    ASSERT_OK(GetAllKeyVersions(db, "", "", kMaxKeys, &versions));
     ASSERT_EQ(expected_versions.size(), versions.size());
     size_t i = 0;
     for (auto &key_version : expected_versions) {
@@ -313,7 +315,8 @@ class BlobDBTest : public testing::Test {
   }
 
   const std::string dbname_;
-  std::unique_ptr<MockTimeEnv> mock_env_;
+  std::shared_ptr<MockSystemClock> mock_clock_;
+  std::unique_ptr<Env> mock_env_;
   std::unique_ptr<FaultInjectionTestEnv> fault_injection_env_;
   BlobDB *blob_db_;
 };  // class BlobDBTest
@@ -342,13 +345,13 @@ TEST_F(BlobDBTest, PutWithTTL) {
   bdb_options.disable_background_tasks = true;
   Open(bdb_options, options);
   std::map<std::string, std::string> data;
-  mock_env_->set_current_time(50);
+  mock_clock_->SetCurrentTime(50);
   for (size_t i = 0; i < 100; i++) {
     uint64_t ttl = rnd.Next() % 100;
     PutRandomWithTTL("key" + ToString(i), ttl, &rnd,
                      (ttl <= 50 ? nullptr : &data));
   }
-  mock_env_->set_current_time(100);
+  mock_clock_->SetCurrentTime(100);
   auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
   auto blob_files = bdb_impl->TEST_GetBlobFiles();
   ASSERT_EQ(1, blob_files.size());
@@ -368,13 +371,13 @@ TEST_F(BlobDBTest, PutUntil) {
   bdb_options.disable_background_tasks = true;
   Open(bdb_options, options);
   std::map<std::string, std::string> data;
-  mock_env_->set_current_time(50);
+  mock_clock_->SetCurrentTime(50);
   for (size_t i = 0; i < 100; i++) {
     uint64_t expiration = rnd.Next() % 100 + 50;
     PutRandomUntil("key" + ToString(i), expiration, &rnd,
                    (expiration <= 100 ? nullptr : &data));
   }
-  mock_env_->set_current_time(100);
+  mock_clock_->SetCurrentTime(100);
   auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
   auto blob_files = bdb_impl->TEST_GetBlobFiles();
   ASSERT_EQ(1, blob_files.size());
@@ -411,10 +414,10 @@ TEST_F(BlobDBTest, GetExpiration) {
   options.env = mock_env_.get();
   BlobDBOptions bdb_options;
   bdb_options.disable_background_tasks = true;
-  mock_env_->set_current_time(100);
+  mock_clock_->SetCurrentTime(100);
   Open(bdb_options, options);
-  Put("key1", "value1");
-  PutWithTTL("key2", "value2", 200);
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(PutWithTTL("key2", "value2", 200));
   PinnableSlice value;
   uint64_t expiration;
   ASSERT_OK(blob_db_->Get(ReadOptions(), "key1", &value, &expiration));
@@ -467,7 +470,8 @@ TEST_F(BlobDBTest, WriteBatch) {
     for (size_t j = 0; j < 10; j++) {
       PutRandomToWriteBatch("key" + ToString(j * 100 + i), &rnd, &batch, &data);
     }
-    blob_db_->Write(WriteOptions(), &batch);
+
+    ASSERT_OK(blob_db_->Write(WriteOptions(), &batch));
   }
   VerifyDB(data);
 }
@@ -499,7 +503,7 @@ TEST_F(BlobDBTest, DeleteBatch) {
   }
   WriteBatch batch;
   for (size_t i = 0; i < 100; i++) {
-    batch.Delete("key" + ToString(i));
+    ASSERT_OK(batch.Delete("key" + ToString(i)));
   }
   ASSERT_OK(blob_db_->Write(WriteOptions(), &batch));
   // DB should be empty.
@@ -541,7 +545,7 @@ TEST_F(BlobDBTest, Compression) {
       PutRandomToWriteBatch("write-batch-key" + ToString(j * 100 + i), &rnd,
                             &batch, &data);
     }
-    blob_db_->Write(WriteOptions(), &batch);
+    ASSERT_OK(blob_db_->Write(WriteOptions(), &batch));
   }
   VerifyDB(data);
 }
@@ -733,7 +737,7 @@ TEST_F(BlobDBTest, MultipleWriters) {
             } else {
               WriteBatch batch;
               PutRandomToWriteBatch(key, &rnd, &batch, &data_set[id]);
-              blob_db_->Write(WriteOptions(), &batch);
+              ASSERT_OK(blob_db_->Write(WriteOptions(), &batch));
             }
           }
         },
@@ -776,7 +780,7 @@ TEST_F(BlobDBTest, SstFileManager) {
   Open(bdb_options, db_options);
 
   // Create one obselete file and clean it.
-  blob_db_->Put(WriteOptions(), "foo", "bar");
+  ASSERT_OK(blob_db_->Put(WriteOptions(), "foo", "bar"));
   auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
   ASSERT_EQ(1, blob_files.size());
   std::shared_ptr<BlobFile> bfile = blob_files[0];
@@ -821,14 +825,14 @@ TEST_F(BlobDBTest, SstFileManagerRestart) {
 
   Open(bdb_options, db_options);
   std::string blob_dir = blob_db_impl()->TEST_blob_dir();
-  blob_db_->Put(WriteOptions(), "foo", "bar");
+  ASSERT_OK(blob_db_->Put(WriteOptions(), "foo", "bar"));
   Close();
 
   // Create 3 dummy trash files under the blob_dir
-  LegacyFileSystemWrapper fs(db_options.env);
-  CreateFile(&fs, blob_dir + "/000666.blob.trash", "", false);
-  CreateFile(&fs, blob_dir + "/000888.blob.trash", "", true);
-  CreateFile(&fs, blob_dir + "/something_not_match.trash", "", false);
+  const auto &fs = db_options.env->GetFileSystem();
+  ASSERT_OK(CreateFile(fs, blob_dir + "/000666.blob.trash", "", false));
+  ASSERT_OK(CreateFile(fs, blob_dir + "/000888.blob.trash", "", true));
+  ASSERT_OK(CreateFile(fs, blob_dir + "/something_not_match.trash", "", false));
 
   // Make sure that reopening the DB rescan the existing trash files
   Open(bdb_options, db_options);
@@ -924,7 +928,7 @@ TEST_F(BlobDBTest, SnapshotAndGarbageCollection) {
 TEST_F(BlobDBTest, ColumnFamilyNotSupported) {
   Options options;
   options.env = mock_env_.get();
-  mock_env_->set_current_time(0);
+  mock_clock_->SetCurrentTime(0);
   Open(BlobDBOptions(), options);
   ColumnFamilyHandle *default_handle = blob_db_->DefaultColumnFamily();
   ColumnFamilyHandle *handle = nullptr;
@@ -939,8 +943,8 @@ TEST_F(BlobDBTest, ColumnFamilyNotSupported) {
   ASSERT_TRUE(blob_db_->PutUntil(WriteOptions(), handle, "k", "v", 100)
                   .IsNotSupported());
   WriteBatch batch;
-  batch.Put("k1", "v1");
-  batch.Put(handle, "k2", "v2");
+  ASSERT_OK(batch.Put("k1", "v1"));
+  ASSERT_OK(batch.Put(handle, "k2", "v2"));
   ASSERT_TRUE(blob_db_->Write(WriteOptions(), &batch).IsNotSupported());
   ASSERT_TRUE(blob_db_->Get(ReadOptions(), "k1", &value).IsNotFound());
   ASSERT_TRUE(
@@ -1046,7 +1050,7 @@ TEST_F(BlobDBTest, MigrateFromPlainRocksDB) {
     if (data.count(key) == 0) {
       ASSERT_TRUE(s.IsNotFound());
     } else if (is_blob[i]) {
-      ASSERT_TRUE(s.IsNotSupported());
+      ASSERT_TRUE(s.IsCorruption());
     } else {
       ASSERT_OK(s);
       ASSERT_EQ(data[key], value);
@@ -1256,7 +1260,7 @@ TEST_F(BlobDBTest, InlineSmallValues) {
   bdb_options.disable_background_tasks = true;
   Options options;
   options.env = mock_env_.get();
-  mock_env_->set_current_time(0);
+  mock_clock_->SetCurrentTime(0);
   Open(bdb_options, options);
   std::map<std::string, std::string> data;
   std::map<std::string, KeyVersion> versions;
@@ -1490,7 +1494,7 @@ TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
   constexpr uint64_t kCompactTime = 500;
   constexpr uint64_t kMinBlobSize = 100;
   Random rnd(301);
-  mock_env_->set_current_time(0);
+  mock_clock_->SetCurrentTime(0);
   BlobDBOptions bdb_options;
   bdb_options.min_blob_size = kMinBlobSize;
   bdb_options.disable_background_tasks = true;
@@ -1529,7 +1533,7 @@ TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
   }
   VerifyDB(data);
 
-  mock_env_->set_current_time(kCompactTime);
+  mock_clock_->SetCurrentTime(kCompactTime);
   // Take a snapshot before compaction. Make sure expired blob indexes is
   // filtered regardless of snapshot.
   const Snapshot *snapshot = blob_db_->GetSnapshot();
@@ -1539,7 +1543,7 @@ TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
   // Verify expired blob index are filtered.
   std::vector<KeyVersion> versions;
   const size_t kMaxKeys = 10000;
-  GetAllKeyVersions(blob_db_, "", "", kMaxKeys, &versions);
+  ASSERT_OK(GetAllKeyVersions(blob_db_, "", "", kMaxKeys, &versions));
   ASSERT_EQ(data_after_compact.size(), versions.size());
   for (auto &version : versions) {
     ASSERT_TRUE(data_after_compact.count(version.user_key) > 0);
@@ -1615,7 +1619,7 @@ TEST_F(BlobDBTest, FilterForFIFOEviction) {
   bdb_options.disable_background_tasks = true;
   Options options;
   // Use mock env to stop wall clock.
-  mock_env_->set_current_time(0);
+  mock_clock_->SetCurrentTime(0);
   options.env = mock_env_.get();
   auto statistics = CreateDBStatistics();
   options.statistics = statistics;
@@ -1812,7 +1816,7 @@ TEST_F(BlobDBTest, GarbageCollection) {
     }
   }
 
-  mock_env_->set_current_time(kCompactTime);
+  mock_clock_->SetCurrentTime(kCompactTime);
 
   ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 
@@ -1890,8 +1894,8 @@ TEST_F(BlobDBTest, GarbageCollectionFailure) {
   Open(bdb_options, db_options);
 
   // Write a couple of valid blobs.
-  Put("foo", "bar");
-  Put("dead", "beef");
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("dead", "beef"));
 
   // Write a fake blob reference into the base DB that cannot be parsed.
   WriteBatch batch;
@@ -1927,7 +1931,7 @@ TEST_F(BlobDBTest, EvictExpiredFile) {
   Options options;
   options.env = mock_env_.get();
   Open(bdb_options, options);
-  mock_env_->set_current_time(50);
+  mock_clock_->SetCurrentTime(50);
   std::map<std::string, std::string> data;
   ASSERT_OK(PutWithTTL("foo", "bar", 100, &data));
   auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
@@ -1936,7 +1940,7 @@ TEST_F(BlobDBTest, EvictExpiredFile) {
   ASSERT_FALSE(blob_file->Immutable());
   ASSERT_FALSE(blob_file->Obsolete());
   VerifyDB(data);
-  mock_env_->set_current_time(250);
+  mock_clock_->SetCurrentTime(250);
   // The key should expired now.
   blob_db_impl()->TEST_EvictExpiredFiles();
   ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
@@ -2293,7 +2297,7 @@ TEST_F(BlobDBTest, ShutdownWait) {
   SyncPoint::GetInstance()->EnableProcessing();
 
   Open(bdb_options, options);
-  mock_env_->set_current_time(50);
+  mock_clock_->SetCurrentTime(50);
   std::map<std::string, std::string> data;
   ASSERT_OK(PutWithTTL("foo", "bar", 100, &data));
   auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
@@ -2304,7 +2308,7 @@ TEST_F(BlobDBTest, ShutdownWait) {
   VerifyDB(data);
 
   TEST_SYNC_POINT("BlobDBTest.ShutdownWait:0");
-  mock_env_->set_current_time(250);
+  mock_clock_->SetCurrentTime(250);
   // The key should expired now.
   TEST_SYNC_POINT("BlobDBTest.ShutdownWait:1");
 
@@ -2326,7 +2330,7 @@ TEST_F(BlobDBTest, SyncBlobFileBeforeClose) {
 
   Open(blob_options, options);
 
-  Put("foo", "bar");
+  ASSERT_OK(Put("foo", "bar"));
 
   auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
   ASSERT_EQ(blob_files.size(), 1);
@@ -2346,7 +2350,7 @@ TEST_F(BlobDBTest, SyncBlobFileBeforeCloseIOError) {
 
   Open(blob_options, options);
 
-  Put("foo", "bar");
+  ASSERT_OK(Put("foo", "bar"));
 
   auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
   ASSERT_EQ(blob_files.size(), 1);
diff --git a/utilities/blob_db/blob_dump_tool.cc b/utilities/blob_db/blob_dump_tool.cc
index 8f425f730a0..d6b8930ffda 100644
--- a/utilities/blob_db/blob_dump_tool.cc
+++ b/utilities/blob_db/blob_dump_tool.cc
@@ -5,17 +5,19 @@
 #ifndef ROCKSDB_LITE
 
 #include "utilities/blob_db/blob_dump_tool.h"
+
 #include <stdio.h>
+
 #include <cinttypes>
 #include <iostream>
 #include <memory>
 #include <string>
-#include "env/composite_env_wrapper.h"
+
 #include "file/random_access_file_reader.h"
 #include "file/readahead_raf.h"
 #include "port/port.h"
 #include "rocksdb/convenience.h"
-#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
 #include "table/format.h"
 #include "util/coding.h"
 #include "util/string_util.h"
@@ -32,18 +34,19 @@ Status BlobDumpTool::Run(const std::string& filename, DisplayType show_key,
                          bool show_summary) {
   constexpr size_t kReadaheadSize = 2 * 1024 * 1024;
   Status s;
-  Env* env = Env::Default();
-  s = env->FileExists(filename);
+  const auto fs = FileSystem::Default();
+  IOOptions io_opts;
+  s = fs->FileExists(filename, io_opts, nullptr);
   if (!s.ok()) {
     return s;
   }
   uint64_t file_size = 0;
-  s = env->GetFileSize(filename, &file_size);
+  s = fs->GetFileSize(filename, io_opts, &file_size, nullptr);
   if (!s.ok()) {
     return s;
   }
-  std::unique_ptr<RandomAccessFile> file;
-  s = env->NewRandomAccessFile(filename, &file, EnvOptions());
+  std::unique_ptr<FSRandomAccessFile> file;
+  s = fs->NewRandomAccessFile(filename, FileOptions(), &file, nullptr);
   if (!s.ok()) {
     return s;
   }
@@ -51,8 +54,7 @@ Status BlobDumpTool::Run(const std::string& filename, DisplayType show_key,
   if (file_size == 0) {
     return Status::Corruption("File is empty.");
   }
-  reader_.reset(new RandomAccessFileReader(
-      NewLegacyRandomAccessFileWrapper(file), filename));
+  reader_.reset(new RandomAccessFileReader(std::move(file), filename));
   uint64_t offset = 0;
   uint64_t footer_offset = 0;
   CompressionType compression = kNoCompression;
@@ -213,8 +215,7 @@ Status BlobDumpTool::DumpRecord(DisplayType show_key, DisplayType show_blob,
                            compression);
     s = UncompressBlockContentsForCompressionType(
         info, slice.data() + key_size, static_cast<size_t>(value_size),
-        &contents, 2 /*compress_format_version*/,
-        ImmutableCFOptions(Options()));
+        &contents, 2 /*compress_format_version*/, ImmutableOptions(Options()));
     if (!s.ok()) {
       return s;
     }
diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc
index a3d208c3b96..f12c3b9b3ff 100644
--- a/utilities/blob_db/blob_file.cc
+++ b/utilities/blob_db/blob_file.cc
@@ -15,7 +15,6 @@
 #include "db/column_family.h"
 #include "db/db_impl/db_impl.h"
 #include "db/dbformat.h"
-#include "env/composite_env_wrapper.h"
 #include "file/filename.h"
 #include "file/readahead_raf.h"
 #include "logging/logging.h"
@@ -151,15 +150,16 @@ void BlobFile::CloseRandomAccessLocked() {
   last_access_ = -1;
 }
 
-Status BlobFile::GetReader(Env* env, const EnvOptions& env_options,
+Status BlobFile::GetReader(Env* env, const FileOptions& file_options,
                            std::shared_ptr<RandomAccessFileReader>* reader,
                            bool* fresh_open) {
   assert(reader != nullptr);
   assert(fresh_open != nullptr);
   *fresh_open = false;
   int64_t current_time = 0;
-  env->GetCurrentTime(&current_time);
-  last_access_.store(current_time);
+  if (env->GetCurrentTime(&current_time).ok()) {
+    last_access_.store(current_time);
+  }
   Status s;
 
   {
@@ -177,8 +177,9 @@ Status BlobFile::GetReader(Env* env, const EnvOptions& env_options,
     return s;
   }
 
-  std::unique_ptr<RandomAccessFile> rfile;
-  s = env->NewRandomAccessFile(PathName(), &rfile, env_options);
+  std::unique_ptr<FSRandomAccessFile> rfile;
+  s = env->GetFileSystem()->NewRandomAccessFile(PathName(), file_options,
+                                                &rfile, nullptr);
   if (!s.ok()) {
     ROCKS_LOG_ERROR(info_log_,
                     "Failed to open blob file for random-read: %s status: '%s'"
@@ -188,18 +189,20 @@ Status BlobFile::GetReader(Env* env, const EnvOptions& env_options,
     return s;
   }
 
-  ra_file_reader_ = std::make_shared<RandomAccessFileReader>(
-      NewLegacyRandomAccessFileWrapper(rfile), PathName());
+  ra_file_reader_ =
+      std::make_shared<RandomAccessFileReader>(std::move(rfile), PathName());
   *reader = ra_file_reader_;
   *fresh_open = true;
   return s;
 }
 
-Status BlobFile::ReadMetadata(Env* env, const EnvOptions& env_options) {
+Status BlobFile::ReadMetadata(const std::shared_ptr<FileSystem>& fs,
+                              const FileOptions& file_options) {
   assert(Immutable());
   // Get file size.
   uint64_t file_size = 0;
-  Status s = env->GetFileSize(PathName(), &file_size);
+  Status s =
+      fs->GetFileSize(PathName(), file_options.io_options, &file_size, nullptr);
   if (s.ok()) {
     file_size_ = file_size;
   } else {
@@ -218,17 +221,15 @@ Status BlobFile::ReadMetadata(Env* env, const EnvOptions& env_options) {
   }
 
   // Create file reader.
-  std::unique_ptr<RandomAccessFile> file;
-  s = env->NewRandomAccessFile(PathName(), &file, env_options);
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  s = RandomAccessFileReader::Create(fs, PathName(), file_options, &file_reader,
+                                     nullptr);
   if (!s.ok()) {
     ROCKS_LOG_ERROR(info_log_,
                     "Failed to open blob file %" PRIu64 ", status: %s",
                     file_number_, s.ToString().c_str());
     return s;
   }
-  std::unique_ptr<RandomAccessFileReader> file_reader(
-      new RandomAccessFileReader(NewLegacyRandomAccessFileWrapper(file),
-                                 PathName()));
 
   // Read file header.
   std::string header_buf;
diff --git a/utilities/blob_db/blob_file.h b/utilities/blob_db/blob_file.h
index 9de9a3e91fb..6f3f2bea7c3 100644
--- a/utilities/blob_db/blob_file.h
+++ b/utilities/blob_db/blob_file.h
@@ -15,6 +15,7 @@
 #include "file/random_access_file_reader.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
 #include "rocksdb/options.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -208,9 +209,10 @@ class BlobFile {
   // Read blob file header and footer. Return corruption if file header is
   // malform or incomplete. If footer is malform or incomplete, set
   // footer_valid_ to false and return Status::OK.
-  Status ReadMetadata(Env* env, const EnvOptions& env_options);
+  Status ReadMetadata(const std::shared_ptr<FileSystem>& fs,
+                      const FileOptions& file_options);
 
-  Status GetReader(Env* env, const EnvOptions& env_options,
+  Status GetReader(Env* env, const FileOptions& file_options,
                    std::shared_ptr<RandomAccessFileReader>* reader,
                    bool* fresh_open);
 
diff --git a/utilities/cassandra/cassandra_format_test.cc b/utilities/cassandra/cassandra_format_test.cc
index 32d4d907ca7..9805d35d722 100644
--- a/utilities/cassandra/cassandra_format_test.cc
+++ b/utilities/cassandra/cassandra_format_test.cc
@@ -10,7 +10,6 @@
 #include "utilities/cassandra/serialize.h"
 #include "utilities/cassandra/test_utils.h"
 
-using namespace ROCKSDB_NAMESPACE::cassandra;
 
 namespace ROCKSDB_NAMESPACE {
 namespace cassandra {
diff --git a/utilities/cassandra/cassandra_functional_test.cc b/utilities/cassandra/cassandra_functional_test.cc
index 5c614b48f52..cd06acc92d3 100644
--- a/utilities/cassandra/cassandra_functional_test.cc
+++ b/utilities/cassandra/cassandra_functional_test.cc
@@ -17,7 +17,6 @@
 #include "utilities/cassandra/test_utils.h"
 #include "utilities/merge_operators.h"
 
-using namespace ROCKSDB_NAMESPACE;
 
 namespace ROCKSDB_NAMESPACE {
 namespace cassandra {
@@ -59,14 +58,17 @@ class CassandraStore {
     }
   }
 
-  void Flush() {
-    dbfull()->TEST_FlushMemTable();
-    dbfull()->TEST_WaitForCompact();
+  Status Flush() {
+    Status s = dbfull()->TEST_FlushMemTable();
+    if (s.ok()) {
+      s = dbfull()->TEST_WaitForCompact();
+    }
+    return s;
   }
 
-  void Compact() {
-    dbfull()->TEST_CompactRange(
-      0, nullptr, nullptr, db_->DefaultColumnFamily());
+  Status Compact() {
+    return dbfull()->TEST_CompactRange(0, nullptr, nullptr,
+                                       db_->DefaultColumnFamily());
   }
 
   std::tuple<bool, RowValue> Get(const std::string& key){
@@ -190,15 +192,15 @@ TEST_F(CassandraFunctionalTest,
     CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))
   }));
 
-  store.Flush();
+  ASSERT_OK(store.Flush());
 
   store.Append("k1",CreateTestRowValue({
     CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 10)), //expired
     CreateTestColumnSpec(kColumn, 2, ToMicroSeconds(now))
   }));
 
-  store.Flush();
-  store.Compact();
+  ASSERT_OK(store.Flush());
+  ASSERT_OK(store.Compact());
 
   auto ret = store.Get("k1");
   ASSERT_TRUE(std::get<0>(ret));
@@ -227,15 +229,15 @@ TEST_F(CassandraFunctionalTest,
     CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))
   }));
 
-  store.Flush();
+  ASSERT_OK(store.Flush());
 
   store.Append("k1",CreateTestRowValue({
     CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 10)), //expired
     CreateTestColumnSpec(kColumn, 2, ToMicroSeconds(now))
   }));
 
-  store.Flush();
-  store.Compact();
+  ASSERT_OK(store.Flush());
+  ASSERT_OK(store.Compact());
 
   auto ret = store.Get("k1");
   ASSERT_TRUE(std::get<0>(ret));
@@ -260,14 +262,14 @@ TEST_F(CassandraFunctionalTest,
     CreateTestColumnSpec(kExpiringColumn, 1, ToMicroSeconds(now - kTtl - 20)),
   }));
 
-  store.Flush();
+  ASSERT_OK(store.Flush());
 
   store.Append("k1",CreateTestRowValue({
     CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 10)),
   }));
 
-  store.Flush();
-  store.Compact();
+  ASSERT_OK(store.Flush());
+  ASSERT_OK(store.Compact());
   ASSERT_FALSE(std::get<0>(store.Get("k1")));
 }
 
@@ -286,14 +288,14 @@ TEST_F(CassandraFunctionalTest,
     CreateTestColumnSpec(kColumn, 0, ToMicroSeconds(now))
   }));
 
-  store.Flush();
+  ASSERT_OK(store.Flush());
 
   store.Append("k1",CreateTestRowValue({
     CreateTestColumnSpec(kColumn, 1, ToMicroSeconds(now)),
   }));
 
-  store.Flush();
-  store.Compact();
+  ASSERT_OK(store.Flush());
+  ASSERT_OK(store.Compact());
 
   auto ret = store.Get("k1");
   ASSERT_TRUE(std::get<0>(ret));
@@ -311,8 +313,8 @@ TEST_F(CassandraFunctionalTest, CompactionShouldRemoveTombstoneFromPut) {
     CreateTestColumnSpec(kTombstone, 0, ToMicroSeconds(now - gc_grace_period_in_seconds_ - 1)),
   }));
 
-  store.Flush();
-  store.Compact();
+  ASSERT_OK(store.Flush());
+  ASSERT_OK(store.Compact());
   ASSERT_FALSE(std::get<0>(store.Get("k1")));
 }
 
diff --git a/utilities/cassandra/cassandra_serialize_test.cc b/utilities/cassandra/cassandra_serialize_test.cc
index 491540bfec6..8d0cf66672c 100644
--- a/utilities/cassandra/cassandra_serialize_test.cc
+++ b/utilities/cassandra/cassandra_serialize_test.cc
@@ -6,7 +6,6 @@
 #include "test_util/testharness.h"
 #include "utilities/cassandra/serialize.h"
 
-using namespace ROCKSDB_NAMESPACE::cassandra;
 
 namespace ROCKSDB_NAMESPACE {
 namespace cassandra {
diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc
index 0ce3222d89d..f79b0f2ae1a 100644
--- a/utilities/checkpoint/checkpoint_impl.cc
+++ b/utilities/checkpoint/checkpoint_impl.cc
@@ -14,6 +14,7 @@
 #include <algorithm>
 #include <cinttypes>
 #include <string>
+#include <tuple>
 #include <vector>
 
 #include "db/wal_manager.h"
@@ -51,12 +52,14 @@ void CheckpointImpl::CleanStagingDirectory(
   }
   ROCKS_LOG_INFO(info_log, "File exists %s -- %s",
                  full_private_path.c_str(), s.ToString().c_str());
-  db_->GetEnv()->GetChildren(full_private_path, &subchildren);
-  for (auto& subchild : subchildren) {
-    std::string subchild_path = full_private_path + "/" + subchild;
-    s = db_->GetEnv()->DeleteFile(subchild_path);
-    ROCKS_LOG_INFO(info_log, "Delete file %s -- %s",
-                   subchild_path.c_str(), s.ToString().c_str());
+  s = db_->GetEnv()->GetChildren(full_private_path, &subchildren);
+  if (s.ok()) {
+    for (auto& subchild : subchildren) {
+      std::string subchild_path = full_private_path + "/" + subchild;
+      s = db_->GetEnv()->DeleteFile(subchild_path);
+      ROCKS_LOG_INFO(info_log, "Delete file %s -- %s", subchild_path.c_str(),
+                     s.ToString().c_str());
+    }
   }
   // finally delete the private dir
   s = db_->GetEnv()->DeleteDir(full_private_path);
@@ -109,33 +112,44 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir,
   s = db_->GetEnv()->CreateDir(full_private_path);
   uint64_t sequence_number = 0;
   if (s.ok()) {
-    db_->DisableFileDeletions();
-    s = CreateCustomCheckpoint(
-        db_options,
-        [&](const std::string& src_dirname, const std::string& fname,
-            FileType) {
-          ROCKS_LOG_INFO(db_options.info_log, "Hard Linking %s", fname.c_str());
-          return db_->GetFileSystem()->LinkFile(src_dirname + fname,
-                                                full_private_path + fname,
-                                                IOOptions(), nullptr);
-        } /* link_file_cb */,
-        [&](const std::string& src_dirname, const std::string& fname,
-            uint64_t size_limit_bytes, FileType,
-            const std::string& /* checksum_func_name */,
-            const std::string& /* checksum_val */) {
-          ROCKS_LOG_INFO(db_options.info_log, "Copying %s", fname.c_str());
-          return CopyFile(db_->GetFileSystem(), src_dirname + fname,
-                          full_private_path + fname, size_limit_bytes,
-                          db_options.use_fsync);
-        } /* copy_file_cb */,
-        [&](const std::string& fname, const std::string& contents, FileType) {
-          ROCKS_LOG_INFO(db_options.info_log, "Creating %s", fname.c_str());
-          return CreateFile(db_->GetFileSystem(), full_private_path + fname,
-                            contents, db_options.use_fsync);
-        } /* create_file_cb */,
-        &sequence_number, log_size_for_flush);
-    // we copied all the files, enable file deletions
-    db_->EnableFileDeletions(false);
+    // enable file deletions
+    s = db_->DisableFileDeletions();
+    const bool disabled_file_deletions = s.ok();
+
+    if (s.ok() || s.IsNotSupported()) {
+      s = CreateCustomCheckpoint(
+          db_options,
+          [&](const std::string& src_dirname, const std::string& fname,
+              FileType) {
+            ROCKS_LOG_INFO(db_options.info_log, "Hard Linking %s",
+                           fname.c_str());
+            return db_->GetFileSystem()->LinkFile(src_dirname + fname,
+                                                  full_private_path + fname,
+                                                  IOOptions(), nullptr);
+          } /* link_file_cb */,
+          [&](const std::string& src_dirname, const std::string& fname,
+              uint64_t size_limit_bytes, FileType,
+              const std::string& /* checksum_func_name */,
+              const std::string& /* checksum_val */) {
+            ROCKS_LOG_INFO(db_options.info_log, "Copying %s", fname.c_str());
+            return CopyFile(db_->GetFileSystem(), src_dirname + fname,
+                            full_private_path + fname, size_limit_bytes,
+                            db_options.use_fsync);
+          } /* copy_file_cb */,
+          [&](const std::string& fname, const std::string& contents, FileType) {
+            ROCKS_LOG_INFO(db_options.info_log, "Creating %s", fname.c_str());
+            return CreateFile(db_->GetFileSystem(), full_private_path + fname,
+                              contents, db_options.use_fsync);
+          } /* create_file_cb */,
+          &sequence_number, log_size_for_flush);
+
+      // we copied all the files, enable file deletions
+      if (disabled_file_deletions) {
+        Status ss = db_->EnableFileDeletions(false);
+        assert(ss.ok());
+        ss.PermitUncheckedError();
+      }
+    }
   }
 
   if (s.ok()) {
@@ -144,8 +158,8 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir,
   }
   if (s.ok()) {
     std::unique_ptr<Directory> checkpoint_directory;
-    db_->GetEnv()->NewDirectory(checkpoint_dir, &checkpoint_directory);
-    if (checkpoint_directory != nullptr) {
+    s = db_->GetEnv()->NewDirectory(checkpoint_dir, &checkpoint_directory);
+    if (s.ok() && checkpoint_directory != nullptr) {
       s = checkpoint_directory->Fsync();
     }
   }
@@ -191,68 +205,61 @@ Status CheckpointImpl::CreateCustomCheckpoint(
   VectorLogPtr live_wal_files;
 
   bool flush_memtable = true;
-  if (s.ok()) {
-    if (!db_options.allow_2pc) {
-      if (log_size_for_flush == port::kMaxUint64) {
-        flush_memtable = false;
-      } else if (log_size_for_flush > 0) {
-        // If out standing log files are small, we skip the flush.
-        s = db_->GetSortedWalFiles(live_wal_files);
+  if (!db_options.allow_2pc) {
+    if (log_size_for_flush == port::kMaxUint64) {
+      flush_memtable = false;
+    } else if (log_size_for_flush > 0) {
+      // If out standing log files are small, we skip the flush.
+      s = db_->GetSortedWalFiles(live_wal_files);
 
-        if (!s.ok()) {
-          return s;
-        }
+      if (!s.ok()) {
+        return s;
+      }
 
-        // Don't flush column families if total log size is smaller than
-        // log_size_for_flush. We copy the log files instead.
-        // We may be able to cover 2PC case too.
-        uint64_t total_wal_size = 0;
-        for (auto& wal : live_wal_files) {
-          total_wal_size += wal->SizeFileBytes();
-        }
-        if (total_wal_size < log_size_for_flush) {
-          flush_memtable = false;
-        }
-        live_wal_files.clear();
+      // Don't flush column families if total log size is smaller than
+      // log_size_for_flush. We copy the log files instead.
+      // We may be able to cover 2PC case too.
+      uint64_t total_wal_size = 0;
+      for (auto& wal : live_wal_files) {
+        total_wal_size += wal->SizeFileBytes();
+      }
+      if (total_wal_size < log_size_for_flush) {
+        flush_memtable = false;
       }
+      live_wal_files.clear();
     }
+  }
 
-    // this will return live_files prefixed with "/"
-    s = db_->GetLiveFiles(live_files, &manifest_file_size, flush_memtable);
+  // this will return live_files prefixed with "/"
+  s = db_->GetLiveFiles(live_files, &manifest_file_size, flush_memtable);
 
-    if (s.ok() && db_options.allow_2pc) {
-      // If 2PC is enabled, we need to get minimum log number after the flush.
-      // Need to refetch the live files to recapture the snapshot.
-      if (!db_->GetIntProperty(DB::Properties::kMinLogNumberToKeep,
-                               &min_log_num)) {
-        return Status::InvalidArgument(
-            "2PC enabled but cannot fine the min log number to keep.");
-      }
-      // We need to refetch live files with flush to handle this case:
-      // A previous 000001.log contains the prepare record of transaction tnx1.
-      // The current log file is 000002.log, and sequence_number points to this
-      // file.
-      // After calling GetLiveFiles(), 000003.log is created.
-      // Then tnx1 is committed. The commit record is written to 000003.log.
-      // Now we fetch min_log_num, which will be 3.
-      // Then only 000002.log and 000003.log will be copied, and 000001.log will
-      // be skipped. 000003.log contains commit message of tnx1, but we don't
-      // have respective prepare record for it.
-      // In order to avoid this situation, we need to force flush to make sure
-      // all transactions committed before getting min_log_num will be flushed
-      // to SST files.
-      // We cannot get min_log_num before calling the GetLiveFiles() for the
-      // first time, because if we do that, all the logs files will be included,
-      // far more than needed.
-      s = db_->GetLiveFiles(live_files, &manifest_file_size, flush_memtable);
-    }
+  if (!db_->GetIntProperty(DB::Properties::kMinLogNumberToKeep, &min_log_num)) {
+    return Status::InvalidArgument("cannot get the min log number to keep.");
+  }
+  // Between GetLiveFiles and getting min_log_num, flush might happen
+  // concurrently, so new WAL deletions might be tracked in MANIFEST. If we do
+  // not get the new MANIFEST size, the deleted WALs might not be reflected in
+  // the checkpoint's MANIFEST.
+  //
+  // If we get min_log_num before the above GetLiveFiles, then there might
+  // be too many unnecessary WALs to be included in the checkpoint.
+  //
+  // Ideally, min_log_num should be got together with manifest_file_size in
+  // GetLiveFiles atomically. But that needs changes to GetLiveFiles' signature
+  // which is a public API.
+  s = db_->GetLiveFiles(live_files, &manifest_file_size, flush_memtable);
+  TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:FlushDone");
+
+  TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles1");
+  TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2");
 
-    TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles1");
-    TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2");
-    db_->FlushWAL(false /* sync */);
+  if (s.ok()) {
+    s = db_->FlushWAL(false /* sync */);
   }
+
   TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1");
   TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive2");
+
   // if we have more than one column family, we need to also get WAL files
   if (s.ok()) {
     s = db_->GetSortedWalFiles(live_wal_files);
@@ -263,10 +270,11 @@ Status CheckpointImpl::CreateCustomCheckpoint(
 
   size_t wal_size = live_wal_files.size();
 
-  // process live files, non-table files first
+  // process live files, non-table, non-blob files first
   std::string manifest_fname, current_fname;
-  // record table files for processing next
-  std::vector<std::pair<std::string, uint64_t>> live_table_files;
+  // record table and blob files for processing next
+  std::vector<std::tuple<std::string, uint64_t, FileType>>
+      live_table_and_blob_files;
   for (auto& live_file : live_files) {
     if (!s.ok()) {
       break;
@@ -278,8 +286,8 @@ Status CheckpointImpl::CreateCustomCheckpoint(
       s = Status::Corruption("Can't parse file name. This is very bad");
       break;
     }
-    // we should only get sst, options, manifest and current files here
-    assert(type == kTableFile || type == kDescriptorFile ||
+    // we should only get sst, blob, options, manifest and current files here
+    assert(type == kTableFile || type == kBlobFile || type == kDescriptorFile ||
            type == kCurrentFile || type == kOptionsFile);
     assert(live_file.size() > 0 && live_file[0] == '/');
     if (type == kCurrentFile) {
@@ -291,20 +299,21 @@ Status CheckpointImpl::CreateCustomCheckpoint(
     } else if (type == kDescriptorFile) {
       manifest_fname = live_file;
     }
-    if (type != kTableFile) {
-      // copy non-table files here
+
+    if (type != kTableFile && type != kBlobFile) {
+      // copy non-table, non-blob files here
       // * if it's kDescriptorFile, limit the size to manifest_file_size
       s = copy_file_cb(db_->GetName(), live_file,
                        (type == kDescriptorFile) ? manifest_file_size : 0, type,
                        kUnknownFileChecksumFuncName, kUnknownFileChecksum);
     } else {
-      // process table files below
-      live_table_files.push_back(make_pair(live_file, number));
+      // process table and blob files below
+      live_table_and_blob_files.emplace_back(live_file, number, type);
     }
   }
 
-  // get checksum info for table files
-  // get table file checksums if get_live_table_checksum is true
+  // get checksum info for table and blob files.
+  // get table and blob file checksums if get_live_table_checksum is true
   std::unique_ptr<FileChecksumList> checksum_list;
 
   if (s.ok() && get_live_table_checksum) {
@@ -316,19 +325,21 @@ Status CheckpointImpl::CreateCustomCheckpoint(
                                      manifest_file_size, checksum_list.get());
   }
 
-  // copy/hard link live table files
-  for (auto& ltf : live_table_files) {
+  // copy/hard link live table and blob files
+  for (const auto& file : live_table_and_blob_files) {
     if (!s.ok()) {
       break;
     }
-    std::string& src_fname = ltf.first;
-    uint64_t number = ltf.second;
+
+    const std::string& src_fname = std::get<0>(file);
+    const uint64_t number = std::get<1>(file);
+    const FileType type = std::get<2>(file);
 
     // rules:
-    // * for kTableFile, attempt hard link instead of copy.
+    // * for kTableFile/kBlobFile, attempt hard link instead of copy.
     // * but can't hard link across filesystems.
     if (same_fs) {
-      s = link_file_cb(db_->GetName(), src_fname, kTableFile);
+      s = link_file_cb(db_->GetName(), src_fname, type);
       if (s.IsNotSupported()) {
         same_fs = false;
         s = Status::OK();
@@ -339,7 +350,7 @@ Status CheckpointImpl::CreateCustomCheckpoint(
       std::string checksum_value = kUnknownFileChecksum;
 
       // we ignore the checksums either they are not required or we failed to
-      // obtain the checksum lsit for old table files that have no file
+      // obtain the checksum list for old table files that have no file
       // checksums
       if (get_live_table_checksum) {
         // find checksum info for table files
@@ -353,13 +364,13 @@ Status CheckpointImpl::CreateCustomCheckpoint(
           assert(checksum_value == kUnknownFileChecksum);
         }
       }
-      s = copy_file_cb(db_->GetName(), src_fname, 0, kTableFile, checksum_name,
+      s = copy_file_cb(db_->GetName(), src_fname, 0, type, checksum_name,
                        checksum_value);
     }
   }
   if (s.ok() && !current_fname.empty() && !manifest_fname.empty()) {
-    create_file_cb(current_fname, manifest_fname.substr(1) + "\n",
-                   kCurrentFile);
+    s = create_file_cb(current_fname, manifest_fname.substr(1) + "\n",
+                       kCurrentFile);
   }
   ROCKS_LOG_INFO(db_options.info_log, "Number of log files %" ROCKSDB_PRIszt,
                  live_wal_files.size());
@@ -369,18 +380,17 @@ Status CheckpointImpl::CreateCustomCheckpoint(
   for (size_t i = 0; s.ok() && i < wal_size; ++i) {
     if ((live_wal_files[i]->Type() == kAliveLogFile) &&
         (!flush_memtable ||
-         live_wal_files[i]->StartSequence() >= *sequence_number ||
          live_wal_files[i]->LogNumber() >= min_log_num)) {
       if (i + 1 == wal_size) {
         s = copy_file_cb(db_options.wal_dir, live_wal_files[i]->PathName(),
-                         live_wal_files[i]->SizeFileBytes(), kLogFile,
+                         live_wal_files[i]->SizeFileBytes(), kWalFile,
                          kUnknownFileChecksumFuncName, kUnknownFileChecksum);
         break;
       }
       if (same_fs) {
         // we only care about live log files
         s = link_file_cb(db_options.wal_dir, live_wal_files[i]->PathName(),
-                         kLogFile);
+                         kWalFile);
         if (s.IsNotSupported()) {
           same_fs = false;
           s = Status::OK();
@@ -388,7 +398,7 @@ Status CheckpointImpl::CreateCustomCheckpoint(
       }
       if (!same_fs) {
         s = copy_file_cb(db_options.wal_dir, live_wal_files[i]->PathName(), 0,
-                         kLogFile, kUnknownFileChecksumFuncName,
+                         kWalFile, kUnknownFileChecksumFuncName,
                          kUnknownFileChecksum);
       }
     }
diff --git a/utilities/checkpoint/checkpoint_impl.h b/utilities/checkpoint/checkpoint_impl.h
index dfa8460f929..ff9b8f4556d 100644
--- a/utilities/checkpoint/checkpoint_impl.h
+++ b/utilities/checkpoint/checkpoint_impl.h
@@ -16,31 +16,15 @@ namespace ROCKSDB_NAMESPACE {
 
 class CheckpointImpl : public Checkpoint {
  public:
-  // Creates a Checkpoint object to be used for creating openable snapshots
   explicit CheckpointImpl(DB* db) : db_(db) {}
 
-  // Builds an openable snapshot of RocksDB on the same disk, which
-  // accepts an output directory on the same disk, and under the directory
-  // (1) hard-linked SST files pointing to existing live SST files
-  // SST files will be copied if output directory is on a different filesystem
-  // (2) a copied manifest files and other files
-  // The directory should not already exist and will be created by this API.
-  // The directory will be an absolute path
-  using Checkpoint::CreateCheckpoint;
-  virtual Status CreateCheckpoint(const std::string& checkpoint_dir,
-                                  uint64_t log_size_for_flush,
-                                  uint64_t* sequence_number_ptr) override;
+  Status CreateCheckpoint(const std::string& checkpoint_dir,
+                          uint64_t log_size_for_flush,
+                          uint64_t* sequence_number_ptr) override;
 
-  // Exports all live SST files of a specified Column Family onto export_dir
-  // and returning SST files information in metadata.
-  //  - SST files will be created as hard links when the directory specified
-  //    is in the same partition as the db directory, copied otherwise.
-  //  - export_dir should not already exist and will be created by this API.
-  //  - Always triggers a flush.
-  using Checkpoint::ExportColumnFamily;
-  virtual Status ExportColumnFamily(
-      ColumnFamilyHandle* handle, const std::string& export_dir,
-      ExportImportFilesMetaData** metadata) override;
+  Status ExportColumnFamily(ColumnFamilyHandle* handle,
+                            const std::string& export_dir,
+                            ExportImportFilesMetaData** metadata) override;
 
   // Checkpoint logic can be customized by providing callbacks for link, copy,
   // or create.
diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc
index 823a169bdb4..a8eda4e67de 100644
--- a/utilities/checkpoint/checkpoint_test.cc
+++ b/utilities/checkpoint/checkpoint_test.cc
@@ -29,6 +29,7 @@
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
 #include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
 
 namespace ROCKSDB_NAMESPACE {
 class CheckpointTest : public testing::Test {
@@ -66,12 +67,12 @@ class CheckpointTest : public testing::Test {
     snapshot_name_ = test::PerThreadDBPath(env_, "snapshot");
     std::string snapshot_tmp_name = snapshot_name_ + ".tmp";
     EXPECT_OK(DestroyDB(snapshot_name_, options));
-    env_->DeleteDir(snapshot_name_);
+    test::DeleteDir(env_, snapshot_name_);
     EXPECT_OK(DestroyDB(snapshot_tmp_name, options));
-    env_->DeleteDir(snapshot_tmp_name);
+    test::DeleteDir(env_, snapshot_tmp_name);
     Reopen(options);
     export_path_ = test::PerThreadDBPath("/export");
-    DestroyDir(env_, export_path_);
+    DestroyDir(env_, export_path_).PermitUncheckedError();
     cfh_reverse_comp_ = nullptr;
     metadata_ = nullptr;
   }
@@ -96,7 +97,7 @@ class CheckpointTest : public testing::Test {
     options.db_paths.emplace_back(dbname_ + "_4", 0);
     EXPECT_OK(DestroyDB(dbname_, options));
     EXPECT_OK(DestroyDB(snapshot_name_, options));
-    DestroyDir(env_, export_path_);
+    DestroyDir(env_, export_path_).PermitUncheckedError();
   }
 
   // Return the current option configuration.
@@ -274,7 +275,6 @@ TEST_F(CheckpointTest, GetSnapshotLink) {
     ASSERT_OK(DestroyDB(dbname_, options));
 
     // Create a database
-    Status s;
     options.create_if_missing = true;
     ASSERT_OK(DB::Open(options, dbname_, &db_));
     std::string key = std::string("foo");
@@ -314,9 +314,62 @@ TEST_F(CheckpointTest, GetSnapshotLink) {
   }
 }
 
+TEST_F(CheckpointTest, CheckpointWithBlob) {
+  // Create a database with a blob file
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  ASSERT_OK(Put(key, blob));
+  ASSERT_OK(Flush());
+
+  // Create a checkpoint
+  Checkpoint* checkpoint = nullptr;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+
+  std::unique_ptr<Checkpoint> checkpoint_guard(checkpoint);
+
+  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+
+  // Make sure it contains the blob file
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(snapshot_name_, &files));
+
+  bool blob_file_found = false;
+  for (const auto& file : files) {
+    uint64_t number = 0;
+    FileType type = kWalFile;
+
+    if (ParseFileName(file, &number, &type) && type == kBlobFile) {
+      blob_file_found = true;
+      break;
+    }
+  }
+
+  ASSERT_TRUE(blob_file_found);
+
+  // Make sure the checkpoint can be opened and the blob value read
+  options.create_if_missing = false;
+  DB* checkpoint_db = nullptr;
+  ASSERT_OK(DB::Open(options, snapshot_name_, &checkpoint_db));
+
+  std::unique_ptr<DB> checkpoint_db_guard(checkpoint_db);
+
+  PinnableSlice value;
+  ASSERT_OK(checkpoint_db->Get(
+      ReadOptions(), checkpoint_db->DefaultColumnFamily(), key, &value));
+
+  ASSERT_EQ(value, blob);
+}
+
 TEST_F(CheckpointTest, ExportColumnFamilyWithLinks) {
   // Create a database
-  Status s;
   auto options = CurrentOptions();
   options.create_if_missing = true;
   CreateAndReopenWithCF({}, options);
@@ -326,14 +379,8 @@ TEST_F(CheckpointTest, ExportColumnFamilyWithLinks) {
                                    int num_files_expected) {
     ASSERT_EQ(metadata.files.size(), num_files_expected);
     std::vector<std::string> subchildren;
-    env_->GetChildren(export_path_, &subchildren);
-    int num_children = 0;
-    for (const auto& child : subchildren) {
-      if (child != "." && child != "..") {
-        ++num_children;
-      }
-    }
-    ASSERT_EQ(num_children, num_files_expected);
+    ASSERT_OK(env_->GetChildren(export_path_, &subchildren));
+    ASSERT_EQ(subchildren.size(), num_files_expected);
   };
 
   // Test DefaultColumnFamily
@@ -349,7 +396,7 @@ TEST_F(CheckpointTest, ExportColumnFamilyWithLinks) {
                                              export_path_, &metadata_));
     verify_files_exported(*metadata_, 1);
     ASSERT_EQ(metadata_->db_comparator_name, options.comparator->Name());
-    DestroyDir(env_, export_path_);
+    ASSERT_OK(DestroyDir(env_, export_path_));
     delete metadata_;
     metadata_ = nullptr;
 
@@ -360,7 +407,7 @@ TEST_F(CheckpointTest, ExportColumnFamilyWithLinks) {
                                              export_path_, &metadata_));
     verify_files_exported(*metadata_, 2);
     ASSERT_EQ(metadata_->db_comparator_name, options.comparator->Name());
-    DestroyDir(env_, export_path_);
+    ASSERT_OK(DestroyDir(env_, export_path_));
     delete metadata_;
     metadata_ = nullptr;
     delete checkpoint;
@@ -390,7 +437,6 @@ TEST_F(CheckpointTest, ExportColumnFamilyWithLinks) {
 
 TEST_F(CheckpointTest, ExportColumnFamilyNegativeTest) {
   // Create a database
-  Status s;
   auto options = CurrentOptions();
   options.create_if_missing = true;
   CreateAndReopenWithCF({}, options);
@@ -402,11 +448,11 @@ TEST_F(CheckpointTest, ExportColumnFamilyNegativeTest) {
   ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
 
   // Export onto existing directory
-  env_->CreateDirIfMissing(export_path_);
+  ASSERT_OK(env_->CreateDirIfMissing(export_path_));
   ASSERT_EQ(checkpoint->ExportColumnFamily(db_->DefaultColumnFamily(),
                                            export_path_, &metadata_),
             Status::InvalidArgument("Specified export_dir exists"));
-  DestroyDir(env_, export_path_);
+  ASSERT_OK(DestroyDir(env_, export_path_));
 
   // Export with invalid directory specification
   export_path_ = "";
@@ -437,7 +483,6 @@ TEST_F(CheckpointTest, CheckpointCF) {
   std::string result;
   std::vector<ColumnFamilyHandle*> cphandles;
 
-  Status s;
   // Take a snapshot
   ROCKSDB_NAMESPACE::port::Thread t([&]() {
     Checkpoint* checkpoint;
@@ -465,7 +510,7 @@ TEST_F(CheckpointTest, CheckpointCF) {
   // Open snapshot and verify contents while DB is running
   options.create_if_missing = false;
   std::vector<std::string> cfs;
-  cfs=  {kDefaultColumnFamilyName, "one", "two", "three", "four", "five"};
+  cfs = {kDefaultColumnFamilyName, "one", "two", "three", "four", "five"};
   std::vector<ColumnFamilyDescriptor> column_families;
     for (size_t i = 0; i < cfs.size(); ++i) {
       column_families.push_back(ColumnFamilyDescriptor(cfs[i], options));
@@ -493,7 +538,7 @@ TEST_F(CheckpointTest, CheckpointCFNoFlush) {
 
   ASSERT_OK(Put(0, "Default", "Default"));
   ASSERT_OK(Put(1, "one", "one"));
-  Flush();
+  ASSERT_OK(Flush());
   ASSERT_OK(Put(2, "two", "two"));
 
   DB* snapshotDB;
@@ -501,7 +546,6 @@ TEST_F(CheckpointTest, CheckpointCFNoFlush) {
   std::string result;
   std::vector<ColumnFamilyHandle*> cphandles;
 
-  Status s;
   // Take a snapshot
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCallFlush:start", [&](void* /*arg*/) {
@@ -554,7 +598,7 @@ TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing) {
       {// Get past the flush in the checkpoint thread before adding any keys to
        // the db so the checkpoint thread won't hit the WriteManifest
        // syncpoints.
-       {"DBImpl::GetLiveFiles:1",
+       {"CheckpointImpl::CreateCheckpoint:FlushDone",
         "CheckpointTest::CurrentFileModifiedWhileCheckpointing:PrePut"},
        // Roll the manifest during checkpointing right after live files are
        // snapshotted.
@@ -590,7 +634,7 @@ TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing2PC) {
   Close();
   const std::string dbname = test::PerThreadDBPath("transaction_testdb");
   ASSERT_OK(DestroyDB(dbname, CurrentOptions()));
-  env_->DeleteDir(dbname);
+  test::DeleteDir(env_, dbname);
 
   Options options = CurrentOptions();
   options.allow_2pc = true;
@@ -599,7 +643,7 @@ TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing2PC) {
   TransactionDBOptions txn_db_options;
   TransactionDB* txdb;
   Status s = TransactionDB::Open(options, txn_db_options, dbname, &txdb);
-  assert(s.ok());
+  ASSERT_OK(s);
   ColumnFamilyHandle* cfa;
   ColumnFamilyHandle* cfb;
   ColumnFamilyOptions cf_options;
@@ -620,6 +664,7 @@ TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing2PC) {
   ASSERT_EQ(txdb->GetTransactionByName("xid"), txn);
 
   s = txn->Put(Slice("foo"), Slice("bar"));
+  ASSERT_OK(s);
   s = txn->Put(cfa, Slice("foocfa"), Slice("barcfa"));
   ASSERT_OK(s);
   // Writing prepare into middle of first WAL, then flush WALs many times
@@ -631,7 +676,7 @@ TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing2PC) {
     ASSERT_OK(tx->Prepare());
     ASSERT_OK(tx->Commit());
     if (i % 10000 == 0) {
-      txdb->Flush(FlushOptions());
+      ASSERT_OK(txdb->Flush(FlushOptions()));
     }
     if (i == 88888) {
       ASSERT_OK(txn->Prepare());
@@ -662,13 +707,13 @@ TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing2PC) {
 
   // No more than two logs files should exist.
   std::vector<std::string> files;
-  env_->GetChildren(snapshot_name_, &files);
+  ASSERT_OK(env_->GetChildren(snapshot_name_, &files));
   int num_log_files = 0;
   for (auto& file : files) {
     uint64_t num;
     FileType type;
     WalFileType log_type;
-    if (ParseFileName(file, &num, &type, &log_type) && type == kLogFile) {
+    if (ParseFileName(file, &num, &type, &log_type) && type == kWalFile) {
       num_log_files++;
     }
   }
@@ -733,7 +778,7 @@ TEST_F(CheckpointTest, CheckpointWithUnsyncedDataDropped) {
   ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
   ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
   delete checkpoint;
-  env->DropUnsyncedFileData();
+  ASSERT_OK(env->DropUnsyncedFileData());
 
   // make sure it's openable even though whatever data that wasn't synced got
   // dropped.
@@ -749,6 +794,50 @@ TEST_F(CheckpointTest, CheckpointWithUnsyncedDataDropped) {
   db_ = nullptr;
 }
 
+TEST_F(CheckpointTest, CheckpointOptionsFileFailedToPersist) {
+  // Regression test for a bug where checkpoint failed on a DB where persisting
+  // OPTIONS file failed and the DB was opened with
+  // `fail_if_options_file_error == false`.
+  Options options = CurrentOptions();
+  options.fail_if_options_file_error = false;
+  auto fault_fs = std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
+
+  // Setup `FaultInjectionTestFS` and `SyncPoint` callbacks to fail one
+  // operation when inside the OPTIONS file persisting code.
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  fault_fs->SetRandomMetadataWriteError(1 /* one_in */);
+  SyncPoint::GetInstance()->SetCallBack(
+      "PersistRocksDBOptions:start", [fault_fs](void* /* arg */) {
+        fault_fs->EnableMetadataWriteErrorInjection();
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "FaultInjectionTestFS::InjectMetadataWriteError:Injected",
+      [fault_fs](void* /* arg */) {
+        fault_fs->DisableMetadataWriteErrorInjection();
+      });
+  options.env = fault_fs_env.get();
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(options);
+  ASSERT_OK(Put("key1", "val1"));
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+  delete checkpoint;
+
+  // Make sure it's usable.
+  options.env = env_;
+  DB* snapshot_db;
+  ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
+  ReadOptions read_opts;
+  std::string get_result;
+  ASSERT_OK(snapshot_db->Get(read_opts, "key1", &get_result));
+  ASSERT_EQ("val1", get_result);
+  delete snapshot_db;
+  delete db_;
+  db_ = nullptr;
+}
+
 TEST_F(CheckpointTest, CheckpointReadOnlyDB) {
   ASSERT_OK(Put("foo", "foo_value"));
   ASSERT_OK(Flush());
diff --git a/utilities/convenience/info_log_finder.cc b/utilities/convenience/info_log_finder.cc
index 980262f22f2..37f3bceee9a 100644
--- a/utilities/convenience/info_log_finder.cc
+++ b/utilities/convenience/info_log_finder.cc
@@ -19,7 +19,7 @@ Status GetInfoLogList(DB* db, std::vector<std::string>* info_log_list) {
   }
   std::string parent_path;
   const Options& options = db->GetOptions();
-  return GetInfoLogFiles(options.env, options.db_log_dir, db->GetName(),
-                         &parent_path, info_log_list);
+  return GetInfoLogFiles(options.env->GetFileSystem(), options.db_log_dir,
+                         db->GetName(), &parent_path, info_log_list);
 }
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/debug.cc b/utilities/debug.cc
index 7521213cce5..247d717bae3 100644
--- a/utilities/debug.cc
+++ b/utilities/debug.cc
@@ -55,9 +55,10 @@ Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key,
   size_t num_keys = 0;
   for (; iter->Valid(); iter->Next()) {
     ParsedInternalKey ikey;
-    if (ParseInternalKey(iter->key(), &ikey) != Status::OK()) {
-      return Status::Corruption("Internal Key [" + iter->key().ToString() +
-                                "] parse error!");
+    Status pik_status =
+        ParseInternalKey(iter->key(), &ikey, true /* log_err_key */);  // TODO
+    if (!pik_status.ok()) {
+      return pik_status;
     }
 
     if (!end_key.empty() &&
diff --git a/utilities/env_mirror.cc b/utilities/env_mirror.cc
index dbb5e8021fe..061b9863246 100644
--- a/utilities/env_mirror.cc
+++ b/utilities/env_mirror.cc
@@ -107,12 +107,21 @@ class WritableFileMirror : public WritableFile {
     assert(as == bs);
     return as;
   }
+  Status Append(const Slice& data,
+                const DataVerificationInfo& /* verification_info */) override {
+    return Append(data);
+  }
   Status PositionedAppend(const Slice& data, uint64_t offset) override {
     Status as = a_->PositionedAppend(data, offset);
     Status bs = b_->PositionedAppend(data, offset);
     assert(as == bs);
     return as;
   }
+  Status PositionedAppend(
+      const Slice& data, uint64_t offset,
+      const DataVerificationInfo& /* verification_info */) override {
+    return PositionedAppend(data, offset);
+  }
   Status Truncate(uint64_t size) override {
     Status as = a_->Truncate(size);
     Status bs = b_->Truncate(size);
diff --git a/utilities/env_timed.cc b/utilities/env_timed.cc
index fc6627da2f0..9fac82c5c9f 100644
--- a/utilities/env_timed.cc
+++ b/utilities/env_timed.cc
@@ -3,138 +3,176 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include "env/composite_env_wrapper.h"
 #include "monitoring/perf_context_imp.h"
 #include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
 #include "rocksdb/status.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 #ifndef ROCKSDB_LITE
-
-// An environment that measures function call times for filesystem
-// operations, reporting results to variables in PerfContext.
-class TimedEnv : public EnvWrapper {
+namespace {
+class TimedFileSystem : public FileSystemWrapper {
  public:
-  explicit TimedEnv(Env* base_env) : EnvWrapper(base_env) {}
-
-  Status NewSequentialFile(const std::string& fname,
-                           std::unique_ptr<SequentialFile>* result,
-                           const EnvOptions& options) override {
+  explicit TimedFileSystem(const std::shared_ptr<FileSystem>& base)
+      : FileSystemWrapper(base) {}
+
+  const char* Name() const override { return "TimedFS"; }
+  IOStatus NewSequentialFile(const std::string& fname,
+                             const FileOptions& options,
+                             std::unique_ptr<FSSequentialFile>* result,
+                             IODebugContext* dbg) override {
     PERF_TIMER_GUARD(env_new_sequential_file_nanos);
-    return EnvWrapper::NewSequentialFile(fname, result, options);
+    return FileSystemWrapper::NewSequentialFile(fname, options, result, dbg);
   }
 
-  Status NewRandomAccessFile(const std::string& fname,
-                             std::unique_ptr<RandomAccessFile>* result,
-                             const EnvOptions& options) override {
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& options,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override {
     PERF_TIMER_GUARD(env_new_random_access_file_nanos);
-    return EnvWrapper::NewRandomAccessFile(fname, result, options);
+    return FileSystemWrapper::NewRandomAccessFile(fname, options, result, dbg);
   }
 
-  Status NewWritableFile(const std::string& fname,
-                         std::unique_ptr<WritableFile>* result,
-                         const EnvOptions& options) override {
+  IOStatus NewWritableFile(const std::string& fname, const FileOptions& options,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override {
     PERF_TIMER_GUARD(env_new_writable_file_nanos);
-    return EnvWrapper::NewWritableFile(fname, result, options);
+    return FileSystemWrapper::NewWritableFile(fname, options, result, dbg);
   }
 
-  Status ReuseWritableFile(const std::string& fname,
-                           const std::string& old_fname,
-                           std::unique_ptr<WritableFile>* result,
-                           const EnvOptions& options) override {
+  IOStatus ReuseWritableFile(const std::string& fname,
+                             const std::string& old_fname,
+                             const FileOptions& options,
+                             std::unique_ptr<FSWritableFile>* result,
+                             IODebugContext* dbg) override {
     PERF_TIMER_GUARD(env_reuse_writable_file_nanos);
-    return EnvWrapper::ReuseWritableFile(fname, old_fname, result, options);
+    return FileSystemWrapper::ReuseWritableFile(fname, old_fname, options,
+                                                result, dbg);
   }
 
-  Status NewRandomRWFile(const std::string& fname,
-                         std::unique_ptr<RandomRWFile>* result,
-                         const EnvOptions& options) override {
+  IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options,
+                           std::unique_ptr<FSRandomRWFile>* result,
+                           IODebugContext* dbg) override {
     PERF_TIMER_GUARD(env_new_random_rw_file_nanos);
-    return EnvWrapper::NewRandomRWFile(fname, result, options);
+    return FileSystemWrapper::NewRandomRWFile(fname, options, result, dbg);
   }
 
-  Status NewDirectory(const std::string& name,
-                      std::unique_ptr<Directory>* result) override {
+  IOStatus NewDirectory(const std::string& name, const IOOptions& options,
+                        std::unique_ptr<FSDirectory>* result,
+                        IODebugContext* dbg) override {
     PERF_TIMER_GUARD(env_new_directory_nanos);
-    return EnvWrapper::NewDirectory(name, result);
+    return FileSystemWrapper::NewDirectory(name, options, result, dbg);
   }
 
-  Status FileExists(const std::string& fname) override {
+  IOStatus FileExists(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override {
     PERF_TIMER_GUARD(env_file_exists_nanos);
-    return EnvWrapper::FileExists(fname);
+    return FileSystemWrapper::FileExists(fname, options, dbg);
   }
 
-  Status GetChildren(const std::string& dir,
-                     std::vector<std::string>* result) override {
+  IOStatus GetChildren(const std::string& dir, const IOOptions& options,
+                       std::vector<std::string>* result,
+                       IODebugContext* dbg) override {
     PERF_TIMER_GUARD(env_get_children_nanos);
-    return EnvWrapper::GetChildren(dir, result);
+    return FileSystemWrapper::GetChildren(dir, options, result, dbg);
   }
 
-  Status GetChildrenFileAttributes(
-      const std::string& dir, std::vector<FileAttributes>* result) override {
+  IOStatus GetChildrenFileAttributes(const std::string& dir,
+                                     const IOOptions& options,
+                                     std::vector<FileAttributes>* result,
+                                     IODebugContext* dbg) override {
     PERF_TIMER_GUARD(env_get_children_file_attributes_nanos);
-    return EnvWrapper::GetChildrenFileAttributes(dir, result);
+    return FileSystemWrapper::GetChildrenFileAttributes(dir, options, result,
+                                                        dbg);
   }
 
-  Status DeleteFile(const std::string& fname) override {
+  IOStatus DeleteFile(const std::string& fname, const IOOptions& options,
+                      IODebugContext* dbg) override {
     PERF_TIMER_GUARD(env_delete_file_nanos);
-    return EnvWrapper::DeleteFile(fname);
+    return FileSystemWrapper::DeleteFile(fname, options, dbg);
   }
 
-  Status CreateDir(const std::string& dirname) override {
+  IOStatus CreateDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override {
     PERF_TIMER_GUARD(env_create_dir_nanos);
-    return EnvWrapper::CreateDir(dirname);
+    return FileSystemWrapper::CreateDir(dirname, options, dbg);
   }
 
-  Status CreateDirIfMissing(const std::string& dirname) override {
+  IOStatus CreateDirIfMissing(const std::string& dirname,
+                              const IOOptions& options,
+                              IODebugContext* dbg) override {
     PERF_TIMER_GUARD(env_create_dir_if_missing_nanos);
-    return EnvWrapper::CreateDirIfMissing(dirname);
+    return FileSystemWrapper::CreateDirIfMissing(dirname, options, dbg);
   }
 
-  Status DeleteDir(const std::string& dirname) override {
+  IOStatus DeleteDir(const std::string& dirname, const IOOptions& options,
+                     IODebugContext* dbg) override {
     PERF_TIMER_GUARD(env_delete_dir_nanos);
-    return EnvWrapper::DeleteDir(dirname);
+    return FileSystemWrapper::DeleteDir(dirname, options, dbg);
   }
 
-  Status GetFileSize(const std::string& fname, uint64_t* file_size) override {
+  IOStatus GetFileSize(const std::string& fname, const IOOptions& options,
+                       uint64_t* file_size, IODebugContext* dbg) override {
     PERF_TIMER_GUARD(env_get_file_size_nanos);
-    return EnvWrapper::GetFileSize(fname, file_size);
+    return FileSystemWrapper::GetFileSize(fname, options, file_size, dbg);
   }
 
-  Status GetFileModificationTime(const std::string& fname,
-                                 uint64_t* file_mtime) override {
+  IOStatus GetFileModificationTime(const std::string& fname,
+                                   const IOOptions& options,
+                                   uint64_t* file_mtime,
+                                   IODebugContext* dbg) override {
     PERF_TIMER_GUARD(env_get_file_modification_time_nanos);
-    return EnvWrapper::GetFileModificationTime(fname, file_mtime);
+    return FileSystemWrapper::GetFileModificationTime(fname, options,
+                                                      file_mtime, dbg);
   }
 
-  Status RenameFile(const std::string& src, const std::string& dst) override {
+  IOStatus RenameFile(const std::string& src, const std::string& dst,
+                      const IOOptions& options, IODebugContext* dbg) override {
     PERF_TIMER_GUARD(env_rename_file_nanos);
-    return EnvWrapper::RenameFile(src, dst);
+    return FileSystemWrapper::RenameFile(src, dst, options, dbg);
   }
 
-  Status LinkFile(const std::string& src, const std::string& dst) override {
+  IOStatus LinkFile(const std::string& src, const std::string& dst,
+                    const IOOptions& options, IODebugContext* dbg) override {
     PERF_TIMER_GUARD(env_link_file_nanos);
-    return EnvWrapper::LinkFile(src, dst);
+    return FileSystemWrapper::LinkFile(src, dst, options, dbg);
   }
 
-  Status LockFile(const std::string& fname, FileLock** lock) override {
+  IOStatus LockFile(const std::string& fname, const IOOptions& options,
+                    FileLock** lock, IODebugContext* dbg) override {
     PERF_TIMER_GUARD(env_lock_file_nanos);
-    return EnvWrapper::LockFile(fname, lock);
+    return FileSystemWrapper::LockFile(fname, options, lock, dbg);
   }
 
-  Status UnlockFile(FileLock* lock) override {
+  IOStatus UnlockFile(FileLock* lock, const IOOptions& options,
+                      IODebugContext* dbg) override {
     PERF_TIMER_GUARD(env_unlock_file_nanos);
-    return EnvWrapper::UnlockFile(lock);
+    return FileSystemWrapper::UnlockFile(lock, options, dbg);
   }
 
-  Status NewLogger(const std::string& fname,
-                   std::shared_ptr<Logger>* result) override {
+  IOStatus NewLogger(const std::string& fname, const IOOptions& options,
+                     std::shared_ptr<Logger>* result,
+                     IODebugContext* dbg) override {
     PERF_TIMER_GUARD(env_new_logger_nanos);
-    return EnvWrapper::NewLogger(fname, result);
+    return FileSystemWrapper::NewLogger(fname, options, result, dbg);
   }
 };
+}  // namespace
 
-Env* NewTimedEnv(Env* base_env) { return new TimedEnv(base_env); }
+std::shared_ptr<FileSystem> NewTimedFileSystem(
+    const std::shared_ptr<FileSystem>& base) {
+  return std::make_shared<TimedFileSystem>(base);
+}
+
+// An environment that measures function call times for filesystem
+// operations, reporting results to variables in PerfContext.
+Env* NewTimedEnv(Env* base_env) {
+  std::shared_ptr<FileSystem> timed_fs =
+      NewTimedFileSystem(base_env->GetFileSystem());
+  return new CompositeEnvWrapper(base_env, timed_fs);
+}
 
 #else  // ROCKSDB_LITE
 
diff --git a/utilities/fault_injection_env.cc b/utilities/fault_injection_env.cc
index 668daa04a35..650655f2d9e 100644
--- a/utilities/fault_injection_env.cc
+++ b/utilities/fault_injection_env.cc
@@ -165,7 +165,7 @@ TestWritableFile::TestWritableFile(const std::string& fname,
 
 TestWritableFile::~TestWritableFile() {
   if (writable_file_opened_) {
-    Close();
+    Close().PermitUncheckedError();
   }
 }
 
@@ -217,7 +217,7 @@ TestRandomRWFile::TestRandomRWFile(const std::string& /*fname*/,
 
 TestRandomRWFile::~TestRandomRWFile() {
   if (file_opened_) {
-    Close();
+    Close().PermitUncheckedError();
   }
 }
 
diff --git a/utilities/fault_injection_env.h b/utilities/fault_injection_env.h
index 4629c0ab9b6..ab232a1a26b 100644
--- a/utilities/fault_injection_env.h
+++ b/utilities/fault_injection_env.h
@@ -73,6 +73,11 @@ class TestWritableFile : public WritableFile {
                             FaultInjectionTestEnv* env);
   virtual ~TestWritableFile();
   virtual Status Append(const Slice& data) override;
+  virtual Status Append(
+      const Slice& data,
+      const DataVerificationInfo& /*verification_info*/) override {
+    return Append(data);
+  }
   virtual Status Truncate(uint64_t size) override {
     return target_->Truncate(size);
   }
@@ -84,6 +89,11 @@ class TestWritableFile : public WritableFile {
                                   uint64_t offset) override {
     return target_->PositionedAppend(data, offset);
   }
+  virtual Status PositionedAppend(
+      const Slice& data, uint64_t offset,
+      const DataVerificationInfo& /*verification_info*/) override {
+    return PositionedAppend(data, offset);
+  }
   virtual bool use_direct_io() const override {
     return target_->use_direct_io();
   };
diff --git a/utilities/fault_injection_fs.cc b/utilities/fault_injection_fs.cc
index 5d4f94e0c65..6c12c5be241 100644
--- a/utilities/fault_injection_fs.cc
+++ b/utilities/fault_injection_fs.cc
@@ -9,7 +9,7 @@
 
 // This test uses a custom FileSystem to keep track of the state of a file
 // system the last "Sync". The data being written is cached in a "buffer".
-// Only when "Sync" is called, the data will be persistent. It can similate
+// Only when "Sync" is called, the data will be persistent. It can simulate
 // file data loss (or entire files) not protected by a "Sync". For any of the
 // FileSystem related operations, by specify the "IOStatus Error", a specific
 // error can be returned when file system is not activated.
@@ -22,7 +22,11 @@
 #include "env/composite_env_wrapper.h"
 #include "port/lang.h"
 #include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
 #include "util/random.h"
+#include "util/xxhash.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -53,6 +57,21 @@ std::pair<std::string, std::string> TestFSGetDirAndName(
   return std::make_pair(dirname, fname);
 }
 
+// Calculate the checksum of the data with corresponding checksum
+// type. If name does not match, no checksum is returned.
+void CalculateTypedChecksum(const ChecksumType& checksum_type, const char* data,
+                            size_t size, std::string* checksum) {
+  if (checksum_type == ChecksumType::kCRC32c) {
+    uint32_t v_crc32c = crc32c::Extend(0, data, size);
+    PutFixed32(checksum, v_crc32c);
+    return;
+  } else if (checksum_type == ChecksumType::kxxHash) {
+    uint32_t v = XXH32(data, size, 0);
+    PutFixed32(checksum, v);
+  }
+  return;
+}
+
 IOStatus FSFileState::DropUnsyncedData() {
   buffer_.resize(0);
   return IOStatus::OK();
@@ -69,14 +88,29 @@ IOStatus TestFSDirectory::Fsync(const IOOptions& options, IODebugContext* dbg) {
   if (!fs_->IsFilesystemActive()) {
     return fs_->GetError();
   }
+  {
+    IOStatus in_s = fs_->InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
   fs_->SyncDir(dirname_);
-  return dir_->Fsync(options, dbg);
+  IOStatus s = dir_->Fsync(options, dbg);
+  {
+    IOStatus in_s = fs_->InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
+  return s;
 }
 
 TestFSWritableFile::TestFSWritableFile(const std::string& fname,
+                                       const FileOptions& file_opts,
                                        std::unique_ptr<FSWritableFile>&& f,
                                        FaultInjectionTestFS* fs)
     : state_(fname),
+      file_opts_(file_opts),
       target_(std::move(f)),
       writable_file_opened_(true),
       fs_(fs) {
@@ -96,6 +130,38 @@ IOStatus TestFSWritableFile::Append(const Slice& data, const IOOptions&,
   if (!fs_->IsFilesystemActive()) {
     return fs_->GetError();
   }
+  state_.buffer_.append(data.data(), data.size());
+  state_.pos_ += data.size();
+  fs_->WritableFileAppended(state_);
+  IOStatus io_s = fs_->InjectWriteError(state_.filename_);
+  return io_s;
+}
+
+// By setting the IngestDataCorruptionBeforeWrite(), the data corruption is
+// simulated.
+IOStatus TestFSWritableFile::Append(
+    const Slice& data, const IOOptions&,
+    const DataVerificationInfo& verification_info, IODebugContext*) {
+  MutexLock l(&mutex_);
+  if (!fs_->IsFilesystemActive()) {
+    return fs_->GetError();
+  }
+  if (fs_->ShouldDataCorruptionBeforeWrite()) {
+    return IOStatus::Corruption("Data is corrupted!");
+  }
+
+  // Calculate the checksum
+  std::string checksum;
+  CalculateTypedChecksum(fs_->GetChecksumHandoffFuncType(), data.data(),
+                         data.size(), &checksum);
+  if (fs_->GetChecksumHandoffFuncType() != ChecksumType::kNoChecksum &&
+      checksum != verification_info.checksum.ToString()) {
+    std::string msg = "Data is corrupted! Origin data checksum: " +
+                      verification_info.checksum.ToString() +
+                      "current data checksum: " + checksum;
+    return IOStatus::Corruption(msg);
+  }
+
   state_.buffer_.append(data.data(), data.size());
   state_.pos_ += data.size();
   fs_->WritableFileAppended(state_);
@@ -107,6 +173,12 @@ IOStatus TestFSWritableFile::Close(const IOOptions& options,
   if (!fs_->IsFilesystemActive()) {
     return fs_->GetError();
   }
+  {
+    IOStatus in_s = fs_->InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
   writable_file_opened_ = false;
   IOStatus io_s;
   io_s = target_->Append(state_.buffer_, options, dbg);
@@ -118,6 +190,10 @@ IOStatus TestFSWritableFile::Close(const IOOptions& options,
   }
   if (io_s.ok()) {
     fs_->WritableFileClosed(state_);
+    IOStatus in_s = fs_->InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
   }
   return io_s;
 }
@@ -223,6 +299,14 @@ IOStatus TestFSRandomAccessFile::Read(uint64_t offset, size_t n,
   return s;
 }
 
+size_t TestFSRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
+  if (fs_->ShouldFailGetUniqueId()) {
+    return 0;
+  } else {
+    return target_->GetUniqueId(id, max_size);
+  }
+}
+
 IOStatus FaultInjectionTestFS::NewDirectory(
     const std::string& name, const IOOptions& options,
     std::unique_ptr<FSDirectory>* result, IODebugContext* dbg) {
@@ -242,21 +326,36 @@ IOStatus FaultInjectionTestFS::NewWritableFile(
   if (!IsFilesystemActive()) {
     return GetError();
   }
+  {
+    IOStatus in_s = InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
   if (IsFilesystemDirectWritable()) {
     return target()->NewWritableFile(fname, file_opts, result, dbg);
   }
 
   IOStatus io_s = target()->NewWritableFile(fname, file_opts, result, dbg);
   if (io_s.ok()) {
-    result->reset(new TestFSWritableFile(fname, std::move(*result), this));
+    result->reset(
+        new TestFSWritableFile(fname, file_opts, std::move(*result), this));
     // WritableFileWriter* file is opened
     // again then it will be truncated - so forget our saved state.
     UntrackFile(fname);
-    MutexLock l(&mutex_);
-    open_files_.insert(fname);
-    auto dir_and_name = TestFSGetDirAndName(fname);
-    auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
-    list.insert(dir_and_name.second);
+    {
+      MutexLock l(&mutex_);
+      open_files_.insert(fname);
+      auto dir_and_name = TestFSGetDirAndName(fname);
+      auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+      list.insert(dir_and_name.second);
+    }
+    {
+      IOStatus in_s = InjectMetadataWriteError();
+      if (!in_s.ok()) {
+        return in_s;
+      }
+    }
   }
   return io_s;
 }
@@ -270,17 +369,32 @@ IOStatus FaultInjectionTestFS::ReopenWritableFile(
   if (IsFilesystemDirectWritable()) {
     return target()->ReopenWritableFile(fname, file_opts, result, dbg);
   }
+  {
+    IOStatus in_s = InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
   IOStatus io_s = target()->ReopenWritableFile(fname, file_opts, result, dbg);
   if (io_s.ok()) {
-    result->reset(new TestFSWritableFile(fname, std::move(*result), this));
+    result->reset(
+        new TestFSWritableFile(fname, file_opts, std::move(*result), this));
     // WritableFileWriter* file is opened
     // again then it will be truncated - so forget our saved state.
     UntrackFile(fname);
-    MutexLock l(&mutex_);
-    open_files_.insert(fname);
-    auto dir_and_name = TestFSGetDirAndName(fname);
-    auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
-    list.insert(dir_and_name.second);
+    {
+      MutexLock l(&mutex_);
+      open_files_.insert(fname);
+      auto dir_and_name = TestFSGetDirAndName(fname);
+      auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+      list.insert(dir_and_name.second);
+    }
+    {
+      IOStatus in_s = InjectMetadataWriteError();
+      if (!in_s.ok()) {
+        return in_s;
+      }
+    }
   }
   return io_s;
 }
@@ -294,17 +408,31 @@ IOStatus FaultInjectionTestFS::NewRandomRWFile(
   if (IsFilesystemDirectWritable()) {
     return target()->NewRandomRWFile(fname, file_opts, result, dbg);
   }
+  {
+    IOStatus in_s = InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
   IOStatus io_s = target()->NewRandomRWFile(fname, file_opts, result, dbg);
   if (io_s.ok()) {
     result->reset(new TestFSRandomRWFile(fname, std::move(*result), this));
     // WritableFileWriter* file is opened
     // again then it will be truncated - so forget our saved state.
     UntrackFile(fname);
-    MutexLock l(&mutex_);
-    open_files_.insert(fname);
-    auto dir_and_name = TestFSGetDirAndName(fname);
-    auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
-    list.insert(dir_and_name.second);
+    {
+      MutexLock l(&mutex_);
+      open_files_.insert(fname);
+      auto dir_and_name = TestFSGetDirAndName(fname);
+      auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+      list.insert(dir_and_name.second);
+    }
+    {
+      IOStatus in_s = InjectMetadataWriteError();
+      if (!in_s.ok()) {
+        return in_s;
+      }
+    }
   }
   return io_s;
 }
@@ -331,9 +459,21 @@ IOStatus FaultInjectionTestFS::DeleteFile(const std::string& f,
   if (!IsFilesystemActive()) {
     return GetError();
   }
+  {
+    IOStatus in_s = InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
   IOStatus io_s = FileSystemWrapper::DeleteFile(f, options, dbg);
   if (io_s.ok()) {
     UntrackFile(f);
+    {
+      IOStatus in_s = InjectMetadataWriteError();
+      if (!in_s.ok()) {
+        return in_s;
+      }
+    }
   }
   return io_s;
 }
@@ -345,21 +485,33 @@ IOStatus FaultInjectionTestFS::RenameFile(const std::string& s,
   if (!IsFilesystemActive()) {
     return GetError();
   }
+  {
+    IOStatus in_s = InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
+    }
+  }
   IOStatus io_s = FileSystemWrapper::RenameFile(s, t, options, dbg);
 
   if (io_s.ok()) {
-    MutexLock l(&mutex_);
-    if (db_file_state_.find(s) != db_file_state_.end()) {
-      db_file_state_[t] = db_file_state_[s];
-      db_file_state_.erase(s);
-    }
+    {
+      MutexLock l(&mutex_);
+      if (db_file_state_.find(s) != db_file_state_.end()) {
+        db_file_state_[t] = db_file_state_[s];
+        db_file_state_.erase(s);
+      }
 
-    auto sdn = TestFSGetDirAndName(s);
-    auto tdn = TestFSGetDirAndName(t);
-    if (dir_to_new_files_since_last_sync_[sdn.first].erase(sdn.second) != 0) {
-      auto& tlist = dir_to_new_files_since_last_sync_[tdn.first];
-      assert(tlist.find(tdn.second) == tlist.end());
-      tlist.insert(tdn.second);
+      auto sdn = TestFSGetDirAndName(s);
+      auto tdn = TestFSGetDirAndName(t);
+      if (dir_to_new_files_since_last_sync_[sdn.first].erase(sdn.second) != 0) {
+        auto& tlist = dir_to_new_files_since_last_sync_[tdn.first];
+        assert(tlist.find(tdn.second) == tlist.end());
+        tlist.insert(tdn.second);
+      }
+    }
+    IOStatus in_s = InjectMetadataWriteError();
+    if (!in_s.ok()) {
+      return in_s;
     }
   }
 
@@ -536,6 +688,47 @@ IOStatus FaultInjectionTestFS::InjectError(ErrorOperation op,
   return IOStatus::OK();
 }
 
+IOStatus FaultInjectionTestFS::InjectWriteError(const std::string& file_name) {
+  MutexLock l(&mutex_);
+  if (!enable_write_error_injection_ || !write_error_one_in_) {
+    return IOStatus::OK();
+  }
+  bool allowed_type = false;
+
+  uint64_t number;
+  FileType cur_type = kTempFile;
+  std::size_t found = file_name.find_last_of("/");
+  std::string file = file_name.substr(found);
+  bool ret = ParseFileName(file, &number, &cur_type);
+  if (ret) {
+    for (const auto& type : write_error_allowed_types_) {
+      if (cur_type == type) {
+        allowed_type = true;
+      }
+    }
+  }
+
+  if (allowed_type) {
+    if (write_error_rand_.OneIn(write_error_one_in_)) {
+      return GetError();
+    }
+  }
+  return IOStatus::OK();
+}
+
+IOStatus FaultInjectionTestFS::InjectMetadataWriteError() {
+  {
+    MutexLock l(&mutex_);
+    if (!enable_metadata_write_error_injection_ ||
+        !metadata_write_error_one_in_ ||
+        !write_error_rand_.OneIn(metadata_write_error_one_in_)) {
+      return IOStatus::OK();
+    }
+  }
+  TEST_SYNC_POINT("FaultInjectionTestFS::InjectMetadataWriteError:Injected");
+  return IOStatus::IOError();
+}
+
 void FaultInjectionTestFS::PrintFaultBacktrace() {
 #if defined(OS_LINUX)
   ErrorContext* ctx =
diff --git a/utilities/fault_injection_fs.h b/utilities/fault_injection_fs.h
index 737ce237953..a51f46f8110 100644
--- a/utilities/fault_injection_fs.h
+++ b/utilities/fault_injection_fs.h
@@ -22,7 +22,7 @@
 #include <string>
 
 #include "file/filename.h"
-#include "include/rocksdb/file_system.h"
+#include "rocksdb/file_system.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 #include "util/thread_local.h"
@@ -59,16 +59,15 @@ struct FSFileState {
 class TestFSWritableFile : public FSWritableFile {
  public:
   explicit TestFSWritableFile(const std::string& fname,
+                              const FileOptions& file_opts,
                               std::unique_ptr<FSWritableFile>&& f,
                               FaultInjectionTestFS* fs);
   virtual ~TestFSWritableFile();
   virtual IOStatus Append(const Slice& data, const IOOptions&,
                           IODebugContext*) override;
-  virtual IOStatus Append(const Slice& data, const IOOptions& options,
-                          const DataVerificationInfo& /*verification_info*/,
-                          IODebugContext* dbg) override {
-    return Append(data, options, dbg);
-  }
+  virtual IOStatus Append(const Slice& data, const IOOptions&,
+                          const DataVerificationInfo& verification_info,
+                          IODebugContext*) override;
   virtual IOStatus Truncate(uint64_t size, const IOOptions& options,
                             IODebugContext* dbg) override {
     return target_->Truncate(size, options, dbg);
@@ -98,6 +97,7 @@ class TestFSWritableFile : public FSWritableFile {
 
  private:
   FSFileState state_;
+  FileOptions file_opts_;
   std::unique_ptr<FSWritableFile> target_;
   bool writable_file_opened_;
   FaultInjectionTestFS* fs_;
@@ -145,6 +145,8 @@ class TestFSRandomAccessFile : public FSRandomAccessFile {
   }
   bool use_direct_io() const override { return target_->use_direct_io(); }
 
+  size_t GetUniqueId(char* id, size_t max_size) const override;
+
  private:
   std::unique_ptr<FSRandomAccessFile> target_;
   FaultInjectionTestFS* fs_;
@@ -172,8 +174,14 @@ class FaultInjectionTestFS : public FileSystemWrapper {
       : FileSystemWrapper(base),
         filesystem_active_(true),
         filesystem_writable_(false),
-        thread_local_error_(new ThreadLocalPtr(DeleteThreadLocalErrorContext)) {
-  }
+        thread_local_error_(new ThreadLocalPtr(DeleteThreadLocalErrorContext)),
+        enable_write_error_injection_(false),
+        enable_metadata_write_error_injection_(false),
+        write_error_rand_(0),
+        write_error_one_in_(0),
+        metadata_write_error_one_in_(0),
+        ingest_data_corruption_before_write_(false),
+        fail_get_file_unique_id_(false) {}
   virtual ~FaultInjectionTestFS() { error_.PermitUncheckedError(); }
 
   const char* Name() const override { return "FaultInjectionTestFS"; }
@@ -290,6 +298,42 @@ class FaultInjectionTestFS : public FileSystemWrapper {
     error_ = io_error;
   }
 
+  // To simulate the data corruption before data is written in FS
+  void IngestDataCorruptionBeforeWrite() {
+    MutexLock l(&mutex_);
+    ingest_data_corruption_before_write_ = true;
+  }
+
+  void NoDataCorruptionBeforeWrite() {
+    MutexLock l(&mutex_);
+    ingest_data_corruption_before_write_ = false;
+  }
+
+  bool ShouldDataCorruptionBeforeWrite() {
+    MutexLock l(&mutex_);
+    return ingest_data_corruption_before_write_;
+  }
+
+  void SetChecksumHandoffFuncType(const ChecksumType& func_type) {
+    MutexLock l(&mutex_);
+    checksum_handoff_func_tpye_ = func_type;
+  }
+
+  const ChecksumType& GetChecksumHandoffFuncType() {
+    MutexLock l(&mutex_);
+    return checksum_handoff_func_tpye_;
+  }
+
+  void SetFailGetUniqueId(bool flag) {
+    MutexLock l(&mutex_);
+    fail_get_file_unique_id_ = flag;
+  }
+
+  bool ShouldFailGetUniqueId() {
+    MutexLock l(&mutex_);
+    return fail_get_file_unique_id_;
+  }
+
   // Specify what the operation, so we can inject the right type of error
   enum ErrorOperation : char {
     kRead = 0,
@@ -316,6 +360,35 @@ class FaultInjectionTestFS : public FileSystemWrapper {
     delete ctx;
   }
 
+  // This is to set the parameters for the write error injection.
+  // seed is the seed for the random number generator, and one_in determines
+  // the probability of injecting error (i.e an error is injected with
+  // 1/one_in probability). For write error, we can specify the error we
+  // want to inject. Types decides the file types we want to inject the
+  // error (e.g., Wal files, SST files), which is empty by default.
+  void SetRandomWriteError(uint32_t seed, int one_in, IOStatus error,
+                           const std::vector<FileType>& types) {
+    MutexLock l(&mutex_);
+    Random tmp_rand(seed);
+    error.PermitUncheckedError();
+    error_ = error;
+    write_error_rand_ = tmp_rand;
+    write_error_one_in_ = one_in;
+    write_error_allowed_types_ = types;
+  }
+
+  void SetRandomMetadataWriteError(int one_in) {
+    MutexLock l(&mutex_);
+    metadata_write_error_one_in_ = one_in;
+  }
+
+  // Inject an write error with randomlized parameter and the predefined
+  // error type. Only the allowed file types will inject the write error
+  IOStatus InjectWriteError(const std::string& file_name);
+
+  // Ingest error to metadata operations.
+  IOStatus InjectMetadataWriteError();
+
   // Inject an error. For a READ operation, a status of IOError(), a
   // corruption in the contents of scratch, or truncation of slice
   // are the types of error with equal probability. For OPEN,
@@ -343,6 +416,21 @@ class FaultInjectionTestFS : public FileSystemWrapper {
     }
   }
 
+  void EnableWriteErrorInjection() {
+    MutexLock l(&mutex_);
+    enable_write_error_injection_ = true;
+  }
+
+  void EnableMetadataWriteErrorInjection() {
+    MutexLock l(&mutex_);
+    enable_metadata_write_error_injection_ = true;
+  }
+
+  void DisableWriteErrorInjection() {
+    MutexLock l(&mutex_);
+    enable_write_error_injection_ = false;
+  }
+
   void DisableErrorInjection() {
     ErrorContext* ctx =
           static_cast<ErrorContext*>(thread_local_error_->Get());
@@ -351,6 +439,11 @@ class FaultInjectionTestFS : public FileSystemWrapper {
     }
   }
 
+  void DisableMetadataWriteErrorInjection() {
+    MutexLock l(&mutex_);
+    enable_metadata_write_error_injection_ = false;
+  }
+
   // We capture a backtrace every time a fault is injected, for debugging
   // purposes. This call prints the backtrace to stderr and frees the
   // saved callstack
@@ -396,6 +489,15 @@ class FaultInjectionTestFS : public FileSystemWrapper {
   };
 
   std::unique_ptr<ThreadLocalPtr> thread_local_error_;
+  bool enable_write_error_injection_;
+  bool enable_metadata_write_error_injection_;
+  Random write_error_rand_;
+  int write_error_one_in_;
+  int metadata_write_error_one_in_;
+  std::vector<FileType> write_error_allowed_types_;
+  bool ingest_data_corruption_before_write_;
+  ChecksumType checksum_handoff_func_tpye_;
+  bool fail_get_file_unique_id_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/memory/memory_test.cc b/utilities/memory/memory_test.cc
index e5502fc86cc..bef22fc2f35 100644
--- a/utilities/memory/memory_test.cc
+++ b/utilities/memory/memory_test.cc
@@ -68,7 +68,8 @@ class MemoryTest : public testing::Test {
         ASSERT_OK(db_impl->TEST_GetAllImmutableCFOptions(&iopts_map));
       }
       for (auto pair : iopts_map) {
-        GetCachePointersFromTableFactory(pair.second->table_factory, cache_set);
+        GetCachePointersFromTableFactory(pair.second->table_factory.get(),
+                                         cache_set);
       }
     }
   }
@@ -103,7 +104,7 @@ TEST_F(MemoryTest, SharedBlockCacheTotal) {
   BlockBasedTableOptions bbt_opts;
   bbt_opts.block_cache = NewLRUCache(4096 * 1000 * 10);
   for (int i = 0; i < kNumDBs; ++i) {
-    DestroyDB(GetDBName(i), opt);
+    ASSERT_OK(DestroyDB(GetDBName(i), opt));
     DB* db = nullptr;
     ASSERT_OK(DB::Open(opt, GetDBName(i), &db));
     dbs.push_back(db);
@@ -119,13 +120,13 @@ TEST_F(MemoryTest, SharedBlockCacheTotal) {
         ASSERT_OK(dbs[i]->Put(WriteOptions(), keys_by_db[i].back(),
                               rnd_.RandomString(kValueSize)));
       }
-      dbs[i]->Flush(FlushOptions());
+      ASSERT_OK(dbs[i]->Flush(FlushOptions()));
     }
   }
   for (int i = 0; i < kNumDBs; ++i) {
     for (auto& key : keys_by_db[i]) {
       std::string value;
-      dbs[i]->Get(ReadOptions(), key, &value);
+      ASSERT_OK(dbs[i]->Get(ReadOptions(), key, &value));
     }
     UpdateUsagesHistory(dbs);
   }
@@ -145,8 +146,10 @@ TEST_F(MemoryTest, MemTableAndTableReadersTotal) {
   std::vector<uint64_t> usage_by_type;
   std::vector<std::vector<ColumnFamilyHandle*>> vec_handles;
   const int kNumDBs = 10;
+  // These key/value sizes ensure each KV has its own memtable. Note that the
+  // minimum write_buffer_size allowed is 64 KB.
   const int kKeySize = 100;
-  const int kValueSize = 500;
+  const int kValueSize = 1 << 16;
   Options opt;
   opt.create_if_missing = true;
   opt.create_missing_column_families = true;
@@ -162,7 +165,7 @@ TEST_F(MemoryTest, MemTableAndTableReadersTotal) {
   };
 
   for (int i = 0; i < kNumDBs; ++i) {
-    DestroyDB(GetDBName(i), opt);
+    ASSERT_OK(DestroyDB(GetDBName(i), opt));
     std::vector<ColumnFamilyHandle*> handles;
     dbs.emplace_back();
     vec_handles.emplace_back();
@@ -198,11 +201,12 @@ TEST_F(MemoryTest, MemTableAndTableReadersTotal) {
   // Create an iterator and flush all memtables for each db
   for (int i = 0; i < kNumDBs; ++i) {
     iters.push_back(dbs[i]->NewIterator(ReadOptions()));
-    dbs[i]->Flush(FlushOptions());
+    ASSERT_OK(dbs[i]->Flush(FlushOptions()));
 
     for (int j = 0; j < 100; ++j) {
       std::string value;
-      dbs[i]->Get(ReadOptions(), rnd_.RandomString(kKeySize), &value);
+      ASSERT_NOK(
+          dbs[i]->Get(ReadOptions(), rnd_.RandomString(kKeySize), &value));
     }
 
     UpdateUsagesHistory(dbs);
diff --git a/utilities/merge_operators/sortlist.cc b/utilities/merge_operators/sortlist.cc
index b6bd65b36af..078b83e1974 100644
--- a/utilities/merge_operators/sortlist.cc
+++ b/utilities/merge_operators/sortlist.cc
@@ -7,9 +7,6 @@
 #include "rocksdb/slice.h"
 #include "utilities/merge_operators.h"
 
-using ROCKSDB_NAMESPACE::Logger;
-using ROCKSDB_NAMESPACE::MergeOperator;
-using ROCKSDB_NAMESPACE::Slice;
 
 namespace ROCKSDB_NAMESPACE {
 
diff --git a/utilities/merge_operators/string_append/stringappend_test.cc b/utilities/merge_operators/string_append/stringappend_test.cc
index 675c4b2afda..58be266bbfd 100644
--- a/utilities/merge_operators/string_append/stringappend_test.cc
+++ b/utilities/merge_operators/string_append/stringappend_test.cc
@@ -27,7 +27,6 @@
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend2.h"
 
-using namespace ROCKSDB_NAMESPACE;
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -121,7 +120,8 @@ class StringAppendOperatorTest : public testing::Test,
                                  public ::testing::WithParamInterface<bool> {
  public:
   StringAppendOperatorTest() {
-    DestroyDB(kDbName, Options());    // Start each test with a fresh DB
+    EXPECT_OK(
+        DestroyDB(kDbName, Options()));  // Start each test with a fresh DB
   }
 
   void SetUp() override {
@@ -254,9 +254,7 @@ TEST_P(StringAppendOperatorTest, SimpleTest) {
   slists.Append("k1", "v3");
 
   std::string res;
-  bool status = slists.Get("k1", &res);
-
-  ASSERT_TRUE(status);
+  ASSERT_TRUE(slists.Get("k1", &res));
   ASSERT_EQ(res, "v1,v2,v3");
 }
 
@@ -269,7 +267,7 @@ TEST_P(StringAppendOperatorTest, SimpleDelimiterTest) {
   slists.Append("k1", "v3");
 
   std::string res;
-  slists.Get("k1", &res);
+  ASSERT_TRUE(slists.Get("k1", &res));
   ASSERT_EQ(res, "v1|v2|v3");
 }
 
@@ -280,7 +278,7 @@ TEST_P(StringAppendOperatorTest, OneValueNoDelimiterTest) {
   slists.Append("random_key", "single_val");
 
   std::string res;
-  slists.Get("random_key", &res);
+  ASSERT_TRUE(slists.Get("random_key", &res));
   ASSERT_EQ(res, "single_val");
 }
 
@@ -425,9 +423,9 @@ TEST_P(StringAppendOperatorTest, PersistentVariousKeys) {
     slists.Append("c", "asdasd");
 
     std::string a, b, c;
-    slists.Get("a", &a);
-    slists.Get("b", &b);
-    slists.Get("c", &c);
+    ASSERT_TRUE(slists.Get("a", &a));
+    ASSERT_TRUE(slists.Get("b", &b));
+    ASSERT_TRUE(slists.Get("c", &c));
 
     ASSERT_EQ(a, "x\nt\nr");
     ASSERT_EQ(b, "y\n2");
@@ -451,9 +449,9 @@ TEST_P(StringAppendOperatorTest, PersistentVariousKeys) {
     // The most recent changes should be in memory (MemTable)
     // Hence, this will test both Get() paths.
     std::string a, b, c;
-    slists.Get("a", &a);
-    slists.Get("b", &b);
-    slists.Get("c", &c);
+    ASSERT_TRUE(slists.Get("a", &a));
+    ASSERT_TRUE(slists.Get("b", &b));
+    ASSERT_TRUE(slists.Get("c", &c));
 
     ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
     ASSERT_EQ(b, "y\n2\ndf\nl;");
@@ -467,9 +465,9 @@ TEST_P(StringAppendOperatorTest, PersistentVariousKeys) {
 
     // All changes should be on disk. This will test VersionSet Get()
     std::string a, b, c;
-    slists.Get("a", &a);
-    slists.Get("b", &b);
-    slists.Get("c", &c);
+    ASSERT_TRUE(slists.Get("a", &a));
+    ASSERT_TRUE(slists.Get("b", &b));
+    ASSERT_TRUE(slists.Get("c", &c));
 
     ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
     ASSERT_EQ(b, "y\n2\ndf\nl;");
@@ -483,41 +481,34 @@ TEST_P(StringAppendOperatorTest, PersistentFlushAndCompaction) {
     auto db = OpenDb('\n');
     StringLists slists(db);
     std::string a, b, c;
-    bool success;
 
     // Append, Flush, Get
     slists.Append("c", "asdasd");
-    db->Flush(ROCKSDB_NAMESPACE::FlushOptions());
-    success = slists.Get("c", &c);
-    ASSERT_TRUE(success);
+    ASSERT_OK(db->Flush(ROCKSDB_NAMESPACE::FlushOptions()));
+    ASSERT_TRUE(slists.Get("c", &c));
     ASSERT_EQ(c, "asdasd");
 
     // Append, Flush, Append, Get
     slists.Append("a", "x");
     slists.Append("b", "y");
-    db->Flush(ROCKSDB_NAMESPACE::FlushOptions());
+    ASSERT_OK(db->Flush(ROCKSDB_NAMESPACE::FlushOptions()));
     slists.Append("a", "t");
     slists.Append("a", "r");
     slists.Append("b", "2");
 
-    success = slists.Get("a", &a);
-    assert(success == true);
+    ASSERT_TRUE(slists.Get("a", &a));
     ASSERT_EQ(a, "x\nt\nr");
 
-    success = slists.Get("b", &b);
-    assert(success == true);
+    ASSERT_TRUE(slists.Get("b", &b));
     ASSERT_EQ(b, "y\n2");
 
     // Append, Get
-    success = slists.Append("c", "asdasd");
-    assert(success);
-    success = slists.Append("b", "monkey");
-    assert(success);
+    ASSERT_TRUE(slists.Append("c", "asdasd"));
+    ASSERT_TRUE(slists.Append("b", "monkey"));
 
-    // I omit the "assert(success)" checks here.
-    slists.Get("a", &a);
-    slists.Get("b", &b);
-    slists.Get("c", &c);
+    ASSERT_TRUE(slists.Get("a", &a));
+    ASSERT_TRUE(slists.Get("b", &b));
+    ASSERT_TRUE(slists.Get("c", &c));
 
     ASSERT_EQ(a, "x\nt\nr");
     ASSERT_EQ(b, "y\n2\nmonkey");
@@ -531,17 +522,17 @@ TEST_P(StringAppendOperatorTest, PersistentFlushAndCompaction) {
     std::string a, b, c;
 
     // Get (Quick check for persistence of previous database)
-    slists.Get("a", &a);
+    ASSERT_TRUE(slists.Get("a", &a));
     ASSERT_EQ(a, "x\nt\nr");
 
     //Append, Compact, Get
     slists.Append("c", "bbnagnagsx");
     slists.Append("a", "sa");
     slists.Append("b", "df");
-    db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
-    slists.Get("a", &a);
-    slists.Get("b", &b);
-    slists.Get("c", &c);
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    ASSERT_TRUE(slists.Get("a", &a));
+    ASSERT_TRUE(slists.Get("b", &b));
+    ASSERT_TRUE(slists.Get("c", &c));
     ASSERT_EQ(a, "x\nt\nr\nsa");
     ASSERT_EQ(b, "y\n2\nmonkey\ndf");
     ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx");
@@ -551,24 +542,24 @@ TEST_P(StringAppendOperatorTest, PersistentFlushAndCompaction) {
     slists.Append("a", "jk");
     slists.Append("b", "l;");
     slists.Append("c", "rogosh");
-    slists.Get("a", &a);
-    slists.Get("b", &b);
-    slists.Get("c", &c);
+    ASSERT_TRUE(slists.Get("a", &a));
+    ASSERT_TRUE(slists.Get("b", &b));
+    ASSERT_TRUE(slists.Get("c", &c));
     ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
     ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;");
     ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh");
 
     // Compact, Get
-    db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
     ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
     ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;");
     ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh");
 
     // Append, Flush, Compact, Get
     slists.Append("b", "afcg");
-    db->Flush(ROCKSDB_NAMESPACE::FlushOptions());
-    db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
-    slists.Get("b", &b);
+    ASSERT_OK(db->Flush(ROCKSDB_NAMESPACE::FlushOptions()));
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    ASSERT_TRUE(slists.Get("b", &b));
     ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;\nafcg");
   }
 }
@@ -582,17 +573,16 @@ TEST_P(StringAppendOperatorTest, SimpleTestNullDelimiter) {
   slists.Append("k1", "v3");
 
   std::string res;
-  bool status = slists.Get("k1", &res);
-  ASSERT_TRUE(status);
+  ASSERT_TRUE(slists.Get("k1", &res));
 
   // Construct the desired string. Default constructor doesn't like '\0' chars.
   std::string checker("v1,v2,v3");    // Verify that the string is right size.
   checker[2] = '\0';                  // Use null delimiter instead of comma.
   checker[5] = '\0';
-  assert(checker.size() == 8);        // Verify it is still the correct size
+  ASSERT_EQ(checker.size(), 8);  // Verify it is still the correct size
 
   // Check that the rocksdb result string matches the desired string
-  assert(res.size() == checker.size());
+  ASSERT_EQ(res.size(), checker.size());
   ASSERT_EQ(res, checker);
 }
 
diff --git a/utilities/object_registry.cc b/utilities/object_registry.cc
index 38e55020e79..d74d37d1717 100644
--- a/utilities/object_registry.cc
+++ b/utilities/object_registry.cc
@@ -32,6 +32,15 @@ void ObjectLibrary::AddEntry(const std::string &type,
   entries.emplace_back(std::move(entry));
 }
 
+size_t ObjectLibrary::GetFactoryCount(size_t *types) const {
+  *types = entries_.size();
+  size_t factories = 0;
+  for (const auto &e : entries_) {
+    factories += e.second.size();
+  }
+  return factories;
+}
+
 void ObjectLibrary::Dump(Logger *logger) const {
   for (const auto &iter : entries_) {
     ROCKS_LOG_HEADER(logger, "    Registered factories for type[%s] ",
@@ -50,17 +59,23 @@ void ObjectLibrary::Dump(Logger *logger) const {
 // This instance will contain most of the "standard" registered objects
 std::shared_ptr<ObjectLibrary> &ObjectLibrary::Default() {
   static std::shared_ptr<ObjectLibrary> instance =
-      std::make_shared<ObjectLibrary>();
+      std::make_shared<ObjectLibrary>("default");
   return instance;
 }
 
-std::shared_ptr<ObjectRegistry> ObjectRegistry::NewInstance() {
-  std::shared_ptr<ObjectRegistry> instance = std::make_shared<ObjectRegistry>();
+std::shared_ptr<ObjectRegistry> ObjectRegistry::Default() {
+  static std::shared_ptr<ObjectRegistry> instance(
+      new ObjectRegistry(ObjectLibrary::Default()));
   return instance;
 }
 
-ObjectRegistry::ObjectRegistry() {
-  libraries_.push_back(ObjectLibrary::Default());
+std::shared_ptr<ObjectRegistry> ObjectRegistry::NewInstance() {
+  return std::make_shared<ObjectRegistry>(Default());
+}
+
+std::shared_ptr<ObjectRegistry> ObjectRegistry::NewInstance(
+    const std::shared_ptr<ObjectRegistry> &parent) {
+  return std::make_shared<ObjectRegistry>(parent);
 }
 
 // Searches (from back to front) the libraries looking for the
@@ -74,13 +89,20 @@ const ObjectLibrary::Entry *ObjectRegistry::FindEntry(
       return entry;
     }
   }
-  return nullptr;
+  if (parent_ != nullptr) {
+    return parent_->FindEntry(type, name);
+  } else {
+    return nullptr;
+  }
 }
 
 void ObjectRegistry::Dump(Logger *logger) const {
   for (auto iter = libraries_.crbegin(); iter != libraries_.crend(); ++iter) {
     iter->get()->Dump(logger);
   }
+  if (parent_ != nullptr) {
+    parent_->Dump(logger);
+  }
 }
 
 #endif  // ROCKSDB_LITE
diff --git a/utilities/object_registry_test.cc b/utilities/object_registry_test.cc
index bbb76b142ba..e09392dd8fd 100644
--- a/utilities/object_registry_test.cc
+++ b/utilities/object_registry_test.cc
@@ -62,7 +62,8 @@ TEST_F(EnvRegistryTest, LocalRegistry) {
   std::string msg;
   std::unique_ptr<Env> guard;
   auto registry = ObjectRegistry::NewInstance();
-  std::shared_ptr<ObjectLibrary> library = std::make_shared<ObjectLibrary>();
+  std::shared_ptr<ObjectLibrary> library =
+      std::make_shared<ObjectLibrary>("local");
   registry->AddLibrary(library);
   library->Register<Env>(
       "test-local",
@@ -87,7 +88,8 @@ TEST_F(EnvRegistryTest, LocalRegistry) {
 TEST_F(EnvRegistryTest, CheckShared) {
   std::shared_ptr<Env> shared;
   std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
-  std::shared_ptr<ObjectLibrary> library = std::make_shared<ObjectLibrary>();
+  std::shared_ptr<ObjectLibrary> library =
+      std::make_shared<ObjectLibrary>("shared");
   registry->AddLibrary(library);
   library->Register<Env>(
       "unguarded",
@@ -111,7 +113,8 @@ TEST_F(EnvRegistryTest, CheckShared) {
 TEST_F(EnvRegistryTest, CheckStatic) {
   Env* env = nullptr;
   std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
-  std::shared_ptr<ObjectLibrary> library = std::make_shared<ObjectLibrary>();
+  std::shared_ptr<ObjectLibrary> library =
+      std::make_shared<ObjectLibrary>("static");
   registry->AddLibrary(library);
   library->Register<Env>(
       "unguarded",
@@ -135,7 +138,8 @@ TEST_F(EnvRegistryTest, CheckStatic) {
 TEST_F(EnvRegistryTest, CheckUnique) {
   std::unique_ptr<Env> unique;
   std::shared_ptr<ObjectRegistry> registry = ObjectRegistry::NewInstance();
-  std::shared_ptr<ObjectLibrary> library = std::make_shared<ObjectLibrary>();
+  std::shared_ptr<ObjectLibrary> library =
+      std::make_shared<ObjectLibrary>("unique");
   registry->AddLibrary(library);
   library->Register<Env>(
       "unguarded",
@@ -156,6 +160,51 @@ TEST_F(EnvRegistryTest, CheckUnique) {
   ASSERT_EQ(unique, nullptr);
 }
 
+TEST_F(EnvRegistryTest, TestRegistryParents) {
+  auto grand = ObjectRegistry::Default();
+  auto parent = ObjectRegistry::NewInstance();  // parent with a grandparent
+  auto uncle = ObjectRegistry::NewInstance(grand);
+  auto child = ObjectRegistry::NewInstance(parent);
+  auto cousin = ObjectRegistry::NewInstance(uncle);
+
+  auto library = parent->AddLibrary("parent");
+  library->Register<Env>(
+      "parent", [](const std::string& /*uri*/, std::unique_ptr<Env>* guard,
+                   std::string* /* errmsg */) {
+        guard->reset(new EnvWrapper(Env::Default()));
+        return guard->get();
+      });
+  library = cousin->AddLibrary("cousin");
+  library->Register<Env>(
+      "cousin", [](const std::string& /*uri*/, std::unique_ptr<Env>* guard,
+                   std::string* /* errmsg */) {
+        guard->reset(new EnvWrapper(Env::Default()));
+        return guard->get();
+      });
+
+  std::unique_ptr<Env> guard;
+  std::string msg;
+
+  // a:://* is registered in Default, so they should all workd
+  ASSERT_NE(parent->NewObject<Env>("a://test", &guard, &msg), nullptr);
+  ASSERT_NE(child->NewObject<Env>("a://test", &guard, &msg), nullptr);
+  ASSERT_NE(uncle->NewObject<Env>("a://test", &guard, &msg), nullptr);
+  ASSERT_NE(cousin->NewObject<Env>("a://test", &guard, &msg), nullptr);
+
+  // The parent env is only registered for parent, not uncle,
+  // So parent and child should return success and uncle and cousin should fail
+  ASSERT_OK(parent->NewUniqueObject<Env>("parent", &guard));
+  ASSERT_OK(child->NewUniqueObject<Env>("parent", &guard));
+  ASSERT_NOK(uncle->NewUniqueObject<Env>("parent", &guard));
+  ASSERT_NOK(cousin->NewUniqueObject<Env>("parent", &guard));
+
+  // The cousin is only registered in the cousin, so all of the others should
+  // fail
+  ASSERT_OK(cousin->NewUniqueObject<Env>("cousin", &guard));
+  ASSERT_NOK(parent->NewUniqueObject<Env>("cousin", &guard));
+  ASSERT_NOK(child->NewUniqueObject<Env>("cousin", &guard));
+  ASSERT_NOK(uncle->NewUniqueObject<Env>("cousin", &guard));
+}
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/utilities/option_change_migration/option_change_migration.cc b/utilities/option_change_migration/option_change_migration.cc
index f2382297bf3..5058c968aee 100644
--- a/utilities/option_change_migration/option_change_migration.cc
+++ b/utilities/option_change_migration/option_change_migration.cc
@@ -60,9 +60,9 @@ Status CompactToLevel(const Options& options, const std::string& dbname,
     // generate one output file
     cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
   }
-  db->CompactRange(cro, nullptr, nullptr);
+  s = db->CompactRange(cro, nullptr, nullptr);
 
-  if (need_reopen) {
+  if (s.ok() && need_reopen) {
     // Need to restart DB to rewrite the manifest file.
     // In order to open a DB with specific num_levels, the manifest file should
     // contain no record that mentiones any level beyond num_levels. Issuing a
diff --git a/utilities/option_change_migration/option_change_migration_test.cc b/utilities/option_change_migration/option_change_migration_test.cc
index c94564085bc..126bebea12b 100644
--- a/utilities/option_change_migration/option_change_migration_test.cc
+++ b/utilities/option_change_migration/option_change_migration_test.cc
@@ -54,7 +54,9 @@ TEST_P(DBOptionChangeMigrationTests, Migrate1) {
   if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
     old_options.level_compaction_dynamic_level_bytes = is_dynamic1_;
   }
-
+  if (old_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    old_options.max_open_files = -1;
+  }
   old_options.level0_file_num_compaction_trigger = 3;
   old_options.write_buffer_size = 64 * 1024;
   old_options.target_file_size_base = 128 * 1024;
@@ -72,8 +74,8 @@ TEST_P(DBOptionChangeMigrationTests, Migrate1) {
   for (int num = 0; num < 20; num++) {
     GenerateNewFile(&rnd, &key_idx);
   }
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // Will make sure exactly those keys are in the DB after migration.
   std::set<std::string> keys;
@@ -92,6 +94,9 @@ TEST_P(DBOptionChangeMigrationTests, Migrate1) {
   if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
     new_options.level_compaction_dynamic_level_bytes = is_dynamic2_;
   }
+  if (new_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    new_options.max_open_files = -1;
+  }
   new_options.target_file_size_base = 256 * 1024;
   new_options.num_levels = level2_;
   new_options.max_bytes_for_level_base = 150 * 1024;
@@ -100,8 +105,8 @@ TEST_P(DBOptionChangeMigrationTests, Migrate1) {
   Reopen(new_options);
 
   // Wait for compaction to finish and make sure it can reopen
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   Reopen(new_options);
 
   {
@@ -123,6 +128,9 @@ TEST_P(DBOptionChangeMigrationTests, Migrate2) {
   if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
     old_options.level_compaction_dynamic_level_bytes = is_dynamic2_;
   }
+  if (old_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    old_options.max_open_files = -1;
+  }
   old_options.level0_file_num_compaction_trigger = 3;
   old_options.write_buffer_size = 64 * 1024;
   old_options.target_file_size_base = 128 * 1024;
@@ -140,8 +148,8 @@ TEST_P(DBOptionChangeMigrationTests, Migrate2) {
   for (int num = 0; num < 20; num++) {
     GenerateNewFile(&rnd, &key_idx);
   }
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // Will make sure exactly those keys are in the DB after migration.
   std::set<std::string> keys;
@@ -161,6 +169,9 @@ TEST_P(DBOptionChangeMigrationTests, Migrate2) {
   if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
     new_options.level_compaction_dynamic_level_bytes = is_dynamic1_;
   }
+  if (new_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    new_options.max_open_files = -1;
+  }
   new_options.target_file_size_base = 256 * 1024;
   new_options.num_levels = level1_;
   new_options.max_bytes_for_level_base = 150 * 1024;
@@ -168,8 +179,8 @@ TEST_P(DBOptionChangeMigrationTests, Migrate2) {
   ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options));
   Reopen(new_options);
   // Wait for compaction to finish and make sure it can reopen
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   Reopen(new_options);
 
   {
@@ -191,7 +202,9 @@ TEST_P(DBOptionChangeMigrationTests, Migrate3) {
   if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
     old_options.level_compaction_dynamic_level_bytes = is_dynamic1_;
   }
-
+  if (old_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    old_options.max_open_files = -1;
+  }
   old_options.level0_file_num_compaction_trigger = 3;
   old_options.write_buffer_size = 64 * 1024;
   old_options.target_file_size_base = 128 * 1024;
@@ -207,16 +220,16 @@ TEST_P(DBOptionChangeMigrationTests, Migrate3) {
       ASSERT_OK(Put(Key(num * 100 + i), rnd.RandomString(900)));
     }
     Flush();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     if (num == 9) {
       // Issue a full compaction to generate some zero-out files
       CompactRangeOptions cro;
       cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
-      dbfull()->CompactRange(cro, nullptr, nullptr);
+      ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
     }
   }
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // Will make sure exactly those keys are in the DB after migration.
   std::set<std::string> keys;
@@ -235,6 +248,9 @@ TEST_P(DBOptionChangeMigrationTests, Migrate3) {
   if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
     new_options.level_compaction_dynamic_level_bytes = is_dynamic2_;
   }
+  if (new_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    new_options.max_open_files = -1;
+  }
   new_options.target_file_size_base = 256 * 1024;
   new_options.num_levels = level2_;
   new_options.max_bytes_for_level_base = 150 * 1024;
@@ -243,8 +259,8 @@ TEST_P(DBOptionChangeMigrationTests, Migrate3) {
   Reopen(new_options);
 
   // Wait for compaction to finish and make sure it can reopen
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   Reopen(new_options);
 
   {
@@ -266,6 +282,9 @@ TEST_P(DBOptionChangeMigrationTests, Migrate4) {
   if (old_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
     old_options.level_compaction_dynamic_level_bytes = is_dynamic2_;
   }
+  if (old_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    old_options.max_open_files = -1;
+  }
   old_options.level0_file_num_compaction_trigger = 3;
   old_options.write_buffer_size = 64 * 1024;
   old_options.target_file_size_base = 128 * 1024;
@@ -281,16 +300,16 @@ TEST_P(DBOptionChangeMigrationTests, Migrate4) {
       ASSERT_OK(Put(Key(num * 100 + i), rnd.RandomString(900)));
     }
     Flush();
-    dbfull()->TEST_WaitForCompact();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     if (num == 9) {
       // Issue a full compaction to generate some zero-out files
       CompactRangeOptions cro;
       cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
-      dbfull()->CompactRange(cro, nullptr, nullptr);
+      ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
     }
   }
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
   // Will make sure exactly those keys are in the DB after migration.
   std::set<std::string> keys;
@@ -310,6 +329,9 @@ TEST_P(DBOptionChangeMigrationTests, Migrate4) {
   if (new_options.compaction_style == CompactionStyle::kCompactionStyleLevel) {
     new_options.level_compaction_dynamic_level_bytes = is_dynamic1_;
   }
+  if (new_options.compaction_style == CompactionStyle::kCompactionStyleFIFO) {
+    new_options.max_open_files = -1;
+  }
   new_options.target_file_size_base = 256 * 1024;
   new_options.num_levels = level1_;
   new_options.max_bytes_for_level_base = 150 * 1024;
@@ -317,8 +339,8 @@ TEST_P(DBOptionChangeMigrationTests, Migrate4) {
   ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options));
   Reopen(new_options);
   // Wait for compaction to finish and make sure it can reopen
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   Reopen(new_options);
 
   {
@@ -381,7 +403,7 @@ TEST_F(DBOptionChangeMigrationTest, CompactedSrcToUniversal) {
   Flush();
   CompactRangeOptions cro;
   cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
-  dbfull()->CompactRange(cro, nullptr, nullptr);
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
 
   // Will make sure exactly those keys are in the DB after migration.
   std::set<std::string> keys;
@@ -404,8 +426,8 @@ TEST_F(DBOptionChangeMigrationTest, CompactedSrcToUniversal) {
   ASSERT_OK(OptionChangeMigration(dbname_, old_options, new_options));
   Reopen(new_options);
   // Wait for compaction to finish and make sure it can reopen
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
   Reopen(new_options);
 
   {
@@ -417,6 +439,7 @@ TEST_F(DBOptionChangeMigrationTest, CompactedSrcToUniversal) {
       it->Next();
     }
     ASSERT_TRUE(!it->Valid());
+    ASSERT_OK(it->status());
   }
 }
 
diff --git a/utilities/options/options_util.cc b/utilities/options/options_util.cc
index 353725552b3..9efbf76f02b 100644
--- a/utilities/options/options_util.cc
+++ b/utilities/options/options_util.cc
@@ -7,7 +7,6 @@
 
 #include "rocksdb/utilities/options_util.h"
 
-#include "env/composite_env_wrapper.h"
 #include "file/filename.h"
 #include "options/options_parser.h"
 #include "rocksdb/convenience.h"
@@ -34,8 +33,8 @@ Status LoadOptionsFromFile(const ConfigOptions& config_options,
                            std::vector<ColumnFamilyDescriptor>* cf_descs,
                            std::shared_ptr<Cache>* cache) {
   RocksDBOptionsParser parser;
-  LegacyFileSystemWrapper fs(config_options.env);
-  Status s = parser.Parse(config_options, file_name, &fs);
+  const auto& fs = config_options.env->GetFileSystem();
+  Status s = parser.Parse(config_options, file_name, fs.get());
   if (!s.ok()) {
     return s;
   }
@@ -122,7 +121,7 @@ Status CheckOptionsCompatibility(
     const std::string& dbpath, Env* env, const DBOptions& db_options,
     const std::vector<ColumnFamilyDescriptor>& cf_descs,
     bool ignore_unknown_options) {
-  ConfigOptions config_options;
+  ConfigOptions config_options(db_options);
   config_options.sanity_level = ConfigOptions::kSanityLevelLooselyCompatible;
   config_options.ignore_unknown_options = ignore_unknown_options;
   config_options.input_strings_escaped = true;
@@ -149,12 +148,11 @@ Status CheckOptionsCompatibility(
     cf_opts.push_back(cf_desc.options);
   }
 
-  LegacyFileSystemWrapper fs(config_options.env);
+  const auto& fs = config_options.env->GetFileSystem();
 
   return RocksDBOptionsParser::VerifyRocksDBOptionsFromFile(
-
       config_options, db_options, cf_names, cf_opts,
-      dbpath + "/" + options_file_name, &fs);
+      dbpath + "/" + options_file_name, fs.get());
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/options/options_util_test.cc b/utilities/options/options_util_test.cc
index 318ccdc0e00..c148721164e 100644
--- a/utilities/options/options_util_test.cc
+++ b/utilities/options/options_util_test.cc
@@ -85,7 +85,7 @@ TEST_F(OptionsUtilTest, SaveAndLoad) {
         exact, cf_opts[i], loaded_cf_descs[i].options));
   }
 
-  DestroyDB(dbname_, Options(db_opt, cf_opts[0]));
+  ASSERT_OK(DestroyDB(dbname_, Options(db_opt, cf_opts[0])));
   for (size_t i = 0; i < kCFCount; ++i) {
     if (cf_opts[i].compaction_filter) {
       delete cf_opts[i].compaction_filter;
@@ -155,7 +155,7 @@ TEST_F(OptionsUtilTest, SaveAndLoadWithCacheCheck) {
       ASSERT_EQ(loaded_bbt_opt->block_cache.get(), cache.get());
     }
   }
-  DestroyDB(dbname_, Options(loaded_db_opt, cf_opts[0]));
+  ASSERT_OK(DestroyDB(dbname_, Options(loaded_db_opt, cf_opts[0])));
 }
 
 namespace {
@@ -178,7 +178,6 @@ class DummyTableFactory : public TableFactory {
 
   TableBuilder* NewTableBuilder(
       const TableBuilderOptions& /*table_builder_options*/,
-      uint32_t /*column_family_id*/,
       WritableFileWriter* /*file*/) const override {
     return nullptr;
   }
@@ -252,7 +251,7 @@ TEST_F(OptionsUtilTest, SanityCheck) {
   db_opt.create_missing_column_families = true;
   db_opt.create_if_missing = true;
 
-  DestroyDB(dbname_, Options(db_opt, cf_descs[0].options));
+  ASSERT_OK(DestroyDB(dbname_, Options(db_opt, cf_descs[0].options)));
   DB* db;
   std::vector<ColumnFamilyHandle*> handles;
   // open and persist the options
@@ -361,7 +360,7 @@ TEST_F(OptionsUtilTest, SanityCheck) {
     ASSERT_OK(
         CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs));
   }
-  DestroyDB(dbname_, Options(db_opt, cf_descs[0].options));
+  ASSERT_OK(DestroyDB(dbname_, Options(db_opt, cf_descs[0].options)));
 }
 
 TEST_F(OptionsUtilTest, LatestOptionsNotFound) {
@@ -379,7 +378,7 @@ TEST_F(OptionsUtilTest, LatestOptionsNotFound) {
   std::vector<std::string> children;
 
   std::string options_file_name;
-  DestroyDB(dbname_, options);
+  ASSERT_OK(DestroyDB(dbname_, options));
   // First, test where the db directory does not exist
   ASSERT_NOK(options.env->GetChildren(dbname_, &children));
 
@@ -436,7 +435,7 @@ TEST_F(OptionsUtilTest, LoadLatestOptions) {
   DB* db;
   options.create_if_missing = true;
 
-  DestroyDB(dbname_, options);
+  ASSERT_OK(DestroyDB(dbname_, options));
 
   cf_descs.emplace_back();
   cf_descs.back().name = kDefaultColumnFamilyName;
@@ -494,7 +493,7 @@ TEST_F(OptionsUtilTest, LoadLatestOptions) {
     delete handle;
   }
   delete db;
-  DestroyDB(dbname_, options, cf_descs);
+  ASSERT_OK(DestroyDB(dbname_, options, cf_descs));
 }
 
 static void WriteOptionsFile(Env* env, const std::string& path,
diff --git a/utilities/persistent_cache/block_cache_tier.cc b/utilities/persistent_cache/block_cache_tier.cc
index 65873757171..baefa87710e 100644
--- a/utilities/persistent_cache/block_cache_tier.cc
+++ b/utilities/persistent_cache/block_cache_tier.cc
@@ -222,7 +222,7 @@ Status BlockCacheTier::InsertImpl(const Slice& key, const Slice& data) {
   assert(data.size());
   assert(cache_file_);
 
-  StopWatchNano timer(opt_.env, /*auto_start=*/ true);
+  StopWatchNano timer(opt_.clock, /*auto_start=*/true);
 
   WriteLock _(&lock_);
 
@@ -265,7 +265,7 @@ Status BlockCacheTier::InsertImpl(const Slice& key, const Slice& data) {
 
 Status BlockCacheTier::Lookup(const Slice& key, std::unique_ptr<char[]>* val,
                               size_t* size) {
-  StopWatchNano timer(opt_.env, /*auto_start=*/ true);
+  StopWatchNano timer(opt_.clock, /*auto_start=*/true);
 
   LBA lba;
   bool status;
diff --git a/utilities/persistent_cache/block_cache_tier.h b/utilities/persistent_cache/block_cache_tier.h
index 13b04f95a73..74c3095e7b3 100644
--- a/utilities/persistent_cache/block_cache_tier.h
+++ b/utilities/persistent_cache/block_cache_tier.h
@@ -19,21 +19,20 @@
 #include <string>
 #include <thread>
 
-#include "rocksdb/cache.h"
-#include "rocksdb/comparator.h"
-#include "rocksdb/persistent_cache.h"
-
-#include "utilities/persistent_cache/block_cache_tier_file.h"
-#include "utilities/persistent_cache/block_cache_tier_metadata.h"
-#include "utilities/persistent_cache/persistent_cache_util.h"
-
 #include "memory/arena.h"
 #include "memtable/skiplist.h"
 #include "monitoring/histogram.h"
 #include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/persistent_cache.h"
+#include "rocksdb/system_clock.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
 #include "util/mutexlock.h"
+#include "utilities/persistent_cache/block_cache_tier_file.h"
+#include "utilities/persistent_cache/block_cache_tier_metadata.h"
+#include "utilities/persistent_cache/persistent_cache_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -74,7 +73,7 @@ class BlockCacheTier : public PersistentCacheTier {
   void TEST_Flush() override {
     while (insert_ops_.Size()) {
       /* sleep override */
-      Env::Default()->SleepForMicroseconds(1000000);
+      SystemClock::Default()->SleepForMicroseconds(1000000);
     }
   }
 
diff --git a/utilities/persistent_cache/block_cache_tier_file.cc b/utilities/persistent_cache/block_cache_tier_file.cc
index d2f92a98fa0..149275fb25a 100644
--- a/utilities/persistent_cache/block_cache_tier_file.cc
+++ b/utilities/persistent_cache/block_cache_tier_file.cc
@@ -16,6 +16,7 @@
 #include "env/composite_env_wrapper.h"
 #include "logging/logging.h"
 #include "port/port.h"
+#include "rocksdb/system_clock.h"
 #include "util/crc32c.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -32,15 +33,15 @@ Status NewWritableCacheFile(Env* const env, const std::string& filepath,
   return s;
 }
 
-Status NewRandomAccessCacheFile(Env* const env, const std::string& filepath,
-                                std::unique_ptr<RandomAccessFile>* file,
+Status NewRandomAccessCacheFile(const std::shared_ptr<FileSystem>& fs,
+                                const std::string& filepath,
+                                std::unique_ptr<FSRandomAccessFile>* file,
                                 const bool use_direct_reads = true) {
-  assert(env);
+  assert(fs.get());
 
-  EnvOptions opt;
+  FileOptions opt;
   opt.use_direct_reads = use_direct_reads;
-  Status s = env->NewRandomAccessFile(filepath, file, opt);
-  return s;
+  return fs->NewRandomAccessFile(filepath, opt, file, nullptr);
 }
 
 //
@@ -209,17 +210,18 @@ bool RandomAccessCacheFile::OpenImpl(const bool enable_direct_reads) {
   rwlock_.AssertHeld();
 
   ROCKS_LOG_DEBUG(log_, "Opening cache file %s", Path().c_str());
+  assert(env_);
 
-  std::unique_ptr<RandomAccessFile> file;
-  Status status =
-      NewRandomAccessCacheFile(env_, Path(), &file, enable_direct_reads);
+  std::unique_ptr<FSRandomAccessFile> file;
+  Status status = NewRandomAccessCacheFile(env_->GetFileSystem(), Path(), &file,
+                                           enable_direct_reads);
   if (!status.ok()) {
     Error(log_, "Error opening random access file %s. %s", Path().c_str(),
           status.ToString().c_str());
     return false;
   }
-  freader_.reset(new RandomAccessFileReader(
-      NewLegacyRandomAccessFileWrapper(file), Path(), env_));
+  freader_.reset(new RandomAccessFileReader(std::move(file), Path(),
+                                            env_->GetSystemClock().get()));
 
   return true;
 }
@@ -579,7 +581,7 @@ void ThreadedWriter::ThreadMain() {
       // We can fail to reserve space if every file in the system
       // is being currently accessed
       /* sleep override */
-      Env::Default()->SleepForMicroseconds(1000000);
+      SystemClock::Default()->SleepForMicroseconds(1000000);
     }
 
     DispatchIO(io);
diff --git a/utilities/persistent_cache/persistent_cache_bench.cc b/utilities/persistent_cache/persistent_cache_bench.cc
index 359fcdd1d83..ea41a83b857 100644
--- a/utilities/persistent_cache/persistent_cache_bench.cc
+++ b/utilities/persistent_cache/persistent_cache_bench.cc
@@ -15,18 +15,17 @@ int main() { fprintf(stderr, "Please install gflags to run tools\n"); }
 #include <sstream>
 #include <unordered_map>
 
-#include "rocksdb/env.h"
-
-#include "utilities/persistent_cache/block_cache_tier.h"
-#include "utilities/persistent_cache/persistent_cache_tier.h"
-#include "utilities/persistent_cache/volatile_tier_impl.h"
-
 #include "monitoring/histogram.h"
 #include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/system_clock.h"
 #include "table/block_based/block_builder.h"
 #include "util/gflags_compat.h"
 #include "util/mutexlock.h"
 #include "util/stop_watch.h"
+#include "utilities/persistent_cache/block_cache_tier.h"
+#include "utilities/persistent_cache/persistent_cache_tier.h"
+#include "utilities/persistent_cache/volatile_tier_impl.h"
 
 DEFINE_int32(nsec, 10, "nsec");
 DEFINE_int32(nthread_write, 1, "Insert threads");
@@ -128,7 +127,7 @@ class CacheTierBenchmark {
           std::bind(&CacheTierBenchmark::Read, this));
 
     // Wait till FLAGS_nsec and then signal to quit
-    StopWatchNano t(Env::Default(), /*auto_start=*/true);
+    StopWatchNano t(SystemClock::Default().get(), /*auto_start=*/true);
     size_t sec = t.ElapsedNanos() / 1000000000ULL;
     while (!quit_) {
       sec = t.ElapsedNanos() / 1000000000ULL;
@@ -195,7 +194,7 @@ class CacheTierBenchmark {
     auto block = NewBlock(key);
 
     // insert
-    StopWatchNano timer(Env::Default(), /*auto_start=*/true);
+    StopWatchNano timer(SystemClock::Default().get(), /*auto_start=*/true);
     while (true) {
       Status status = cache_->Insert(block_key, block.get(), FLAGS_iosize);
       if (status.ok()) {
@@ -227,7 +226,7 @@ class CacheTierBenchmark {
     Slice key = FillKey(k, val);
 
     // Lookup in cache
-    StopWatchNano timer(Env::Default(), /*auto_start=*/true);
+    StopWatchNano timer(SystemClock::Default().get(), /*auto_start=*/true);
     std::unique_ptr<char[]> block;
     size_t size;
     Status status = cache_->Lookup(key, &block, &size);
diff --git a/utilities/persistent_cache/persistent_cache_test.cc b/utilities/persistent_cache/persistent_cache_test.cc
index 5dc090d51dc..defff0f21cd 100644
--- a/utilities/persistent_cache/persistent_cache_test.cc
+++ b/utilities/persistent_cache/persistent_cache_test.cc
@@ -14,6 +14,7 @@
 #include <memory>
 #include <thread>
 
+#include "file/file_util.h"
 #include "utilities/persistent_cache/block_cache_tier.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -38,30 +39,9 @@ static void OnOpenForWrite(void* arg) {
 }
 #endif
 
-static void RemoveDirectory(const std::string& folder) {
-  std::vector<std::string> files;
-  Status status = Env::Default()->GetChildren(folder, &files);
-  if (!status.ok()) {
-    // we assume the directory does not exist
-    return;
-  }
-
-  // cleanup files with the patter :digi:.rc
-  for (auto file : files) {
-    if (file == "." || file == "..") {
-      continue;
-    }
-    status = Env::Default()->DeleteFile(folder + "/" + file);
-    assert(status.ok());
-  }
-
-  status = Env::Default()->DeleteDir(folder);
-  assert(status.ok());
-}
-
 static void OnDeleteDir(void* arg) {
   char* dir = static_cast<char*>(arg);
-  RemoveDirectory(std::string(dir));
+  ASSERT_OK(DestroyDir(Env::Default(), std::string(dir)));
 }
 
 //
@@ -253,18 +233,19 @@ TEST_F(PersistentCacheTierTest, DISABLED_TieredCacheInsertWithEviction) {
 }
 
 std::shared_ptr<PersistentCacheTier> MakeVolatileCache(
-    const std::string& /*dbname*/) {
+    Env* /*env*/, const std::string& /*dbname*/) {
   return std::make_shared<VolatileCacheTier>();
 }
 
-std::shared_ptr<PersistentCacheTier> MakeBlockCache(const std::string& dbname) {
-  return NewBlockCache(Env::Default(), dbname);
+std::shared_ptr<PersistentCacheTier> MakeBlockCache(Env* env,
+                                                    const std::string& dbname) {
+  return NewBlockCache(env, dbname);
 }
 
 std::shared_ptr<PersistentCacheTier> MakeTieredCache(
-    const std::string& dbname) {
+    Env* env, const std::string& dbname) {
   const auto memory_size = 1 * 1024 * 1024 * kStressFactor;
-  return NewTieredCache(Env::Default(), dbname, static_cast<size_t>(memory_size));
+  return NewTieredCache(env, dbname, static_cast<size_t>(memory_size));
 }
 
 #ifdef OS_LINUX
@@ -442,26 +423,26 @@ void PersistentCacheDBTest::RunTest(
 // specifically written for Travis.
 // Now used generally because main tests are too expensive as unit tests.
 TEST_F(PersistentCacheDBTest, BasicTest) {
-  RunTest(std::bind(&MakeBlockCache, dbname_), /*max_keys=*/1024,
+  RunTest(std::bind(&MakeBlockCache, env_, dbname_), /*max_keys=*/1024,
           /*max_usecase=*/1);
 }
 
 // test table with block page cache
 // DISABLED for now (very expensive, especially memory)
 TEST_F(PersistentCacheDBTest, DISABLED_BlockCacheTest) {
-  RunTest(std::bind(&MakeBlockCache, dbname_));
+  RunTest(std::bind(&MakeBlockCache, env_, dbname_));
 }
 
 // test table with volatile page cache
 // DISABLED for now (very expensive, especially memory)
 TEST_F(PersistentCacheDBTest, DISABLED_VolatileCacheTest) {
-  RunTest(std::bind(&MakeVolatileCache, dbname_));
+  RunTest(std::bind(&MakeVolatileCache, env_, dbname_));
 }
 
 // test table with tiered page cache
 // DISABLED for now (very expensive, especially memory)
 TEST_F(PersistentCacheDBTest, DISABLED_TieredCacheTest) {
-  RunTest(std::bind(&MakeTieredCache, dbname_));
+  RunTest(std::bind(&MakeTieredCache, env_, dbname_));
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/persistent_cache/persistent_cache_tier.h b/utilities/persistent_cache/persistent_cache_tier.h
index f8fa88c34db..286436da071 100644
--- a/utilities/persistent_cache/persistent_cache_tier.h
+++ b/utilities/persistent_cache/persistent_cache_tier.h
@@ -17,6 +17,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/persistent_cache.h"
 #include "rocksdb/status.h"
+#include "rocksdb/system_clock.h"
 
 // Persistent Cache
 //
@@ -86,6 +87,8 @@ struct PersistentCacheConfig {
       const std::shared_ptr<Logger>& _log,
       const uint32_t _write_buffer_size = 1 * 1024 * 1024 /*1MB*/) {
     env = _env;
+    clock = (env != nullptr) ? env->GetSystemClock().get()
+                             : SystemClock::Default().get();
     path = _path;
     log = _log;
     cache_size = _cache_size;
@@ -124,10 +127,10 @@ struct PersistentCacheConfig {
   }
 
   //
-  // Env abstraction to use for systmer level operations
+  // Env abstraction to use for system level operations
   //
   Env* env;
-
+  SystemClock* clock;
   //
   // Path for the block cache where blocks are persisted
   //
diff --git a/utilities/simulator_cache/sim_cache.cc b/utilities/simulator_cache/sim_cache.cc
index 5d528fc8573..2ef0fe9b321 100644
--- a/utilities/simulator_cache/sim_cache.cc
+++ b/utilities/simulator_cache/sim_cache.cc
@@ -4,12 +4,14 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "rocksdb/utilities/sim_cache.h"
+
 #include <atomic>
-#include "env/composite_env_wrapper.h"
+
 #include "file/writable_file_writer.h"
 #include "monitoring/statistics.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
 
@@ -35,8 +37,7 @@ class CacheActivityLogger {
     assert(env != nullptr);
 
     Status status;
-    EnvOptions env_opts;
-    std::unique_ptr<WritableFile> log_file;
+    FileOptions file_opts;
 
     MutexLock l(&mutex_);
 
@@ -44,13 +45,11 @@ class CacheActivityLogger {
     StopLoggingInternal();
 
     // Open log file
-    status = env->NewWritableFile(activity_log_file, &log_file, env_opts);
+    status = WritableFileWriter::Create(env->GetFileSystem(), activity_log_file,
+                                        file_opts, &file_writer_, nullptr);
     if (!status.ok()) {
       return status;
     }
-    file_writer_.reset(new WritableFileWriter(
-        NewLegacyWritableFileWrapper(std::move(log_file)), activity_log_file,
-        env_opts));
 
     max_logging_size_ = max_logging_size;
     activity_logging_enabled_.store(true);
@@ -168,6 +167,7 @@ class SimCacheImpl : public SimCache {
     cache_->SetStrictCapacityLimit(strict_capacity_limit);
   }
 
+  using Cache::Insert;
   Status Insert(const Slice& key, void* value, size_t charge,
                 void (*deleter)(const Slice& key, void* value), Handle** handle,
                 Priority priority) override {
@@ -194,6 +194,7 @@ class SimCacheImpl : public SimCache {
     return cache_->Insert(key, value, charge, deleter, handle, priority);
   }
 
+  using Cache::Lookup;
   Handle* Lookup(const Slice& key, Statistics* stats) override {
     Handle* h = key_only_cache_->Lookup(key);
     if (h != nullptr) {
@@ -214,6 +215,7 @@ class SimCacheImpl : public SimCache {
 
   bool Ref(Handle* handle) override { return cache_->Ref(handle); }
 
+  using Cache::Release;
   bool Release(Handle* handle, bool force_erase = false) override {
     return cache_->Release(handle, force_erase);
   }
@@ -243,6 +245,10 @@ class SimCacheImpl : public SimCache {
     return cache_->GetCharge(handle);
   }
 
+  DeleterFn GetDeleter(Handle* handle) const override {
+    return cache_->GetDeleter(handle);
+  }
+
   size_t GetPinnedUsage() const override { return cache_->GetPinnedUsage(); }
 
   void DisownData() override {
@@ -256,6 +262,13 @@ class SimCacheImpl : public SimCache {
     cache_->ApplyToAllCacheEntries(callback, thread_safe);
   }
 
+  void ApplyToAllEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      const ApplyToAllEntriesOptions& opts) override {
+    cache_->ApplyToAllEntries(callback, opts);
+  }
+
   void EraseUnRefEntries() override {
     cache_->EraseUnRefEntries();
     key_only_cache_->EraseUnRefEntries();
diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
index 40cb04abb66..97784efe43a 100644
--- a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
+++ b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
@@ -38,10 +38,11 @@ TEST(CompactOnDeletionCollector, DeletionRatio) {
           factory->CreateTablePropertiesCollector(context));
       for (size_t i = 0; i < kTotalEntries; i++) {
         // All entries are deletion entries.
-        collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0);
+        ASSERT_OK(
+            collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0));
         ASSERT_FALSE(collector->NeedCompact());
       }
-      collector->Finish(nullptr);
+      ASSERT_OK(collector->Finish(nullptr));
       ASSERT_FALSE(collector->NeedCompact());
     }
   }
@@ -58,13 +59,15 @@ TEST(CompactOnDeletionCollector, DeletionRatio) {
             factory->CreateTablePropertiesCollector(context));
         for (size_t i = 0; i < kTotalEntries; i++) {
           if (i < actual_deletion_entries) {
-            collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0);
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0));
           } else {
-            collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0);
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
           }
           ASSERT_FALSE(collector->NeedCompact());
         }
-        collector->Finish(nullptr);
+        ASSERT_OK(collector->Finish(nullptr));
         if (delta >= 0) {
           // >= deletion_ratio
           ASSERT_TRUE(collector->NeedCompact());
@@ -123,10 +126,12 @@ TEST(CompactOnDeletionCollector, SlidingWindow) {
         int deletions = 0;
         for (int i = 0; i < kPaddedWindowSize; ++i) {
           if (i % kSample < delete_rate) {
-            collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0);
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0));
             deletions++;
           } else {
-            collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0);
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
           }
         }
         if (collector->NeedCompact() !=
@@ -138,7 +143,7 @@ TEST(CompactOnDeletionCollector, SlidingWindow) {
                   kWindowSize, kNumDeletionTrigger);
           ASSERT_TRUE(false);
         }
-        collector->Finish(nullptr);
+        ASSERT_OK(collector->Finish(nullptr));
       }
     }
 
@@ -154,21 +159,25 @@ TEST(CompactOnDeletionCollector, SlidingWindow) {
         for (int section = 0; section < 5; ++section) {
           int initial_entries = rnd.Uniform(kWindowSize) + kWindowSize;
           for (int i = 0; i < initial_entries; ++i) {
-            collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0);
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
           }
         }
         for (int i = 0; i < kPaddedWindowSize; ++i) {
           if (i % kSample < delete_rate) {
-            collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0);
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0));
             deletions++;
           } else {
-            collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0);
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
           }
         }
         for (int section = 0; section < 5; ++section) {
           int ending_entries = rnd.Uniform(kWindowSize) + kWindowSize;
           for (int i = 0; i < ending_entries; ++i) {
-            collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0);
+            ASSERT_OK(
+                collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
           }
         }
         if (collector->NeedCompact() != (deletions >= kNumDeletionTrigger) &&
@@ -180,7 +189,7 @@ TEST(CompactOnDeletionCollector, SlidingWindow) {
                   kNumDeletionTrigger);
           ASSERT_TRUE(false);
         }
-        collector->Finish(nullptr);
+        ASSERT_OK(collector->Finish(nullptr));
       }
     }
 
@@ -199,9 +208,11 @@ TEST(CompactOnDeletionCollector, SlidingWindow) {
         for (int section = 0; section < 200; ++section) {
           for (int i = 0; i < kPaddedWindowSize; ++i) {
             if (i < kDeletionsPerSection) {
-              collector->AddUserKey("hello", "rocksdb", kEntryDelete, 0, 0);
+              ASSERT_OK(collector->AddUserKey("hello", "rocksdb", kEntryDelete,
+                                              0, 0));
             } else {
-              collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0);
+              ASSERT_OK(
+                  collector->AddUserKey("hello", "rocksdb", kEntryPut, 0, 0));
             }
           }
         }
@@ -212,7 +223,7 @@ TEST(CompactOnDeletionCollector, SlidingWindow) {
                   kWindowSize, kNumDeletionTrigger);
           ASSERT_TRUE(false);
         }
-        collector->Finish(nullptr);
+        ASSERT_OK(collector->Finish(nullptr));
       }
     }
   }
diff --git a/utilities/trace/file_trace_reader_writer.cc b/utilities/trace/file_trace_reader_writer.cc
index 3ee096a4b1d..d553e243402 100644
--- a/utilities/trace/file_trace_reader_writer.cc
+++ b/utilities/trace/file_trace_reader_writer.cc
@@ -92,15 +92,13 @@ uint64_t FileTraceWriter::GetFileSize() { return file_writer_->GetFileSize(); }
 Status NewFileTraceReader(Env* env, const EnvOptions& env_options,
                           const std::string& trace_filename,
                           std::unique_ptr<TraceReader>* trace_reader) {
-  std::unique_ptr<RandomAccessFile> trace_file;
-  Status s = env->NewRandomAccessFile(trace_filename, &trace_file, env_options);
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  Status s = RandomAccessFileReader::Create(
+      env->GetFileSystem(), trace_filename, FileOptions(env_options),
+      &file_reader, nullptr);
   if (!s.ok()) {
     return s;
   }
-
-  std::unique_ptr<RandomAccessFileReader> file_reader;
-  file_reader.reset(new RandomAccessFileReader(
-      NewLegacyRandomAccessFileWrapper(trace_file), trace_filename));
   trace_reader->reset(new FileTraceReader(std::move(file_reader)));
   return s;
 }
@@ -108,16 +106,13 @@ Status NewFileTraceReader(Env* env, const EnvOptions& env_options,
 Status NewFileTraceWriter(Env* env, const EnvOptions& env_options,
                           const std::string& trace_filename,
                           std::unique_ptr<TraceWriter>* trace_writer) {
-  std::unique_ptr<WritableFile> trace_file;
-  Status s = env->NewWritableFile(trace_filename, &trace_file, env_options);
+  std::unique_ptr<WritableFileWriter> file_writer;
+  Status s = WritableFileWriter::Create(env->GetFileSystem(), trace_filename,
+                                        FileOptions(env_options), &file_writer,
+                                        nullptr);
   if (!s.ok()) {
     return s;
   }
-
-  std::unique_ptr<WritableFileWriter> file_writer;
-  file_writer.reset(new WritableFileWriter(
-      NewLegacyWritableFileWrapper(std::move(trace_file)), trace_filename,
-      env_options));
   trace_writer->reset(new FileTraceWriter(std::move(file_writer)));
   return s;
 }
diff --git a/utilities/transactions/lock/lock_manager.cc b/utilities/transactions/lock/lock_manager.cc
new file mode 100644
index 00000000000..df16b32ad9a
--- /dev/null
+++ b/utilities/transactions/lock/lock_manager.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/lock/lock_manager.h"
+
+#include "utilities/transactions/lock/point/point_lock_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::shared_ptr<LockManager> NewLockManager(PessimisticTransactionDB* db,
+                                            const TransactionDBOptions& opt) {
+  assert(db);
+  if (opt.lock_mgr_handle) {
+    // A custom lock manager was provided in options
+    auto mgr = opt.lock_mgr_handle->getLockManager();
+    return std::shared_ptr<LockManager>(opt.lock_mgr_handle, mgr);
+  } else {
+    // Use a point lock manager by default
+    return std::shared_ptr<LockManager>(new PointLockManager(db, opt));
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/lock/lock_manager.h b/utilities/transactions/lock/lock_manager.h
new file mode 100644
index 00000000000..a5ce1948c1d
--- /dev/null
+++ b/utilities/transactions/lock/lock_manager.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "utilities/transactions/lock/lock_tracker.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class PessimisticTransactionDB;
+
+class LockManager {
+ public:
+  virtual ~LockManager() {}
+
+  // Whether supports locking a specific key.
+  virtual bool IsPointLockSupported() const = 0;
+
+  // Whether supports locking a range of keys.
+  virtual bool IsRangeLockSupported() const = 0;
+
+  // Locks acquired through this LockManager should be tracked by
+  // the LockTrackers created through the returned factory.
+  virtual const LockTrackerFactory& GetLockTrackerFactory() const = 0;
+
+  // Enable locking for the specified column family.
+  // Caller should guarantee that this column family is not already enabled.
+  virtual void AddColumnFamily(const ColumnFamilyHandle* cf) = 0;
+
+  // Disable locking for the specified column family.
+  // Caller should guarantee that this column family is no longer used.
+  virtual void RemoveColumnFamily(const ColumnFamilyHandle* cf) = 0;
+
+  // Attempt to lock a key or a key range.  If OK status is returned, the caller
+  // is responsible for calling UnLock() on this key.
+  virtual Status TryLock(PessimisticTransaction* txn,
+                         ColumnFamilyId column_family_id,
+                         const std::string& key, Env* env, bool exclusive) = 0;
+  // The range [start, end] are inclusive at both sides.
+  virtual Status TryLock(PessimisticTransaction* txn,
+                         ColumnFamilyId column_family_id, const Endpoint& start,
+                         const Endpoint& end, Env* env, bool exclusive) = 0;
+
+  // Unlock a key or a range locked by TryLock().  txn must be the same
+  // Transaction that locked this key.
+  virtual void UnLock(PessimisticTransaction* txn, const LockTracker& tracker,
+                      Env* env) = 0;
+  virtual void UnLock(PessimisticTransaction* txn,
+                      ColumnFamilyId column_family_id, const std::string& key,
+                      Env* env) = 0;
+  virtual void UnLock(PessimisticTransaction* txn,
+                      ColumnFamilyId column_family_id, const Endpoint& start,
+                      const Endpoint& end, Env* env) = 0;
+
+  using PointLockStatus = std::unordered_multimap<ColumnFamilyId, KeyLockInfo>;
+  virtual PointLockStatus GetPointLockStatus() = 0;
+
+  using RangeLockStatus =
+      std::unordered_multimap<ColumnFamilyId, RangeLockInfo>;
+  virtual RangeLockStatus GetRangeLockStatus() = 0;
+
+  virtual std::vector<DeadlockPath> GetDeadlockInfoBuffer() = 0;
+
+  virtual void Resize(uint32_t new_size) = 0;
+};
+
+// LockManager should always be constructed through this factory method,
+// instead of constructing through concrete implementations' constructor.
+// Caller owns the returned pointer.
+std::shared_ptr<LockManager> NewLockManager(PessimisticTransactionDB* db,
+                                            const TransactionDBOptions& opt);
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/lock/lock_tracker.cc b/utilities/transactions/lock/lock_tracker.cc
deleted file mode 100644
index c367c273d3b..00000000000
--- a/utilities/transactions/lock/lock_tracker.cc
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-// This source code is licensed under both the GPLv2 (found in the
-// COPYING file in the root directory) and Apache 2.0 License
-// (found in the LICENSE.Apache file in the root directory).
-
-#include "utilities/transactions/lock/lock_tracker.h"
-
-#include "utilities/transactions/lock/point_lock_tracker.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-LockTracker* NewLockTracker() {
-  // TODO: determine the lock tracker implementation based on configuration.
-  return new PointLockTracker();
-}
-
-}  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/transactions/lock/lock_tracker.h b/utilities/transactions/lock/lock_tracker.h
index 2129dd2a673..5fa228a8290 100644
--- a/utilities/transactions/lock/lock_tracker.h
+++ b/utilities/transactions/lock/lock_tracker.h
@@ -4,17 +4,17 @@
 // (found in the LICENSE.Apache file in the root directory).
 
 #pragma once
+#ifndef ROCKSDB_LITE
 
 #include <memory>
 
 #include "rocksdb/rocksdb_namespace.h"
 #include "rocksdb/status.h"
 #include "rocksdb/types.h"
+#include "rocksdb/utilities/transaction_db.h"
 
 namespace ROCKSDB_NAMESPACE {
 
-using ColumnFamilyId = uint32_t;
-
 // Request for locking a single key.
 struct PointLockRequest {
   // The id of the key's column family.
@@ -31,7 +31,12 @@ struct PointLockRequest {
 
 // Request for locking a range of keys.
 struct RangeLockRequest {
-  // TODO
+  // The id of the key's column family.
+  ColumnFamilyId column_family_id;
+
+  // The range to be locked
+  Endpoint start_endp;
+  Endpoint end_endp;
 };
 
 struct PointLockStatus {
@@ -191,9 +196,14 @@ class LockTracker {
       ColumnFamilyId /*column_family_id*/) const = 0;
 };
 
-// LockTracker should always be constructed through this factory method,
-// instead of constructing through concrete implementations' constructor.
-// Caller owns the returned pointer.
-LockTracker* NewLockTracker();
+// LockTracker should always be constructed through this factory.
+// Each LockManager owns a LockTrackerFactory.
+class LockTrackerFactory {
+ public:
+  // Caller owns the returned pointer.
+  virtual LockTracker* Create() const = 0;
+  virtual ~LockTrackerFactory() {}
+};
 
 }  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/transaction_lock_mgr.cc b/utilities/transactions/lock/point/point_lock_manager.cc
similarity index 80%
rename from utilities/transactions/transaction_lock_mgr.cc
rename to utilities/transactions/lock/point/point_lock_manager.cc
index 84d93b304fc..79954d8f05b 100644
--- a/utilities/transactions/transaction_lock_mgr.cc
+++ b/utilities/transactions/lock/point/point_lock_manager.cc
@@ -5,10 +5,10 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "utilities/transactions/transaction_lock_mgr.h"
+#include "utilities/transactions/lock/point/point_lock_manager.h"
 
-#include <cinttypes>
 #include <algorithm>
+#include <cinttypes>
 #include <mutex>
 
 #include "monitoring/perf_context_imp.h"
@@ -19,6 +19,7 @@
 #include "util/hash.h"
 #include "util/thread_local.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_db_mutex_impl.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -37,6 +38,11 @@ struct LockInfo {
       : exclusive(lock_info.exclusive),
         txn_ids(lock_info.txn_ids),
         expiration_time(lock_info.expiration_time) {}
+  void operator=(const LockInfo& lock_info) {
+    exclusive = lock_info.exclusive;
+    txn_ids = lock_info.txn_ids;
+    expiration_time = lock_info.expiration_time;
+  }
 };
 
 struct LockMapStripe {
@@ -80,7 +86,7 @@ struct LockMap {
   const size_t num_stripes_;
 
   // Count of keys that are currently locked in this column family.
-  // (Only maintained if TransactionLockMgr::max_num_locks_ is positive.)
+  // (Only maintained if PointLockManager::max_num_locks_ is positive.)
   std::atomic<int64_t> lock_cnt{0};
 
   std::vector<LockMapStripe*> lock_map_stripes_;
@@ -88,64 +94,6 @@ struct LockMap {
   size_t GetStripe(const std::string& key) const;
 };
 
-void DeadlockInfoBuffer::AddNewPath(DeadlockPath path) {
-  std::lock_guard<std::mutex> lock(paths_buffer_mutex_);
-
-  if (paths_buffer_.empty()) {
-    return;
-  }
-
-  paths_buffer_[buffer_idx_] = std::move(path);
-  buffer_idx_ = (buffer_idx_ + 1) % paths_buffer_.size();
-}
-
-void DeadlockInfoBuffer::Resize(uint32_t target_size) {
-  std::lock_guard<std::mutex> lock(paths_buffer_mutex_);
-
-  paths_buffer_ = Normalize();
-
-  // Drop the deadlocks that will no longer be needed ater the normalize
-  if (target_size < paths_buffer_.size()) {
-    paths_buffer_.erase(
-        paths_buffer_.begin(),
-        paths_buffer_.begin() + (paths_buffer_.size() - target_size));
-    buffer_idx_ = 0;
-  }
-  // Resize the buffer to the target size and restore the buffer's idx
-  else {
-    auto prev_size = paths_buffer_.size();
-    paths_buffer_.resize(target_size);
-    buffer_idx_ = (uint32_t)prev_size;
-  }
-}
-
-std::vector<DeadlockPath> DeadlockInfoBuffer::Normalize() {
-  auto working = paths_buffer_;
-
-  if (working.empty()) {
-    return working;
-  }
-
-  // Next write occurs at a nonexistent path's slot
-  if (paths_buffer_[buffer_idx_].empty()) {
-    working.resize(buffer_idx_);
-  } else {
-    std::rotate(working.begin(), working.begin() + buffer_idx_, working.end());
-  }
-
-  return working;
-}
-
-std::vector<DeadlockPath> DeadlockInfoBuffer::PrepareBuffer() {
-  std::lock_guard<std::mutex> lock(paths_buffer_mutex_);
-
-  // Reversing the normalized vector returns the latest deadlocks first
-  auto working = Normalize();
-  std::reverse(working.begin(), working.end());
-
-  return working;
-}
-
 namespace {
 void UnrefLockMapsCache(void* ptr) {
   // Called when a thread exits or a ThreadLocalPtr gets destroyed.
@@ -155,47 +103,44 @@ void UnrefLockMapsCache(void* ptr) {
 }
 }  // anonymous namespace
 
-TransactionLockMgr::TransactionLockMgr(
-    TransactionDB* txn_db, size_t default_num_stripes, int64_t max_num_locks,
-    uint32_t max_num_deadlocks,
-    std::shared_ptr<TransactionDBMutexFactory> mutex_factory)
-    : txn_db_impl_(nullptr),
-      default_num_stripes_(default_num_stripes),
-      max_num_locks_(max_num_locks),
+PointLockManager::PointLockManager(PessimisticTransactionDB* txn_db,
+                                   const TransactionDBOptions& opt)
+    : txn_db_impl_(txn_db),
+      default_num_stripes_(opt.num_stripes),
+      max_num_locks_(opt.max_num_locks),
       lock_maps_cache_(new ThreadLocalPtr(&UnrefLockMapsCache)),
-      dlock_buffer_(max_num_deadlocks),
-      mutex_factory_(mutex_factory) {
-  assert(txn_db);
-  txn_db_impl_ = static_cast_with_check<PessimisticTransactionDB>(txn_db);
-}
+      dlock_buffer_(opt.max_num_deadlocks),
+      mutex_factory_(opt.custom_mutex_factory
+                         ? opt.custom_mutex_factory
+                         : std::make_shared<TransactionDBMutexFactoryImpl>()) {}
 
-TransactionLockMgr::~TransactionLockMgr() {}
+PointLockManager::~PointLockManager() {}
 
 size_t LockMap::GetStripe(const std::string& key) const {
   assert(num_stripes_ > 0);
   return FastRange64(GetSliceNPHash64(key), num_stripes_);
 }
 
-void TransactionLockMgr::AddColumnFamily(uint32_t column_family_id) {
+void PointLockManager::AddColumnFamily(const ColumnFamilyHandle* cf) {
   InstrumentedMutexLock l(&lock_map_mutex_);
 
-  if (lock_maps_.find(column_family_id) == lock_maps_.end()) {
-    lock_maps_.emplace(column_family_id,
-                       std::make_shared<LockMap>(default_num_stripes_, mutex_factory_));
+  if (lock_maps_.find(cf->GetID()) == lock_maps_.end()) {
+    lock_maps_.emplace(cf->GetID(), std::make_shared<LockMap>(
+                                        default_num_stripes_, mutex_factory_));
   } else {
     // column_family already exists in lock map
     assert(false);
   }
 }
 
-void TransactionLockMgr::RemoveColumnFamily(uint32_t column_family_id) {
+void PointLockManager::RemoveColumnFamily(const ColumnFamilyHandle* cf) {
   // Remove lock_map for this column family.  Since the lock map is stored
   // as a shared ptr, concurrent transactions can still keep using it
   // until they release their references to it.
   {
     InstrumentedMutexLock l(&lock_map_mutex_);
 
-    auto lock_maps_iter = lock_maps_.find(column_family_id);
+    auto lock_maps_iter = lock_maps_.find(cf->GetID());
     if (lock_maps_iter == lock_maps_.end()) {
       return;
     }
@@ -214,8 +159,8 @@ void TransactionLockMgr::RemoveColumnFamily(uint32_t column_family_id) {
 // Look up the LockMap std::shared_ptr for a given column_family_id.
 // Note:  The LockMap is only valid as long as the caller is still holding on
 //   to the returned std::shared_ptr.
-std::shared_ptr<LockMap> TransactionLockMgr::GetLockMap(
-    uint32_t column_family_id) {
+std::shared_ptr<LockMap> PointLockManager::GetLockMap(
+    ColumnFamilyId column_family_id) {
   // First check thread-local cache
   if (lock_maps_cache_->Get() == nullptr) {
     lock_maps_cache_->Reset(new LockMaps());
@@ -248,9 +193,9 @@ std::shared_ptr<LockMap> TransactionLockMgr::GetLockMap(
 // transaction.
 // If false, sets *expire_time to the expiration time of the lock according
 // to Env->GetMicros() or 0 if no expiration.
-bool TransactionLockMgr::IsLockExpired(TransactionID txn_id,
-                                       const LockInfo& lock_info, Env* env,
-                                       uint64_t* expire_time) {
+bool PointLockManager::IsLockExpired(TransactionID txn_id,
+                                     const LockInfo& lock_info, Env* env,
+                                     uint64_t* expire_time) {
   if (lock_info.expiration_time == 0) {
     *expire_time = 0;
     return false;
@@ -279,10 +224,10 @@ bool TransactionLockMgr::IsLockExpired(TransactionID txn_id,
   return expired;
 }
 
-Status TransactionLockMgr::TryLock(PessimisticTransaction* txn,
-                                   uint32_t column_family_id,
-                                   const std::string& key, Env* env,
-                                   bool exclusive) {
+Status PointLockManager::TryLock(PessimisticTransaction* txn,
+                                 ColumnFamilyId column_family_id,
+                                 const std::string& key, Env* env,
+                                 bool exclusive) {
   // Lookup lock map for this column family id
   std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
   LockMap* lock_map = lock_map_ptr.get();
@@ -307,9 +252,9 @@ Status TransactionLockMgr::TryLock(PessimisticTransaction* txn,
 }
 
 // Helper function for TryLock().
-Status TransactionLockMgr::AcquireWithTimeout(
+Status PointLockManager::AcquireWithTimeout(
     PessimisticTransaction* txn, LockMap* lock_map, LockMapStripe* stripe,
-    uint32_t column_family_id, const std::string& key, Env* env,
+    ColumnFamilyId column_family_id, const std::string& key, Env* env,
     int64_t timeout, LockInfo&& lock_info) {
   Status result;
   uint64_t end_time = 0;
@@ -370,7 +315,7 @@ Status TransactionLockMgr::AcquireWithTimeout(
         txn->SetWaitingTxn(wait_ids, column_family_id, &key);
       }
 
-      TEST_SYNC_POINT("TransactionLockMgr::AcquireWithTimeout:WaitingTxn");
+      TEST_SYNC_POINT("PointLockManager::AcquireWithTimeout:WaitingTxn");
       if (cv_end_time < 0) {
         // Wait indefinitely
         result = stripe->stripe_cv->Wait(stripe->stripe_mutex);
@@ -408,14 +353,14 @@ Status TransactionLockMgr::AcquireWithTimeout(
   return result;
 }
 
-void TransactionLockMgr::DecrementWaiters(
+void PointLockManager::DecrementWaiters(
     const PessimisticTransaction* txn,
     const autovector<TransactionID>& wait_ids) {
   std::lock_guard<std::mutex> lock(wait_txn_map_mutex_);
   DecrementWaitersImpl(txn, wait_ids);
 }
 
-void TransactionLockMgr::DecrementWaitersImpl(
+void PointLockManager::DecrementWaitersImpl(
     const PessimisticTransaction* txn,
     const autovector<TransactionID>& wait_ids) {
   auto id = txn->GetID();
@@ -430,7 +375,7 @@ void TransactionLockMgr::DecrementWaitersImpl(
   }
 }
 
-bool TransactionLockMgr::IncrementWaiters(
+bool PointLockManager::IncrementWaiters(
     const PessimisticTransaction* txn,
     const autovector<TransactionID>& wait_ids, const std::string& key,
     const uint32_t& cf_id, const bool& exclusive, Env* const env) {
@@ -487,7 +432,14 @@ bool TransactionLockMgr::IncrementWaiters(
                         extracted_info.m_waiting_key});
         head = queue_parents[head];
       }
-      env->GetCurrentTime(&deadlock_time);
+      if (!env->GetCurrentTime(&deadlock_time).ok()) {
+        /*
+          TODO(AR) this preserves the current behaviour whilst checking the
+          status of env->GetCurrentTime to ensure that ASSERT_STATUS_CHECKED
+          passes. Should we instead raise an error if !ok() ?
+        */
+        deadlock_time = 0;
+      }
       std::reverse(path.begin(), path.end());
       dlock_buffer_.AddNewPath(DeadlockPath(path, deadlock_time));
       deadlock_time = 0;
@@ -503,7 +455,14 @@ bool TransactionLockMgr::IncrementWaiters(
   }
 
   // Wait cycle too big, just assume deadlock.
-  env->GetCurrentTime(&deadlock_time);
+  if (!env->GetCurrentTime(&deadlock_time).ok()) {
+    /*
+      TODO(AR) this preserves the current behaviour whilst checking the status
+      of env->GetCurrentTime to ensure that ASSERT_STATUS_CHECKED passes.
+      Should we instead raise an error if !ok() ?
+    */
+    deadlock_time = 0;
+  }
   dlock_buffer_.AddNewPath(DeadlockPath(deadlock_time, true));
   DecrementWaitersImpl(txn, wait_ids);
   return true;
@@ -513,12 +472,11 @@ bool TransactionLockMgr::IncrementWaiters(
 // Sets *expire_time to the expiration time in microseconds
 //  or 0 if no expiration.
 // REQUIRED:  Stripe mutex must be held.
-Status TransactionLockMgr::AcquireLocked(LockMap* lock_map,
-                                         LockMapStripe* stripe,
-                                         const std::string& key, Env* env,
-                                         LockInfo&& txn_lock_info,
-                                         uint64_t* expire_time,
-                                         autovector<TransactionID>* txn_ids) {
+Status PointLockManager::AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
+                                       const std::string& key, Env* env,
+                                       LockInfo&& txn_lock_info,
+                                       uint64_t* expire_time,
+                                       autovector<TransactionID>* txn_ids) {
   assert(txn_lock_info.txn_ids.size() == 1);
 
   Status result;
@@ -580,10 +538,9 @@ Status TransactionLockMgr::AcquireLocked(LockMap* lock_map,
   return result;
 }
 
-void TransactionLockMgr::UnLockKey(const PessimisticTransaction* txn,
-                                   const std::string& key,
-                                   LockMapStripe* stripe, LockMap* lock_map,
-                                   Env* env) {
+void PointLockManager::UnLockKey(PessimisticTransaction* txn,
+                                 const std::string& key, LockMapStripe* stripe,
+                                 LockMap* lock_map, Env* env) {
 #ifdef NDEBUG
   (void)env;
 #endif
@@ -619,9 +576,9 @@ void TransactionLockMgr::UnLockKey(const PessimisticTransaction* txn,
   }
 }
 
-void TransactionLockMgr::UnLock(PessimisticTransaction* txn,
-                                uint32_t column_family_id,
-                                const std::string& key, Env* env) {
+void PointLockManager::UnLock(PessimisticTransaction* txn,
+                              ColumnFamilyId column_family_id,
+                              const std::string& key, Env* env) {
   std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
   LockMap* lock_map = lock_map_ptr.get();
   if (lock_map == nullptr) {
@@ -634,7 +591,7 @@ void TransactionLockMgr::UnLock(PessimisticTransaction* txn,
   assert(lock_map->lock_map_stripes_.size() > stripe_num);
   LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
 
-  stripe->stripe_mutex->Lock();
+  stripe->stripe_mutex->Lock().PermitUncheckedError();
   UnLockKey(txn, key, stripe, lock_map, env);
   stripe->stripe_mutex->UnLock();
 
@@ -642,8 +599,8 @@ void TransactionLockMgr::UnLock(PessimisticTransaction* txn,
   stripe->stripe_cv->NotifyAll();
 }
 
-void TransactionLockMgr::UnLock(const PessimisticTransaction* txn,
-                                const LockTracker& tracker, Env* env) {
+void PointLockManager::UnLock(PessimisticTransaction* txn,
+                              const LockTracker& tracker, Env* env) {
   std::unique_ptr<LockTracker::ColumnFamilyIterator> cf_it(
       tracker.GetColumnFamilyIterator());
   assert(cf_it != nullptr);
@@ -676,7 +633,7 @@ void TransactionLockMgr::UnLock(const PessimisticTransaction* txn,
       assert(lock_map->lock_map_stripes_.size() > stripe_num);
       LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
 
-      stripe->stripe_mutex->Lock();
+      stripe->stripe_mutex->Lock().PermitUncheckedError();
 
       for (const std::string* key : stripe_keys) {
         UnLockKey(txn, *key, stripe, lock_map, env);
@@ -690,8 +647,8 @@ void TransactionLockMgr::UnLock(const PessimisticTransaction* txn,
   }
 }
 
-TransactionLockMgr::LockStatusData TransactionLockMgr::GetLockStatusData() {
-  LockStatusData data;
+PointLockManager::PointLockStatus PointLockManager::GetPointLockStatus() {
+  PointLockStatus data;
   // Lock order here is important. The correct order is lock_map_mutex_, then
   // for every column family ID in ascending order lock every stripe in
   // ascending order.
@@ -707,7 +664,7 @@ TransactionLockMgr::LockStatusData TransactionLockMgr::GetLockStatusData() {
     const auto& stripes = lock_maps_[i]->lock_map_stripes_;
     // Iterate and lock all stripes in ascending order.
     for (const auto& j : stripes) {
-      j->stripe_mutex->Lock();
+      j->stripe_mutex->Lock().PermitUncheckedError();
       for (const auto& it : j->keys) {
         struct KeyLockInfo info;
         info.exclusive = it.second.exclusive;
@@ -730,13 +687,34 @@ TransactionLockMgr::LockStatusData TransactionLockMgr::GetLockStatusData() {
 
   return data;
 }
-std::vector<DeadlockPath> TransactionLockMgr::GetDeadlockInfoBuffer() {
+
+std::vector<DeadlockPath> PointLockManager::GetDeadlockInfoBuffer() {
   return dlock_buffer_.PrepareBuffer();
 }
 
-void TransactionLockMgr::Resize(uint32_t target_size) {
+void PointLockManager::Resize(uint32_t target_size) {
   dlock_buffer_.Resize(target_size);
 }
 
+PointLockManager::RangeLockStatus PointLockManager::GetRangeLockStatus() {
+  return {};
+}
+
+Status PointLockManager::TryLock(PessimisticTransaction* /* txn */,
+                                 ColumnFamilyId /* cf_id */,
+                                 const Endpoint& /* start */,
+                                 const Endpoint& /* end */, Env* /* env */,
+                                 bool /* exclusive */) {
+  return Status::NotSupported(
+      "PointLockManager does not support range locking");
+}
+
+void PointLockManager::UnLock(PessimisticTransaction* /* txn */,
+                              ColumnFamilyId /* cf_id */,
+                              const Endpoint& /* start */,
+                              const Endpoint& /* end */, Env* /* env */) {
+  // no-op
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/transaction_lock_mgr.h b/utilities/transactions/lock/point/point_lock_manager.h
similarity index 53%
rename from utilities/transactions/transaction_lock_mgr.h
rename to utilities/transactions/lock/point/point_lock_manager.h
index 0a947448870..3c541eb3afb 100644
--- a/utilities/transactions/transaction_lock_mgr.h
+++ b/utilities/transactions/lock/point/point_lock_manager.h
@@ -6,9 +6,9 @@
 #pragma once
 #ifndef ROCKSDB_LITE
 
+#include <memory>
 #include <string>
 #include <unordered_map>
-#include <memory>
 #include <utility>
 #include <vector>
 
@@ -17,7 +17,8 @@
 #include "util/autovector.h"
 #include "util/hash_map.h"
 #include "util/thread_local.h"
-#include "utilities/transactions/pessimistic_transaction.h"
+#include "utilities/transactions/lock/lock_manager.h"
+#include "utilities/transactions/lock/point/point_lock_tracker.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -26,21 +27,79 @@ struct LockInfo;
 struct LockMap;
 struct LockMapStripe;
 
-struct DeadlockInfoBuffer {
+template <class Path>
+class DeadlockInfoBufferTempl {
  private:
-  std::vector<DeadlockPath> paths_buffer_;
+  std::vector<Path> paths_buffer_;
   uint32_t buffer_idx_;
   std::mutex paths_buffer_mutex_;
-  std::vector<DeadlockPath> Normalize();
+
+  std::vector<Path> Normalize() {
+    auto working = paths_buffer_;
+
+    if (working.empty()) {
+      return working;
+    }
+
+    // Next write occurs at a nonexistent path's slot
+    if (paths_buffer_[buffer_idx_].empty()) {
+      working.resize(buffer_idx_);
+    } else {
+      std::rotate(working.begin(), working.begin() + buffer_idx_,
+                  working.end());
+    }
+
+    return working;
+  }
 
  public:
-  explicit DeadlockInfoBuffer(uint32_t n_latest_dlocks)
+  explicit DeadlockInfoBufferTempl(uint32_t n_latest_dlocks)
       : paths_buffer_(n_latest_dlocks), buffer_idx_(0) {}
-  void AddNewPath(DeadlockPath path);
-  void Resize(uint32_t target_size);
-  std::vector<DeadlockPath> PrepareBuffer();
+
+  void AddNewPath(Path path) {
+    std::lock_guard<std::mutex> lock(paths_buffer_mutex_);
+
+    if (paths_buffer_.empty()) {
+      return;
+    }
+
+    paths_buffer_[buffer_idx_] = std::move(path);
+    buffer_idx_ = (buffer_idx_ + 1) % paths_buffer_.size();
+  }
+
+  void Resize(uint32_t target_size) {
+    std::lock_guard<std::mutex> lock(paths_buffer_mutex_);
+
+    paths_buffer_ = Normalize();
+
+    // Drop the deadlocks that will no longer be needed ater the normalize
+    if (target_size < paths_buffer_.size()) {
+      paths_buffer_.erase(
+          paths_buffer_.begin(),
+          paths_buffer_.begin() + (paths_buffer_.size() - target_size));
+      buffer_idx_ = 0;
+    }
+    // Resize the buffer to the target size and restore the buffer's idx
+    else {
+      auto prev_size = paths_buffer_.size();
+      paths_buffer_.resize(target_size);
+      buffer_idx_ = (uint32_t)prev_size;
+    }
+  }
+
+  std::vector<Path> PrepareBuffer() {
+    std::lock_guard<std::mutex> lock(paths_buffer_mutex_);
+
+    // Reversing the normalized vector returns the latest deadlocks first
+    auto working = Normalize();
+    std::reverse(working.begin(), working.end());
+
+    return working;
+  }
 };
 
+typedef DeadlockInfoBufferTempl<DeadlockPath> DeadlockInfoBuffer;
+
 struct TrackedTrxInfo {
   autovector<TransactionID> m_neighbors;
   uint32_t m_cf_id;
@@ -48,44 +107,51 @@ struct TrackedTrxInfo {
   std::string m_waiting_key;
 };
 
-class Slice;
-class PessimisticTransactionDB;
-
-class TransactionLockMgr {
+class PointLockManager : public LockManager {
  public:
-  TransactionLockMgr(TransactionDB* txn_db, size_t default_num_stripes,
-                     int64_t max_num_locks, uint32_t max_num_deadlocks,
-                     std::shared_ptr<TransactionDBMutexFactory> factory);
+  PointLockManager(PessimisticTransactionDB* db,
+                   const TransactionDBOptions& opt);
   // No copying allowed
-  TransactionLockMgr(const TransactionLockMgr&) = delete;
-  void operator=(const TransactionLockMgr&) = delete;
+  PointLockManager(const PointLockManager&) = delete;
+  PointLockManager& operator=(const PointLockManager&) = delete;
+
+  ~PointLockManager() override;
+
+  bool IsPointLockSupported() const override { return true; }
+
+  bool IsRangeLockSupported() const override { return false; }
 
-  ~TransactionLockMgr();
+  const LockTrackerFactory& GetLockTrackerFactory() const override {
+    return PointLockTrackerFactory::Get();
+  }
 
   // Creates a new LockMap for this column family.  Caller should guarantee
   // that this column family does not already exist.
-  void AddColumnFamily(uint32_t column_family_id);
-
+  void AddColumnFamily(const ColumnFamilyHandle* cf) override;
   // Deletes the LockMap for this column family.  Caller should guarantee that
   // this column family is no longer in use.
-  void RemoveColumnFamily(uint32_t column_family_id);
+  void RemoveColumnFamily(const ColumnFamilyHandle* cf) override;
+
+  Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+                 const std::string& key, Env* env, bool exclusive) override;
+  Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+                 const Endpoint& start, const Endpoint& end, Env* env,
+                 bool exclusive) override;
+
+  void UnLock(PessimisticTransaction* txn, const LockTracker& tracker,
+              Env* env) override;
+  void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+              const std::string& key, Env* env) override;
+  void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+              const Endpoint& start, const Endpoint& end, Env* env) override;
+
+  PointLockStatus GetPointLockStatus() override;
 
-  // Attempt to lock key.  If OK status is returned, the caller is responsible
-  // for calling UnLock() on this key.
-  Status TryLock(PessimisticTransaction* txn, uint32_t column_family_id,
-                 const std::string& key, Env* env, bool exclusive);
+  RangeLockStatus GetRangeLockStatus() override;
 
-  // Unlock a key locked by TryLock().  txn must be the same Transaction that
-  // locked this key.
-  void UnLock(const PessimisticTransaction* txn, const LockTracker& tracker,
-              Env* env);
-  void UnLock(PessimisticTransaction* txn, uint32_t column_family_id,
-              const std::string& key, Env* env);
+  std::vector<DeadlockPath> GetDeadlockInfoBuffer() override;
 
-  using LockStatusData = std::unordered_multimap<uint32_t, KeyLockInfo>;
-  LockStatusData GetLockStatusData();
-  std::vector<DeadlockPath> GetDeadlockInfoBuffer();
-  void Resize(uint32_t);
+  void Resize(uint32_t new_size) override;
 
  private:
   PessimisticTransactionDB* txn_db_impl_;
@@ -140,7 +206,7 @@ class TransactionLockMgr {
                        LockInfo&& lock_info, uint64_t* wait_time,
                        autovector<TransactionID>* txn_ids);
 
-  void UnLockKey(const PessimisticTransaction* txn, const std::string& key,
+  void UnLockKey(PessimisticTransaction* txn, const std::string& key,
                  LockMapStripe* stripe, LockMap* lock_map, Env* env);
 
   bool IncrementWaiters(const PessimisticTransaction* txn,
diff --git a/utilities/transactions/lock/point/point_lock_manager_test.cc b/utilities/transactions/lock/point/point_lock_manager_test.cc
new file mode 100644
index 00000000000..525fdea7138
--- /dev/null
+++ b/utilities/transactions/lock/point/point_lock_manager_test.cc
@@ -0,0 +1,181 @@
+//  Copyright (c) 2020-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/lock/point/point_lock_manager_test.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This test is not applicable for Range Lock manager as Range Lock Manager
+// operates on Column Families, not their ids.
+TEST_F(PointLockManagerTest, LockNonExistingColumnFamily) {
+  MockColumnFamilyHandle cf(1024);
+  locker_->RemoveColumnFamily(&cf);
+  auto txn = NewTxn();
+  auto s = locker_->TryLock(txn, 1024, "k", env_, true);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_STREQ(s.getState(), "Column family id not found: 1024");
+  delete txn;
+}
+
+TEST_F(PointLockManagerTest, LockStatus) {
+  MockColumnFamilyHandle cf1(1024), cf2(2048);
+  locker_->AddColumnFamily(&cf1);
+  locker_->AddColumnFamily(&cf2);
+
+  auto txn1 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn1, 1024, "k1", env_, true));
+  ASSERT_OK(locker_->TryLock(txn1, 2048, "k1", env_, true));
+
+  auto txn2 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn2, 1024, "k2", env_, false));
+  ASSERT_OK(locker_->TryLock(txn2, 2048, "k2", env_, false));
+
+  auto s = locker_->GetPointLockStatus();
+  ASSERT_EQ(s.size(), 4u);
+  for (uint32_t cf_id : {1024, 2048}) {
+    ASSERT_EQ(s.count(cf_id), 2u);
+    auto range = s.equal_range(cf_id);
+    for (auto it = range.first; it != range.second; it++) {
+      ASSERT_TRUE(it->second.key == "k1" || it->second.key == "k2");
+      if (it->second.key == "k1") {
+        ASSERT_EQ(it->second.exclusive, true);
+        ASSERT_EQ(it->second.ids.size(), 1u);
+        ASSERT_EQ(it->second.ids[0], txn1->GetID());
+      } else if (it->second.key == "k2") {
+        ASSERT_EQ(it->second.exclusive, false);
+        ASSERT_EQ(it->second.ids.size(), 1u);
+        ASSERT_EQ(it->second.ids[0], txn2->GetID());
+      }
+    }
+  }
+
+  // Cleanup
+  locker_->UnLock(txn1, 1024, "k1", env_);
+  locker_->UnLock(txn1, 2048, "k1", env_);
+  locker_->UnLock(txn2, 1024, "k2", env_);
+  locker_->UnLock(txn2, 2048, "k2", env_);
+
+  delete txn1;
+  delete txn2;
+}
+
+TEST_F(PointLockManagerTest, UnlockExclusive) {
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+
+  auto txn1 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, true));
+  locker_->UnLock(txn1, 1, "k", env_);
+
+  auto txn2 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, true));
+
+  // Cleanup
+  locker_->UnLock(txn2, 1, "k", env_);
+
+  delete txn1;
+  delete txn2;
+}
+
+TEST_F(PointLockManagerTest, UnlockShared) {
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+
+  auto txn1 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false));
+  locker_->UnLock(txn1, 1, "k", env_);
+
+  auto txn2 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, true));
+
+  // Cleanup
+  locker_->UnLock(txn2, 1, "k", env_);
+
+  delete txn1;
+  delete txn2;
+}
+
+// This test doesn't work with Range Lock Manager, because Range Lock Manager
+// doesn't support deadlock_detect_depth.
+
+TEST_F(PointLockManagerTest, DeadlockDepthExceeded) {
+  // Tests that when detecting deadlock, if the detection depth is exceeded,
+  // it's also viewed as deadlock.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  txn_opt.deadlock_detect = true;
+  txn_opt.deadlock_detect_depth = 1;
+  txn_opt.lock_timeout = 1000000;
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+  auto txn3 = NewTxn(txn_opt);
+  auto txn4 = NewTxn(txn_opt);
+  // "a ->(k) b" means transaction a is waiting for transaction b to release
+  // the held lock on key k.
+  // txn4 ->(k3) -> txn3 ->(k2) txn2 ->(k1) txn1
+  // txn3's deadlock detection will exceed the detection depth 1,
+  // which will be viewed as a deadlock.
+  // NOTE:
+  // txn4 ->(k3) -> txn3 must be set up before
+  // txn3 ->(k2) -> txn2, because to trigger deadlock detection for txn3,
+  // it must have another txn waiting on it, which is txn4 in this case.
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+
+  port::Thread t1 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
+    ASSERT_OK(locker_->TryLock(txn2, 1, "k2", env_, true));
+    // block because txn1 is holding a lock on k1.
+    locker_->TryLock(txn2, 1, "k1", env_, true);
+  });
+
+  ASSERT_OK(locker_->TryLock(txn3, 1, "k3", env_, true));
+
+  port::Thread t2 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
+    // block because txn3 is holding a lock on k1.
+    locker_->TryLock(txn4, 1, "k3", env_, true);
+  });
+
+  auto s = locker_->TryLock(txn3, 1, "k2", env_, true);
+  ASSERT_TRUE(s.IsBusy());
+  ASSERT_EQ(s.subcode(), Status::SubCode::kDeadlock);
+
+  std::vector<DeadlockPath> deadlock_paths = locker_->GetDeadlockInfoBuffer();
+  ASSERT_EQ(deadlock_paths.size(), 1u);
+  ASSERT_TRUE(deadlock_paths[0].limit_exceeded);
+
+  locker_->UnLock(txn1, 1, "k1", env_);
+  locker_->UnLock(txn3, 1, "k3", env_);
+  t1.join();
+  t2.join();
+
+  delete txn4;
+  delete txn3;
+  delete txn2;
+  delete txn1;
+}
+
+INSTANTIATE_TEST_CASE_P(PointLockManager, AnyLockManagerTest,
+                        ::testing::Values(nullptr));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED because Transactions are not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/lock/point/point_lock_manager_test.h b/utilities/transactions/lock/point/point_lock_manager_test.h
new file mode 100644
index 00000000000..d4011c02419
--- /dev/null
+++ b/utilities/transactions/lock/point/point_lock_manager_test.h
@@ -0,0 +1,319 @@
+
+#include "file/file_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "utilities/transactions/lock/point/point_lock_manager.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_db_mutex_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MockColumnFamilyHandle : public ColumnFamilyHandle {
+ public:
+  explicit MockColumnFamilyHandle(ColumnFamilyId cf_id) : cf_id_(cf_id) {}
+
+  ~MockColumnFamilyHandle() override {}
+
+  const std::string& GetName() const override { return name_; }
+
+  ColumnFamilyId GetID() const override { return cf_id_; }
+
+  Status GetDescriptor(ColumnFamilyDescriptor*) override {
+    return Status::OK();
+  }
+
+  const Comparator* GetComparator() const override {
+    return BytewiseComparator();
+  }
+
+ private:
+  ColumnFamilyId cf_id_;
+  std::string name_ = "MockCF";
+};
+
+class PointLockManagerTest : public testing::Test {
+ public:
+  void SetUp() override {
+    env_ = Env::Default();
+    db_dir_ = test::PerThreadDBPath("point_lock_manager_test");
+    ASSERT_OK(env_->CreateDir(db_dir_));
+
+    Options opt;
+    opt.create_if_missing = true;
+    TransactionDBOptions txn_opt;
+    txn_opt.transaction_lock_timeout = 0;
+
+    ASSERT_OK(TransactionDB::Open(opt, txn_opt, db_dir_, &db_));
+
+    // CAUTION: This test creates a separate lock manager object (right, NOT
+    // the one that the TransactionDB is using!), and runs tests on it.
+    locker_.reset(new PointLockManager(
+        static_cast<PessimisticTransactionDB*>(db_), txn_opt));
+
+    wait_sync_point_name_ = "PointLockManager::AcquireWithTimeout:WaitingTxn";
+  }
+
+  void TearDown() override {
+    delete db_;
+    EXPECT_OK(DestroyDir(env_, db_dir_));
+  }
+
+  PessimisticTransaction* NewTxn(
+      TransactionOptions txn_opt = TransactionOptions()) {
+    Transaction* txn = db_->BeginTransaction(WriteOptions(), txn_opt);
+    return reinterpret_cast<PessimisticTransaction*>(txn);
+  }
+
+ protected:
+  Env* env_;
+  std::shared_ptr<LockManager> locker_;
+  const char* wait_sync_point_name_;
+  friend void PointLockManagerTestExternalSetup(PointLockManagerTest*);
+
+ private:
+  std::string db_dir_;
+  TransactionDB* db_;
+};
+
+typedef void (*init_func_t)(PointLockManagerTest*);
+
+class AnyLockManagerTest : public PointLockManagerTest,
+                           public testing::WithParamInterface<init_func_t> {
+ public:
+  void SetUp() override {
+    // If a custom setup function was provided, use it. Otherwise, use what we
+    // have inherited.
+    auto init_func = GetParam();
+    if (init_func)
+      (*init_func)(this);
+    else
+      PointLockManagerTest::SetUp();
+  }
+};
+
+TEST_P(AnyLockManagerTest, ReentrantExclusiveLock) {
+  // Tests that a txn can acquire exclusive lock on the same key repeatedly.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+
+  // Cleanup
+  locker_->UnLock(txn, 1, "k", env_);
+
+  delete txn;
+}
+
+TEST_P(AnyLockManagerTest, ReentrantSharedLock) {
+  // Tests that a txn can acquire shared lock on the same key repeatedly.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+
+  // Cleanup
+  locker_->UnLock(txn, 1, "k", env_);
+
+  delete txn;
+}
+
+TEST_P(AnyLockManagerTest, LockUpgrade) {
+  // Tests that a txn can upgrade from a shared lock to an exclusive lock.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+
+  // Cleanup
+  locker_->UnLock(txn, 1, "k", env_);
+  delete txn;
+}
+
+TEST_P(AnyLockManagerTest, LockDowngrade) {
+  // Tests that a txn can acquire a shared lock after acquiring an exclusive
+  // lock on the same key.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
+  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
+
+  // Cleanup
+  locker_->UnLock(txn, 1, "k", env_);
+  delete txn;
+}
+
+TEST_P(AnyLockManagerTest, LockConflict) {
+  // Tests that lock conflicts lead to lock timeout.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn1 = NewTxn();
+  auto txn2 = NewTxn();
+
+  {
+    // exclusive-exclusive conflict.
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+    auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  {
+    // exclusive-shared conflict.
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, true));
+    auto s = locker_->TryLock(txn2, 1, "k2", env_, false);
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  {
+    // shared-exclusive conflict.
+    ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, false));
+    auto s = locker_->TryLock(txn2, 1, "k2", env_, true);
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  // Cleanup
+  locker_->UnLock(txn1, 1, "k1", env_);
+  locker_->UnLock(txn1, 1, "k2", env_);
+
+  delete txn1;
+  delete txn2;
+}
+
+port::Thread BlockUntilWaitingTxn(const char* sync_point_name,
+                                  std::function<void()> f) {
+  std::atomic<bool> reached(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      sync_point_name, [&](void* /*arg*/) { reached.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread t(f);
+
+  while (!reached.load()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  return t;
+}
+
+TEST_P(AnyLockManagerTest, SharedLocks) {
+  // Tests that shared locks can be concurrently held by multiple transactions.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  auto txn1 = NewTxn();
+  auto txn2 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false));
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false));
+
+  // Cleanup
+  locker_->UnLock(txn1, 1, "k", env_);
+  locker_->UnLock(txn2, 1, "k", env_);
+
+  delete txn1;
+  delete txn2;
+}
+
+TEST_P(AnyLockManagerTest, Deadlock) {
+  // Tests that deadlock can be detected.
+  // Deadlock scenario:
+  // txn1 exclusively locks k1, and wants to lock k2;
+  // txn2 exclusively locks k2, and wants to lock k1.
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+  TransactionOptions txn_opt;
+  txn_opt.deadlock_detect = true;
+  txn_opt.lock_timeout = 1000000;
+  auto txn1 = NewTxn(txn_opt);
+  auto txn2 = NewTxn(txn_opt);
+
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k2", env_, true));
+
+  // txn1 tries to lock k2, will block forever.
+  port::Thread t = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
+    // block because txn2 is holding a lock on k2.
+    locker_->TryLock(txn1, 1, "k2", env_, true);
+  });
+
+  auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
+  ASSERT_TRUE(s.IsBusy());
+  ASSERT_EQ(s.subcode(), Status::SubCode::kDeadlock);
+
+  std::vector<DeadlockPath> deadlock_paths = locker_->GetDeadlockInfoBuffer();
+  ASSERT_EQ(deadlock_paths.size(), 1u);
+  ASSERT_FALSE(deadlock_paths[0].limit_exceeded);
+
+  std::vector<DeadlockInfo> deadlocks = deadlock_paths[0].path;
+  ASSERT_EQ(deadlocks.size(), 2u);
+
+  ASSERT_EQ(deadlocks[0].m_txn_id, txn1->GetID());
+  ASSERT_EQ(deadlocks[0].m_cf_id, 1u);
+  ASSERT_TRUE(deadlocks[0].m_exclusive);
+  ASSERT_EQ(deadlocks[0].m_waiting_key, "k2");
+
+  ASSERT_EQ(deadlocks[1].m_txn_id, txn2->GetID());
+  ASSERT_EQ(deadlocks[1].m_cf_id, 1u);
+  ASSERT_TRUE(deadlocks[1].m_exclusive);
+  ASSERT_EQ(deadlocks[1].m_waiting_key, "k1");
+
+  locker_->UnLock(txn2, 1, "k2", env_);
+  t.join();
+
+  // Cleanup
+  locker_->UnLock(txn1, 1, "k1", env_);
+  locker_->UnLock(txn1, 1, "k2", env_);
+  delete txn2;
+  delete txn1;
+}
+
+TEST_P(AnyLockManagerTest, GetWaitingTxns_MultipleTxns) {
+  MockColumnFamilyHandle cf(1);
+  locker_->AddColumnFamily(&cf);
+
+  auto txn1 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false));
+
+  auto txn2 = NewTxn();
+  ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false));
+
+  auto txn3 = NewTxn();
+  txn3->SetLockTimeout(10000);
+  port::Thread t1 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() {
+    ASSERT_OK(locker_->TryLock(txn3, 1, "k", env_, true));
+    locker_->UnLock(txn3, 1, "k", env_);
+  });
+
+  // Ok, now txn3 is waiting for lock on "k", which is owned by two
+  // transactions. Check that GetWaitingTxns reports this correctly
+  uint32_t wait_cf_id;
+  std::string wait_key;
+  auto waiters = txn3->GetWaitingTxns(&wait_cf_id, &wait_key);
+
+  ASSERT_EQ(wait_cf_id, 1u);
+  ASSERT_EQ(wait_key, "k");
+  ASSERT_EQ(waiters.size(), 2);
+  bool waits_correct =
+      (waiters[0] == txn1->GetID() && waiters[1] == txn2->GetID()) ||
+      (waiters[1] == txn1->GetID() && waiters[0] == txn2->GetID());
+  ASSERT_EQ(waits_correct, true);
+
+  // Release locks so txn3 can proceed with execution
+  locker_->UnLock(txn1, 1, "k", env_);
+  locker_->UnLock(txn2, 1, "k", env_);
+
+  // Wait until txn3 finishes
+  t1.join();
+
+  delete txn1;
+  delete txn2;
+  delete txn3;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/transactions/lock/point_lock_tracker.cc b/utilities/transactions/lock/point/point_lock_tracker.cc
similarity index 98%
rename from utilities/transactions/lock/point_lock_tracker.cc
rename to utilities/transactions/lock/point/point_lock_tracker.cc
index d6f609ee42a..837f377def7 100644
--- a/utilities/transactions/lock/point_lock_tracker.cc
+++ b/utilities/transactions/lock/point/point_lock_tracker.cc
@@ -3,7 +3,9 @@
 // COPYING file in the root directory) and Apache 2.0 License
 // (found in the LICENSE.Apache file in the root directory).
 
-#include "utilities/transactions/lock/point_lock_tracker.h"
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/lock/point/point_lock_tracker.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -264,3 +266,5 @@ LockTracker::KeyIterator* PointLockTracker::GetKeyIterator(
 void PointLockTracker::Clear() { tracked_keys_.clear(); }
 
 }  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/lock/point_lock_tracker.h b/utilities/transactions/lock/point/point_lock_tracker.h
similarity index 87%
rename from utilities/transactions/lock/point_lock_tracker.h
rename to utilities/transactions/lock/point/point_lock_tracker.h
index f307d1892ba..daf6f9aa276 100644
--- a/utilities/transactions/lock/point_lock_tracker.h
+++ b/utilities/transactions/lock/point/point_lock_tracker.h
@@ -4,6 +4,7 @@
 // (found in the LICENSE.Apache file in the root directory).
 
 #pragma once
+#ifndef ROCKSDB_LITE
 
 #include <memory>
 #include <string>
@@ -81,4 +82,18 @@ class PointLockTracker : public LockTracker {
   TrackedKeys tracked_keys_;
 };
 
+class PointLockTrackerFactory : public LockTrackerFactory {
+ public:
+  static const PointLockTrackerFactory& Get() {
+    static const PointLockTrackerFactory instance;
+    return instance;
+  }
+
+  LockTracker* Create() const override { return new PointLockTracker(); }
+
+ private:
+  PointLockTrackerFactory() {}
+};
+
 }  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/lock/range/range_lock_manager.h b/utilities/transactions/lock/range/range_lock_manager.h
new file mode 100644
index 00000000000..91619934bc9
--- /dev/null
+++ b/utilities/transactions/lock/range/range_lock_manager.h
@@ -0,0 +1,30 @@
+//
+// Generic definitions for a Range-based Lock Manager
+//
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/lock/lock_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+/*
+  A base class for all Range-based lock managers
+
+  See also class RangeLockManagerHandle in
+  include/rocksdb/utilities/transaction_db.h
+*/
+class RangeLockManagerBase : public LockManager {
+ public:
+  // Geting a point lock is reduced to getting a range lock on a single-point
+  // range
+  using LockManager::TryLock;
+  Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+                 const std::string& key, Env* env, bool exclusive) override {
+    Endpoint endp(key.data(), key.size(), false);
+    return TryLock(txn, column_family_id, endp, endp, env, exclusive);
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/lock/range/range_locking_test.cc b/utilities/transactions/lock/range/range_locking_test.cc
new file mode 100644
index 00000000000..c881b68cb38
--- /dev/null
+++ b/utilities/transactions/lock/range/range_locking_test.cc
@@ -0,0 +1,341 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+
+#include <algorithm>
+#include <functional>
+#include <string>
+#include <thread>
+
+#include "db/db_impl/db_impl.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "utilities/transactions/lock/point/point_lock_manager_test.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_test.h"
+
+using std::string;
+
+namespace ROCKSDB_NAMESPACE {
+
+class RangeLockingTest : public ::testing::Test {
+ public:
+  TransactionDB* db;
+  std::string dbname;
+  Options options;
+
+  std::shared_ptr<RangeLockManagerHandle> range_lock_mgr;
+  TransactionDBOptions txn_db_options;
+
+  RangeLockingTest() : db(nullptr) {
+    options.create_if_missing = true;
+    dbname = test::PerThreadDBPath("range_locking_testdb");
+
+    DestroyDB(dbname, options);
+
+    range_lock_mgr.reset(NewRangeLockManager(nullptr));
+    txn_db_options.lock_mgr_handle = range_lock_mgr;
+
+    auto s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+    assert(s.ok());
+  }
+
+  ~RangeLockingTest() {
+    delete db;
+    db = nullptr;
+    // This is to skip the assert statement in FaultInjectionTestEnv. There
+    // seems to be a bug in btrfs that the makes readdir return recently
+    // unlink-ed files. By using the default fs we simply ignore errors resulted
+    // from attempting to delete such files in DestroyDB.
+    DestroyDB(dbname, options);
+  }
+
+  PessimisticTransaction* NewTxn(
+      TransactionOptions txn_opt = TransactionOptions()) {
+    Transaction* txn = db->BeginTransaction(WriteOptions(), txn_opt);
+    return reinterpret_cast<PessimisticTransaction*>(txn);
+  }
+};
+
+// TODO: set a smaller lock wait timeout so that the test runs faster.
+TEST_F(RangeLockingTest, BasicRangeLocking) {
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  std::string value;
+  ReadOptions read_options;
+  auto cf = db->DefaultColumnFamily();
+
+  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+
+  // Get a range lock
+  ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("c")));
+
+  // Check that range Lock inhibits an overlapping range lock
+  {
+    auto s = txn1->GetRangeLock(cf, Endpoint("b"), Endpoint("z"));
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  // Check that range Lock inhibits an overlapping point lock
+  {
+    auto s = txn1->GetForUpdate(read_options, cf, Slice("b"), &value);
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  // Get a point lock, check that it inhibits range locks
+  ASSERT_OK(txn0->Put(cf, Slice("n"), Slice("value")));
+  {
+    auto s = txn1->GetRangeLock(cf, Endpoint("m"), Endpoint("p"));
+    ASSERT_TRUE(s.IsTimedOut());
+  }
+
+  ASSERT_OK(txn0->Commit());
+  txn1->Rollback();
+
+  delete txn0;
+  delete txn1;
+}
+
+TEST_F(RangeLockingTest, MyRocksLikeUpdate) {
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+  auto cf = db->DefaultColumnFamily();
+  Status s;
+
+  // Get a range lock for the range we are about to update
+  ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("c")));
+
+  bool try_range_lock_called = false;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "RangeTreeLockManager::TryRangeLock:enter",
+      [&](void* /*arg*/) { try_range_lock_called = true; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // For performance reasons, the following must NOT call lock_mgr->TryLock():
+  // We verify that by checking the value of try_range_lock_called.
+  ASSERT_OK(txn0->Put(cf, Slice("b"), Slice("value"),
+                      /*assume_tracked=*/true));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ASSERT_FALSE(try_range_lock_called);
+
+  txn0->Rollback();
+
+  delete txn0;
+}
+
+TEST_F(RangeLockingTest, UpgradeLockAndGetConflict) {
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  auto cf = db->DefaultColumnFamily();
+  Status s;
+  std::string value;
+  txn_options.lock_timeout= 10;
+
+  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+
+  // Get the shared lock in txn0
+  s = txn0->GetForUpdate(ReadOptions(), cf,
+                                Slice("a"), &value,
+                                false /*exclusive*/);
+  ASSERT_TRUE(s.IsNotFound());
+
+  // Get the shared lock on the same key in txn1
+  s = txn1->GetForUpdate(ReadOptions(), cf,
+                         Slice("a"), &value,
+                         false /*exclusive*/);
+  ASSERT_TRUE(s.IsNotFound());
+
+  // Now, try getting an exclusive lock that overlaps with the above
+  s = txn0->GetRangeLock(cf, Endpoint("a"), Endpoint("b"));
+  ASSERT_TRUE(s.IsTimedOut());
+
+  txn0->Rollback();
+  txn1->Rollback();
+
+  delete txn0;
+  delete txn1;
+}
+
+
+TEST_F(RangeLockingTest, SnapshotValidation) {
+  Status s;
+  Slice key_slice = Slice("k");
+  ColumnFamilyHandle* cfh = db->DefaultColumnFamily();
+
+  auto txn0 = NewTxn();
+  txn0->Put(key_slice, Slice("initial"));
+  txn0->Commit();
+
+  // txn1
+  auto txn1 = NewTxn();
+  txn1->SetSnapshot();
+  std::string val1;
+  ASSERT_OK(txn1->Get(ReadOptions(), cfh, key_slice, &val1));
+  ASSERT_EQ(val1, "initial");
+  val1 = val1 + std::string("-txn1");
+
+  ASSERT_OK(txn1->Put(cfh, key_slice, Slice(val1)));
+
+  // txn2
+  auto txn2 = NewTxn();
+  txn2->SetSnapshot();
+  std::string val2;
+  // This will see the original value as nothing is committed
+  // This is also Get, so it is doesn't acquire any locks.
+  ASSERT_OK(txn2->Get(ReadOptions(), cfh, key_slice, &val2));
+  ASSERT_EQ(val2, "initial");
+
+  // txn1
+  ASSERT_OK(txn1->Commit());
+
+  // txn2
+  val2 = val2 + std::string("-txn2");
+  // Now, this call should do Snapshot Validation and fail:
+  s = txn2->Put(cfh, key_slice, Slice(val2));
+  ASSERT_TRUE(s.IsBusy());
+
+  ASSERT_OK(txn2->Commit());
+
+  delete txn0;
+  delete txn1;
+  delete txn2;
+}
+
+TEST_F(RangeLockingTest, MultipleTrxLockStatusData) {
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  auto cf = db->DefaultColumnFamily();
+
+  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+
+  // Get a range lock
+  ASSERT_OK(txn0->GetRangeLock(cf, Endpoint("z"), Endpoint("z")));
+  ASSERT_OK(txn1->GetRangeLock(cf, Endpoint("b"), Endpoint("e")));
+
+  auto s = range_lock_mgr->GetRangeLockStatusData();
+  ASSERT_EQ(s.size(), 2);
+  for (auto it = s.begin(); it != s.end(); ++it) {
+    ASSERT_EQ(it->first, cf->GetID());
+    auto val = it->second;
+    ASSERT_FALSE(val.start.inf_suffix);
+    ASSERT_FALSE(val.end.inf_suffix);
+    ASSERT_TRUE(val.exclusive);
+    ASSERT_EQ(val.ids.size(), 1);
+    if (val.ids[0] == txn0->GetID()) {
+      ASSERT_EQ(val.start.slice, "z");
+      ASSERT_EQ(val.end.slice, "z");
+    } else if (val.ids[0] == txn1->GetID()) {
+      ASSERT_EQ(val.start.slice, "b");
+      ASSERT_EQ(val.end.slice, "e");
+    } else {
+      FAIL();  // Unknown transaction ID.
+    }
+  }
+
+  delete txn0;
+  delete txn1;
+}
+
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define SKIP_LOCK_ESCALATION_TEST 1
+#endif
+#else
+#define SKIP_LOCK_ESCALATION_TEST 1
+#endif
+
+#ifndef SKIP_LOCK_ESCALATION_TEST
+TEST_F(RangeLockingTest, BasicLockEscalation) {
+  auto cf = db->DefaultColumnFamily();
+
+  auto counters = range_lock_mgr->GetStatus();
+
+  // Initially not using any lock memory
+  ASSERT_EQ(counters.current_lock_memory, 0);
+  ASSERT_EQ(counters.escalation_count, 0);
+
+  ASSERT_EQ(0, range_lock_mgr->SetMaxLockMemory(2000));
+
+  // Insert until we see lock escalations
+  auto txn = NewTxn();
+
+  // Get the locks until we hit an escalation
+  for (int i = 0; i < 2020; i++) {
+    char buf[32];
+    snprintf(buf, sizeof(buf) - 1, "%08d", i);
+    ASSERT_OK(txn->GetRangeLock(cf, Endpoint(buf), Endpoint(buf)));
+  }
+  counters = range_lock_mgr->GetStatus();
+  ASSERT_GT(counters.escalation_count, 0);
+  ASSERT_LE(counters.current_lock_memory, 2000);
+
+  delete txn;
+}
+#endif
+
+void PointLockManagerTestExternalSetup(PointLockManagerTest* self) {
+  self->env_ = Env::Default();
+  self->db_dir_ = test::PerThreadDBPath("point_lock_manager_test");
+  ASSERT_OK(self->env_->CreateDir(self->db_dir_));
+
+  Options opt;
+  opt.create_if_missing = true;
+  TransactionDBOptions txn_opt;
+  txn_opt.transaction_lock_timeout = 0;
+
+  auto mutex_factory = std::make_shared<TransactionDBMutexFactoryImpl>();
+  self->locker_.reset(NewRangeLockManager(mutex_factory)->getLockManager());
+  std::shared_ptr<RangeLockManagerHandle> range_lock_mgr =
+      std::dynamic_pointer_cast<RangeLockManagerHandle>(self->locker_);
+  txn_opt.lock_mgr_handle = range_lock_mgr;
+
+  ASSERT_OK(TransactionDB::Open(opt, txn_opt, self->db_dir_, &self->db_));
+  self->wait_sync_point_name_ = "RangeTreeLockManager::TryRangeLock:WaitingTxn";
+}
+
+INSTANTIATE_TEST_CASE_P(RangeLockManager, AnyLockManagerTest,
+                        ::testing::Values(PointLockManagerTestExternalSetup));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else  // OS_WIN
+
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "skipped as Range Locking is not supported on Windows\n");
+  return 0;
+}
+
+#endif  // OS_WIN
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "skipped as transactions are not supported in rocksdb_lite\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3 b/utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3
new file mode 100644
index 00000000000..dba13ed2ddf
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/COPYING.AGPLv3
@@ -0,0 +1,661 @@
+                    GNU AFFERO GENERAL PUBLIC LICENSE
+                       Version 3, 19 November 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+
+  A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate.  Many developers of free software are heartened and
+encouraged by the resulting cooperation.  However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+
+  The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community.  It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server.  Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+
+  An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals.  This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU Affero General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Remote Network Interaction; Use with the GNU General Public License.
+
+  Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software.  This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time.  Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source.  For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code.  There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+<http://www.gnu.org/licenses/>.
diff --git a/utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2 b/utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2
new file mode 100644
index 00000000000..ecbfc770fa9
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/COPYING.APACHEv2
@@ -0,0 +1,174 @@
+Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
diff --git a/utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2 b/utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2
new file mode 100644
index 00000000000..d511905c164
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/COPYING.GPLv2
@@ -0,0 +1,339 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/utilities/transactions/lock/range/range_tree/lib/README b/utilities/transactions/lock/range/range_tree/lib/README
new file mode 100644
index 00000000000..2ea86bf4690
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/README
@@ -0,0 +1,13 @@
+The files in this directory originally come from
+https://github.com/percona/PerconaFT/.
+
+This directory only includes the "locktree" part of PerconaFT, and its
+dependencies.
+
+The following modifications were made:
+- Make locktree usable outside of PerconaFT library
+- Add shared read-only lock support
+
+The files named *_subst.* are substitutes of the PerconaFT's files, they
+contain replacements of PerconaFT's functionality.
+
diff --git a/utilities/transactions/lock/range/range_tree/lib/db.h b/utilities/transactions/lock/range/range_tree/lib/db.h
new file mode 100644
index 00000000000..5aa826c8e08
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/db.h
@@ -0,0 +1,76 @@
+#ifndef _DB_H
+#define _DB_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+typedef struct __toku_dbt DBT;
+
+// port: this is currently not used
+struct simple_dbt {
+  uint32_t len;
+  void *data;
+};
+
+// engine status info
+// engine status is passed to handlerton as an array of
+// TOKU_ENGINE_STATUS_ROW_S[]
+typedef enum {
+  STATUS_FS_STATE = 0,  // interpret as file system state (redzone) enum
+  STATUS_UINT64,        // interpret as uint64_t
+  STATUS_CHARSTR,       // interpret as char *
+  STATUS_UNIXTIME,      // interpret as time_t
+  STATUS_TOKUTIME,      // interpret as tokutime_t
+  STATUS_PARCOUNT,      // interpret as PARTITIONED_COUNTER
+  STATUS_DOUBLE         // interpret as double
+} toku_engine_status_display_type;
+
+typedef enum {
+  TOKU_ENGINE_STATUS = (1ULL << 0),  // Include when asking for engine status
+  TOKU_GLOBAL_STATUS =
+      (1ULL << 1),  // Include when asking for information_schema.global_status
+} toku_engine_status_include_type;
+
+typedef struct __toku_engine_status_row {
+  const char *keyname;  // info schema key, should not change across revisions
+                        // without good reason
+  const char
+      *columnname;  // column for mysql, e.g. information_schema.global_status.
+                    // TOKUDB_ will automatically be prefixed.
+  const char *legend;  // the text that will appear at user interface
+  toku_engine_status_display_type type;  // how to interpret the value
+  toku_engine_status_include_type
+      include;  // which kinds of callers should get read this row?
+  union {
+    double dnum;
+    uint64_t num;
+    const char *str;
+    char datebuf[26];
+    struct partitioned_counter *parcount;
+  } value;
+} * TOKU_ENGINE_STATUS_ROW, TOKU_ENGINE_STATUS_ROW_S;
+
+#define DB_BUFFER_SMALL -30999
+#define DB_LOCK_DEADLOCK -30995
+#define DB_LOCK_NOTGRANTED -30994
+#define DB_NOTFOUND -30989
+#define DB_KEYEXIST -30996
+#define DB_DBT_MALLOC 8
+#define DB_DBT_REALLOC 64
+#define DB_DBT_USERMEM 256
+
+/* PerconaFT specific error codes */
+#define TOKUDB_OUT_OF_LOCKS -100000
+
+typedef void (*lock_wait_callback)(void *arg, uint64_t requesting_txnid,
+                                   uint64_t blocking_txnid);
+
+struct __toku_dbt {
+  void *data;
+  size_t size;
+  size_t ulen;
+  // One of DB_DBT_XXX flags
+  uint32_t flags;
+};
+
+#endif
diff --git a/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h b/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h
new file mode 100644
index 00000000000..718efc62314
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/ft/comparator.h
@@ -0,0 +1,138 @@
+/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <string.h>
+
+#include "../db.h"
+#include "../portability/memory.h"
+#include "../util/dbt.h"
+
+typedef int (*ft_compare_func)(void *arg, const DBT *a, const DBT *b);
+
+int toku_keycompare(const void *key1, size_t key1len, const void *key2,
+                    size_t key2len);
+
+int toku_builtin_compare_fun(const DBT *, const DBT *)
+    __attribute__((__visibility__("default")));
+
+namespace toku {
+
+// a comparator object encapsulates the data necessary for
+// comparing two keys in a fractal tree. it further understands
+// that points may be positive or negative infinity.
+
+class comparator {
+  void init(ft_compare_func cmp, void *cmp_arg, uint8_t memcmp_magic) {
+    _cmp = cmp;
+    _cmp_arg = cmp_arg;
+    _memcmp_magic = memcmp_magic;
+  }
+
+ public:
+  // This magic value is reserved to mean that the magic has not been set.
+  static const uint8_t MEMCMP_MAGIC_NONE = 0;
+
+  void create(ft_compare_func cmp, void *cmp_arg,
+              uint8_t memcmp_magic = MEMCMP_MAGIC_NONE) {
+    init(cmp, cmp_arg, memcmp_magic);
+  }
+
+  // inherit the attributes of another comparator, but keep our own
+  // copy of fake_db that is owned separately from the one given.
+  void inherit(const comparator &cmp) {
+    invariant_notnull(cmp._cmp);
+    init(cmp._cmp, cmp._cmp_arg, cmp._memcmp_magic);
+  }
+
+  // like inherit, but doesn't require that the this comparator
+  // was already created
+  void create_from(const comparator &cmp) { inherit(cmp); }
+
+  void destroy() {}
+
+  ft_compare_func get_compare_func() const { return _cmp; }
+
+  uint8_t get_memcmp_magic() const { return _memcmp_magic; }
+
+  bool valid() const { return _cmp != nullptr; }
+
+  inline bool dbt_has_memcmp_magic(const DBT *dbt) const {
+    return *reinterpret_cast<const char *>(dbt->data) == _memcmp_magic;
+  }
+
+  int operator()(const DBT *a, const DBT *b) const {
+    if (__builtin_expect(toku_dbt_is_infinite(a) || toku_dbt_is_infinite(b),
+                         0)) {
+      return toku_dbt_infinite_compare(a, b);
+    } else if (_memcmp_magic != MEMCMP_MAGIC_NONE
+               // If `a' has the memcmp magic..
+               && dbt_has_memcmp_magic(a)
+               // ..then we expect `b' to also have the memcmp magic
+               && __builtin_expect(dbt_has_memcmp_magic(b), 1)) {
+      assert(0);  // psergey: this branch should not be taken.
+      return toku_builtin_compare_fun(a, b);
+    } else {
+      // yikes, const sadness here
+      return _cmp(_cmp_arg, a, b);
+    }
+  }
+
+ private:
+  ft_compare_func _cmp;
+  void *_cmp_arg;
+
+  uint8_t _memcmp_magic;
+};
+
+} /* namespace toku */
diff --git a/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h b/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h
new file mode 100644
index 00000000000..1b451117214
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/ft/ft-status.h
@@ -0,0 +1,102 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../db.h"
+#include "../portability/toku_race_tools.h"
+#include "../util/status.h"
+
+//
+// Lock Tree Manager statistics
+//
+class LTM_STATUS_S {
+ public:
+  enum {
+    LTM_SIZE_CURRENT = 0,
+    LTM_SIZE_LIMIT,
+    LTM_ESCALATION_COUNT,
+    LTM_ESCALATION_TIME,
+    LTM_ESCALATION_LATEST_RESULT,
+    LTM_NUM_LOCKTREES,
+    LTM_LOCK_REQUESTS_PENDING,
+    LTM_STO_NUM_ELIGIBLE,
+    LTM_STO_END_EARLY_COUNT,
+    LTM_STO_END_EARLY_TIME,
+    LTM_WAIT_COUNT,
+    LTM_WAIT_TIME,
+    LTM_LONG_WAIT_COUNT,
+    LTM_LONG_WAIT_TIME,
+    LTM_TIMEOUT_COUNT,
+    LTM_WAIT_ESCALATION_COUNT,
+    LTM_WAIT_ESCALATION_TIME,
+    LTM_LONG_WAIT_ESCALATION_COUNT,
+    LTM_LONG_WAIT_ESCALATION_TIME,
+    LTM_STATUS_NUM_ROWS  // must be last
+  };
+
+  void init(void);
+  void destroy(void);
+
+  TOKU_ENGINE_STATUS_ROW_S status[LTM_STATUS_NUM_ROWS];
+
+ private:
+  bool m_initialized = false;
+};
+typedef LTM_STATUS_S* LTM_STATUS;
+extern LTM_STATUS_S ltm_status;
+
+#define LTM_STATUS_VAL(x) ltm_status.status[LTM_STATUS_S::x].value.num
+
+void toku_status_init(void);     // just call ltm_status.init();
+void toku_status_destroy(void);  // just call ltm_status.destroy();
diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc
new file mode 100644
index 00000000000..5110cd48253
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc
@@ -0,0 +1,139 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "concurrent_tree.h"
+
+// PORT #include <toku_assert.h>
+namespace toku {
+
+void concurrent_tree::create(const comparator *cmp) {
+  // start with an empty root node. we do this instead of
+  // setting m_root to null so there's always a root to lock
+  m_root.create_root(cmp);
+}
+
+void concurrent_tree::destroy(void) { m_root.destroy_root(); }
+
+bool concurrent_tree::is_empty(void) { return m_root.is_empty(); }
+
+uint64_t concurrent_tree::get_insertion_memory_overhead(void) {
+  return sizeof(treenode);
+}
+
+void concurrent_tree::locked_keyrange::prepare(concurrent_tree *tree) {
+  // the first step in acquiring a locked keyrange is locking the root
+  treenode *const root = &tree->m_root;
+  m_tree = tree;
+  m_subtree = root;
+  m_range = keyrange::get_infinite_range();
+  root->mutex_lock();
+}
+
+void concurrent_tree::locked_keyrange::acquire(const keyrange &range) {
+  treenode *const root = &m_tree->m_root;
+
+  treenode *subtree;
+  if (root->is_empty() || root->range_overlaps(range)) {
+    subtree = root;
+  } else {
+    // we do not have a precomputed comparison hint, so pass null
+    const keyrange::comparison *cmp_hint = nullptr;
+    subtree = root->find_node_with_overlapping_child(range, cmp_hint);
+  }
+
+  // subtree is locked. it will be unlocked when this is release()'d
+  invariant_notnull(subtree);
+  m_range = range;
+  m_subtree = subtree;
+}
+
+bool concurrent_tree::locked_keyrange::add_shared_owner(const keyrange &range,
+                                                        TXNID new_owner) {
+  return m_subtree->insert(range, new_owner, /*is_shared*/ true);
+}
+
+void concurrent_tree::locked_keyrange::release(void) {
+  m_subtree->mutex_unlock();
+}
+
+void concurrent_tree::locked_keyrange::insert(const keyrange &range,
+                                              TXNID txnid, bool is_shared) {
+  // empty means no children, and only the root should ever be empty
+  if (m_subtree->is_empty()) {
+    m_subtree->set_range_and_txnid(range, txnid, is_shared);
+  } else {
+    m_subtree->insert(range, txnid, is_shared);
+  }
+}
+
+void concurrent_tree::locked_keyrange::remove(const keyrange &range,
+                                              TXNID txnid) {
+  invariant(!m_subtree->is_empty());
+  treenode *new_subtree = m_subtree->remove(range, txnid);
+  // if removing range changed the root of the subtree,
+  // then the subtree must be the root of the entire tree.
+  if (new_subtree == nullptr) {
+    invariant(m_subtree->is_root());
+    invariant(m_subtree->is_empty());
+  }
+}
+
+void concurrent_tree::locked_keyrange::remove_all(void) {
+  m_subtree->recursive_remove();
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h b/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h
new file mode 100644
index 00000000000..e1bfb86c501
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.h
@@ -0,0 +1,174 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../ft/comparator.h"
+#include "keyrange.h"
+#include "treenode.h"
+
+namespace toku {
+
+// A concurrent_tree stores non-overlapping ranges.
+// Access to disjoint parts of the tree usually occurs concurrently.
+
+class concurrent_tree {
+ public:
+  // A locked_keyrange gives you exclusive access to read and write
+  // operations that occur on any keys in that range. You only have
+  // the right to operate on keys in that range or keys that were read
+  // from the keyrange using iterate()
+  //
+  // Access model:
+  // - user prepares a locked keyrange. all threads serialize behind prepare().
+  // - user breaks the serialzation point by acquiring a range, or releasing.
+  // - one thread operates on a certain locked_keyrange object at a time.
+  // - when the thread is finished, it releases
+
+  class locked_keyrange {
+   public:
+    // effect: prepare to acquire a locked keyrange over the given
+    //         concurrent_tree, preventing other threads from preparing
+    //         until this thread either does acquire() or release().
+    // note: operations performed on a prepared keyrange are equivalent
+    //         to ones performed on an acquired keyrange over -inf, +inf.
+    // rationale: this provides the user with a serialization point for
+    // descending
+    //            or modifying the the tree. it also proives a convenient way of
+    //            doing serializable operations on the tree.
+    // There are two valid sequences of calls:
+    //  - prepare, acquire, [operations], release
+    //  - prepare, [operations],release
+    void prepare(concurrent_tree *tree);
+
+    // requires: the locked keyrange was prepare()'d
+    // effect: acquire a locked keyrange over the given concurrent_tree.
+    //         the locked keyrange represents the range of keys overlapped
+    //         by the given range
+    void acquire(const keyrange &range);
+
+    // effect: releases a locked keyrange and the mutex it holds
+    void release(void);
+
+    // effect: iterate over each range this locked_keyrange represents,
+    //         calling function->fn() on each node's keyrange and txnid
+    //         until there are no more or the function returns false
+    template <class F>
+    void iterate(F *function) const {
+      // if the subtree is non-empty, traverse it by calling the given
+      // function on each range, txnid pair found that overlaps.
+      if (!m_subtree->is_empty()) {
+        m_subtree->traverse_overlaps(m_range, function);
+      }
+    }
+
+    // Adds another owner to the lock on the specified keyrange.
+    // requires: the keyrange contains one treenode whose bounds are
+    //           exactly equal to the specifed range (no sub/supersets)
+    bool add_shared_owner(const keyrange &range, TXNID new_owner);
+
+    // inserts the given range into the tree, with an associated txnid.
+    // requires: range does not overlap with anything in this locked_keyrange
+    // rationale: caller is responsible for only inserting unique ranges
+    void insert(const keyrange &range, TXNID txnid, bool is_shared);
+
+    // effect: removes the given range from the tree.
+    //         - txnid=TXNID_ANY means remove the range no matter what its
+    //           owners are
+    //         - Other value means remove the specified txnid from
+    //           ownership (if the range has other owners, it will remain
+    //           in the tree)
+    // requires: range exists exactly in this locked_keyrange
+    // rationale: caller is responsible for only removing existing ranges
+    void remove(const keyrange &range, TXNID txnid);
+
+    // effect: removes all of the keys represented by this locked keyrange
+    // rationale: we'd like a fast way to empty out a tree
+    void remove_all(void);
+
+   private:
+    // the concurrent tree this locked keyrange is for
+    concurrent_tree *m_tree;
+
+    // the range of keys this locked keyrange represents
+    keyrange m_range;
+
+    // the subtree under which all overlapping ranges exist
+    treenode *m_subtree;
+
+    friend class concurrent_tree_unit_test;
+  };
+
+  // effect: initialize the tree to an empty state
+  void create(const comparator *cmp);
+
+  // effect: destroy the tree.
+  // requires: tree is empty
+  void destroy(void);
+
+  // returns: true iff the tree is empty
+  bool is_empty(void);
+
+  // returns: the memory overhead of a single insertion into the tree
+  static uint64_t get_insertion_memory_overhead(void);
+
+ private:
+  // the root needs to always exist so there's a lock to grab
+  // even if the tree is empty. that's why we store a treenode
+  // here and not a pointer to one.
+  treenode m_root;
+
+  friend class concurrent_tree_unit_test;
+};
+
+} /* namespace toku */
diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc
new file mode 100644
index 00000000000..e50ace5a9d3
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc
@@ -0,0 +1,222 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "keyrange.h"
+
+#include "../util/dbt.h"
+
+namespace toku {
+
+// create a keyrange by borrowing the left and right dbt
+// pointers. no memory is copied. no checks for infinity needed.
+void keyrange::create(const DBT *left, const DBT *right) {
+  init_empty();
+  m_left_key = left;
+  m_right_key = right;
+}
+
+// destroy the key copies. if they were never set, then destroy does nothing.
+void keyrange::destroy(void) {
+  toku_destroy_dbt(&m_left_key_copy);
+  toku_destroy_dbt(&m_right_key_copy);
+}
+
+// create a keyrange by copying the keys from the given range.
+void keyrange::create_copy(const keyrange &range) {
+  // start with an initialized, empty range
+  init_empty();
+
+  // optimize the case where the left and right keys are the same.
+  // we'd like to only have one copy of the data.
+  if (toku_dbt_equals(range.get_left_key(), range.get_right_key())) {
+    set_both_keys(range.get_left_key());
+  } else {
+    // replace our empty left and right keys with
+    // copies of the range's left and right keys
+    replace_left_key(range.get_left_key());
+    replace_right_key(range.get_right_key());
+  }
+}
+
+// extend this keyrange by choosing the leftmost and rightmost
+// endpoints between this range and the given. replaced keys
+// in this range are freed and inherited keys are copied.
+void keyrange::extend(const comparator &cmp, const keyrange &range) {
+  const DBT *range_left = range.get_left_key();
+  const DBT *range_right = range.get_right_key();
+  if (cmp(range_left, get_left_key()) < 0) {
+    replace_left_key(range_left);
+  }
+  if (cmp(range_right, get_right_key()) > 0) {
+    replace_right_key(range_right);
+  }
+}
+
+// how much memory does this keyrange take?
+// - the size of the left and right keys
+// --- ignore the fact that we may have optimized the point case.
+//     it complicates things for little gain.
+// - the size of the keyrange class itself
+uint64_t keyrange::get_memory_size(void) const {
+  const DBT *left_key = get_left_key();
+  const DBT *right_key = get_right_key();
+  return left_key->size + right_key->size + sizeof(keyrange);
+}
+
+// compare ranges.
+keyrange::comparison keyrange::compare(const comparator &cmp,
+                                       const keyrange &range) const {
+  if (cmp(get_right_key(), range.get_left_key()) < 0) {
+    return comparison::LESS_THAN;
+  } else if (cmp(get_left_key(), range.get_right_key()) > 0) {
+    return comparison::GREATER_THAN;
+  } else if (cmp(get_left_key(), range.get_left_key()) == 0 &&
+             cmp(get_right_key(), range.get_right_key()) == 0) {
+    return comparison::EQUALS;
+  } else {
+    return comparison::OVERLAPS;
+  }
+}
+
+bool keyrange::overlaps(const comparator &cmp, const keyrange &range) const {
+  // equality is a stronger form of overlapping.
+  // so two ranges "overlap" if they're either equal or just overlapping.
+  comparison c = compare(cmp, range);
+  return c == comparison::EQUALS || c == comparison::OVERLAPS;
+}
+
+keyrange keyrange::get_infinite_range(void) {
+  keyrange range;
+  range.create(toku_dbt_negative_infinity(), toku_dbt_positive_infinity());
+  return range;
+}
+
+void keyrange::init_empty(void) {
+  m_left_key = nullptr;
+  m_right_key = nullptr;
+  toku_init_dbt(&m_left_key_copy);
+  toku_init_dbt(&m_right_key_copy);
+  m_point_range = false;
+}
+
+const DBT *keyrange::get_left_key(void) const {
+  if (m_left_key) {
+    return m_left_key;
+  } else {
+    return &m_left_key_copy;
+  }
+}
+
+const DBT *keyrange::get_right_key(void) const {
+  if (m_right_key) {
+    return m_right_key;
+  } else {
+    return &m_right_key_copy;
+  }
+}
+
+// copy the given once and set both the left and right pointers.
+// optimization for point ranges, so the left and right ranges
+// are not copied twice.
+void keyrange::set_both_keys(const DBT *key) {
+  if (toku_dbt_is_infinite(key)) {
+    m_left_key = key;
+    m_right_key = key;
+  } else {
+    toku_clone_dbt(&m_left_key_copy, *key);
+    toku_copyref_dbt(&m_right_key_copy, m_left_key_copy);
+  }
+  m_point_range = true;
+}
+
+// destroy the current left key. set and possibly copy the new one
+void keyrange::replace_left_key(const DBT *key) {
+  // a little magic:
+  //
+  // if this is a point range, then the left and right keys share
+  // one copy of the data, and it lives in the left key copy. so
+  // if we're replacing the left key, move the real data to the
+  // right key copy instead of destroying it. now, the memory is
+  // owned by the right key and the left key may be replaced.
+  if (m_point_range) {
+    m_right_key_copy = m_left_key_copy;
+  } else {
+    toku_destroy_dbt(&m_left_key_copy);
+  }
+
+  if (toku_dbt_is_infinite(key)) {
+    m_left_key = key;
+  } else {
+    toku_clone_dbt(&m_left_key_copy, *key);
+    m_left_key = nullptr;
+  }
+  m_point_range = false;
+}
+
+// destroy the current right key. set and possibly copy the new one
+void keyrange::replace_right_key(const DBT *key) {
+  toku_destroy_dbt(&m_right_key_copy);
+  if (toku_dbt_is_infinite(key)) {
+    m_right_key = key;
+  } else {
+    toku_clone_dbt(&m_right_key_copy, *key);
+    m_right_key = nullptr;
+  }
+  m_point_range = false;
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h b/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h
new file mode 100644
index 00000000000..f9aeea0c4e5
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.h
@@ -0,0 +1,141 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../ft/comparator.h"
+
+namespace toku {
+
+// A keyrange has a left and right key as endpoints.
+//
+// When a keyrange is created it owns no memory, but when it copies
+// or extends another keyrange, it copies memory as necessary. This
+// means it is cheap in the common case.
+
+class keyrange {
+ public:
+  // effect: constructor that borrows left and right key pointers.
+  //         no memory is allocated or copied.
+  void create(const DBT *left_key, const DBT *right_key);
+
+  // effect: constructor that allocates and copies another keyrange's points.
+  void create_copy(const keyrange &range);
+
+  // effect: destroys the keyrange, freeing any allocated memory
+  void destroy(void);
+
+  // effect: extends the keyrange by choosing the leftmost and rightmost
+  //         endpoints from this range and the given range.
+  //         replaced keys in this range are freed, new keys are copied.
+  void extend(const comparator &cmp, const keyrange &range);
+
+  // returns: the amount of memory this keyrange takes. does not account
+  //          for point optimizations or malloc overhead.
+  uint64_t get_memory_size(void) const;
+
+  // returns: pointer to the left key of this range
+  const DBT *get_left_key(void) const;
+
+  // returns: pointer to the right key of this range
+  const DBT *get_right_key(void) const;
+
+  // two ranges are either equal, lt, gt, or overlapping
+  enum comparison { EQUALS, LESS_THAN, GREATER_THAN, OVERLAPS };
+
+  // effect: compares this range to the given range
+  // returns: LESS_THAN    if given range is strictly to the left
+  //          GREATER_THAN if given range is strictly to the right
+  //          EQUALS       if given range has the same left and right endpoints
+  //          OVERLAPS     if at least one of the given range's endpoints falls
+  //                       between this range's endpoints
+  comparison compare(const comparator &cmp, const keyrange &range) const;
+
+  // returns: true if the range and the given range are equal or overlapping
+  bool overlaps(const comparator &cmp, const keyrange &range) const;
+
+  // returns: a keyrange representing -inf, +inf
+  static keyrange get_infinite_range(void);
+
+ private:
+  // some keys should be copied, some keys should not be.
+  //
+  // to support both, we use two DBTs for copies and two pointers
+  // for temporaries. the access rule is:
+  // - if a pointer is non-null, then it reprsents the key.
+  // - otherwise the pointer is null, and the key is in the copy.
+  DBT m_left_key_copy;
+  DBT m_right_key_copy;
+  const DBT *m_left_key;
+  const DBT *m_right_key;
+
+  // if this range is a point range, then m_left_key == m_right_key
+  // and the actual data is stored exactly once in m_left_key_copy.
+  bool m_point_range;
+
+  // effect: initializes a keyrange to be empty
+  void init_empty(void);
+
+  // effect: copies the given key once into the left key copy
+  //         and sets the right key copy to share the left.
+  // rationale: optimization for point ranges to only do one malloc
+  void set_both_keys(const DBT *key);
+
+  // effect: destroys the current left key. sets and copies the new one.
+  void replace_left_key(const DBT *key);
+
+  // effect: destroys the current right key. sets and copies the new one.
+  void replace_right_key(const DBT *key);
+};
+
+} /* namespace toku */
diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc
new file mode 100644
index 00000000000..ec7bd04dc9d
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc
@@ -0,0 +1,525 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "lock_request.h"
+
+#include "../portability/toku_race_tools.h"
+#include "../portability/txn_subst.h"
+#include "../util/dbt.h"
+#include "locktree.h"
+
+namespace toku {
+
+// initialize a lock request's internals
+void lock_request::create(toku_external_mutex_factory_t mutex_factory) {
+  m_txnid = TXNID_NONE;
+  m_conflicting_txnid = TXNID_NONE;
+  m_start_time = 0;
+  m_left_key = nullptr;
+  m_right_key = nullptr;
+  toku_init_dbt(&m_left_key_copy);
+  toku_init_dbt(&m_right_key_copy);
+
+  m_type = type::UNKNOWN;
+  m_lt = nullptr;
+
+  m_complete_r = 0;
+  m_state = state::UNINITIALIZED;
+  m_info = nullptr;
+
+  // psergey-todo: this condition is for interruptible wait
+  // note: moved to here from lock_request::create:
+  toku_external_cond_init(mutex_factory, &m_wait_cond);
+
+  m_start_test_callback = nullptr;
+  m_start_before_pending_test_callback = nullptr;
+  m_retry_test_callback = nullptr;
+}
+
+// destroy a lock request.
+void lock_request::destroy(void) {
+  invariant(m_state != state::PENDING);
+  invariant(m_state != state::DESTROYED);
+  m_state = state::DESTROYED;
+  toku_destroy_dbt(&m_left_key_copy);
+  toku_destroy_dbt(&m_right_key_copy);
+  toku_external_cond_destroy(&m_wait_cond);
+}
+
+// set the lock request parameters. this API allows a lock request to be reused.
+void lock_request::set(locktree *lt, TXNID txnid, const DBT *left_key,
+                       const DBT *right_key, lock_request::type lock_type,
+                       bool big_txn, void *extra) {
+  invariant(m_state != state::PENDING);
+  m_lt = lt;
+
+  m_txnid = txnid;
+  m_left_key = left_key;
+  m_right_key = right_key;
+  toku_destroy_dbt(&m_left_key_copy);
+  toku_destroy_dbt(&m_right_key_copy);
+  m_type = lock_type;
+  m_state = state::INITIALIZED;
+  m_info = lt ? lt->get_lock_request_info() : nullptr;
+  m_big_txn = big_txn;
+  m_extra = extra;
+}
+
+// get rid of any stored left and right key copies and
+// replace them with copies of the given left and right key
+void lock_request::copy_keys() {
+  if (!toku_dbt_is_infinite(m_left_key)) {
+    toku_clone_dbt(&m_left_key_copy, *m_left_key);
+    m_left_key = &m_left_key_copy;
+  }
+  if (!toku_dbt_is_infinite(m_right_key)) {
+    toku_clone_dbt(&m_right_key_copy, *m_right_key);
+    m_right_key = &m_right_key_copy;
+  }
+}
+
+// what are the conflicts for this pending lock request?
+void lock_request::get_conflicts(txnid_set *conflicts) {
+  invariant(m_state == state::PENDING);
+  const bool is_write_request = m_type == type::WRITE;
+  m_lt->get_conflicts(is_write_request, m_txnid, m_left_key, m_right_key,
+                      conflicts);
+}
+
+// build a wait-for-graph for this lock request and the given conflict set
+// for each transaction B that blocks A's lock request
+//     if B is blocked then
+//         add (A,T) to the WFG and if B is new, fill in the WFG from B
+void lock_request::build_wait_graph(wfg *wait_graph,
+                                    const txnid_set &conflicts) {
+  uint32_t num_conflicts = conflicts.size();
+  for (uint32_t i = 0; i < num_conflicts; i++) {
+    TXNID conflicting_txnid = conflicts.get(i);
+    lock_request *conflicting_request = find_lock_request(conflicting_txnid);
+    invariant(conflicting_txnid != m_txnid);
+    invariant(conflicting_request != this);
+    if (conflicting_request) {
+      bool already_exists = wait_graph->node_exists(conflicting_txnid);
+      wait_graph->add_edge(m_txnid, conflicting_txnid);
+      if (!already_exists) {
+        // recursively build the wait for graph rooted at the conflicting
+        // request, given its set of lock conflicts.
+        txnid_set other_conflicts;
+        other_conflicts.create();
+        conflicting_request->get_conflicts(&other_conflicts);
+        conflicting_request->build_wait_graph(wait_graph, other_conflicts);
+        other_conflicts.destroy();
+      }
+    }
+  }
+}
+
+// returns: true if the current set of lock requests contains
+//          a deadlock, false otherwise.
+bool lock_request::deadlock_exists(const txnid_set &conflicts) {
+  wfg wait_graph;
+  wait_graph.create();
+
+  build_wait_graph(&wait_graph, conflicts);
+
+  std::function<void(TXNID)> reporter;
+  if (m_deadlock_cb) {
+    reporter = [this](TXNID a) {
+      lock_request *req = find_lock_request(a);
+      if (req) {
+        m_deadlock_cb(req->m_txnid, (req->m_type == lock_request::WRITE),
+                      req->m_left_key, req->m_right_key);
+      }
+    };
+  }
+
+  bool deadlock = wait_graph.cycle_exists_from_txnid(m_txnid, reporter);
+  wait_graph.destroy();
+  return deadlock;
+}
+
+// try to acquire a lock described by this lock request.
+int lock_request::start(void) {
+  int r;
+
+  txnid_set conflicts;
+  conflicts.create();
+  if (m_type == type::WRITE) {
+    r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, &conflicts,
+                                 m_big_txn);
+  } else {
+    invariant(m_type == type::READ);
+    r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, &conflicts,
+                                m_big_txn);
+  }
+
+  // if the lock is not granted, save it to the set of lock requests
+  // and check for a deadlock. if there is one, complete it as failed
+  if (r == DB_LOCK_NOTGRANTED) {
+    copy_keys();
+    m_state = state::PENDING;
+    m_start_time = toku_current_time_microsec() / 1000;
+    m_conflicting_txnid = conflicts.get(0);
+    if (m_start_before_pending_test_callback)
+      m_start_before_pending_test_callback();
+    toku_external_mutex_lock(&m_info->mutex);
+    insert_into_lock_requests();
+    if (deadlock_exists(conflicts)) {
+      remove_from_lock_requests();
+      r = DB_LOCK_DEADLOCK;
+    }
+    toku_external_mutex_unlock(&m_info->mutex);
+    if (m_start_test_callback) m_start_test_callback();  // test callback
+  }
+
+  if (r != DB_LOCK_NOTGRANTED) {
+    complete(r);
+  }
+
+  conflicts.destroy();
+  return r;
+}
+
+// sleep on the lock request until it becomes resolved or the wait time has
+// elapsed.
+int lock_request::wait(uint64_t wait_time_ms) {
+  return wait(wait_time_ms, 0, nullptr);
+}
+
+int lock_request::wait(uint64_t wait_time_ms, uint64_t killed_time_ms,
+                       int (*killed_callback)(void),
+                       void (*lock_wait_callback)(void *, lock_wait_infos *),
+                       void *callback_arg) {
+  uint64_t t_now = toku_current_time_microsec();
+  uint64_t t_start = t_now;
+  uint64_t t_end = t_start + wait_time_ms * 1000;
+
+  toku_external_mutex_lock(&m_info->mutex);
+
+  // check again, this time locking out other retry calls
+  if (m_state == state::PENDING) {
+    lock_wait_infos conflicts_collector;
+    retry(&conflicts_collector);
+    if (m_state == state::PENDING) {
+      report_waits(&conflicts_collector, lock_wait_callback, callback_arg);
+    }
+  }
+
+  while (m_state == state::PENDING) {
+    // check if this thread is killed
+    if (killed_callback && killed_callback()) {
+      remove_from_lock_requests();
+      complete(DB_LOCK_NOTGRANTED);
+      continue;
+    }
+
+    // compute the time until we should wait
+    uint64_t t_wait;
+    if (killed_time_ms == 0) {
+      t_wait = t_end;
+    } else {
+      t_wait = t_now + killed_time_ms * 1000;
+      if (t_wait > t_end) t_wait = t_end;
+    }
+
+    int r = toku_external_cond_timedwait(&m_wait_cond, &m_info->mutex,
+                                         (int64_t)(t_wait - t_now));
+    invariant(r == 0 || r == ETIMEDOUT);
+
+    t_now = toku_current_time_microsec();
+    if (m_state == state::PENDING && (t_now >= t_end)) {
+      m_info->counters.timeout_count += 1;
+
+      // if we're still pending and we timed out, then remove our
+      // request from the set of lock requests and fail.
+      remove_from_lock_requests();
+
+      // complete sets m_state to COMPLETE, breaking us out of the loop
+      complete(DB_LOCK_NOTGRANTED);
+    }
+  }
+
+  uint64_t t_real_end = toku_current_time_microsec();
+  uint64_t duration = t_real_end - t_start;
+  m_info->counters.wait_count += 1;
+  m_info->counters.wait_time += duration;
+  if (duration >= 1000000) {
+    m_info->counters.long_wait_count += 1;
+    m_info->counters.long_wait_time += duration;
+  }
+  toku_external_mutex_unlock(&m_info->mutex);
+
+  invariant(m_state == state::COMPLETE);
+  return m_complete_r;
+}
+
+// complete this lock request with the given return value
+void lock_request::complete(int complete_r) {
+  m_complete_r = complete_r;
+  m_state = state::COMPLETE;
+}
+
+const DBT *lock_request::get_left_key(void) const { return m_left_key; }
+
+const DBT *lock_request::get_right_key(void) const { return m_right_key; }
+
+TXNID lock_request::get_txnid(void) const { return m_txnid; }
+
+uint64_t lock_request::get_start_time(void) const { return m_start_time; }
+
+TXNID lock_request::get_conflicting_txnid(void) const {
+  return m_conflicting_txnid;
+}
+
+int lock_request::retry(lock_wait_infos *conflicts_collector) {
+  invariant(m_state == state::PENDING);
+  int r;
+  txnid_set conflicts;
+  conflicts.create();
+
+  if (m_type == type::WRITE) {
+    r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, &conflicts,
+                                 m_big_txn);
+  } else {
+    r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, &conflicts,
+                                m_big_txn);
+  }
+
+  // if the acquisition succeeded then remove ourselves from the
+  // set of lock requests, complete, and signal the waiting thread.
+  if (r == 0) {
+    remove_from_lock_requests();
+    complete(r);
+    if (m_retry_test_callback) m_retry_test_callback();  // test callback
+    toku_external_cond_broadcast(&m_wait_cond);
+  } else {
+    m_conflicting_txnid = conflicts.get(0);
+    add_conflicts_to_waits(&conflicts, conflicts_collector);
+  }
+  conflicts.destroy();
+
+  return r;
+}
+
+void lock_request::retry_all_lock_requests(
+    locktree *lt, void (*lock_wait_callback)(void *, lock_wait_infos *),
+    void *callback_arg, void (*after_retry_all_test_callback)(void)) {
+  lt_lock_request_info *info = lt->get_lock_request_info();
+
+  // if there are no pending lock requests than there is nothing to do
+  // the unlocked data race on pending_is_empty is OK since lock requests
+  // are retried after added to the pending set.
+  if (info->pending_is_empty) return;
+
+  // get my retry generation (post increment of retry_want)
+  unsigned long long my_retry_want = (info->retry_want += 1);
+
+  toku_mutex_lock(&info->retry_mutex);
+
+  lock_wait_infos conflicts_collector;
+
+  // here is the group retry algorithm.
+  // get the latest retry_want count and use it as the generation number of
+  // this retry operation. if this retry generation is > the last retry
+  // generation, then do the lock retries.  otherwise, no lock retries
+  // are needed.
+  if ((my_retry_want - 1) == info->retry_done) {
+    for (;;) {
+      if (!info->running_retry) {
+        info->running_retry = true;
+        info->retry_done = info->retry_want;
+        toku_mutex_unlock(&info->retry_mutex);
+        retry_all_lock_requests_info(info, &conflicts_collector);
+        if (after_retry_all_test_callback) after_retry_all_test_callback();
+        toku_mutex_lock(&info->retry_mutex);
+        info->running_retry = false;
+        toku_cond_broadcast(&info->retry_cv);
+        break;
+      } else {
+        toku_cond_wait(&info->retry_cv, &info->retry_mutex);
+      }
+    }
+  }
+  toku_mutex_unlock(&info->retry_mutex);
+
+  report_waits(&conflicts_collector, lock_wait_callback, callback_arg);
+}
+
+void lock_request::retry_all_lock_requests_info(lt_lock_request_info *info,
+                                                lock_wait_infos *collector) {
+  toku_external_mutex_lock(&info->mutex);
+  // retry all of the pending lock requests.
+  for (uint32_t i = 0; i < info->pending_lock_requests.size();) {
+    lock_request *request;
+    int r = info->pending_lock_requests.fetch(i, &request);
+    invariant_zero(r);
+
+    // retry the lock request. if it didn't succeed,
+    // move on to the next lock request. otherwise
+    // the request is gone from the list so we may
+    // read the i'th entry for the next one.
+    r = request->retry(collector);
+    if (r != 0) {
+      i++;
+    }
+  }
+
+  // future threads should only retry lock requests if some still exist
+  info->should_retry_lock_requests = info->pending_lock_requests.size() > 0;
+  toku_external_mutex_unlock(&info->mutex);
+}
+
+void lock_request::add_conflicts_to_waits(txnid_set *conflicts,
+                                          lock_wait_infos *wait_conflicts) {
+  wait_conflicts->push_back({m_lt, get_txnid(), m_extra, {}});
+  uint32_t num_conflicts = conflicts->size();
+  for (uint32_t i = 0; i < num_conflicts; i++) {
+    wait_conflicts->back().waitees.push_back(conflicts->get(i));
+  }
+}
+
+void lock_request::report_waits(lock_wait_infos *wait_conflicts,
+                                void (*lock_wait_callback)(void *,
+                                                           lock_wait_infos *),
+                                void *callback_arg) {
+  if (lock_wait_callback) (*lock_wait_callback)(callback_arg, wait_conflicts);
+}
+
+void *lock_request::get_extra(void) const { return m_extra; }
+
+void lock_request::kill_waiter(void) {
+  remove_from_lock_requests();
+  complete(DB_LOCK_NOTGRANTED);
+  toku_external_cond_broadcast(&m_wait_cond);
+}
+
+void lock_request::kill_waiter(locktree *lt, void *extra) {
+  lt_lock_request_info *info = lt->get_lock_request_info();
+  toku_external_mutex_lock(&info->mutex);
+  for (uint32_t i = 0; i < info->pending_lock_requests.size(); i++) {
+    lock_request *request;
+    int r = info->pending_lock_requests.fetch(i, &request);
+    if (r == 0 && request->get_extra() == extra) {
+      request->kill_waiter();
+      break;
+    }
+  }
+  toku_external_mutex_unlock(&info->mutex);
+}
+
+// find another lock request by txnid. must hold the mutex.
+lock_request *lock_request::find_lock_request(const TXNID &txnid) {
+  lock_request *request;
+  int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>(
+      txnid, &request, nullptr);
+  if (r != 0) {
+    request = nullptr;
+  }
+  return request;
+}
+
+// insert this lock request into the locktree's set. must hold the mutex.
+void lock_request::insert_into_lock_requests(void) {
+  uint32_t idx;
+  lock_request *request;
+  int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>(
+      m_txnid, &request, &idx);
+  invariant(r == DB_NOTFOUND);
+  r = m_info->pending_lock_requests.insert_at(this, idx);
+  invariant_zero(r);
+  m_info->pending_is_empty = false;
+}
+
+// remove this lock request from the locktree's set. must hold the mutex.
+void lock_request::remove_from_lock_requests(void) {
+  uint32_t idx;
+  lock_request *request;
+  int r = m_info->pending_lock_requests.find_zero<TXNID, find_by_txnid>(
+      m_txnid, &request, &idx);
+  invariant_zero(r);
+  invariant(request == this);
+  r = m_info->pending_lock_requests.delete_at(idx);
+  invariant_zero(r);
+  if (m_info->pending_lock_requests.size() == 0)
+    m_info->pending_is_empty = true;
+}
+
+int lock_request::find_by_txnid(lock_request *const &request,
+                                const TXNID &txnid) {
+  TXNID request_txnid = request->m_txnid;
+  if (request_txnid < txnid) {
+    return -1;
+  } else if (request_txnid == txnid) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+void lock_request::set_start_test_callback(void (*f)(void)) {
+  m_start_test_callback = f;
+}
+
+void lock_request::set_start_before_pending_test_callback(void (*f)(void)) {
+  m_start_before_pending_test_callback = f;
+}
+
+void lock_request::set_retry_test_callback(void (*f)(void)) {
+  m_retry_test_callback = f;
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h b/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h
new file mode 100644
index 00000000000..3544f102ffb
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h
@@ -0,0 +1,253 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../db.h"
+#include "../ft/comparator.h"
+#include "../portability/toku_pthread.h"
+#include "locktree.h"
+#include "txnid_set.h"
+#include "wfg.h"
+
+namespace toku {
+
+// Information about a lock wait
+struct lock_wait_info {
+  locktree *ltree;  // the tree where wait happens
+  TXNID waiter;     // the waiting transaction
+  void *m_extra;    // lock_request's m_extra
+
+  // The transactions that are waited for.
+  std::vector<TXNID> waitees;
+};
+
+typedef std::vector<lock_wait_info> lock_wait_infos;
+
+// A lock request contains the db, the key range, the lock type, and
+// the transaction id that describes a potential row range lock.
+//
+// the typical use case is:
+// - initialize a lock request
+// - start to try to acquire the lock
+// - do something else
+// - wait for the lock request to be resolved on a timed condition
+// - destroy the lock request
+// a lock request is resolved when its state is no longer pending, or
+// when it becomes granted, or timedout, or deadlocked. when resolved, the
+// state of the lock request is changed and any waiting threads are awakened.
+
+class lock_request {
+ public:
+  enum type { UNKNOWN, READ, WRITE };
+
+  // effect: Initializes a lock request.
+  void create(toku_external_mutex_factory_t mutex_factory);
+
+  // effect: Destroys a lock request.
+  void destroy(void);
+
+  // effect: Resets the lock request parameters, allowing it to be reused.
+  // requires: Lock request was already created at some point
+  void set(locktree *lt, TXNID txnid, const DBT *left_key, const DBT *right_key,
+           type lock_type, bool big_txn, void *extra = nullptr);
+
+  // effect: Tries to acquire a lock described by this lock request.
+  // returns: The return code of locktree::acquire_[write,read]_lock()
+  //          or DB_LOCK_DEADLOCK if this request would end up deadlocked.
+  int start(void);
+
+  // effect: Sleeps until either the request is granted or the wait time
+  // expires. returns: The return code of locktree::acquire_[write,read]_lock()
+  //          or simply DB_LOCK_NOTGRANTED if the wait time expired.
+  int wait(uint64_t wait_time_ms);
+  int wait(uint64_t wait_time_ms, uint64_t killed_time_ms,
+           int (*killed_callback)(void),
+           void (*lock_wait_callback)(void *, lock_wait_infos *) = nullptr,
+           void *callback_arg = nullptr);
+
+  // return: left end-point of the lock range
+  const DBT *get_left_key(void) const;
+
+  // return: right end-point of the lock range
+  const DBT *get_right_key(void) const;
+
+  // return: the txnid waiting for a lock
+  TXNID get_txnid(void) const;
+
+  // return: when this lock request started, as milliseconds from epoch
+  uint64_t get_start_time(void) const;
+
+  // return: which txnid is blocking this request (there may be more, though)
+  TXNID get_conflicting_txnid(void) const;
+
+  // effect: Retries all of the lock requests for the given locktree.
+  //         Any lock requests successfully restarted is completed and woken
+  //         up.
+  //         The rest remain pending.
+  static void retry_all_lock_requests(
+      locktree *lt,
+      void (*lock_wait_callback)(void *, lock_wait_infos *) = nullptr,
+      void *callback_arg = nullptr,
+      void (*after_retry_test_callback)(void) = nullptr);
+  static void retry_all_lock_requests_info(lt_lock_request_info *info,
+                                           lock_wait_infos *collector);
+
+  void set_start_test_callback(void (*f)(void));
+  void set_start_before_pending_test_callback(void (*f)(void));
+  void set_retry_test_callback(void (*f)(void));
+
+  void *get_extra(void) const;
+
+  void kill_waiter(void);
+  static void kill_waiter(locktree *lt, void *extra);
+
+ private:
+  enum state {
+    UNINITIALIZED,
+    INITIALIZED,
+    PENDING,
+    COMPLETE,
+    DESTROYED,
+  };
+
+  // The keys for a lock request are stored "unowned" in m_left_key
+  // and m_right_key. When the request is about to go to sleep, it
+  // copies these keys and stores them in m_left_key_copy etc and
+  // sets the temporary pointers to null.
+  TXNID m_txnid;
+  TXNID m_conflicting_txnid;
+  uint64_t m_start_time;
+  const DBT *m_left_key;
+  const DBT *m_right_key;
+  DBT m_left_key_copy;
+  DBT m_right_key_copy;
+
+  // The lock request type and associated locktree
+  type m_type;
+  locktree *m_lt;
+
+  // If the lock request is in the completed state, then its
+  // final return value is stored in m_complete_r
+  int m_complete_r;
+  state m_state;
+
+  toku_external_cond_t m_wait_cond;
+
+  bool m_big_txn;
+
+  // the lock request info state stored in the
+  // locktree that this lock request is for.
+  struct lt_lock_request_info *m_info;
+
+  void *m_extra;
+
+  // effect: tries again to acquire the lock described by this lock request
+  // returns: 0 if retrying the request succeeded and is now complete
+  int retry(lock_wait_infos *collector);
+
+  void complete(int complete_r);
+
+  // effect: Finds another lock request by txnid.
+  // requires: The lock request info mutex is held
+  lock_request *find_lock_request(const TXNID &txnid);
+
+  // effect: Insert this lock request into the locktree's set.
+  // requires: the locktree's mutex is held
+  void insert_into_lock_requests(void);
+
+  // effect: Removes this lock request from the locktree's set.
+  // requires: The lock request info mutex is held
+  void remove_from_lock_requests(void);
+
+  // effect: Asks this request's locktree which txnids are preventing
+  //         us from getting the lock described by this request.
+  // returns: conflicts is populated with the txnid's that this request
+  //          is blocked on
+  void get_conflicts(txnid_set *conflicts);
+
+  // effect: Builds a wait-for-graph for this lock request and the given
+  // conflict set
+  void build_wait_graph(wfg *wait_graph, const txnid_set &conflicts);
+
+  // returns: True if this lock request is in deadlock with the given conflicts
+  // set
+  bool deadlock_exists(const txnid_set &conflicts);
+
+  void copy_keys(void);
+
+  static int find_by_txnid(lock_request *const &request, const TXNID &txnid);
+
+  // Report list of conflicts to lock wait callback.
+  static void report_waits(lock_wait_infos *wait_conflicts,
+                           void (*lock_wait_callback)(void *,
+                                                      lock_wait_infos *),
+                           void *callback_arg);
+  void add_conflicts_to_waits(txnid_set *conflicts,
+                              lock_wait_infos *wait_conflicts);
+
+  void (*m_start_test_callback)(void);
+  void (*m_start_before_pending_test_callback)(void);
+  void (*m_retry_test_callback)(void);
+
+ public:
+  std::function<void(TXNID, bool, const DBT *, const DBT *)> m_deadlock_cb;
+
+  friend class lock_request_unit_test;
+};
+// PORT: lock_request is not a POD anymore due to use of toku_external_cond_t
+//  This is ok as the PODness is not really required: lock_request objects are
+//  not moved in memory or anything.
+// ENSURE_POD(lock_request);
+
+} /* namespace toku */
diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc
new file mode 100644
index 00000000000..c238b02048a
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc
@@ -0,0 +1,1009 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "locktree.h"
+
+#include <memory.h>
+
+#include "../portability/toku_pthread.h"
+#include "../portability/toku_time.h"
+#include "../util/growable_array.h"
+#include "range_buffer.h"
+
+// including the concurrent_tree here expands the templates
+// and "defines" the implementation, so we do it here in
+// the locktree source file instead of the header.
+#include "concurrent_tree.h"
+
+namespace toku {
+// A locktree represents the set of row locks owned by all transactions
+// over an open dictionary. Read and write ranges are represented as
+// a left and right key which are compared with the given descriptor
+// and comparison fn.
+//
+// Each locktree has a reference count which it manages
+// but does nothing based on the value of the reference count - it is
+// up to the user of the locktree to destroy it when it sees fit.
+
+void locktree::create(locktree_manager *mgr, DICTIONARY_ID dict_id,
+                      const comparator &cmp,
+                      toku_external_mutex_factory_t mutex_factory) {
+  m_mgr = mgr;
+  m_dict_id = dict_id;
+
+  m_cmp.create_from(cmp);
+  m_reference_count = 1;
+  m_userdata = nullptr;
+
+  XCALLOC(m_rangetree);
+  m_rangetree->create(&m_cmp);
+
+  m_sto_txnid = TXNID_NONE;
+  m_sto_buffer.create();
+  m_sto_score = STO_SCORE_THRESHOLD;
+  m_sto_end_early_count = 0;
+  m_sto_end_early_time = 0;
+
+  m_lock_request_info.init(mutex_factory);
+}
+
+void lt_lock_request_info::init(toku_external_mutex_factory_t mutex_factory) {
+  pending_lock_requests.create();
+  pending_is_empty = true;
+  toku_external_mutex_init(mutex_factory, &mutex);
+  retry_want = retry_done = 0;
+  ZERO_STRUCT(counters);
+  ZERO_STRUCT(retry_mutex);
+  toku_mutex_init(locktree_request_info_retry_mutex_key, &retry_mutex, nullptr);
+  toku_cond_init(locktree_request_info_retry_cv_key, &retry_cv, nullptr);
+  running_retry = false;
+
+  TOKU_VALGRIND_HG_DISABLE_CHECKING(&pending_is_empty,
+                                    sizeof(pending_is_empty));
+  TOKU_DRD_IGNORE_VAR(pending_is_empty);
+}
+
+void locktree::destroy(void) {
+  invariant(m_reference_count == 0);
+  invariant(m_lock_request_info.pending_lock_requests.size() == 0);
+  m_cmp.destroy();
+  m_rangetree->destroy();
+  toku_free(m_rangetree);
+  m_sto_buffer.destroy();
+  m_lock_request_info.destroy();
+}
+
+void lt_lock_request_info::destroy(void) {
+  pending_lock_requests.destroy();
+  toku_external_mutex_destroy(&mutex);
+  toku_mutex_destroy(&retry_mutex);
+  toku_cond_destroy(&retry_cv);
+}
+
+void locktree::add_reference(void) {
+  (void)toku_sync_add_and_fetch(&m_reference_count, 1);
+}
+
+uint32_t locktree::release_reference(void) {
+  return toku_sync_sub_and_fetch(&m_reference_count, 1);
+}
+
+uint32_t locktree::get_reference_count(void) { return m_reference_count; }
+
+// a container for a range/txnid pair
+struct row_lock {
+  keyrange range;
+  TXNID txnid;
+  bool is_shared;
+  TxnidVector *owners;
+};
+
+// iterate over a locked keyrange and copy out all of the data,
+// storing each row lock into the given growable array. the
+// caller does not own the range inside the returned row locks,
+// so remove from the tree with care using them as keys.
+static void iterate_and_get_overlapping_row_locks(
+    const concurrent_tree::locked_keyrange *lkr,
+    GrowableArray<row_lock> *row_locks) {
+  struct copy_fn_obj {
+    GrowableArray<row_lock> *row_locks;
+    bool fn(const keyrange &range, TXNID txnid, bool is_shared,
+            TxnidVector *owners) {
+      row_lock lock = {.range = range,
+                       .txnid = txnid,
+                       .is_shared = is_shared,
+                       .owners = owners};
+      row_locks->push(lock);
+      return true;
+    }
+  } copy_fn;
+  copy_fn.row_locks = row_locks;
+  lkr->iterate(&copy_fn);
+}
+
+// given a txnid and a set of overlapping row locks, determine
+// which txnids are conflicting, and store them in the conflicts
+// set, if given.
+static bool determine_conflicting_txnids(
+    const GrowableArray<row_lock> &row_locks, const TXNID &txnid,
+    txnid_set *conflicts) {
+  bool conflicts_exist = false;
+  const size_t num_overlaps = row_locks.get_size();
+  for (size_t i = 0; i < num_overlaps; i++) {
+    const row_lock lock = row_locks.fetch_unchecked(i);
+    const TXNID other_txnid = lock.txnid;
+    if (other_txnid != txnid) {
+      if (conflicts) {
+        if (other_txnid == TXNID_SHARED) {
+          // Add all shared lock owners, except this transaction.
+          for (TXNID shared_id : *lock.owners) {
+            if (shared_id != txnid)
+              conflicts->add(shared_id);
+          }
+        } else {
+          conflicts->add(other_txnid);
+        }
+      }
+      conflicts_exist = true;
+    }
+  }
+  return conflicts_exist;
+}
+
+// how much memory does a row lock take up in a concurrent tree?
+static uint64_t row_lock_size_in_tree(const row_lock &lock) {
+  const uint64_t overhead = concurrent_tree::get_insertion_memory_overhead();
+  return lock.range.get_memory_size() + overhead;
+}
+
+// remove and destroy the given row lock from the locked keyrange,
+// then notify the memory tracker of the newly freed lock.
+static void remove_row_lock_from_tree(concurrent_tree::locked_keyrange *lkr,
+                                      const row_lock &lock, TXNID txnid,
+                                      locktree_manager *mgr) {
+  const uint64_t mem_released = row_lock_size_in_tree(lock);
+  lkr->remove(lock.range, txnid);
+  if (mgr != nullptr) {
+    mgr->note_mem_released(mem_released);
+  }
+}
+
+// insert a row lock into the locked keyrange, then notify
+// the memory tracker of this newly acquired lock.
+static void insert_row_lock_into_tree(concurrent_tree::locked_keyrange *lkr,
+                                      const row_lock &lock,
+                                      locktree_manager *mgr) {
+  uint64_t mem_used = row_lock_size_in_tree(lock);
+  lkr->insert(lock.range, lock.txnid, lock.is_shared);
+  if (mgr != nullptr) {
+    mgr->note_mem_used(mem_used);
+  }
+}
+
+void locktree::sto_begin(TXNID txnid) {
+  invariant(m_sto_txnid == TXNID_NONE);
+  invariant(m_sto_buffer.is_empty());
+  m_sto_txnid = txnid;
+}
+
+void locktree::sto_append(const DBT *left_key, const DBT *right_key,
+                          bool is_write_request) {
+  uint64_t buffer_mem, delta;
+
+  // psergey: the below two lines do not make any sense
+  // (and it's the same in upstream TokuDB)
+  keyrange range;
+  range.create(left_key, right_key);
+
+  buffer_mem = m_sto_buffer.total_memory_size();
+  m_sto_buffer.append(left_key, right_key, is_write_request);
+  delta = m_sto_buffer.total_memory_size() - buffer_mem;
+  if (m_mgr != nullptr) {
+    m_mgr->note_mem_used(delta);
+  }
+}
+
+void locktree::sto_end(void) {
+  uint64_t mem_size = m_sto_buffer.total_memory_size();
+  if (m_mgr != nullptr) {
+    m_mgr->note_mem_released(mem_size);
+  }
+  m_sto_buffer.destroy();
+  m_sto_buffer.create();
+  m_sto_txnid = TXNID_NONE;
+}
+
+void locktree::sto_end_early_no_accounting(void *prepared_lkr) {
+  sto_migrate_buffer_ranges_to_tree(prepared_lkr);
+  sto_end();
+  toku_unsafe_set(m_sto_score, 0);
+}
+
+void locktree::sto_end_early(void *prepared_lkr) {
+  m_sto_end_early_count++;
+
+  tokutime_t t0 = toku_time_now();
+  sto_end_early_no_accounting(prepared_lkr);
+  tokutime_t t1 = toku_time_now();
+
+  m_sto_end_early_time += (t1 - t0);
+}
+
+void locktree::sto_migrate_buffer_ranges_to_tree(void *prepared_lkr) {
+  // There should be something to migrate, and nothing in the rangetree.
+  invariant(!m_sto_buffer.is_empty());
+  invariant(m_rangetree->is_empty());
+
+  concurrent_tree sto_rangetree;
+  concurrent_tree::locked_keyrange sto_lkr;
+  sto_rangetree.create(&m_cmp);
+
+  // insert all of the ranges from the single txnid buffer into a new rangtree
+  range_buffer::iterator iter(&m_sto_buffer);
+  range_buffer::iterator::record rec;
+  while (iter.current(&rec)) {
+    sto_lkr.prepare(&sto_rangetree);
+    int r = acquire_lock_consolidated(&sto_lkr, m_sto_txnid, rec.get_left_key(),
+                                      rec.get_right_key(),
+                                      rec.get_exclusive_flag(), nullptr);
+    invariant_zero(r);
+    sto_lkr.release();
+    iter.next();
+  }
+
+  // Iterate the newly created rangetree and insert each range into the
+  // locktree's rangetree, on behalf of the old single txnid.
+  struct migrate_fn_obj {
+    concurrent_tree::locked_keyrange *dst_lkr;
+    bool fn(const keyrange &range, TXNID txnid, bool is_shared,
+            TxnidVector *owners) {
+      // There can't be multiple owners in STO mode
+      invariant_zero(owners);
+      dst_lkr->insert(range, txnid, is_shared);
+      return true;
+    }
+  } migrate_fn;
+  migrate_fn.dst_lkr =
+      static_cast<concurrent_tree::locked_keyrange *>(prepared_lkr);
+  sto_lkr.prepare(&sto_rangetree);
+  sto_lkr.iterate(&migrate_fn);
+  sto_lkr.remove_all();
+  sto_lkr.release();
+  sto_rangetree.destroy();
+  invariant(!m_rangetree->is_empty());
+}
+
+bool locktree::sto_try_acquire(void *prepared_lkr, TXNID txnid,
+                               const DBT *left_key, const DBT *right_key,
+                               bool is_write_request) {
+  if (m_rangetree->is_empty() && m_sto_buffer.is_empty() &&
+      toku_unsafe_fetch(m_sto_score) >= STO_SCORE_THRESHOLD) {
+    // We can do the optimization because the rangetree is empty, and
+    // we know its worth trying because the sto score is big enough.
+    sto_begin(txnid);
+  } else if (m_sto_txnid != TXNID_NONE) {
+    // We are currently doing the optimization. Check if we need to cancel
+    // it because a new txnid appeared, or if the current single txnid has
+    // taken too many locks already.
+    if (m_sto_txnid != txnid ||
+        m_sto_buffer.get_num_ranges() > STO_BUFFER_MAX_SIZE) {
+      sto_end_early(prepared_lkr);
+    }
+  }
+
+  // At this point the sto txnid is properly set. If it is valid, then
+  // this txnid can append its lock to the sto buffer successfully.
+  if (m_sto_txnid != TXNID_NONE) {
+    invariant(m_sto_txnid == txnid);
+    sto_append(left_key, right_key, is_write_request);
+    return true;
+  } else {
+    invariant(m_sto_buffer.is_empty());
+    return false;
+  }
+}
+
+/*
+  Do the same as iterate_and_get_overlapping_row_locks does, but also check for
+  this:
+    The set of overlapping rows locks consists of just one read-only shared
+    lock with the same endpoints as specified (in that case, we can just add
+    ourselves into that list)
+
+  @return true - One compatible shared lock
+         false - Otherwise
+*/
+static bool iterate_and_get_overlapping_row_locks2(
+    const concurrent_tree::locked_keyrange *lkr, const DBT *left_key,
+    const DBT *right_key, comparator *cmp, TXNID,
+    GrowableArray<row_lock> *row_locks) {
+  struct copy_fn_obj {
+    GrowableArray<row_lock> *row_locks;
+    bool first_call = true;
+    bool matching_lock_found = false;
+    const DBT *left_key, *right_key;
+    comparator *cmp;
+
+    bool fn(const keyrange &range, TXNID txnid, bool is_shared,
+            TxnidVector *owners) {
+      if (first_call) {
+        first_call = false;
+        if (is_shared && !(*cmp)(left_key, range.get_left_key()) &&
+            !(*cmp)(right_key, range.get_right_key())) {
+          matching_lock_found = true;
+        }
+      } else {
+        // if we see multiple matching locks, it doesn't matter whether
+        // the first one was matching.
+        matching_lock_found = false;
+      }
+      row_lock lock = {.range = range,
+                       .txnid = txnid,
+                       .is_shared = is_shared,
+                       .owners = owners};
+      row_locks->push(lock);
+      return true;
+    }
+  } copy_fn;
+  copy_fn.row_locks = row_locks;
+  copy_fn.left_key = left_key;
+  copy_fn.right_key = right_key;
+  copy_fn.cmp = cmp;
+  lkr->iterate(&copy_fn);
+  return copy_fn.matching_lock_found;
+}
+
+// try to acquire a lock and consolidate it with existing locks if possible
+// param: lkr, a prepared locked keyrange
+// return: 0 on success, DB_LOCK_NOTGRANTED if conflicting locks exist.
+int locktree::acquire_lock_consolidated(void *prepared_lkr, TXNID txnid,
+                                        const DBT *left_key,
+                                        const DBT *right_key,
+                                        bool is_write_request,
+                                        txnid_set *conflicts) {
+  int r = 0;
+  concurrent_tree::locked_keyrange *lkr;
+
+  keyrange requested_range;
+  requested_range.create(left_key, right_key);
+  lkr = static_cast<concurrent_tree::locked_keyrange *>(prepared_lkr);
+  lkr->acquire(requested_range);
+
+  // copy out the set of overlapping row locks.
+  GrowableArray<row_lock> overlapping_row_locks;
+  overlapping_row_locks.init();
+  bool matching_shared_lock_found = false;
+
+  if (is_write_request)
+    iterate_and_get_overlapping_row_locks(lkr, &overlapping_row_locks);
+  else {
+    matching_shared_lock_found = iterate_and_get_overlapping_row_locks2(
+        lkr, left_key, right_key, &m_cmp, txnid, &overlapping_row_locks);
+    // psergey-todo: what to do now? So, we have figured we have just one
+    // shareable lock. Need to add us into it as an owner but the lock
+    // pointer cannot be kept?
+    // A: use find_node_with_overlapping_child(key_range, nullptr);
+    //  then, add ourselves to the owner list.
+    // Dont' foreget to release the subtree after that.
+  }
+
+  if (matching_shared_lock_found) {
+    // there is just one non-confliting matching shared lock.
+    //  we are hilding a lock on it (see acquire() call above).
+    //  we need to modify it to indicate there is another locker...
+    if (lkr->add_shared_owner(requested_range, txnid)) {
+      // Pretend shared lock uses as much memory.
+      row_lock new_lock = {.range = requested_range,
+                           .txnid = txnid,
+                           .is_shared = false,
+                           .owners = nullptr};
+      uint64_t mem_used = row_lock_size_in_tree(new_lock);
+      if (m_mgr) {
+        m_mgr->note_mem_used(mem_used);
+      }
+    }
+    requested_range.destroy();
+    overlapping_row_locks.deinit();
+    return 0;
+  }
+
+  size_t num_overlapping_row_locks = overlapping_row_locks.get_size();
+
+  // if any overlapping row locks conflict with this request, bail out.
+
+  bool conflicts_exist =
+      determine_conflicting_txnids(overlapping_row_locks, txnid, conflicts);
+  if (!conflicts_exist) {
+    // there are no conflicts, so all of the overlaps are for the requesting
+    // txnid. so, we must consolidate all existing overlapping ranges and the
+    // requested range into one dominating range. then we insert the dominating
+    // range.
+    bool all_shared = !is_write_request;
+    for (size_t i = 0; i < num_overlapping_row_locks; i++) {
+      row_lock overlapping_lock = overlapping_row_locks.fetch_unchecked(i);
+      invariant(overlapping_lock.txnid == txnid);
+      requested_range.extend(m_cmp, overlapping_lock.range);
+      remove_row_lock_from_tree(lkr, overlapping_lock, TXNID_ANY, m_mgr);
+      all_shared = all_shared && overlapping_lock.is_shared;
+    }
+
+    row_lock new_lock = {.range = requested_range,
+                         .txnid = txnid,
+                         .is_shared = all_shared,
+                         .owners = nullptr};
+    insert_row_lock_into_tree(lkr, new_lock, m_mgr);
+  } else {
+    r = DB_LOCK_NOTGRANTED;
+  }
+
+  requested_range.destroy();
+  overlapping_row_locks.deinit();
+  return r;
+}
+
+// acquire a lock in the given key range, inclusive. if successful,
+// return 0. otherwise, populate the conflicts txnid_set with the set of
+// transactions that conflict with this request.
+int locktree::acquire_lock(bool is_write_request, TXNID txnid,
+                           const DBT *left_key, const DBT *right_key,
+                           txnid_set *conflicts) {
+  int r = 0;
+
+  // we are only supporting write locks for simplicity
+  // invariant(is_write_request);
+
+  // acquire and prepare a locked keyrange over the requested range.
+  // prepare is a serialzation point, so we take the opportunity to
+  // try the single txnid optimization first.
+  concurrent_tree::locked_keyrange lkr;
+  lkr.prepare(m_rangetree);
+
+  bool acquired =
+      sto_try_acquire(&lkr, txnid, left_key, right_key, is_write_request);
+  if (!acquired) {
+    r = acquire_lock_consolidated(&lkr, txnid, left_key, right_key,
+                                  is_write_request, conflicts);
+  }
+
+  lkr.release();
+  return r;
+}
+
+int locktree::try_acquire_lock(bool is_write_request, TXNID txnid,
+                               const DBT *left_key, const DBT *right_key,
+                               txnid_set *conflicts, bool big_txn) {
+  // All ranges in the locktree must have left endpoints <= right endpoints.
+  // Range comparisons rely on this fact, so we make a paranoid invariant here.
+  paranoid_invariant(m_cmp(left_key, right_key) <= 0);
+  int r = m_mgr == nullptr ? 0 : m_mgr->check_current_lock_constraints(big_txn);
+  if (r == 0) {
+    r = acquire_lock(is_write_request, txnid, left_key, right_key, conflicts);
+  }
+  return r;
+}
+
+// the locktree silently upgrades read locks to write locks for simplicity
+int locktree::acquire_read_lock(TXNID txnid, const DBT *left_key,
+                                const DBT *right_key, txnid_set *conflicts,
+                                bool big_txn) {
+  return try_acquire_lock(false, txnid, left_key, right_key, conflicts,
+                          big_txn);
+}
+
+int locktree::acquire_write_lock(TXNID txnid, const DBT *left_key,
+                                 const DBT *right_key, txnid_set *conflicts,
+                                 bool big_txn) {
+  return try_acquire_lock(true, txnid, left_key, right_key, conflicts, big_txn);
+}
+
+// typedef void (*dump_callback)(void *cdata, const DBT *left, const DBT *right,
+// TXNID txnid);
+void locktree::dump_locks(void *cdata, dump_callback cb) {
+  concurrent_tree::locked_keyrange lkr;
+  keyrange range;
+  range.create(toku_dbt_negative_infinity(), toku_dbt_positive_infinity());
+
+  lkr.prepare(m_rangetree);
+  lkr.acquire(range);
+
+  TXNID sto_txn;
+  if ((sto_txn = toku_unsafe_fetch(m_sto_txnid)) != TXNID_NONE) {
+    // insert all of the ranges from the single txnid buffer into a new rangtree
+    range_buffer::iterator iter(&m_sto_buffer);
+    range_buffer::iterator::record rec;
+    while (iter.current(&rec)) {
+      (*cb)(cdata, rec.get_left_key(), rec.get_right_key(), sto_txn,
+            !rec.get_exclusive_flag(), nullptr);
+      iter.next();
+    }
+  } else {
+    GrowableArray<row_lock> all_locks;
+    all_locks.init();
+    iterate_and_get_overlapping_row_locks(&lkr, &all_locks);
+
+    const size_t n_locks = all_locks.get_size();
+    for (size_t i = 0; i < n_locks; i++) {
+      const row_lock lock = all_locks.fetch_unchecked(i);
+      (*cb)(cdata, lock.range.get_left_key(), lock.range.get_right_key(),
+            lock.txnid, lock.is_shared, lock.owners);
+    }
+    all_locks.deinit();
+  }
+  lkr.release();
+  range.destroy();
+}
+
+void locktree::get_conflicts(bool is_write_request, TXNID txnid,
+                             const DBT *left_key, const DBT *right_key,
+                             txnid_set *conflicts) {
+  // because we only support write locks, ignore this bit for now.
+  (void)is_write_request;
+
+  // preparing and acquire a locked keyrange over the range
+  keyrange range;
+  range.create(left_key, right_key);
+  concurrent_tree::locked_keyrange lkr;
+  lkr.prepare(m_rangetree);
+  lkr.acquire(range);
+
+  // copy out the set of overlapping row locks and determine the conflicts
+  GrowableArray<row_lock> overlapping_row_locks;
+  overlapping_row_locks.init();
+  iterate_and_get_overlapping_row_locks(&lkr, &overlapping_row_locks);
+
+  // we don't care if conflicts exist. we just want the conflicts set populated.
+  (void)determine_conflicting_txnids(overlapping_row_locks, txnid, conflicts);
+
+  lkr.release();
+  overlapping_row_locks.deinit();
+  range.destroy();
+}
+
+// Effect:
+//  For each range in the lock tree that overlaps the given range and has
+//  the given txnid, remove it.
+// Rationale:
+//  In the common case, there is only the range [left_key, right_key] and
+//  it is associated with txnid, so this is a single tree delete.
+//
+//  However, consolidation and escalation change the objects in the tree
+//  without telling the txn anything.  In this case, the txn may own a
+//  large range lock that represents its ownership of many smaller range
+//  locks.  For example, the txn may think it owns point locks on keys 1,
+//  2, and 3, but due to escalation, only the object [1,3] exists in the
+//  tree.
+//
+//  The first call for a small lock will remove the large range lock, and
+//  the rest of the calls should do nothing.  After the first release,
+//  another thread can acquire one of the locks that the txn thinks it
+//  still owns.  That's ok, because the txn doesn't want it anymore (it
+//  unlocks everything at once), but it may find a lock that it does not
+//  own.
+//
+//  In our example, the txn unlocks key 1, which actually removes the
+//  whole lock [1,3].  Now, someone else can lock 2 before our txn gets
+//  around to unlocking 2, so we should not remove that lock.
+void locktree::remove_overlapping_locks_for_txnid(TXNID txnid,
+                                                  const DBT *left_key,
+                                                  const DBT *right_key) {
+  keyrange release_range;
+  release_range.create(left_key, right_key);
+
+  // acquire and prepare a locked keyrange over the release range
+  concurrent_tree::locked_keyrange lkr;
+  lkr.prepare(m_rangetree);
+  lkr.acquire(release_range);
+
+  // copy out the set of overlapping row locks.
+  GrowableArray<row_lock> overlapping_row_locks;
+  overlapping_row_locks.init();
+  iterate_and_get_overlapping_row_locks(&lkr, &overlapping_row_locks);
+  size_t num_overlapping_row_locks = overlapping_row_locks.get_size();
+
+  for (size_t i = 0; i < num_overlapping_row_locks; i++) {
+    row_lock lock = overlapping_row_locks.fetch_unchecked(i);
+    // If this isn't our lock, that's ok, just don't remove it.
+    // See rationale above.
+    // psergey-todo: for shared locks, just remove ourselves from the
+    //               owners.
+    if (lock.txnid == txnid || (lock.owners && lock.owners->contains(txnid))) {
+      remove_row_lock_from_tree(&lkr, lock, txnid, m_mgr);
+    }
+  }
+
+  lkr.release();
+  overlapping_row_locks.deinit();
+  release_range.destroy();
+}
+
+bool locktree::sto_txnid_is_valid_unsafe(void) const {
+  return toku_unsafe_fetch(m_sto_txnid) != TXNID_NONE;
+}
+
+int locktree::sto_get_score_unsafe(void) const {
+  return toku_unsafe_fetch(m_sto_score);
+}
+
+bool locktree::sto_try_release(TXNID txnid) {
+  bool released = false;
+  if (toku_unsafe_fetch(m_sto_txnid) != TXNID_NONE) {
+    // check the bit again with a prepared locked keyrange,
+    // which protects the optimization bits and rangetree data
+    concurrent_tree::locked_keyrange lkr;
+    lkr.prepare(m_rangetree);
+    if (m_sto_txnid != TXNID_NONE) {
+      // this txnid better be the single txnid on this locktree,
+      // or else we are in big trouble (meaning the logic is broken)
+      invariant(m_sto_txnid == txnid);
+      invariant(m_rangetree->is_empty());
+      sto_end();
+      released = true;
+    }
+    lkr.release();
+  }
+  return released;
+}
+
+// release all of the locks for a txnid whose endpoints are pairs
+// in the given range buffer.
+void locktree::release_locks(TXNID txnid, const range_buffer *ranges,
+                             bool all_trx_locks_hint) {
+  // try the single txn optimization. if it worked, then all of the
+  // locks are already released, otherwise we need to do it here.
+  bool released;
+  if (all_trx_locks_hint) {
+    // This will release all of the locks the transaction is holding
+    released = sto_try_release(txnid);
+  } else {
+    /*
+      psergey: we are asked to release *Some* of the locks the transaction
+      is holding.
+      We could try doing that without leaving the STO mode, but right now,
+      the easiest way is to exit the STO mode and let the non-STO code path
+      handle it.
+    */
+    if (toku_unsafe_fetch(m_sto_txnid) != TXNID_NONE) {
+      // check the bit again with a prepared locked keyrange,
+      // which protects the optimization bits and rangetree data
+      concurrent_tree::locked_keyrange lkr;
+      lkr.prepare(m_rangetree);
+      if (m_sto_txnid != TXNID_NONE) {
+        sto_end_early(&lkr);
+      }
+      lkr.release();
+    }
+    released = false;
+  }
+  if (!released) {
+    range_buffer::iterator iter(ranges);
+    range_buffer::iterator::record rec;
+    while (iter.current(&rec)) {
+      const DBT *left_key = rec.get_left_key();
+      const DBT *right_key = rec.get_right_key();
+      // All ranges in the locktree must have left endpoints <= right endpoints.
+      // Range comparisons rely on this fact, so we make a paranoid invariant
+      // here.
+      paranoid_invariant(m_cmp(left_key, right_key) <= 0);
+      remove_overlapping_locks_for_txnid(txnid, left_key, right_key);
+      iter.next();
+    }
+    // Increase the sto score slightly. Eventually it will hit
+    // the threshold and we'll try the optimization again. This
+    // is how a previously multithreaded system transitions into
+    // a single threaded system that benefits from the optimization.
+    if (toku_unsafe_fetch(m_sto_score) < STO_SCORE_THRESHOLD) {
+      toku_sync_fetch_and_add(&m_sto_score, 1);
+    }
+  }
+}
+
+// iterate over a locked keyrange and extract copies of the first N
+// row locks, storing each one into the given array of size N,
+// then removing each extracted lock from the locked keyrange.
+static int extract_first_n_row_locks(concurrent_tree::locked_keyrange *lkr,
+                                     locktree_manager *mgr, row_lock *row_locks,
+                                     int num_to_extract) {
+  struct extract_fn_obj {
+    int num_extracted;
+    int num_to_extract;
+    row_lock *row_locks;
+    bool fn(const keyrange &range, TXNID txnid, bool is_shared,
+            TxnidVector *owners) {
+      if (num_extracted < num_to_extract) {
+        row_lock lock;
+        lock.range.create_copy(range);
+        lock.txnid = txnid;
+        lock.is_shared = is_shared;
+        // deep-copy the set of owners:
+        if (owners)
+          lock.owners = new TxnidVector(*owners);
+        else
+          lock.owners = nullptr;
+        row_locks[num_extracted++] = lock;
+        return true;
+      } else {
+        return false;
+      }
+    }
+  } extract_fn;
+
+  extract_fn.row_locks = row_locks;
+  extract_fn.num_to_extract = num_to_extract;
+  extract_fn.num_extracted = 0;
+  lkr->iterate(&extract_fn);
+
+  // now that the ranges have been copied out, complete
+  // the extraction by removing the ranges from the tree.
+  // use remove_row_lock_from_tree() so we properly track the
+  // amount of memory and number of locks freed.
+  int num_extracted = extract_fn.num_extracted;
+  invariant(num_extracted <= num_to_extract);
+  for (int i = 0; i < num_extracted; i++) {
+    remove_row_lock_from_tree(lkr, row_locks[i], TXNID_ANY, mgr);
+  }
+
+  return num_extracted;
+}
+
+// Store each newly escalated lock in a range buffer for appropriate txnid.
+// We'll rebuild the locktree by iterating over these ranges, and then we
+// can pass back each txnid/buffer pair individually through a callback
+// to notify higher layers that locks have changed.
+struct txnid_range_buffer {
+  TXNID txnid;
+  range_buffer buffer;
+
+  static int find_by_txnid(struct txnid_range_buffer *const &other_buffer,
+                           const TXNID &txnid) {
+    if (txnid < other_buffer->txnid) {
+      return -1;
+    } else if (other_buffer->txnid == txnid) {
+      return 0;
+    } else {
+      return 1;
+    }
+  }
+};
+
+// escalate the locks in the locktree by merging adjacent
+// locks that have the same txnid into one larger lock.
+//
+// if there's only one txnid in the locktree then this
+// approach works well. if there are many txnids and each
+// has locks in a random/alternating order, then this does
+// not work so well.
+void locktree::escalate(lt_escalate_cb after_escalate_callback,
+                        void *after_escalate_callback_extra) {
+  omt<struct txnid_range_buffer *, struct txnid_range_buffer *> range_buffers;
+  range_buffers.create();
+
+  // prepare and acquire a locked keyrange on the entire locktree
+  concurrent_tree::locked_keyrange lkr;
+  keyrange infinite_range = keyrange::get_infinite_range();
+  lkr.prepare(m_rangetree);
+  lkr.acquire(infinite_range);
+
+  // if we're in the single txnid optimization, simply call it off.
+  // if you have to run escalation, you probably don't care about
+  // the optimization anyway, and this makes things easier.
+  if (m_sto_txnid != TXNID_NONE) {
+    // We are already accounting for this escalation time and
+    // count, so don't do it for sto_end_early too.
+    sto_end_early_no_accounting(&lkr);
+  }
+
+  // extract and remove batches of row locks from the locktree
+  int num_extracted;
+  const int num_row_locks_per_batch = 128;
+  row_lock *XCALLOC_N(num_row_locks_per_batch, extracted_buf);
+
+  // we always remove the "first" n because we are removing n
+  // each time we do an extraction. so this loops until its empty.
+  while ((num_extracted = extract_first_n_row_locks(
+              &lkr, m_mgr, extracted_buf, num_row_locks_per_batch)) > 0) {
+    int current_index = 0;
+    while (current_index < num_extracted) {
+      // every batch of extracted locks is in range-sorted order. search
+      // through them and merge adjacent locks with the same txnid into
+      // one dominating lock and save it to a set of escalated locks.
+      //
+      // first, find the index of the next row lock that
+      //  - belongs to a different txnid, or
+      //  - belongs to several txnids, or
+      //  - is a shared lock (we could potentially merge those but
+      //    currently we don't)
+      int next_txnid_index = current_index + 1;
+
+      while (next_txnid_index < num_extracted &&
+             (extracted_buf[current_index].txnid ==
+              extracted_buf[next_txnid_index].txnid) &&
+             !extracted_buf[next_txnid_index].is_shared &&
+             !extracted_buf[next_txnid_index].owners) {
+        next_txnid_index++;
+      }
+
+      // Create an escalated range for the current txnid that dominates
+      // each range between the current indext and the next txnid's index.
+      // const TXNID current_txnid = extracted_buf[current_index].txnid;
+      const DBT *escalated_left_key =
+          extracted_buf[current_index].range.get_left_key();
+      const DBT *escalated_right_key =
+          extracted_buf[next_txnid_index - 1].range.get_right_key();
+
+      // Try to find a range buffer for the current txnid. Create one if it
+      // doesn't exist. Then, append the new escalated range to the buffer. (If
+      // a lock is shared by multiple txnids, append it each of txnid's lists)
+      TxnidVector *owners_ptr;
+      TxnidVector singleton_owner;
+      if (extracted_buf[current_index].owners)
+        owners_ptr = extracted_buf[current_index].owners;
+      else {
+        singleton_owner.insert(extracted_buf[current_index].txnid);
+        owners_ptr = &singleton_owner;
+      }
+
+      for (auto cur_txnid : *owners_ptr) {
+        uint32_t idx;
+        struct txnid_range_buffer *existing_range_buffer;
+        int r =
+            range_buffers.find_zero<TXNID, txnid_range_buffer::find_by_txnid>(
+                cur_txnid, &existing_range_buffer, &idx);
+        if (r == DB_NOTFOUND) {
+          struct txnid_range_buffer *XMALLOC(new_range_buffer);
+          new_range_buffer->txnid = cur_txnid;
+          new_range_buffer->buffer.create();
+          new_range_buffer->buffer.append(
+              escalated_left_key, escalated_right_key,
+              !extracted_buf[current_index].is_shared);
+          range_buffers.insert_at(new_range_buffer, idx);
+        } else {
+          invariant_zero(r);
+          invariant(existing_range_buffer->txnid == cur_txnid);
+          existing_range_buffer->buffer.append(
+              escalated_left_key, escalated_right_key,
+              !extracted_buf[current_index].is_shared);
+        }
+      }
+
+      current_index = next_txnid_index;
+    }
+
+    // destroy the ranges copied during the extraction
+    for (int i = 0; i < num_extracted; i++) {
+      delete extracted_buf[i].owners;
+      extracted_buf[i].range.destroy();
+    }
+  }
+  toku_free(extracted_buf);
+
+  // Rebuild the locktree from each range in each range buffer,
+  // then notify higher layers that the txnid's locks have changed.
+  //
+  // (shared locks: if a lock was initially shared between transactions TRX1,
+  //  TRX2, etc, we will now try to acquire it acting on behalf on TRX1, on
+  //  TRX2, etc.  This will succeed and an identical shared lock will be
+  //  constructed)
+
+  invariant(m_rangetree->is_empty());
+  const uint32_t num_range_buffers = range_buffers.size();
+  for (uint32_t i = 0; i < num_range_buffers; i++) {
+    struct txnid_range_buffer *current_range_buffer;
+    int r = range_buffers.fetch(i, &current_range_buffer);
+    invariant_zero(r);
+    if (r == EINVAL)  // Shouldn't happen, avoid compiler warning
+      continue;
+
+    const TXNID current_txnid = current_range_buffer->txnid;
+    range_buffer::iterator iter(&current_range_buffer->buffer);
+    range_buffer::iterator::record rec;
+    while (iter.current(&rec)) {
+      keyrange range;
+      range.create(rec.get_left_key(), rec.get_right_key());
+      row_lock lock = {.range = range,
+                       .txnid = current_txnid,
+                       .is_shared = !rec.get_exclusive_flag(),
+                       .owners = nullptr};
+      insert_row_lock_into_tree(&lkr, lock, m_mgr);
+      iter.next();
+    }
+
+    // Notify higher layers that locks have changed for the current txnid
+    if (after_escalate_callback) {
+      after_escalate_callback(current_txnid, this, current_range_buffer->buffer,
+                              after_escalate_callback_extra);
+    }
+    current_range_buffer->buffer.destroy();
+  }
+
+  while (range_buffers.size() > 0) {
+    struct txnid_range_buffer *buffer;
+    int r = range_buffers.fetch(0, &buffer);
+    invariant_zero(r);
+    r = range_buffers.delete_at(0);
+    invariant_zero(r);
+    toku_free(buffer);
+  }
+  range_buffers.destroy();
+
+  lkr.release();
+}
+
+void *locktree::get_userdata(void) const { return m_userdata; }
+
+void locktree::set_userdata(void *userdata) { m_userdata = userdata; }
+
+struct lt_lock_request_info *locktree::get_lock_request_info(void) {
+  return &m_lock_request_info;
+}
+
+void locktree::set_comparator(const comparator &cmp) { m_cmp.inherit(cmp); }
+
+locktree_manager *locktree::get_manager(void) const { return m_mgr; }
+
+int locktree::compare(const locktree *lt) const {
+  if (m_dict_id.dictid < lt->m_dict_id.dictid) {
+    return -1;
+  } else if (m_dict_id.dictid == lt->m_dict_id.dictid) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+DICTIONARY_ID locktree::get_dict_id() const { return m_dict_id; }
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h b/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h
new file mode 100644
index 00000000000..3e438f50296
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h
@@ -0,0 +1,560 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <atomic>
+
+#include "../db.h"
+#include "../ft/comparator.h"
+#include "../portability/toku_external_pthread.h"
+#include "../portability/toku_pthread.h"
+#include "../portability/toku_time.h"
+// PORT #include <ft/ft-ops.h>  // just for DICTIONARY_ID..
+// PORT: ft-status for LTM_STATUS:
+#include "../ft/ft-status.h"
+
+struct DICTIONARY_ID {
+  uint64_t dictid;
+};
+
+#include "../util/omt.h"
+#include "range_buffer.h"
+#include "txnid_set.h"
+#include "wfg.h"
+
+namespace toku {
+
+class locktree;
+class locktree_manager;
+class lock_request;
+class concurrent_tree;
+
+typedef int (*lt_create_cb)(locktree *lt, void *extra);
+typedef void (*lt_destroy_cb)(locktree *lt);
+typedef void (*lt_escalate_cb)(TXNID txnid, const locktree *lt,
+                               const range_buffer &buffer, void *extra);
+
+struct lt_counters {
+  uint64_t wait_count, wait_time;
+  uint64_t long_wait_count, long_wait_time;
+  uint64_t timeout_count;
+
+  void add(const lt_counters &rhs) {
+    wait_count += rhs.wait_count;
+    wait_time += rhs.wait_time;
+    long_wait_count += rhs.long_wait_count;
+    long_wait_time += rhs.long_wait_time;
+    timeout_count += rhs.timeout_count;
+  }
+};
+
+// Lock request state for some locktree
+struct lt_lock_request_info {
+  omt<lock_request *> pending_lock_requests;
+  std::atomic_bool pending_is_empty;
+  toku_external_mutex_t mutex;
+  bool should_retry_lock_requests;
+  lt_counters counters;
+  std::atomic_ullong retry_want;
+  unsigned long long retry_done;
+  toku_mutex_t retry_mutex;
+  toku_cond_t retry_cv;
+  bool running_retry;
+
+  void init(toku_external_mutex_factory_t mutex_factory);
+  void destroy(void);
+};
+
+// The locktree manager manages a set of locktrees, one for each open
+// dictionary. Locktrees are retrieved from the manager. When they are no
+// longer needed, they are be released by the user.
+class locktree_manager {
+ public:
+  // param: create_cb, called just after a locktree is first created.
+  //        destroy_cb, called just before a locktree is destroyed.
+  //        escalate_cb, called after a locktree is escalated (with extra
+  //        param)
+  void create(lt_create_cb create_cb, lt_destroy_cb destroy_cb,
+              lt_escalate_cb escalate_cb, void *extra,
+              toku_external_mutex_factory_t mutex_factory_arg);
+
+  void destroy(void);
+
+  size_t get_max_lock_memory(void);
+
+  int set_max_lock_memory(size_t max_lock_memory);
+
+  // effect: Get a locktree from the manager. If a locktree exists with the
+  // given
+  //         dict_id, it is referenced and then returned. If one did not exist,
+  //         it is created. It will use the comparator for comparing keys. The
+  //         on_create callback (passed to locktree_manager::create()) will be
+  //         called with the given extra parameter.
+  locktree *get_lt(DICTIONARY_ID dict_id, const comparator &cmp,
+                   void *on_create_extra);
+
+  void reference_lt(locktree *lt);
+
+  // effect: Releases one reference on a locktree. If the reference count
+  // transitions
+  //         to zero, the on_destroy callback is called before it gets
+  //         destroyed.
+  void release_lt(locktree *lt);
+
+  void get_status(LTM_STATUS status);
+
+  // effect: calls the iterate function on each pending lock request
+  // note: holds the manager's mutex
+  typedef int (*lock_request_iterate_callback)(DICTIONARY_ID dict_id,
+                                               TXNID txnid, const DBT *left_key,
+                                               const DBT *right_key,
+                                               TXNID blocking_txnid,
+                                               uint64_t start_time,
+                                               void *extra);
+  int iterate_pending_lock_requests(lock_request_iterate_callback cb,
+                                    void *extra);
+
+  // effect: Determines if too many locks or too much memory is being used,
+  //         Runs escalation on the manager if so.
+  // param: big_txn, if the current transaction is 'big' (has spilled rollback
+  // logs) returns: 0 if there enough resources to create a new lock, or
+  // TOKUDB_OUT_OF_LOCKS
+  //          if there are not enough resources and lock escalation failed to
+  //          free up enough resources for a new lock.
+  int check_current_lock_constraints(bool big_txn);
+
+  bool over_big_threshold(void);
+
+  void note_mem_used(uint64_t mem_used);
+
+  void note_mem_released(uint64_t mem_freed);
+
+  bool out_of_locks(void) const;
+
+  // Escalate all locktrees
+  void escalate_all_locktrees(void);
+
+  // Escalate a set of locktrees
+  void escalate_locktrees(locktree **locktrees, int num_locktrees);
+
+  // effect: calls the private function run_escalation(), only ok to
+  //         do for tests.
+  // rationale: to get better stress test coverage, we want a way to
+  //            deterministicly trigger lock escalation.
+  void run_escalation_for_test(void);
+  void run_escalation(void);
+
+  // Add time t to the escalator's wait time statistics
+  void add_escalator_wait_time(uint64_t t);
+
+  void kill_waiter(void *extra);
+
+ private:
+  static const uint64_t DEFAULT_MAX_LOCK_MEMORY = 64L * 1024 * 1024;
+
+  // tracks the current number of locks and lock memory
+  uint64_t m_max_lock_memory;
+  uint64_t m_current_lock_memory;
+
+  struct lt_counters m_lt_counters;
+
+  // the create and destroy callbacks for the locktrees
+  lt_create_cb m_lt_create_callback;
+  lt_destroy_cb m_lt_destroy_callback;
+  lt_escalate_cb m_lt_escalate_callback;
+  void *m_lt_escalate_callback_extra;
+
+  omt<locktree *> m_locktree_map;
+
+  toku_external_mutex_factory_t mutex_factory;
+
+  // the manager's mutex protects the locktree map
+  toku_mutex_t m_mutex;
+
+  void mutex_lock(void);
+
+  void mutex_unlock(void);
+
+  // Manage the set of open locktrees
+  locktree *locktree_map_find(const DICTIONARY_ID &dict_id);
+  void locktree_map_put(locktree *lt);
+  void locktree_map_remove(locktree *lt);
+
+  static int find_by_dict_id(locktree *const &lt, const DICTIONARY_ID &dict_id);
+
+  void escalator_init(void);
+  void escalator_destroy(void);
+
+  // statistics about lock escalation.
+  toku_mutex_t m_escalation_mutex;
+  uint64_t m_escalation_count;
+  tokutime_t m_escalation_time;
+  uint64_t m_escalation_latest_result;
+  uint64_t m_wait_escalation_count;
+  uint64_t m_wait_escalation_time;
+  uint64_t m_long_wait_escalation_count;
+  uint64_t m_long_wait_escalation_time;
+
+  // the escalator coordinates escalation on a set of locktrees for a bunch of
+  // threads
+  class locktree_escalator {
+   public:
+    void create(void);
+    void destroy(void);
+    void run(locktree_manager *mgr, void (*escalate_locktrees_fun)(void *extra),
+             void *extra);
+
+   private:
+    toku_mutex_t m_escalator_mutex;
+    toku_cond_t m_escalator_done;
+    bool m_escalator_running;
+  };
+
+  locktree_escalator m_escalator;
+
+  friend class manager_unit_test;
+};
+
+// A locktree represents the set of row locks owned by all transactions
+// over an open dictionary. Read and write ranges are represented as
+// a left and right key which are compared with the given comparator
+//
+// Locktrees are not created and destroyed by the user. Instead, they are
+// referenced and released using the locktree manager.
+//
+// A sample workflow looks like this:
+// - Create a manager.
+// - Get a locktree by dictionaroy id from the manager.
+// - Perform read/write lock acquision on the locktree, add references to
+//   the locktree using the manager, release locks, release references, etc.
+// - ...
+// - Release the final reference to the locktree. It will be destroyed.
+// - Destroy the manager.
+class locktree {
+ public:
+  // effect: Creates a locktree
+  void create(locktree_manager *mgr, DICTIONARY_ID dict_id,
+              const comparator &cmp,
+              toku_external_mutex_factory_t mutex_factory);
+
+  void destroy(void);
+
+  // For thread-safe, external reference counting
+  void add_reference(void);
+
+  // requires: the reference count is > 0
+  // returns: the reference count, after decrementing it by one
+  uint32_t release_reference(void);
+
+  // returns: the current reference count
+  uint32_t get_reference_count(void);
+
+  // effect: Attempts to grant a read lock for the range of keys between
+  // [left_key, right_key]. returns: If the lock cannot be granted, return
+  // DB_LOCK_NOTGRANTED, and populate the
+  //          given conflicts set with the txnids that hold conflicting locks in
+  //          the range. If the locktree cannot create more locks, return
+  //          TOKUDB_OUT_OF_LOCKS.
+  // note: Read locks cannot be shared between txnids, as one would expect.
+  //       This is for simplicity since read locks are rare in MySQL.
+  int acquire_read_lock(TXNID txnid, const DBT *left_key, const DBT *right_key,
+                        txnid_set *conflicts, bool big_txn);
+
+  // effect: Attempts to grant a write lock for the range of keys between
+  // [left_key, right_key]. returns: If the lock cannot be granted, return
+  // DB_LOCK_NOTGRANTED, and populate the
+  //          given conflicts set with the txnids that hold conflicting locks in
+  //          the range. If the locktree cannot create more locks, return
+  //          TOKUDB_OUT_OF_LOCKS.
+  int acquire_write_lock(TXNID txnid, const DBT *left_key, const DBT *right_key,
+                         txnid_set *conflicts, bool big_txn);
+
+  // effect: populate the conflicts set with the txnids that would preventing
+  //         the given txnid from getting a lock on [left_key, right_key]
+  void get_conflicts(bool is_write_request, TXNID txnid, const DBT *left_key,
+                     const DBT *right_key, txnid_set *conflicts);
+
+  // effect: Release all of the lock ranges represented by the range buffer for
+  // a txnid.
+  void release_locks(TXNID txnid, const range_buffer *ranges,
+                     bool all_trx_locks_hint = false);
+
+  // effect: Runs escalation on this locktree
+  void escalate(lt_escalate_cb after_escalate_callback, void *extra);
+
+  // returns: The userdata associated with this locktree, or null if it has not
+  // been set.
+  void *get_userdata(void) const;
+
+  void set_userdata(void *userdata);
+
+  locktree_manager *get_manager(void) const;
+
+  void set_comparator(const comparator &cmp);
+
+  int compare(const locktree *lt) const;
+
+  DICTIONARY_ID get_dict_id() const;
+
+  // Private info struct for storing pending lock request state.
+  // Only to be used by lock requests. We store it here as
+  // something less opaque than usual to strike a tradeoff between
+  // abstraction and code complexity. It is still fairly abstract
+  // since the lock_request object is opaque
+  struct lt_lock_request_info *get_lock_request_info(void);
+
+  typedef void (*dump_callback)(void *cdata, const DBT *left, const DBT *right,
+                                TXNID txnid, bool is_shared,
+                                TxnidVector *owners);
+  void dump_locks(void *cdata, dump_callback cb);
+
+ private:
+  locktree_manager *m_mgr;
+  DICTIONARY_ID m_dict_id;
+  uint32_t m_reference_count;
+
+  // Since the memory referenced by this comparator is not owned by the
+  // locktree, the user must guarantee it will outlive the locktree.
+  //
+  // The ydb API accomplishes this by opening an ft_handle in the on_create
+  // callback, which will keep the underlying FT (and its descriptor) in memory
+  // for as long as the handle is open. The ft_handle is stored opaquely in the
+  // userdata pointer below. see locktree_manager::get_lt w/ on_create_extra
+  comparator m_cmp;
+
+  concurrent_tree *m_rangetree;
+
+  void *m_userdata;
+  struct lt_lock_request_info m_lock_request_info;
+
+  // psergey-todo:
+  //  Each transaction also keeps a list of ranges it has locked.
+  //  So, when a transaction is running in STO mode, two identical
+  //  lists are kept: the STO lock list and transaction's owned locks
+  //  list. Why can't we do with just one list?
+
+  // The following fields and members prefixed with "sto_" are for
+  // the single txnid optimization, intended to speed up the case
+  // when only one transaction is using the locktree. If we know
+  // the locktree has only one transaction, then acquiring locks
+  // takes O(1) work and releasing all locks takes O(1) work.
+  //
+  // How do we know that the locktree only has a single txnid?
+  // What do we do if it does?
+  //
+  // When a txn with txnid T requests a lock:
+  // - If the tree is empty, the optimization is possible. Set the single
+  // txnid to T, and insert the lock range into the buffer.
+  // - If the tree is not empty, check if the single txnid is T. If so,
+  // append the lock range to the buffer. Otherwise, migrate all of
+  // the locks in the buffer into the rangetree on behalf of txnid T,
+  // and invalid the single txnid.
+  //
+  // When a txn with txnid T releases its locks:
+  // - If the single txnid is valid, it must be for T. Destroy the buffer.
+  // - If it's not valid, release locks the normal way in the rangetree.
+  //
+  // To carry out the optimization we need to record a single txnid
+  // and a range buffer for each locktree, each protected by the root
+  // lock of the locktree's rangetree. The root lock for a rangetree
+  // is grabbed by preparing a locked keyrange on the rangetree.
+  TXNID m_sto_txnid;
+  range_buffer m_sto_buffer;
+
+  // The single txnid optimization speeds up the case when only one
+  // transaction is using the locktree. But it has the potential to
+  // hurt the case when more than one txnid exists.
+  //
+  // There are two things we need to do to make the optimization only
+  // optimize the case we care about, and not hurt the general case.
+  //
+  // Bound the worst-case latency for lock migration when the
+  // optimization stops working:
+  // - Idea: Stop the optimization and migrate immediate if we notice
+  // the single txnid has takes many locks in the range buffer.
+  // - Implementation: Enforce a max size on the single txnid range buffer.
+  // - Analysis: Choosing the perfect max value, M, is difficult to do
+  // without some feedback from the field. Intuition tells us that M should
+  // not be so small that the optimization is worthless, and it should not
+  // be so big that it's unreasonable to have to wait behind a thread doing
+  // the work of converting M buffer locks into rangetree locks.
+  //
+  // Prevent concurrent-transaction workloads from trying the optimization
+  // in vain:
+  // - Idea: Don't even bother trying the optimization if we think the
+  // system is in a concurrent-transaction state.
+  // - Implementation: Do something even simpler than detecting whether the
+  // system is in a concurent-transaction state. Just keep a "score" value
+  // and some threshold. If at any time the locktree is eligible for the
+  // optimization, only do it if the score is at this threshold. When you
+  // actually do the optimization but someone has to migrate locks in the buffer
+  // (expensive), then reset the score back to zero. Each time a txn
+  // releases locks, the score is incremented by 1.
+  // - Analysis: If you let the threshold be "C", then at most 1 / C txns will
+  // do the optimization in a concurrent-transaction system. Similarly, it
+  // takes at most C txns to start using the single txnid optimzation, which
+  // is good when the system transitions from multithreaded to single threaded.
+  //
+  // STO_BUFFER_MAX_SIZE:
+  //
+  // We choose the max value to be 1 million since most transactions are smaller
+  // than 1 million and we can create a rangetree of 1 million elements in
+  // less than a second. So we can be pretty confident that this threshold
+  // enables the optimization almost always, and prevents super pathological
+  // latency issues for the first lock taken by a second thread.
+  //
+  // STO_SCORE_THRESHOLD:
+  //
+  // A simple first guess at a good value for the score threshold is 100.
+  // By our analysis, we'd end up doing the optimization in vain for
+  // around 1% of all transactions, which seems reasonable. Further,
+  // if the system goes single threaded, it ought to be pretty quick
+  // for 100 transactions to go by, so we won't have to wait long before
+  // we start doing the single txind optimzation again.
+  static const int STO_BUFFER_MAX_SIZE = 50 * 1024;
+  static const int STO_SCORE_THRESHOLD = 100;
+  int m_sto_score;
+
+  // statistics about time spent ending the STO early
+  uint64_t m_sto_end_early_count;
+  tokutime_t m_sto_end_early_time;
+
+  // effect: begins the single txnid optimizaiton, setting m_sto_txnid
+  //         to the given txnid.
+  // requires: m_sto_txnid is invalid
+  void sto_begin(TXNID txnid);
+
+  // effect: append a range to the sto buffer
+  // requires: m_sto_txnid is valid
+  void sto_append(const DBT *left_key, const DBT *right_key,
+                  bool is_write_request);
+
+  // effect: ends the single txnid optimization, releaseing any memory
+  //         stored in the sto buffer, notifying the tracker, and
+  //         invalidating m_sto_txnid.
+  // requires: m_sto_txnid is valid
+  void sto_end(void);
+
+  // params: prepared_lkr is a void * to a prepared locked keyrange. see below.
+  // effect: ends the single txnid optimization early, migrating buffer locks
+  //         into the rangetree, calling sto_end(), and then setting the
+  //         sto_score back to zero.
+  // requires: m_sto_txnid is valid
+  void sto_end_early(void *prepared_lkr);
+  void sto_end_early_no_accounting(void *prepared_lkr);
+
+  // params: prepared_lkr is a void * to a prepared locked keyrange. we can't
+  // use
+  //         the real type because the compiler won't allow us to forward
+  //         declare concurrent_tree::locked_keyrange without including
+  //         concurrent_tree.h, which we cannot do here because it is a template
+  //         implementation.
+  // requires: the prepared locked keyrange is for the locktree's rangetree
+  // requires: m_sto_txnid is valid
+  // effect: migrates each lock in the single txnid buffer into the locktree's
+  //         rangetree, notifying the memory tracker as necessary.
+  void sto_migrate_buffer_ranges_to_tree(void *prepared_lkr);
+
+  // effect: If m_sto_txnid is valid, then release the txnid's locks
+  //         by ending the optimization.
+  // requires: If m_sto_txnid is valid, it is equal to the given txnid
+  // returns: True if locks were released for this txnid
+  bool sto_try_release(TXNID txnid);
+
+  // params: prepared_lkr is a void * to a prepared locked keyrange. see above.
+  // requires: the prepared locked keyrange is for the locktree's rangetree
+  // effect: If m_sto_txnid is valid and equal to the given txnid, then
+  // append a range onto the buffer. Otherwise, if m_sto_txnid is valid
+  //        but not equal to this txnid, then migrate the buffer's locks
+  //        into the rangetree and end the optimization, setting the score
+  //        back to zero.
+  // returns: true if the lock was acquired for this txnid
+  bool sto_try_acquire(void *prepared_lkr, TXNID txnid, const DBT *left_key,
+                       const DBT *right_key, bool is_write_request);
+
+  // Effect:
+  //  Provides a hook for a helgrind suppression.
+  // Returns:
+  //  true if m_sto_txnid is not TXNID_NONE
+  bool sto_txnid_is_valid_unsafe(void) const;
+
+  // Effect:
+  //  Provides a hook for a helgrind suppression.
+  // Returns:
+  //  m_sto_score
+  int sto_get_score_unsafe(void) const;
+
+  void remove_overlapping_locks_for_txnid(TXNID txnid, const DBT *left_key,
+                                          const DBT *right_key);
+
+  int acquire_lock_consolidated(void *prepared_lkr, TXNID txnid,
+                                const DBT *left_key, const DBT *right_key,
+                                bool is_write_request, txnid_set *conflicts);
+
+  int acquire_lock(bool is_write_request, TXNID txnid, const DBT *left_key,
+                   const DBT *right_key, txnid_set *conflicts);
+
+  int try_acquire_lock(bool is_write_request, TXNID txnid, const DBT *left_key,
+                       const DBT *right_key, txnid_set *conflicts,
+                       bool big_txn);
+
+  friend class locktree_unit_test;
+  friend class manager_unit_test;
+  friend class lock_request_unit_test;
+
+  // engine status reaches into the locktree to read some stats
+  friend void locktree_manager::get_status(LTM_STATUS status);
+};
+
+} /* namespace toku */
diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc
new file mode 100644
index 00000000000..4186182beae
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc
@@ -0,0 +1,527 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "../portability/toku_pthread.h"
+#include "../util/status.h"
+#include "lock_request.h"
+#include "locktree.h"
+
+namespace toku {
+
+void locktree_manager::create(lt_create_cb create_cb, lt_destroy_cb destroy_cb,
+                              lt_escalate_cb escalate_cb, void *escalate_extra,
+                              toku_external_mutex_factory_t mutex_factory_arg) {
+  mutex_factory = mutex_factory_arg;
+  m_max_lock_memory = DEFAULT_MAX_LOCK_MEMORY;
+  m_current_lock_memory = 0;
+
+  m_locktree_map.create();
+  m_lt_create_callback = create_cb;
+  m_lt_destroy_callback = destroy_cb;
+  m_lt_escalate_callback = escalate_cb;
+  m_lt_escalate_callback_extra = escalate_extra;
+  ZERO_STRUCT(m_mutex);
+  toku_mutex_init(manager_mutex_key, &m_mutex, nullptr);
+
+  ZERO_STRUCT(m_lt_counters);
+
+  escalator_init();
+}
+
+void locktree_manager::destroy(void) {
+  escalator_destroy();
+  invariant(m_current_lock_memory == 0);
+  invariant(m_locktree_map.size() == 0);
+  m_locktree_map.destroy();
+  toku_mutex_destroy(&m_mutex);
+}
+
+void locktree_manager::mutex_lock(void) { toku_mutex_lock(&m_mutex); }
+
+void locktree_manager::mutex_unlock(void) { toku_mutex_unlock(&m_mutex); }
+
+size_t locktree_manager::get_max_lock_memory(void) { return m_max_lock_memory; }
+
+int locktree_manager::set_max_lock_memory(size_t max_lock_memory) {
+  int r = 0;
+  mutex_lock();
+  if (max_lock_memory < m_current_lock_memory) {
+    r = EDOM;
+  } else {
+    m_max_lock_memory = max_lock_memory;
+  }
+  mutex_unlock();
+  return r;
+}
+
+int locktree_manager::find_by_dict_id(locktree *const &lt,
+                                      const DICTIONARY_ID &dict_id) {
+  if (lt->get_dict_id().dictid < dict_id.dictid) {
+    return -1;
+  } else if (lt->get_dict_id().dictid == dict_id.dictid) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+locktree *locktree_manager::locktree_map_find(const DICTIONARY_ID &dict_id) {
+  locktree *lt;
+  int r = m_locktree_map.find_zero<DICTIONARY_ID, find_by_dict_id>(dict_id, &lt,
+                                                                   nullptr);
+  return r == 0 ? lt : nullptr;
+}
+
+void locktree_manager::locktree_map_put(locktree *lt) {
+  int r = m_locktree_map.insert<DICTIONARY_ID, find_by_dict_id>(
+      lt, lt->get_dict_id(), nullptr);
+  invariant_zero(r);
+}
+
+void locktree_manager::locktree_map_remove(locktree *lt) {
+  uint32_t idx;
+  locktree *found_lt;
+  int r = m_locktree_map.find_zero<DICTIONARY_ID, find_by_dict_id>(
+      lt->get_dict_id(), &found_lt, &idx);
+  invariant_zero(r);
+  invariant(found_lt == lt);
+  r = m_locktree_map.delete_at(idx);
+  invariant_zero(r);
+}
+
+locktree *locktree_manager::get_lt(DICTIONARY_ID dict_id, const comparator &cmp,
+                                   void *on_create_extra) {
+  // hold the mutex around searching and maybe
+  // inserting into the locktree map
+  mutex_lock();
+
+  locktree *lt = locktree_map_find(dict_id);
+  if (lt == nullptr) {
+    XCALLOC(lt);
+    lt->create(this, dict_id, cmp, mutex_factory);
+
+    // new locktree created - call the on_create callback
+    // and put it in the locktree map
+    if (m_lt_create_callback) {
+      int r = m_lt_create_callback(lt, on_create_extra);
+      if (r != 0) {
+        lt->release_reference();
+        lt->destroy();
+        toku_free(lt);
+        lt = nullptr;
+      }
+    }
+    if (lt) {
+      locktree_map_put(lt);
+    }
+  } else {
+    reference_lt(lt);
+  }
+
+  mutex_unlock();
+
+  return lt;
+}
+
+void locktree_manager::reference_lt(locktree *lt) {
+  // increment using a sync fetch and add.
+  // the caller guarantees that the lt won't be
+  // destroyed while we increment the count here.
+  //
+  // the caller can do this by already having an lt
+  // reference or by holding the manager mutex.
+  //
+  // if the manager's mutex is held, it is ok for the
+  // reference count to transition from 0 to 1 (no race),
+  // since we're serialized with other opens and closes.
+  lt->add_reference();
+}
+
+void locktree_manager::release_lt(locktree *lt) {
+  bool do_destroy = false;
+  DICTIONARY_ID dict_id = lt->get_dict_id();
+
+  // Release a reference on the locktree. If the count transitions to zero,
+  // then we *may* need to do the cleanup.
+  //
+  // Grab the manager's mutex and look for a locktree with this locktree's
+  // dictionary id. Since dictionary id's never get reused, any locktree
+  // found must be the one we just released a reference on.
+  //
+  // At least two things could have happened since we got the mutex:
+  // - Another thread gets a locktree with the same dict_id, increments
+  // the reference count. In this case, we shouldn't destroy it.
+  // - Another thread gets a locktree with the same dict_id and then
+  // releases it quickly, transitioning the reference count from zero to
+  // one and back to zero. In this case, only one of us should destroy it.
+  // It doesn't matter which. We originally missed this case, see #5776.
+  //
+  // After 5776, the high level rule for release is described below.
+  //
+  // If a thread releases a locktree and notices the reference count transition
+  // to zero, then that thread must immediately:
+  // - assume the locktree object is invalid
+  // - grab the manager's mutex
+  // - search the locktree map for a locktree with the same dict_id and remove
+  // it, if it exists. the destroy may be deferred.
+  // - release the manager's mutex
+  //
+  // This way, if many threads transition the same locktree's reference count
+  // from 1 to zero and wait behind the manager's mutex, only one of them will
+  // do the actual destroy and the others will happily do nothing.
+  uint32_t refs = lt->release_reference();
+  if (refs == 0) {
+    mutex_lock();
+    // lt may not have already been destroyed, so look it up.
+    locktree *find_lt = locktree_map_find(dict_id);
+    if (find_lt != nullptr) {
+      // A locktree is still in the map with that dict_id, so it must be
+      // equal to lt. This is true because dictionary ids are never reused.
+      // If the reference count is zero, it's our responsibility to remove
+      // it and do the destroy. Otherwise, someone still wants it.
+      // If the locktree is still valid then check if it should be deleted.
+      if (find_lt == lt) {
+        if (lt->get_reference_count() == 0) {
+          locktree_map_remove(lt);
+          do_destroy = true;
+        }
+        m_lt_counters.add(lt->get_lock_request_info()->counters);
+      }
+    }
+    mutex_unlock();
+  }
+
+  // if necessary, do the destroy without holding the mutex
+  if (do_destroy) {
+    if (m_lt_destroy_callback) {
+      m_lt_destroy_callback(lt);
+    }
+    lt->destroy();
+    toku_free(lt);
+  }
+}
+
+void locktree_manager::run_escalation(void) {
+  struct escalation_fn {
+    static void run(void *extra) {
+      locktree_manager *mgr = (locktree_manager *)extra;
+      mgr->escalate_all_locktrees();
+    };
+  };
+  m_escalator.run(this, escalation_fn::run, this);
+}
+
+// test-only version of lock escalation
+void locktree_manager::run_escalation_for_test(void) { run_escalation(); }
+
+void locktree_manager::escalate_all_locktrees(void) {
+  uint64_t t0 = toku_current_time_microsec();
+
+  // get all locktrees
+  mutex_lock();
+  int num_locktrees = m_locktree_map.size();
+  locktree **locktrees = new locktree *[num_locktrees];
+  for (int i = 0; i < num_locktrees; i++) {
+    int r = m_locktree_map.fetch(i, &locktrees[i]);
+    invariant_zero(r);
+    reference_lt(locktrees[i]);
+  }
+  mutex_unlock();
+
+  // escalate them
+  escalate_locktrees(locktrees, num_locktrees);
+
+  delete[] locktrees;
+
+  uint64_t t1 = toku_current_time_microsec();
+  add_escalator_wait_time(t1 - t0);
+}
+
+void locktree_manager::note_mem_used(uint64_t mem_used) {
+  (void)toku_sync_fetch_and_add(&m_current_lock_memory, mem_used);
+}
+
+void locktree_manager::note_mem_released(uint64_t mem_released) {
+  uint64_t old_mem_used =
+      toku_sync_fetch_and_sub(&m_current_lock_memory, mem_released);
+  invariant(old_mem_used >= mem_released);
+}
+
+bool locktree_manager::out_of_locks(void) const {
+  return m_current_lock_memory >= m_max_lock_memory;
+}
+
+bool locktree_manager::over_big_threshold(void) {
+  return m_current_lock_memory >= m_max_lock_memory / 2;
+}
+
+int locktree_manager::iterate_pending_lock_requests(
+    lock_request_iterate_callback callback, void *extra) {
+  mutex_lock();
+  int r = 0;
+  uint32_t num_locktrees = m_locktree_map.size();
+  for (uint32_t i = 0; i < num_locktrees && r == 0; i++) {
+    locktree *lt;
+    r = m_locktree_map.fetch(i, &lt);
+    invariant_zero(r);
+    if (r == EINVAL)  // Shouldn't happen, avoid compiler warning
+      continue;
+
+    struct lt_lock_request_info *info = lt->get_lock_request_info();
+    toku_external_mutex_lock(&info->mutex);
+
+    uint32_t num_requests = info->pending_lock_requests.size();
+    for (uint32_t k = 0; k < num_requests && r == 0; k++) {
+      lock_request *req;
+      r = info->pending_lock_requests.fetch(k, &req);
+      invariant_zero(r);
+      if (r == EINVAL) /* Shouldn't happen, avoid compiler warning */
+        continue;
+      r = callback(lt->get_dict_id(), req->get_txnid(), req->get_left_key(),
+                   req->get_right_key(), req->get_conflicting_txnid(),
+                   req->get_start_time(), extra);
+    }
+
+    toku_external_mutex_unlock(&info->mutex);
+  }
+  mutex_unlock();
+  return r;
+}
+
+int locktree_manager::check_current_lock_constraints(bool big_txn) {
+  int r = 0;
+  if (big_txn && over_big_threshold()) {
+    run_escalation();
+    if (over_big_threshold()) {
+      r = TOKUDB_OUT_OF_LOCKS;
+    }
+  }
+  if (r == 0 && out_of_locks()) {
+    run_escalation();
+    if (out_of_locks()) {
+      // return an error if we're still out of locks after escalation.
+      r = TOKUDB_OUT_OF_LOCKS;
+    }
+  }
+  return r;
+}
+
+void locktree_manager::escalator_init(void) {
+  ZERO_STRUCT(m_escalation_mutex);
+  toku_mutex_init(manager_escalation_mutex_key, &m_escalation_mutex, nullptr);
+  m_escalation_count = 0;
+  m_escalation_time = 0;
+  m_wait_escalation_count = 0;
+  m_wait_escalation_time = 0;
+  m_long_wait_escalation_count = 0;
+  m_long_wait_escalation_time = 0;
+  m_escalation_latest_result = 0;
+  m_escalator.create();
+}
+
+void locktree_manager::escalator_destroy(void) {
+  m_escalator.destroy();
+  toku_mutex_destroy(&m_escalation_mutex);
+}
+
+void locktree_manager::add_escalator_wait_time(uint64_t t) {
+  toku_mutex_lock(&m_escalation_mutex);
+  m_wait_escalation_count += 1;
+  m_wait_escalation_time += t;
+  if (t >= 1000000) {
+    m_long_wait_escalation_count += 1;
+    m_long_wait_escalation_time += t;
+  }
+  toku_mutex_unlock(&m_escalation_mutex);
+}
+
+void locktree_manager::escalate_locktrees(locktree **locktrees,
+                                          int num_locktrees) {
+  // there are too many row locks in the system and we need to tidy up.
+  //
+  // a simple implementation of escalation does not attempt
+  // to reduce the memory foot print of each txn's range buffer.
+  // doing so would require some layering hackery (or a callback)
+  // and more complicated locking. for now, just escalate each
+  // locktree individually, in-place.
+  tokutime_t t0 = toku_time_now();
+  for (int i = 0; i < num_locktrees; i++) {
+    locktrees[i]->escalate(m_lt_escalate_callback,
+                           m_lt_escalate_callback_extra);
+    release_lt(locktrees[i]);
+  }
+  tokutime_t t1 = toku_time_now();
+
+  toku_mutex_lock(&m_escalation_mutex);
+  m_escalation_count++;
+  m_escalation_time += (t1 - t0);
+  m_escalation_latest_result = m_current_lock_memory;
+  toku_mutex_unlock(&m_escalation_mutex);
+}
+
+struct escalate_args {
+  locktree_manager *mgr;
+  locktree **locktrees;
+  int num_locktrees;
+};
+
+void locktree_manager::locktree_escalator::create(void) {
+  ZERO_STRUCT(m_escalator_mutex);
+  toku_mutex_init(manager_escalator_mutex_key, &m_escalator_mutex, nullptr);
+  toku_cond_init(manager_m_escalator_done_key, &m_escalator_done, nullptr);
+  m_escalator_running = false;
+}
+
+void locktree_manager::locktree_escalator::destroy(void) {
+  toku_cond_destroy(&m_escalator_done);
+  toku_mutex_destroy(&m_escalator_mutex);
+}
+
+void locktree_manager::locktree_escalator::run(
+    locktree_manager *mgr, void (*escalate_locktrees_fun)(void *extra),
+    void *extra) {
+  uint64_t t0 = toku_current_time_microsec();
+  toku_mutex_lock(&m_escalator_mutex);
+  if (!m_escalator_running) {
+    // run escalation on this thread
+    m_escalator_running = true;
+    toku_mutex_unlock(&m_escalator_mutex);
+    escalate_locktrees_fun(extra);
+    toku_mutex_lock(&m_escalator_mutex);
+    m_escalator_running = false;
+    toku_cond_broadcast(&m_escalator_done);
+  } else {
+    toku_cond_wait(&m_escalator_done, &m_escalator_mutex);
+  }
+  toku_mutex_unlock(&m_escalator_mutex);
+  uint64_t t1 = toku_current_time_microsec();
+  mgr->add_escalator_wait_time(t1 - t0);
+}
+
+void locktree_manager::get_status(LTM_STATUS statp) {
+  ltm_status.init();
+  LTM_STATUS_VAL(LTM_SIZE_CURRENT) = m_current_lock_memory;
+  LTM_STATUS_VAL(LTM_SIZE_LIMIT) = m_max_lock_memory;
+  LTM_STATUS_VAL(LTM_ESCALATION_COUNT) = m_escalation_count;
+  LTM_STATUS_VAL(LTM_ESCALATION_TIME) = m_escalation_time;
+  LTM_STATUS_VAL(LTM_ESCALATION_LATEST_RESULT) = m_escalation_latest_result;
+  LTM_STATUS_VAL(LTM_WAIT_ESCALATION_COUNT) = m_wait_escalation_count;
+  LTM_STATUS_VAL(LTM_WAIT_ESCALATION_TIME) = m_wait_escalation_time;
+  LTM_STATUS_VAL(LTM_LONG_WAIT_ESCALATION_COUNT) = m_long_wait_escalation_count;
+  LTM_STATUS_VAL(LTM_LONG_WAIT_ESCALATION_TIME) = m_long_wait_escalation_time;
+
+  uint64_t lock_requests_pending = 0;
+  uint64_t sto_num_eligible = 0;
+  uint64_t sto_end_early_count = 0;
+  tokutime_t sto_end_early_time = 0;
+  uint32_t num_locktrees = 0;
+  struct lt_counters lt_counters;
+  ZERO_STRUCT(lt_counters);  // PORT: instead of ={}.
+
+  if (toku_mutex_trylock(&m_mutex) == 0) {
+    lt_counters = m_lt_counters;
+    num_locktrees = m_locktree_map.size();
+    for (uint32_t i = 0; i < num_locktrees; i++) {
+      locktree *lt;
+      int r = m_locktree_map.fetch(i, &lt);
+      invariant_zero(r);
+      if (r == EINVAL)  // Shouldn't happen, avoid compiler warning
+        continue;
+      if (toku_external_mutex_trylock(&lt->m_lock_request_info.mutex) == 0) {
+        lock_requests_pending +=
+            lt->m_lock_request_info.pending_lock_requests.size();
+        lt_counters.add(lt->get_lock_request_info()->counters);
+        toku_external_mutex_unlock(&lt->m_lock_request_info.mutex);
+      }
+      sto_num_eligible += lt->sto_txnid_is_valid_unsafe() ? 1 : 0;
+      sto_end_early_count += lt->m_sto_end_early_count;
+      sto_end_early_time += lt->m_sto_end_early_time;
+    }
+    mutex_unlock();
+  }
+
+  LTM_STATUS_VAL(LTM_NUM_LOCKTREES) = num_locktrees;
+  LTM_STATUS_VAL(LTM_LOCK_REQUESTS_PENDING) = lock_requests_pending;
+  LTM_STATUS_VAL(LTM_STO_NUM_ELIGIBLE) = sto_num_eligible;
+  LTM_STATUS_VAL(LTM_STO_END_EARLY_COUNT) = sto_end_early_count;
+  LTM_STATUS_VAL(LTM_STO_END_EARLY_TIME) = sto_end_early_time;
+  LTM_STATUS_VAL(LTM_WAIT_COUNT) = lt_counters.wait_count;
+  LTM_STATUS_VAL(LTM_WAIT_TIME) = lt_counters.wait_time;
+  LTM_STATUS_VAL(LTM_LONG_WAIT_COUNT) = lt_counters.long_wait_count;
+  LTM_STATUS_VAL(LTM_LONG_WAIT_TIME) = lt_counters.long_wait_time;
+  LTM_STATUS_VAL(LTM_TIMEOUT_COUNT) = lt_counters.timeout_count;
+  *statp = ltm_status;
+}
+
+void locktree_manager::kill_waiter(void *extra) {
+  mutex_lock();
+  int r = 0;
+  uint32_t num_locktrees = m_locktree_map.size();
+  for (uint32_t i = 0; i < num_locktrees; i++) {
+    locktree *lt;
+    r = m_locktree_map.fetch(i, &lt);
+    invariant_zero(r);
+    if (r) continue;  // Get rid of "may be used uninitialized" warning
+    lock_request::kill_waiter(lt, extra);
+  }
+  mutex_unlock();
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc
new file mode 100644
index 00000000000..1e1d23ef8af
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc
@@ -0,0 +1,265 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "range_buffer.h"
+
+#include <string.h>
+
+#include "../portability/memory.h"
+#include "../util/dbt.h"
+
+namespace toku {
+
+bool range_buffer::record_header::left_is_infinite(void) const {
+  return left_neg_inf || left_pos_inf;
+}
+
+bool range_buffer::record_header::right_is_infinite(void) const {
+  return right_neg_inf || right_pos_inf;
+}
+
+void range_buffer::record_header::init(const DBT *left_key,
+                                       const DBT *right_key,
+                                       bool is_exclusive) {
+  is_exclusive_lock = is_exclusive;
+  left_neg_inf = left_key == toku_dbt_negative_infinity();
+  left_pos_inf = left_key == toku_dbt_positive_infinity();
+  left_key_size = toku_dbt_is_infinite(left_key) ? 0 : left_key->size;
+  if (right_key) {
+    right_neg_inf = right_key == toku_dbt_negative_infinity();
+    right_pos_inf = right_key == toku_dbt_positive_infinity();
+    right_key_size = toku_dbt_is_infinite(right_key) ? 0 : right_key->size;
+  } else {
+    right_neg_inf = left_neg_inf;
+    right_pos_inf = left_pos_inf;
+    right_key_size = 0;
+  }
+}
+
+const DBT *range_buffer::iterator::record::get_left_key(void) const {
+  if (_header.left_neg_inf) {
+    return toku_dbt_negative_infinity();
+  } else if (_header.left_pos_inf) {
+    return toku_dbt_positive_infinity();
+  } else {
+    return &_left_key;
+  }
+}
+
+const DBT *range_buffer::iterator::record::get_right_key(void) const {
+  if (_header.right_neg_inf) {
+    return toku_dbt_negative_infinity();
+  } else if (_header.right_pos_inf) {
+    return toku_dbt_positive_infinity();
+  } else {
+    return &_right_key;
+  }
+}
+
+size_t range_buffer::iterator::record::size(void) const {
+  return sizeof(record_header) + _header.left_key_size + _header.right_key_size;
+}
+
+void range_buffer::iterator::record::deserialize(const char *buf) {
+  size_t current = 0;
+
+  // deserialize the header
+  memcpy(&_header, buf, sizeof(record_header));
+  current += sizeof(record_header);
+
+  // deserialize the left key if necessary
+  if (!_header.left_is_infinite()) {
+    // point the left DBT's buffer into ours
+    toku_fill_dbt(&_left_key, buf + current, _header.left_key_size);
+    current += _header.left_key_size;
+  }
+
+  // deserialize the right key if necessary
+  if (!_header.right_is_infinite()) {
+    if (_header.right_key_size == 0) {
+      toku_copyref_dbt(&_right_key, _left_key);
+    } else {
+      toku_fill_dbt(&_right_key, buf + current, _header.right_key_size);
+    }
+  }
+}
+
+toku::range_buffer::iterator::iterator()
+    : _ma_chunk_iterator(nullptr),
+      _current_chunk_base(nullptr),
+      _current_chunk_offset(0),
+      _current_chunk_max(0),
+      _current_rec_size(0) {}
+
+toku::range_buffer::iterator::iterator(const range_buffer *buffer)
+    : _ma_chunk_iterator(&buffer->_arena),
+      _current_chunk_base(nullptr),
+      _current_chunk_offset(0),
+      _current_chunk_max(0),
+      _current_rec_size(0) {
+  reset_current_chunk();
+}
+
+void range_buffer::iterator::reset_current_chunk() {
+  _current_chunk_base = _ma_chunk_iterator.current(&_current_chunk_max);
+  _current_chunk_offset = 0;
+}
+
+bool range_buffer::iterator::current(record *rec) {
+  if (_current_chunk_offset < _current_chunk_max) {
+    const char *buf = reinterpret_cast<const char *>(_current_chunk_base);
+    rec->deserialize(buf + _current_chunk_offset);
+    _current_rec_size = rec->size();
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// move the iterator to the next record in the buffer
+void range_buffer::iterator::next(void) {
+  invariant(_current_chunk_offset < _current_chunk_max);
+  invariant(_current_rec_size > 0);
+
+  // the next record is _current_rec_size bytes forward
+  _current_chunk_offset += _current_rec_size;
+  // now, we don't know how big the current is, set it to 0.
+  _current_rec_size = 0;
+
+  if (_current_chunk_offset >= _current_chunk_max) {
+    // current chunk is exhausted, try moving to the next one
+    if (_ma_chunk_iterator.more()) {
+      _ma_chunk_iterator.next();
+      reset_current_chunk();
+    }
+  }
+}
+
+void range_buffer::create(void) {
+  // allocate buffer space lazily instead of on creation. this way,
+  // no malloc/free is done if the transaction ends up taking no locks.
+  _arena.create(0);
+  _num_ranges = 0;
+}
+
+void range_buffer::append(const DBT *left_key, const DBT *right_key,
+                          bool is_write_request) {
+  // if the keys are equal, then only one copy is stored.
+  if (toku_dbt_equals(left_key, right_key)) {
+    invariant(left_key->size <= MAX_KEY_SIZE);
+    append_point(left_key, is_write_request);
+  } else {
+    invariant(left_key->size <= MAX_KEY_SIZE);
+    invariant(right_key->size <= MAX_KEY_SIZE);
+    append_range(left_key, right_key, is_write_request);
+  }
+  _num_ranges++;
+}
+
+bool range_buffer::is_empty(void) const { return total_memory_size() == 0; }
+
+uint64_t range_buffer::total_memory_size(void) const {
+  return _arena.total_size_in_use();
+}
+
+int range_buffer::get_num_ranges(void) const { return _num_ranges; }
+
+void range_buffer::destroy(void) { _arena.destroy(); }
+
+void range_buffer::append_range(const DBT *left_key, const DBT *right_key,
+                                bool is_exclusive) {
+  size_t record_length =
+      sizeof(record_header) + left_key->size + right_key->size;
+  char *buf = reinterpret_cast<char *>(_arena.malloc_from_arena(record_length));
+
+  record_header h;
+  h.init(left_key, right_key, is_exclusive);
+
+  // serialize the header
+  memcpy(buf, &h, sizeof(record_header));
+  buf += sizeof(record_header);
+
+  // serialize the left key if necessary
+  if (!h.left_is_infinite()) {
+    memcpy(buf, left_key->data, left_key->size);
+    buf += left_key->size;
+  }
+
+  // serialize the right key if necessary
+  if (!h.right_is_infinite()) {
+    memcpy(buf, right_key->data, right_key->size);
+  }
+}
+
+void range_buffer::append_point(const DBT *key, bool is_exclusive) {
+  size_t record_length = sizeof(record_header) + key->size;
+  char *buf = reinterpret_cast<char *>(_arena.malloc_from_arena(record_length));
+
+  record_header h;
+  h.init(key, nullptr, is_exclusive);
+
+  // serialize the header
+  memcpy(buf, &h, sizeof(record_header));
+  buf += sizeof(record_header);
+
+  // serialize the key if necessary
+  if (!h.left_is_infinite()) {
+    memcpy(buf, key->data, key->size);
+  }
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h b/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h
new file mode 100644
index 00000000000..76e28d7477a
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.h
@@ -0,0 +1,178 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <inttypes.h>
+#include <stdint.h>
+
+#include "../util/dbt.h"
+#include "../util/memarena.h"
+
+namespace toku {
+
+// a key range buffer represents a set of key ranges that can
+// be stored, iterated over, and then destroyed all at once.
+class range_buffer {
+ private:
+  // the key range buffer is a bunch of records in a row.
+  // each record has the following header, followed by the
+  // left key and right key data payload, if applicable.
+  // we limit keys to be 2^16, since we store lengths as 2 bytes.
+  static const size_t MAX_KEY_SIZE = 1 << 16;
+
+  struct record_header {
+    bool left_neg_inf;
+    bool left_pos_inf;
+    bool right_pos_inf;
+    bool right_neg_inf;
+    uint16_t left_key_size;
+    uint16_t right_key_size;
+    bool is_exclusive_lock;
+
+    bool left_is_infinite(void) const;
+
+    bool right_is_infinite(void) const;
+
+    void init(const DBT *left_key, const DBT *right_key, bool is_exclusive);
+  };
+  // PORT static_assert(sizeof(record_header) == 8, "record header format is
+  // off");
+
+ public:
+  // the iterator abstracts reading over a buffer of variable length
+  // records one by one until there are no more left.
+  class iterator {
+   public:
+    iterator();
+    iterator(const range_buffer *buffer);
+
+    // a record represents the user-view of a serialized key range.
+    // it handles positive and negative infinity and the optimized
+    // point range case, where left and right points share memory.
+    class record {
+     public:
+      // get a read-only pointer to the left key of this record's range
+      const DBT *get_left_key(void) const;
+
+      // get a read-only pointer to the right key of this record's range
+      const DBT *get_right_key(void) const;
+
+      // how big is this record? this tells us where the next record is
+      size_t size(void) const;
+
+      bool get_exclusive_flag() const { return _header.is_exclusive_lock; }
+
+      // populate a record header and point our DBT's
+      // buffers into ours if they are not infinite.
+      void deserialize(const char *buf);
+
+     private:
+      record_header _header;
+      DBT _left_key;
+      DBT _right_key;
+    };
+
+    // populate the given record object with the current
+    // the memory referred to by record is valid for only
+    // as long as the record exists.
+    bool current(record *rec);
+
+    // move the iterator to the next record in the buffer
+    void next(void);
+
+   private:
+    void reset_current_chunk();
+
+    // the key range buffer we are iterating over, the current
+    // offset in that buffer, and the size of the current record.
+    memarena::chunk_iterator _ma_chunk_iterator;
+    const void *_current_chunk_base;
+    size_t _current_chunk_offset;
+    size_t _current_chunk_max;
+    size_t _current_rec_size;
+  };
+
+  // allocate buffer space lazily instead of on creation. this way,
+  // no malloc/free is done if the transaction ends up taking no locks.
+  void create(void);
+
+  // append a left/right key range to the buffer.
+  // if the keys are equal, then only one copy is stored.
+  void append(const DBT *left_key, const DBT *right_key,
+              bool is_write_request = false);
+
+  // is this range buffer empty?
+  bool is_empty(void) const;
+
+  // how much memory is being used by this range buffer?
+  uint64_t total_memory_size(void) const;
+
+  // how many ranges are stored in this range buffer?
+  int get_num_ranges(void) const;
+
+  void destroy(void);
+
+ private:
+  memarena _arena;
+  int _num_ranges;
+
+  void append_range(const DBT *left_key, const DBT *right_key,
+                    bool is_write_request);
+
+  // append a point to the buffer. this is the space/time saving
+  // optimization for key ranges where left == right.
+  void append_point(const DBT *key, bool is_write_request);
+};
+
+} /* namespace toku */
diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc
new file mode 100644
index 00000000000..8997f634b06
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc
@@ -0,0 +1,520 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "treenode.h"
+
+#include "../portability/toku_race_tools.h"
+
+namespace toku {
+
+// TODO: source location info might have to be pulled up one caller
+// to be useful
+void treenode::mutex_lock(void) { toku_mutex_lock(&m_mutex); }
+
+void treenode::mutex_unlock(void) { toku_mutex_unlock(&m_mutex); }
+
+void treenode::init(const comparator *cmp) {
+  m_txnid = TXNID_NONE;
+  m_is_root = false;
+  m_is_empty = true;
+  m_cmp = cmp;
+
+  m_is_shared = false;
+  m_owners = nullptr;
+
+  // use an adaptive mutex at each node since we expect the time the
+  // lock is held to be relatively short compared to a context switch.
+  // indeed, this improves performance at high thread counts considerably.
+  memset(&m_mutex, 0, sizeof(toku_mutex_t));
+  toku_pthread_mutexattr_t attr;
+  toku_mutexattr_init(&attr);
+  toku_mutexattr_settype(&attr, TOKU_MUTEX_ADAPTIVE);
+  toku_mutex_init(treenode_mutex_key, &m_mutex, &attr);
+  toku_mutexattr_destroy(&attr);
+  m_left_child.set(nullptr);
+  m_right_child.set(nullptr);
+}
+
+void treenode::create_root(const comparator *cmp) {
+  init(cmp);
+  m_is_root = true;
+}
+
+void treenode::destroy_root(void) {
+  invariant(is_root());
+  invariant(is_empty());
+  toku_mutex_destroy(&m_mutex);
+  m_cmp = nullptr;
+}
+
+void treenode::set_range_and_txnid(const keyrange &range, TXNID txnid,
+                                   bool is_shared) {
+  // allocates a new copy of the range for this node
+  m_range.create_copy(range);
+  m_txnid = txnid;
+  m_is_shared = is_shared;
+  m_is_empty = false;
+}
+
+bool treenode::is_root(void) { return m_is_root; }
+
+bool treenode::is_empty(void) { return m_is_empty; }
+
+bool treenode::range_overlaps(const keyrange &range) {
+  return m_range.overlaps(*m_cmp, range);
+}
+
+treenode *treenode::alloc(const comparator *cmp, const keyrange &range,
+                          TXNID txnid, bool is_shared) {
+  treenode *XCALLOC(node);
+  node->init(cmp);
+  node->set_range_and_txnid(range, txnid, is_shared);
+  return node;
+}
+
+void treenode::swap_in_place(treenode *node1, treenode *node2) {
+  keyrange tmp_range = node1->m_range;
+  TXNID tmp_txnid = node1->m_txnid;
+  node1->m_range = node2->m_range;
+  node1->m_txnid = node2->m_txnid;
+  node2->m_range = tmp_range;
+  node2->m_txnid = tmp_txnid;
+
+  bool tmp_is_shared = node1->m_is_shared;
+  node1->m_is_shared = node2->m_is_shared;
+  node2->m_is_shared = tmp_is_shared;
+
+  auto tmp_m_owners = node1->m_owners;
+  node1->m_owners = node2->m_owners;
+  node2->m_owners = tmp_m_owners;
+}
+
+bool treenode::add_shared_owner(TXNID txnid) {
+  assert(m_is_shared);
+  if (txnid == m_txnid)
+    return false;  // acquiring a lock on the same range by the same trx
+
+  if (m_txnid != TXNID_SHARED) {
+    m_owners = new TxnidVector;
+    m_owners->insert(m_txnid);
+    m_txnid = TXNID_SHARED;
+  }
+  m_owners->insert(txnid);
+  return true;
+}
+
+void treenode::free(treenode *node) {
+  // destroy the range, freeing any copied keys
+  node->m_range.destroy();
+
+  if (node->m_owners) {
+    delete node->m_owners;
+    node->m_owners = nullptr;  // need this?
+  }
+
+  // the root is simply marked as empty.
+  if (node->is_root()) {
+    // PORT toku_mutex_assert_locked(&node->m_mutex);
+    node->m_is_empty = true;
+  } else {
+    // PORT toku_mutex_assert_unlocked(&node->m_mutex);
+    toku_mutex_destroy(&node->m_mutex);
+    toku_free(node);
+  }
+}
+
+uint32_t treenode::get_depth_estimate(void) const {
+  const uint32_t left_est = m_left_child.depth_est;
+  const uint32_t right_est = m_right_child.depth_est;
+  return (left_est > right_est ? left_est : right_est) + 1;
+}
+
+treenode *treenode::find_node_with_overlapping_child(
+    const keyrange &range, const keyrange::comparison *cmp_hint) {
+  // determine which child to look at based on a comparison. if we were
+  // given a comparison hint, use that. otherwise, compare them now.
+  keyrange::comparison c =
+      cmp_hint ? *cmp_hint : range.compare(*m_cmp, m_range);
+
+  treenode *child;
+  if (c == keyrange::comparison::LESS_THAN) {
+    child = lock_and_rebalance_left();
+  } else {
+    // The caller (locked_keyrange::acquire) handles the case where
+    // the root of the locked_keyrange is the node that overlaps.
+    // range is guaranteed not to overlap this node.
+    invariant(c == keyrange::comparison::GREATER_THAN);
+    child = lock_and_rebalance_right();
+  }
+
+  // if the search would lead us to an empty subtree (child == nullptr),
+  // or the child overlaps, then we know this node is the parent we want.
+  // otherwise we need to recur into that child.
+  if (child == nullptr) {
+    return this;
+  } else {
+    c = range.compare(*m_cmp, child->m_range);
+    if (c == keyrange::comparison::EQUALS ||
+        c == keyrange::comparison::OVERLAPS) {
+      child->mutex_unlock();
+      return this;
+    } else {
+      // unlock this node before recurring into the locked child,
+      // passing in a comparison hint since we just comapred range
+      // to the child's range.
+      mutex_unlock();
+      return child->find_node_with_overlapping_child(range, &c);
+    }
+  }
+}
+
+bool treenode::insert(const keyrange &range, TXNID txnid, bool is_shared) {
+  int rc = true;
+  // choose a child to check. if that child is null, then insert the new node
+  // there. otherwise recur down that child's subtree
+  keyrange::comparison c = range.compare(*m_cmp, m_range);
+  if (c == keyrange::comparison::LESS_THAN) {
+    treenode *left_child = lock_and_rebalance_left();
+    if (left_child == nullptr) {
+      left_child = treenode::alloc(m_cmp, range, txnid, is_shared);
+      m_left_child.set(left_child);
+    } else {
+      left_child->insert(range, txnid, is_shared);
+      left_child->mutex_unlock();
+    }
+  } else if (c == keyrange::comparison::GREATER_THAN) {
+    // invariant(c == keyrange::comparison::GREATER_THAN);
+    treenode *right_child = lock_and_rebalance_right();
+    if (right_child == nullptr) {
+      right_child = treenode::alloc(m_cmp, range, txnid, is_shared);
+      m_right_child.set(right_child);
+    } else {
+      right_child->insert(range, txnid, is_shared);
+      right_child->mutex_unlock();
+    }
+  } else if (c == keyrange::comparison::EQUALS) {
+    invariant(is_shared);
+    invariant(m_is_shared);
+    rc = add_shared_owner(txnid);
+  } else {
+    invariant(0);
+  }
+  return rc;
+}
+
+treenode *treenode::find_child_at_extreme(int direction, treenode **parent) {
+  treenode *child =
+      direction > 0 ? m_right_child.get_locked() : m_left_child.get_locked();
+
+  if (child) {
+    *parent = this;
+    treenode *child_extreme = child->find_child_at_extreme(direction, parent);
+    child->mutex_unlock();
+    return child_extreme;
+  } else {
+    return this;
+  }
+}
+
+treenode *treenode::find_leftmost_child(treenode **parent) {
+  return find_child_at_extreme(-1, parent);
+}
+
+treenode *treenode::find_rightmost_child(treenode **parent) {
+  return find_child_at_extreme(1, parent);
+}
+
+treenode *treenode::remove_root_of_subtree() {
+  // if this node has no children, just free it and return null
+  if (m_left_child.ptr == nullptr && m_right_child.ptr == nullptr) {
+    // treenode::free requires that non-root nodes are unlocked
+    if (!is_root()) {
+      mutex_unlock();
+    }
+    treenode::free(this);
+    return nullptr;
+  }
+
+  // we have a child, so get either the in-order successor or
+  // predecessor of this node to be our replacement.
+  // replacement_parent is updated by the find functions as
+  // they recur down the tree, so initialize it to this.
+  treenode *child, *replacement;
+  treenode *replacement_parent = this;
+  if (m_left_child.ptr != nullptr) {
+    child = m_left_child.get_locked();
+    replacement = child->find_rightmost_child(&replacement_parent);
+    invariant(replacement == child || replacement_parent != this);
+
+    // detach the replacement from its parent
+    if (replacement_parent == this) {
+      m_left_child = replacement->m_left_child;
+    } else {
+      replacement_parent->m_right_child = replacement->m_left_child;
+    }
+  } else {
+    child = m_right_child.get_locked();
+    replacement = child->find_leftmost_child(&replacement_parent);
+    invariant(replacement == child || replacement_parent != this);
+
+    // detach the replacement from its parent
+    if (replacement_parent == this) {
+      m_right_child = replacement->m_right_child;
+    } else {
+      replacement_parent->m_left_child = replacement->m_right_child;
+    }
+  }
+  child->mutex_unlock();
+
+  // swap in place with the detached replacement, then destroy it
+  treenode::swap_in_place(replacement, this);
+  treenode::free(replacement);
+
+  return this;
+}
+
+void treenode::recursive_remove(void) {
+  treenode *left = m_left_child.ptr;
+  if (left) {
+    left->recursive_remove();
+  }
+  m_left_child.set(nullptr);
+
+  treenode *right = m_right_child.ptr;
+  if (right) {
+    right->recursive_remove();
+  }
+  m_right_child.set(nullptr);
+
+  // we do not take locks on the way down, so we know non-root nodes
+  // are unlocked here and the caller is required to pass a locked
+  // root, so this free is correct.
+  treenode::free(this);
+}
+
+void treenode::remove_shared_owner(TXNID txnid) {
+  assert(m_owners->size() > 1);
+  m_owners->erase(txnid);
+  assert(m_owners->size() > 0);
+  /* if there is just one owner left, move it to m_txnid */
+  if (m_owners->size() == 1) {
+    m_txnid = *m_owners->begin();
+    delete m_owners;
+    m_owners = nullptr;
+  }
+}
+
+treenode *treenode::remove(const keyrange &range, TXNID txnid) {
+  treenode *child;
+  // if the range is equal to this node's range, then just remove
+  // the root of this subtree. otherwise search down the tree
+  // in either the left or right children.
+  keyrange::comparison c = range.compare(*m_cmp, m_range);
+  switch (c) {
+    case keyrange::comparison::EQUALS: {
+      // if we are the only owners, remove. Otherwise, just remove
+      // us from the owners list.
+      if (txnid != TXNID_ANY && has_multiple_owners()) {
+        remove_shared_owner(txnid);
+        return this;
+      } else {
+        return remove_root_of_subtree();
+      }
+    }
+    case keyrange::comparison::LESS_THAN:
+      child = m_left_child.get_locked();
+      invariant_notnull(child);
+      child = child->remove(range, txnid);
+
+      // unlock the child if there still is one.
+      // regardless, set the right child pointer
+      if (child) {
+        child->mutex_unlock();
+      }
+      m_left_child.set(child);
+      break;
+    case keyrange::comparison::GREATER_THAN:
+      child = m_right_child.get_locked();
+      invariant_notnull(child);
+      child = child->remove(range, txnid);
+
+      // unlock the child if there still is one.
+      // regardless, set the right child pointer
+      if (child) {
+        child->mutex_unlock();
+      }
+      m_right_child.set(child);
+      break;
+    case keyrange::comparison::OVERLAPS:
+      // shouldn't be overlapping, since the tree is
+      // non-overlapping and this range must exist
+      abort();
+  }
+
+  return this;
+}
+
+bool treenode::left_imbalanced(int threshold) const {
+  uint32_t left_depth = m_left_child.depth_est;
+  uint32_t right_depth = m_right_child.depth_est;
+  return m_left_child.ptr != nullptr && left_depth > threshold + right_depth;
+}
+
+bool treenode::right_imbalanced(int threshold) const {
+  uint32_t left_depth = m_left_child.depth_est;
+  uint32_t right_depth = m_right_child.depth_est;
+  return m_right_child.ptr != nullptr && right_depth > threshold + left_depth;
+}
+
+// effect: rebalances the subtree rooted at this node
+//         using AVL style O(1) rotations. unlocks this
+//         node if it is not the new root of the subtree.
+// requires: node is locked by this thread, children are not
+// returns: locked root node of the rebalanced tree
+treenode *treenode::maybe_rebalance(void) {
+  // if we end up not rotating at all, the new root is this
+  treenode *new_root = this;
+  treenode *child = nullptr;
+
+  if (left_imbalanced(IMBALANCE_THRESHOLD)) {
+    child = m_left_child.get_locked();
+    if (child->right_imbalanced(0)) {
+      treenode *grandchild = child->m_right_child.get_locked();
+
+      child->m_right_child = grandchild->m_left_child;
+      grandchild->m_left_child.set(child);
+
+      m_left_child = grandchild->m_right_child;
+      grandchild->m_right_child.set(this);
+
+      new_root = grandchild;
+    } else {
+      m_left_child = child->m_right_child;
+      child->m_right_child.set(this);
+      new_root = child;
+    }
+  } else if (right_imbalanced(IMBALANCE_THRESHOLD)) {
+    child = m_right_child.get_locked();
+    if (child->left_imbalanced(0)) {
+      treenode *grandchild = child->m_left_child.get_locked();
+
+      child->m_left_child = grandchild->m_right_child;
+      grandchild->m_right_child.set(child);
+
+      m_right_child = grandchild->m_left_child;
+      grandchild->m_left_child.set(this);
+
+      new_root = grandchild;
+    } else {
+      m_right_child = child->m_left_child;
+      child->m_left_child.set(this);
+      new_root = child;
+    }
+  }
+
+  // up to three nodes may be locked.
+  // - this
+  // - child
+  // - grandchild (but if it is locked, its the new root)
+  //
+  // one of them is the new root. we unlock everything except the new root.
+  if (child && child != new_root) {
+    TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&child->m_mutex);
+    child->mutex_unlock();
+  }
+  if (this != new_root) {
+    TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&m_mutex);
+    mutex_unlock();
+  }
+  TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(&new_root->m_mutex);
+  return new_root;
+}
+
+treenode *treenode::lock_and_rebalance_left(void) {
+  treenode *child = m_left_child.get_locked();
+  if (child) {
+    treenode *new_root = child->maybe_rebalance();
+    m_left_child.set(new_root);
+    child = new_root;
+  }
+  return child;
+}
+
+treenode *treenode::lock_and_rebalance_right(void) {
+  treenode *child = m_right_child.get_locked();
+  if (child) {
+    treenode *new_root = child->maybe_rebalance();
+    m_right_child.set(new_root);
+    child = new_root;
+  }
+  return child;
+}
+
+void treenode::child_ptr::set(treenode *node) {
+  ptr = node;
+  depth_est = ptr ? ptr->get_depth_estimate() : 0;
+}
+
+treenode *treenode::child_ptr::get_locked(void) {
+  if (ptr) {
+    ptr->mutex_lock();
+    depth_est = ptr->get_depth_estimate();
+  }
+  return ptr;
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h b/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h
new file mode 100644
index 00000000000..ec25a8c583d
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.h
@@ -0,0 +1,302 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <string.h>
+
+#include "../ft/comparator.h"
+#include "../portability/memory.h"
+#include "../portability/toku_pthread.h"
+// PORT: we need LTM_STATUS
+#include "../ft/ft-status.h"
+#include "../portability/txn_subst.h"
+#include "keyrange.h"
+
+namespace toku {
+
+// a node in a tree with its own mutex
+// - range is the "key" of this node
+// - txnid is the single txnid associated with this node
+// - left and right children may be null
+//
+// to build a tree on top of this abstraction, the user:
+// - provides memory for a root node, initializes it via create_root()
+// - performs tree operations on the root node. memory management
+//   below the root node is handled by the abstraction, not the user.
+// this pattern:
+// - guaruntees a root node always exists.
+// - does not allow for rebalances on the root node
+
+class treenode {
+ public:
+  // every treenode function has some common requirements:
+  // - node is locked and children are never locked
+  // - node may be unlocked if no other thread has visibility
+
+  // effect: create the root node
+  void create_root(const comparator *cmp);
+
+  // effect: destroys the root node
+  void destroy_root(void);
+
+  // effect: sets the txnid and copies the given range for this node
+  void set_range_and_txnid(const keyrange &range, TXNID txnid, bool is_shared);
+
+  // returns: true iff this node is marked as empty
+  bool is_empty(void);
+
+  // returns: true if this is the root node, denoted by a null parent
+  bool is_root(void);
+
+  // returns: true if the given range overlaps with this node's range
+  bool range_overlaps(const keyrange &range);
+
+  // effect: locks the node
+  void mutex_lock(void);
+
+  // effect: unlocks the node
+  void mutex_unlock(void);
+
+  // return: node whose child overlaps, or a child that is empty
+  //         and would contain range if it existed
+  // given: if cmp_hint is non-null, then it is a precomputed
+  //        comparison of this node's range to the given range.
+  treenode *find_node_with_overlapping_child(
+      const keyrange &range, const keyrange::comparison *cmp_hint);
+
+  // effect: performs an in-order traversal of the ranges that overlap the
+  //         given range, calling function->fn() on each node that does
+  // requires: function signature is: bool fn(const keyrange &range, TXNID
+  // txnid) requires: fn returns true to keep iterating, false to stop iterating
+  // requires: fn does not attempt to use any ranges read out by value
+  //           after removing a node with an overlapping range from the tree.
+  template <class F>
+  void traverse_overlaps(const keyrange &range, F *function) {
+    keyrange::comparison c = range.compare(*m_cmp, m_range);
+    if (c == keyrange::comparison::EQUALS) {
+      // Doesn't matter if fn wants to keep going, there
+      // is nothing left, so return.
+      function->fn(m_range, m_txnid, m_is_shared, m_owners);
+      return;
+    }
+
+    treenode *left = m_left_child.get_locked();
+    if (left) {
+      if (c != keyrange::comparison::GREATER_THAN) {
+        // Target range is less than this node, or it overlaps this
+        // node.  There may be something on the left.
+        left->traverse_overlaps(range, function);
+      }
+      left->mutex_unlock();
+    }
+
+    if (c == keyrange::comparison::OVERLAPS) {
+      bool keep_going = function->fn(m_range, m_txnid, m_is_shared, m_owners);
+      if (!keep_going) {
+        return;
+      }
+    }
+
+    treenode *right = m_right_child.get_locked();
+    if (right) {
+      if (c != keyrange::comparison::LESS_THAN) {
+        // Target range is greater than this node, or it overlaps this
+        // node.  There may be something on the right.
+        right->traverse_overlaps(range, function);
+      }
+      right->mutex_unlock();
+    }
+  }
+
+  // effect: inserts the given range and txnid into a subtree, recursively
+  // requires: range does not overlap with any node below the subtree
+  bool insert(const keyrange &range, TXNID txnid, bool is_shared);
+
+  // effect: removes the given range from the subtree
+  // requires: range exists in the subtree
+  // returns: the root of the resulting subtree
+  treenode *remove(const keyrange &range, TXNID txnid);
+
+  // effect: removes this node and all of its children, recursively
+  // requires: every node at and below this node is unlocked
+  void recursive_remove(void);
+
+ private:
+  // the child_ptr is a light abstraction for the locking of
+  // a child and the maintenence of its depth estimate.
+
+  struct child_ptr {
+    // set the child pointer
+    void set(treenode *node);
+
+    // get and lock this child if it exists
+    treenode *get_locked(void);
+
+    treenode *ptr;
+    uint32_t depth_est;
+  };
+
+  // the balance factor at which a node is considered imbalanced
+  static const int32_t IMBALANCE_THRESHOLD = 2;
+
+  // node-level mutex
+  toku_mutex_t m_mutex;
+
+  // the range and txnid for this node. the range contains a copy
+  // of the keys originally inserted into the tree. nodes may
+  // swap ranges. but at the end of the day, when a node is
+  // destroyed, it frees the memory associated with whatever range
+  // it has at the time of destruction.
+  keyrange m_range;
+
+  void remove_shared_owner(TXNID txnid);
+
+  bool has_multiple_owners() { return (m_txnid == TXNID_SHARED); }
+
+ private:
+  // Owner transaction id.
+  // A value of TXNID_SHARED means this node has multiple owners
+  TXNID m_txnid;
+
+  // If true, this lock is a non-exclusive lock, and it can have either
+  // one or several owners.
+  bool m_is_shared;
+
+  // List of the owners, or nullptr if there's just one owner.
+  TxnidVector *m_owners;
+
+  // two child pointers
+  child_ptr m_left_child;
+  child_ptr m_right_child;
+
+  // comparator for ranges
+  // psergey-todo: Is there any sense to store the comparator in each tree
+  // node?
+  const comparator *m_cmp;
+
+  // marked for the root node. the root node is never free()'d
+  // when removed, but instead marked as empty.
+  bool m_is_root;
+
+  // marked for an empty node. only valid for the root.
+  bool m_is_empty;
+
+  // effect: initializes an empty node with the given comparator
+  void init(const comparator *cmp);
+
+  // requires: this is a shared node (m_is_shared==true)
+  // effect: another transaction is added as an owner.
+  // returns: true <=> added another owner
+  //          false <=> this transaction is already an owner
+  bool add_shared_owner(TXNID txnid);
+
+  // requires: *parent is initialized to something meaningful.
+  // requires: subtree is non-empty
+  // returns: the leftmost child of the given subtree
+  // returns: a pointer to the parent of said child in *parent, only
+  //          if this function recurred, otherwise it is untouched.
+  treenode *find_leftmost_child(treenode **parent);
+
+  // requires: *parent is initialized to something meaningful.
+  // requires: subtree is non-empty
+  // returns: the rightmost child of the given subtree
+  // returns: a pointer to the parent of said child in *parent, only
+  //          if this function recurred, otherwise it is untouched.
+  treenode *find_rightmost_child(treenode **parent);
+
+  // effect: remove the root of this subtree, destroying the old root
+  // returns: the new root of the subtree
+  treenode *remove_root_of_subtree(void);
+
+  // requires: subtree is non-empty, direction is not 0
+  // returns: the child of the subtree at either the left or rightmost extreme
+  treenode *find_child_at_extreme(int direction, treenode **parent);
+
+  // effect: retrieves and possibly rebalances the left child
+  // returns: a locked left child, if it exists
+  treenode *lock_and_rebalance_left(void);
+
+  // effect: retrieves and possibly rebalances the right child
+  // returns: a locked right child, if it exists
+  treenode *lock_and_rebalance_right(void);
+
+  // returns: the estimated depth of this subtree
+  uint32_t get_depth_estimate(void) const;
+
+  // returns: true iff left subtree depth is sufficiently less than the right
+  bool left_imbalanced(int threshold) const;
+
+  // returns: true iff right subtree depth is sufficiently greater than the left
+  bool right_imbalanced(int threshold) const;
+
+  // effect: performs an O(1) rebalance, which will "heal" an imbalance by at
+  // most 1. effect: if the new root is not this node, then this node is
+  // unlocked. returns: locked node representing the new root of the rebalanced
+  // subtree
+  treenode *maybe_rebalance(void);
+
+  // returns: allocated treenode populated with a copy of the range and txnid
+  static treenode *alloc(const comparator *cmp, const keyrange &range,
+                         TXNID txnid, bool is_shared);
+
+  // requires: node is a locked root node, or an unlocked non-root node
+  static void free(treenode *node);
+
+  // effect: swaps the range/txnid pairs for node1 and node2.
+  static void swap_in_place(treenode *node1, treenode *node2);
+
+  friend class concurrent_tree_unit_test;
+};
+
+} /* namespace toku */
diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc
new file mode 100644
index 00000000000..4caf1e26fd8
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc
@@ -0,0 +1,120 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "txnid_set.h"
+
+#include "../db.h"
+
+namespace toku {
+
+int find_by_txnid(const TXNID &txnid_a, const TXNID &txnid_b);
+int find_by_txnid(const TXNID &txnid_a, const TXNID &txnid_b) {
+  if (txnid_a < txnid_b) {
+    return -1;
+  } else if (txnid_a == txnid_b) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+void txnid_set::create(void) {
+  // lazily allocate the underlying omt, since it is common
+  // to create a txnid set and never put anything in it.
+  m_txnids.create_no_array();
+}
+
+void txnid_set::destroy(void) { m_txnids.destroy(); }
+
+// Return true if the given transaction id is a member of the set.
+// Otherwise, return false.
+bool txnid_set::contains(TXNID txnid) const {
+  TXNID find_txnid;
+  int r = m_txnids.find_zero<TXNID, find_by_txnid>(txnid, &find_txnid, nullptr);
+  return r == 0 ? true : false;
+}
+
+// Add a given txnid to the set
+void txnid_set::add(TXNID txnid) {
+  int r = m_txnids.insert<TXNID, find_by_txnid>(txnid, txnid, nullptr);
+  invariant(r == 0 || r == DB_KEYEXIST);
+}
+
+// Delete a given txnid from the set.
+void txnid_set::remove(TXNID txnid) {
+  uint32_t idx;
+  int r = m_txnids.find_zero<TXNID, find_by_txnid>(txnid, nullptr, &idx);
+  if (r == 0) {
+    r = m_txnids.delete_at(idx);
+    invariant_zero(r);
+  }
+}
+
+// Return the size of the set
+uint32_t txnid_set::size(void) const { return m_txnids.size(); }
+
+// Get the ith id in the set, assuming that the set is sorted.
+TXNID txnid_set::get(uint32_t i) const {
+  TXNID txnid;
+  int r = m_txnids.fetch(i, &txnid);
+  if (r == EINVAL) /* Shouldn't happen, avoid compiler warning */
+    return TXNID_NONE;
+  invariant_zero(r);
+  return txnid;
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h b/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h
new file mode 100644
index 00000000000..d79c24fb0c6
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.h
@@ -0,0 +1,92 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../portability/txn_subst.h"
+#include "../util/omt.h"
+
+namespace toku {
+
+class txnid_set {
+ public:
+  // effect: Creates an empty set. Does not malloc space for
+  //         any entries yet. That is done lazily on add().
+  void create(void);
+
+  // effect: Destroy the set's internals.
+  void destroy(void);
+
+  // returns: True if the given txnid is a member of the set.
+  bool contains(TXNID id) const;
+
+  // effect: Adds a given txnid to the set if it did not exist
+  void add(TXNID txnid);
+
+  // effect: Deletes a txnid from the set if it exists.
+  void remove(TXNID txnid);
+
+  // returns: Size of the set
+  uint32_t size(void) const;
+
+  // returns: The "i'th" id in the set, as if it were sorted.
+  TXNID get(uint32_t i) const;
+
+ private:
+  toku::omt<TXNID> m_txnids;
+
+  friend class txnid_set_unit_test;
+};
+ENSURE_POD(txnid_set);
+
+} /* namespace toku */
diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc
new file mode 100644
index 00000000000..24536c88ecf
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc
@@ -0,0 +1,213 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "../db.h"
+#include "../portability/memory.h"
+// PORT #include <toku_assert.h>
+#include <memory.h>
+#include <string.h>
+
+#include "txnid_set.h"
+#include "wfg.h"
+
+namespace toku {
+
+// Create a lock request graph
+void wfg::create(void) { m_nodes.create(); }
+
+// Destroy the internals of the lock request graph
+void wfg::destroy(void) {
+  uint32_t n_nodes = m_nodes.size();
+  for (uint32_t i = 0; i < n_nodes; i++) {
+    node *n;
+    int r = m_nodes.fetch(i, &n);
+    invariant_zero(r);
+    invariant_notnull(n);
+    if (r) continue;  // Get rid of "may be used uninitialized" warning
+    node::free(n);
+  }
+  m_nodes.destroy();
+}
+
+// Add an edge (a_id, b_id) to the graph
+void wfg::add_edge(TXNID a_txnid, TXNID b_txnid) {
+  node *a_node = find_create_node(a_txnid);
+  node *b_node = find_create_node(b_txnid);
+  a_node->edges.add(b_node->txnid);
+}
+
+// Return true if a node with the given transaction id exists in the graph.
+// Return false otherwise.
+bool wfg::node_exists(TXNID txnid) {
+  node *n = find_node(txnid);
+  return n != NULL;
+}
+
+bool wfg::cycle_exists_from_node(node *target, node *head,
+                                 std::function<void(TXNID)> reporter) {
+  bool cycle_found = false;
+  head->visited = true;
+  uint32_t n_edges = head->edges.size();
+  for (uint32_t i = 0; i < n_edges && !cycle_found; i++) {
+    TXNID edge_id = head->edges.get(i);
+    if (target->txnid == edge_id) {
+      cycle_found = true;
+      if (reporter) reporter(edge_id);
+    } else {
+      node *new_head = find_node(edge_id);
+      if (new_head && !new_head->visited) {
+        cycle_found = cycle_exists_from_node(target, new_head, reporter);
+        if (cycle_found && reporter) reporter(edge_id);
+      }
+    }
+  }
+  head->visited = false;
+  return cycle_found;
+}
+
+// Return true if there exists a cycle from a given transaction id in the graph.
+// Return false otherwise.
+bool wfg::cycle_exists_from_txnid(TXNID txnid,
+                                  std::function<void(TXNID)> reporter) {
+  node *a_node = find_node(txnid);
+  bool cycles_found = false;
+  if (a_node) {
+    cycles_found = cycle_exists_from_node(a_node, a_node, reporter);
+  }
+  return cycles_found;
+}
+
+// Apply a given function f to all of the nodes in the graph.  The apply
+// function returns when the function f is called for all of the nodes in the
+// graph, or the function f returns non-zero.
+void wfg::apply_nodes(int (*fn)(TXNID id, void *extra), void *extra) {
+  int r = 0;
+  uint32_t n_nodes = m_nodes.size();
+  for (uint32_t i = 0; i < n_nodes && r == 0; i++) {
+    node *n;
+    r = m_nodes.fetch(i, &n);
+    invariant_zero(r);
+    if (r) continue;  // Get rid of "may be used uninitialized" warning
+    r = fn(n->txnid, extra);
+  }
+}
+
+// Apply a given function f to all of the edges whose origin is a given node id.
+// The apply function returns when the function f is called for all edges in the
+// graph rooted at node id, or the function f returns non-zero.
+void wfg::apply_edges(TXNID txnid,
+                      int (*fn)(TXNID txnid, TXNID edge_txnid, void *extra),
+                      void *extra) {
+  node *n = find_node(txnid);
+  if (n) {
+    int r = 0;
+    uint32_t n_edges = n->edges.size();
+    for (uint32_t i = 0; i < n_edges && r == 0; i++) {
+      r = fn(txnid, n->edges.get(i), extra);
+    }
+  }
+}
+
+// find node by id
+wfg::node *wfg::find_node(TXNID txnid) {
+  node *n = nullptr;
+  int r = m_nodes.find_zero<TXNID, find_by_txnid>(txnid, &n, nullptr);
+  invariant(r == 0 || r == DB_NOTFOUND);
+  return n;
+}
+
+// this is the omt comparison function
+// nodes are compared by their txnid.
+int wfg::find_by_txnid(node *const &node_a, const TXNID &txnid_b) {
+  TXNID txnid_a = node_a->txnid;
+  if (txnid_a < txnid_b) {
+    return -1;
+  } else if (txnid_a == txnid_b) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+// insert a new node
+wfg::node *wfg::find_create_node(TXNID txnid) {
+  node *n;
+  uint32_t idx;
+  int r = m_nodes.find_zero<TXNID, find_by_txnid>(txnid, &n, &idx);
+  if (r == DB_NOTFOUND) {
+    n = node::alloc(txnid);
+    r = m_nodes.insert_at(n, idx);
+    invariant_zero(r);
+  }
+  invariant_notnull(n);
+  return n;
+}
+
+wfg::node *wfg::node::alloc(TXNID txnid) {
+  node *XCALLOC(n);
+  n->txnid = txnid;
+  n->visited = false;
+  n->edges.create();
+  return n;
+}
+
+void wfg::node::free(wfg::node *n) {
+  n->edges.destroy();
+  toku_free(n);
+}
+
+} /* namespace toku */
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h b/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h
new file mode 100644
index 00000000000..80420217079
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.h
@@ -0,0 +1,124 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <functional>
+
+#include "../util/omt.h"
+#include "txnid_set.h"
+
+namespace toku {
+
+// A wfg is a 'wait-for' graph. A directed edge in represents one
+// txn waiting for another to finish before it can acquire a lock.
+
+class wfg {
+ public:
+  // Create a lock request graph
+  void create(void);
+
+  // Destroy the internals of the lock request graph
+  void destroy(void);
+
+  // Add an edge (a_id, b_id) to the graph
+  void add_edge(TXNID a_txnid, TXNID b_txnid);
+
+  // Return true if a node with the given transaction id exists in the graph.
+  // Return false otherwise.
+  bool node_exists(TXNID txnid);
+
+  // Return true if there exists a cycle from a given transaction id in the
+  // graph. Return false otherwise.
+  bool cycle_exists_from_txnid(TXNID txnid,
+                               std::function<void(TXNID)> reporter);
+
+  // Apply a given function f to all of the nodes in the graph.  The apply
+  // function returns when the function f is called for all of the nodes in the
+  // graph, or the function f returns non-zero.
+  void apply_nodes(int (*fn)(TXNID txnid, void *extra), void *extra);
+
+  // Apply a given function f to all of the edges whose origin is a given node
+  // id. The apply function returns when the function f is called for all edges
+  // in the graph rooted at node id, or the function f returns non-zero.
+  void apply_edges(TXNID txnid,
+                   int (*fn)(TXNID txnid, TXNID edge_txnid, void *extra),
+                   void *extra);
+
+ private:
+  struct node {
+    // txnid for this node and the associated set of edges
+    TXNID txnid;
+    txnid_set edges;
+    bool visited;
+
+    static node *alloc(TXNID txnid);
+
+    static void free(node *n);
+  };
+  ENSURE_POD(node);
+
+  toku::omt<node *> m_nodes;
+
+  node *find_node(TXNID txnid);
+
+  node *find_create_node(TXNID txnid);
+
+  bool cycle_exists_from_node(node *target, node *head,
+                              std::function<void(TXNID)> reporter);
+
+  static int find_by_txnid(node *const &node_a, const TXNID &txnid_b);
+};
+ENSURE_POD(wfg);
+
+} /* namespace toku */
diff --git a/utilities/transactions/lock/range/range_tree/lib/portability/memory.h b/utilities/transactions/lock/range/range_tree/lib/portability/memory.h
new file mode 100644
index 00000000000..0a621f8e01e
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/portability/memory.h
@@ -0,0 +1,215 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <stdlib.h>
+
+#include "toku_portability.h"
+
+/* Percona memory allocation functions and macros.
+ * These are functions for malloc and free */
+
+int toku_memory_startup(void) __attribute__((constructor));
+void toku_memory_shutdown(void) __attribute__((destructor));
+
+/* Generally: errno is set to 0 or a value to indicate problems. */
+
+// Everything should call toku_malloc() instead of malloc(), and toku_calloc()
+// instead of calloc() That way the tests can can, e.g.,  replace the malloc
+// function using toku_set_func_malloc().
+void *toku_calloc(size_t nmemb, size_t size)
+    __attribute__((__visibility__("default")));
+void *toku_xcalloc(size_t nmemb, size_t size)
+    __attribute__((__visibility__("default")));
+void *toku_malloc(size_t size) __attribute__((__visibility__("default")));
+void *toku_malloc_aligned(size_t alignment, size_t size)
+    __attribute__((__visibility__("default")));
+
+// xmalloc aborts instead of return NULL if we run out of memory
+void *toku_xmalloc(size_t size) __attribute__((__visibility__("default")));
+void *toku_xrealloc(void *, size_t size)
+    __attribute__((__visibility__("default")));
+void *toku_xmalloc_aligned(size_t alignment, size_t size)
+    __attribute__((__visibility__("default")));
+// Effect: Perform a os_malloc_aligned(size) with the additional property that
+// the returned pointer is a multiple of ALIGNMENT.
+//  Fail with a resource_assert if the allocation fails (don't return an error
+//  code). If the alloc_aligned function has been set then call it instead.
+// Requires: alignment is a power of two.
+
+void toku_free(void *) __attribute__((__visibility__("default")));
+
+size_t toku_malloc_usable_size(void *p)
+    __attribute__((__visibility__("default")));
+
+/* MALLOC is a macro that helps avoid a common error:
+ * Suppose I write
+ *    struct foo *x = malloc(sizeof(struct foo));
+ * That works fine.  But if I change it to this, I've probably made an mistake:
+ *    struct foo *x = malloc(sizeof(struct bar));
+ * It can get worse, since one might have something like
+ *    struct foo *x = malloc(sizeof(struct foo *))
+ * which looks reasonable, but it allocoates enough to hold a pointer instead of
+ * the amount needed for the struct. So instead, write struct foo *MALLOC(x);
+ * and you cannot go wrong.
+ */
+#define MALLOC(v) CAST_FROM_VOIDP(v, toku_malloc(sizeof(*v)))
+/* MALLOC_N is like calloc(Except no 0ing of data):  It makes an array.  Write
+ *   int *MALLOC_N(5,x);
+ * to make an array of 5 integers.
+ */
+#define MALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_malloc((n) * sizeof(*v)))
+#define MALLOC_N_ALIGNED(align, n, v) \
+  CAST_FROM_VOIDP(v, toku_malloc_aligned((align), (n) * sizeof(*v)))
+
+// CALLOC_N is like calloc with auto-figuring out size of members
+#define CALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_calloc((n), sizeof(*v)))
+
+#define CALLOC(v) CALLOC_N(1, v)
+
+// XMALLOC macros are like MALLOC except they abort if the operation fails
+#define XMALLOC(v) CAST_FROM_VOIDP(v, toku_xmalloc(sizeof(*v)))
+#define XMALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xmalloc((n) * sizeof(*v)))
+#define XCALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xcalloc((n), (sizeof(*v))))
+#define XCALLOC(v) XCALLOC_N(1, v)
+#define XREALLOC(v, s) CAST_FROM_VOIDP(v, toku_xrealloc(v, s))
+#define XREALLOC_N(n, v) CAST_FROM_VOIDP(v, toku_xrealloc(v, (n) * sizeof(*v)))
+
+#define XMALLOC_N_ALIGNED(align, n, v) \
+  CAST_FROM_VOIDP(v, toku_xmalloc_aligned((align), (n) * sizeof(*v)))
+
+#define XMEMDUP(dst, src) CAST_FROM_VOIDP(dst, toku_xmemdup(src, sizeof(*src)))
+#define XMEMDUP_N(dst, src, len) CAST_FROM_VOIDP(dst, toku_xmemdup(src, len))
+
+// ZERO_ARRAY writes zeroes to a stack-allocated array
+#define ZERO_ARRAY(o)          \
+  do {                         \
+    memset((o), 0, sizeof(o)); \
+  } while (0)
+// ZERO_STRUCT writes zeroes to a stack-allocated struct
+#define ZERO_STRUCT(o)          \
+  do {                          \
+    memset(&(o), 0, sizeof(o)); \
+  } while (0)
+
+/* Copy memory.  Analogous to strdup() */
+void *toku_memdup(const void *v, size_t len);
+/* Toku-version of strdup.  Use this so that it calls toku_malloc() */
+char *toku_strdup(const char *s) __attribute__((__visibility__("default")));
+/* Toku-version of strndup.  Use this so that it calls toku_malloc() */
+char *toku_strndup(const char *s, size_t n)
+    __attribute__((__visibility__("default")));
+/* Copy memory.  Analogous to strdup() Crashes instead of returning NULL */
+void *toku_xmemdup(const void *v, size_t len)
+    __attribute__((__visibility__("default")));
+/* Toku-version of strdup.  Use this so that it calls toku_xmalloc()  Crashes
+ * instead of returning NULL */
+char *toku_xstrdup(const char *s) __attribute__((__visibility__("default")));
+
+void toku_malloc_cleanup(
+    void); /* Before exiting, call this function to free up any internal data
+              structures from toku_malloc.  Otherwise valgrind will complain of
+              memory leaks. */
+
+/* Check to see if everything malloc'd was free.  Might be a no-op depending on
+ * how memory.c is configured. */
+void toku_memory_check_all_free(void);
+/* Check to see if memory is "sane".  Might be a no-op.  Probably better to
+ * simply use valgrind. */
+void toku_do_memory_check(void);
+
+typedef void *(*malloc_fun_t)(size_t);
+typedef void (*free_fun_t)(void *);
+typedef void *(*realloc_fun_t)(void *, size_t);
+typedef void *(*malloc_aligned_fun_t)(size_t /*alignment*/, size_t /*size*/);
+typedef void *(*realloc_aligned_fun_t)(size_t /*alignment*/, void * /*pointer*/,
+                                       size_t /*size*/);
+
+void toku_set_func_malloc(malloc_fun_t f);
+void toku_set_func_xmalloc_only(malloc_fun_t f);
+void toku_set_func_malloc_only(malloc_fun_t f);
+void toku_set_func_realloc(realloc_fun_t f);
+void toku_set_func_xrealloc_only(realloc_fun_t f);
+void toku_set_func_realloc_only(realloc_fun_t f);
+void toku_set_func_free(free_fun_t f);
+
+typedef struct memory_status {
+  uint64_t malloc_count;   // number of malloc operations
+  uint64_t free_count;     // number of free operations
+  uint64_t realloc_count;  // number of realloc operations
+  uint64_t malloc_fail;    // number of malloc operations that failed
+  uint64_t realloc_fail;   // number of realloc operations that failed
+  uint64_t requested;      // number of bytes requested
+  uint64_t used;   // number of bytes used (requested + overhead), obtained from
+                   // malloc_usable_size()
+  uint64_t freed;  // number of bytes freed;
+  uint64_t max_requested_size;  // largest attempted allocation size
+  uint64_t last_failed_size;    // size of the last failed allocation attempt
+  volatile uint64_t
+      max_in_use;  // maximum memory footprint (used - freed), approximate (not
+                   // worth threadsafety overhead for exact)
+  const char *mallocator_version;
+  uint64_t mmap_threshold;
+} LOCAL_MEMORY_STATUS_S, *LOCAL_MEMORY_STATUS;
+
+void toku_memory_get_status(LOCAL_MEMORY_STATUS s);
+
+// Effect: Like toku_memory_footprint, except instead of passing p,
+//   we pass toku_malloc_usable_size(p).
+size_t toku_memory_footprint_given_usable_size(size_t touched, size_t usable);
+
+// Effect: Return an estimate how how much space an object is using, possibly by
+//   using toku_malloc_usable_size(p).
+//   If p is NULL then returns 0.
+size_t toku_memory_footprint(void *p, size_t touched);
diff --git a/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h b/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h
new file mode 100644
index 00000000000..af47800fb71
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h
@@ -0,0 +1,39 @@
+//
+// A replacement for toku_assert.h
+//
+#pragma once
+
+#include <assert.h>
+#include <errno.h>
+
+#ifdef NDEBUG
+
+#define assert_zero(a) ((void)(a))
+#define invariant(a) ((void)(a))
+#define invariant_notnull(a) ((void)(a))
+#define invariant_zero(a) ((void)(a))
+
+#else
+
+#define assert_zero(a) assert((a) == 0)
+#define invariant(a) assert(a)
+#define invariant_notnull(a) assert(a)
+#define invariant_zero(a) assert_zero(a)
+
+#endif
+
+#define lazy_assert_zero(a) assert_zero(a)
+
+#define paranoid_invariant_zero(a) assert_zero(a)
+#define paranoid_invariant_notnull(a) assert(a)
+#define paranoid_invariant(a) assert(a)
+
+#define ENSURE_POD(type)                                                    \
+  static_assert(                                                            \
+      std::is_standard_layout<type>::value && std::is_trivial<type>::value, \
+      #type "isn't POD")
+
+inline int get_error_errno(void) {
+  invariant(errno);
+  return errno;
+}
diff --git a/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h b/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h
new file mode 100644
index 00000000000..aaa2298faf4
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/portability/toku_atomic.h
@@ -0,0 +1,130 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+// PORT2: #include <portability/toku_config.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "toku_assert_subst.h"
+
+__attribute__((const, always_inline)) static inline intptr_t which_cache_line(
+    intptr_t addr) {
+  static const size_t assumed_cache_line_size = 64;
+  return addr / assumed_cache_line_size;
+}
+template <typename T>
+__attribute__((const, always_inline)) static inline bool crosses_boundary(
+    T *addr, size_t width) {
+  const intptr_t int_addr = reinterpret_cast<intptr_t>(addr);
+  const intptr_t last_byte = int_addr + width - 1;
+  return which_cache_line(int_addr) != which_cache_line(last_byte);
+}
+
+template <typename T, typename U>
+__attribute__((always_inline)) static inline T toku_sync_fetch_and_add(T *addr,
+                                                                       U diff) {
+  paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+  return __sync_fetch_and_add(addr, diff);
+}
+template <typename T, typename U>
+__attribute__((always_inline)) static inline T toku_sync_add_and_fetch(T *addr,
+                                                                       U diff) {
+  paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+  return __sync_add_and_fetch(addr, diff);
+}
+template <typename T, typename U>
+__attribute__((always_inline)) static inline T toku_sync_fetch_and_sub(T *addr,
+                                                                       U diff) {
+  paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+  return __sync_fetch_and_sub(addr, diff);
+}
+template <typename T, typename U>
+__attribute__((always_inline)) static inline T toku_sync_sub_and_fetch(T *addr,
+                                                                       U diff) {
+  paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+  return __sync_sub_and_fetch(addr, diff);
+}
+template <typename T, typename U, typename V>
+__attribute__((always_inline)) static inline T toku_sync_val_compare_and_swap(
+    T *addr, U oldval, V newval) {
+  paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+  return __sync_val_compare_and_swap(addr, oldval, newval);
+}
+template <typename T, typename U, typename V>
+__attribute__((always_inline)) static inline bool
+toku_sync_bool_compare_and_swap(T *addr, U oldval, V newval) {
+  paranoid_invariant(!crosses_boundary(addr, sizeof *addr));
+  return __sync_bool_compare_and_swap(addr, oldval, newval);
+}
+
+// in case you include this but not toku_portability.h
+#pragma GCC poison __sync_fetch_and_add
+#pragma GCC poison __sync_fetch_and_sub
+#pragma GCC poison __sync_fetch_and_or
+#pragma GCC poison __sync_fetch_and_and
+#pragma GCC poison __sync_fetch_and_xor
+#pragma GCC poison __sync_fetch_and_nand
+#pragma GCC poison __sync_add_and_fetch
+#pragma GCC poison __sync_sub_and_fetch
+#pragma GCC poison __sync_or_and_fetch
+#pragma GCC poison __sync_and_and_fetch
+#pragma GCC poison __sync_xor_and_fetch
+#pragma GCC poison __sync_nand_and_fetch
+#pragma GCC poison __sync_bool_compare_and_swap
+#pragma GCC poison __sync_val_compare_and_swap
+#pragma GCC poison __sync_synchronize
+#pragma GCC poison __sync_lock_test_and_set
+#pragma GCC poison __sync_release
diff --git a/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h b/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h
new file mode 100644
index 00000000000..b9f9c3e01e8
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h
@@ -0,0 +1,82 @@
+/*
+  A wrapper around rocksdb::TransactionDBMutexFactory-provided condition and
+  mutex that provides toku_pthread_*-like interface. The functions are named
+
+    toku_external_{mutex|cond}_XXX
+
+  Lock Tree uses this mutex and condition for interruptible (long) lock waits.
+
+  (It also still uses toku_pthread_XXX calls for mutexes/conditions for
+   shorter waits on internal objects)
+*/
+
+#pragma once
+
+#include <pthread.h>
+#include <stdint.h>
+#include <time.h>
+
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/utilities/transaction_db_mutex.h"
+#include "toku_portability.h"
+
+using ROCKSDB_NAMESPACE::TransactionDBCondVar;
+using ROCKSDB_NAMESPACE::TransactionDBMutex;
+
+typedef std::shared_ptr<ROCKSDB_NAMESPACE::TransactionDBMutexFactory>
+    toku_external_mutex_factory_t;
+
+typedef std::shared_ptr<TransactionDBMutex> toku_external_mutex_t;
+typedef std::shared_ptr<TransactionDBCondVar> toku_external_cond_t;
+
+static inline void toku_external_cond_init(
+    toku_external_mutex_factory_t mutex_factory, toku_external_cond_t *cond) {
+  *cond = mutex_factory->AllocateCondVar();
+}
+
+inline void toku_external_cond_destroy(toku_external_cond_t *cond) {
+  cond->reset();  // this will destroy the managed object
+}
+
+inline void toku_external_cond_signal(toku_external_cond_t *cond) {
+  (*cond)->Notify();
+}
+
+inline void toku_external_cond_broadcast(toku_external_cond_t *cond) {
+  (*cond)->NotifyAll();
+}
+
+inline int toku_external_cond_timedwait(toku_external_cond_t *cond,
+                                        toku_external_mutex_t *mutex,
+                                        int64_t timeout_microsec) {
+  auto res = (*cond)->WaitFor(*mutex, timeout_microsec);
+  if (res.ok())
+    return 0;
+  else
+    return ETIMEDOUT;
+}
+
+inline void toku_external_mutex_init(toku_external_mutex_factory_t factory,
+                                     toku_external_mutex_t *mutex) {
+  // Use placement new: the memory has been allocated but constructor wasn't
+  // called
+  new (mutex) toku_external_mutex_t;
+  *mutex = factory->AllocateMutex();
+}
+
+inline void toku_external_mutex_lock(toku_external_mutex_t *mutex) {
+  (*mutex)->Lock();
+}
+
+inline int toku_external_mutex_trylock(toku_external_mutex_t *mutex) {
+  (*mutex)->Lock();
+  return 0;
+}
+
+inline void toku_external_mutex_unlock(toku_external_mutex_t *mutex) {
+  (*mutex)->UnLock();
+}
+
+inline void toku_external_mutex_destroy(toku_external_mutex_t *mutex) {
+  mutex->reset();  // this will destroy the managed object
+}
diff --git a/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h b/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h
new file mode 100644
index 00000000000..c967e71771b
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/portability/toku_instrumentation.h
@@ -0,0 +1,286 @@
+/*======
+This file is part of PerconaFT.
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#pragma once
+
+#include <stdio.h>  // FILE
+
+// Performance instrumentation object identifier type
+typedef unsigned int pfs_key_t;
+
+enum class toku_instr_object_type { mutex, rwlock, cond, thread, file };
+
+struct PSI_file;
+
+struct TOKU_FILE {
+  /** The real file. */
+  FILE *file;
+  struct PSI_file *key;
+  TOKU_FILE() : file(nullptr), key(nullptr) {}
+};
+
+struct PSI_mutex;
+struct PSI_cond;
+struct PSI_rwlock;
+
+struct toku_mutex_t;
+struct toku_cond_t;
+struct toku_pthread_rwlock_t;
+
+class toku_instr_key;
+
+class toku_instr_probe_empty {
+ public:
+  explicit toku_instr_probe_empty(UU(const toku_instr_key &key)) {}
+
+  void start_with_source_location(UU(const char *src_file), UU(int src_line)) {}
+
+  void stop() {}
+};
+
+#define TOKU_PROBE_START(p) p->start_with_source_location(__FILE__, __LINE__)
+#define TOKU_PROBE_STOP(p) p->stop
+
+extern toku_instr_key toku_uninstrumented;
+
+#ifndef MYSQL_TOKUDB_ENGINE
+
+#include <pthread.h>
+
+class toku_instr_key {
+ public:
+  toku_instr_key(UU(toku_instr_object_type type), UU(const char *group),
+                 UU(const char *name)) {}
+
+  explicit toku_instr_key(UU(pfs_key_t key_id)) {}
+  // No-instrumentation constructor:
+  toku_instr_key() {}
+  ~toku_instr_key() {}
+};
+
+typedef toku_instr_probe_empty toku_instr_probe;
+
+enum class toku_instr_file_op {
+  file_stream_open,
+  file_create,
+  file_open,
+  file_delete,
+  file_rename,
+  file_read,
+  file_write,
+  file_sync,
+  file_stream_close,
+  file_close,
+  file_stat
+};
+
+struct PSI_file {};
+struct PSI_mutex {};
+
+struct toku_io_instrumentation {};
+
+inline int toku_pthread_create(UU(const toku_instr_key &key), pthread_t *thread,
+                               const pthread_attr_t *attr,
+                               void *(*start_routine)(void *), void *arg) {
+  return pthread_create(thread, attr, start_routine, arg);
+}
+
+inline void toku_instr_register_current_thread() {}
+
+inline void toku_instr_delete_current_thread() {}
+
+// Instrument file creation, opening, closing, and renaming
+inline void toku_instr_file_open_begin(UU(toku_io_instrumentation &io_instr),
+                                       UU(const toku_instr_key &key),
+                                       UU(toku_instr_file_op op),
+                                       UU(const char *name),
+                                       UU(const char *src_file),
+                                       UU(int src_line)) {}
+
+inline void toku_instr_file_stream_open_end(
+    UU(toku_io_instrumentation &io_instr), UU(TOKU_FILE &file)) {}
+
+inline void toku_instr_file_open_end(UU(toku_io_instrumentation &io_instr),
+                                     UU(int fd)) {}
+
+inline void toku_instr_file_name_close_begin(
+    UU(toku_io_instrumentation &io_instr), UU(const toku_instr_key &key),
+    UU(toku_instr_file_op op), UU(const char *name), UU(const char *src_file),
+    UU(int src_line)) {}
+
+inline void toku_instr_file_stream_close_begin(
+    UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op),
+    UU(TOKU_FILE &file), UU(const char *src_file), UU(int src_line)) {}
+
+inline void toku_instr_file_fd_close_begin(
+    UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op),
+    UU(int fd), UU(const char *src_file), UU(int src_line)) {}
+
+inline void toku_instr_file_close_end(UU(toku_io_instrumentation &io_instr),
+                                      UU(int result)) {}
+
+inline void toku_instr_file_io_begin(UU(toku_io_instrumentation &io_instr),
+                                     UU(toku_instr_file_op op), UU(int fd),
+                                     UU(unsigned int count),
+                                     UU(const char *src_file),
+                                     UU(int src_line)) {}
+
+inline void toku_instr_file_name_io_begin(
+    UU(toku_io_instrumentation &io_instr), UU(const toku_instr_key &key),
+    UU(toku_instr_file_op op), UU(const char *name), UU(unsigned int count),
+    UU(const char *src_file), UU(int src_line)) {}
+
+inline void toku_instr_file_stream_io_begin(
+    UU(toku_io_instrumentation &io_instr), UU(toku_instr_file_op op),
+    UU(TOKU_FILE &file), UU(unsigned int count), UU(const char *src_file),
+    UU(int src_line)) {}
+
+inline void toku_instr_file_io_end(UU(toku_io_instrumentation &io_instr),
+                                   UU(unsigned int count)) {}
+
+struct toku_mutex_t;
+
+struct toku_mutex_instrumentation {};
+
+inline PSI_mutex *toku_instr_mutex_init(UU(const toku_instr_key &key),
+                                        UU(toku_mutex_t &mutex)) {
+  return nullptr;
+}
+
+inline void toku_instr_mutex_destroy(UU(PSI_mutex *&mutex_instr)) {}
+
+inline void toku_instr_mutex_lock_start(
+    UU(toku_mutex_instrumentation &mutex_instr), UU(toku_mutex_t &mutex),
+    UU(const char *src_file), UU(int src_line)) {}
+
+inline void toku_instr_mutex_trylock_start(
+    UU(toku_mutex_instrumentation &mutex_instr), UU(toku_mutex_t &mutex),
+    UU(const char *src_file), UU(int src_line)) {}
+
+inline void toku_instr_mutex_lock_end(
+    UU(toku_mutex_instrumentation &mutex_instr),
+    UU(int pthread_mutex_lock_result)) {}
+
+inline void toku_instr_mutex_unlock(UU(PSI_mutex *mutex_instr)) {}
+
+struct toku_cond_instrumentation {};
+
+enum class toku_instr_cond_op {
+  cond_wait,
+  cond_timedwait,
+};
+
+inline PSI_cond *toku_instr_cond_init(UU(const toku_instr_key &key),
+                                      UU(toku_cond_t &cond)) {
+  return nullptr;
+}
+
+inline void toku_instr_cond_destroy(UU(PSI_cond *&cond_instr)) {}
+
+inline void toku_instr_cond_wait_start(
+    UU(toku_cond_instrumentation &cond_instr), UU(toku_instr_cond_op op),
+    UU(toku_cond_t &cond), UU(toku_mutex_t &mutex), UU(const char *src_file),
+    UU(int src_line)) {}
+
+inline void toku_instr_cond_wait_end(UU(toku_cond_instrumentation &cond_instr),
+                                     UU(int pthread_cond_wait_result)) {}
+
+inline void toku_instr_cond_signal(UU(toku_cond_t &cond)) {}
+
+inline void toku_instr_cond_broadcast(UU(toku_cond_t &cond)) {}
+
+#if 0
+// rw locks are not used 
+// rwlock instrumentation
+struct toku_rwlock_instrumentation {};
+
+inline PSI_rwlock *toku_instr_rwlock_init(UU(const toku_instr_key &key),
+                                          UU(toku_pthread_rwlock_t &rwlock)) {
+    return nullptr;
+}
+
+inline void toku_instr_rwlock_destroy(UU(PSI_rwlock *&rwlock_instr)) {}
+
+inline void toku_instr_rwlock_rdlock_wait_start(
+    UU(toku_rwlock_instrumentation &rwlock_instr),
+    UU(toku_pthread_rwlock_t &rwlock),
+    UU(const char *src_file),
+    UU(int src_line)) {}
+
+inline void toku_instr_rwlock_wrlock_wait_start(
+    UU(toku_rwlock_instrumentation &rwlock_instr),
+    UU(toku_pthread_rwlock_t &rwlock),
+    UU(const char *src_file),
+    UU(int src_line)) {}
+
+inline void toku_instr_rwlock_rdlock_wait_end(
+    UU(toku_rwlock_instrumentation &rwlock_instr),
+    UU(int pthread_rwlock_wait_result)) {}
+
+inline void toku_instr_rwlock_wrlock_wait_end(
+    UU(toku_rwlock_instrumentation &rwlock_instr),
+    UU(int pthread_rwlock_wait_result)) {}
+
+inline void toku_instr_rwlock_unlock(UU(toku_pthread_rwlock_t &rwlock)) {}
+#endif
+
+#else  // MYSQL_TOKUDB_ENGINE
+// There can be not only mysql but also mongodb or any other PFS stuff
+#include <toku_instr_mysql.h>
+#endif  // MYSQL_TOKUDB_ENGINE
+
+// Mutexes
+extern toku_instr_key manager_escalation_mutex_key;
+extern toku_instr_key manager_escalator_mutex_key;
+extern toku_instr_key manager_mutex_key;
+extern toku_instr_key treenode_mutex_key;
+extern toku_instr_key locktree_request_info_mutex_key;
+extern toku_instr_key locktree_request_info_retry_mutex_key;
+
+// condition vars
+extern toku_instr_key lock_request_m_wait_cond_key;
+extern toku_instr_key locktree_request_info_retry_cv_key;
+extern toku_instr_key manager_m_escalator_done_key;  // unused
diff --git a/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h b/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h
new file mode 100644
index 00000000000..9a95b38bd5d
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/portability/toku_portability.h
@@ -0,0 +1,87 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#if defined(__clang__)
+#define constexpr_static_assert(a, b)
+#else
+#define constexpr_static_assert(a, b) static_assert(a, b)
+#endif
+
+// include here, before they get deprecated
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "toku_atomic.h"
+
+#if defined(__cplusplus)
+#include <type_traits>
+#endif
+
+#if defined(__cplusplus)
+// decltype() here gives a reference-to-pointer instead of just a pointer,
+// just use __typeof__
+#define CAST_FROM_VOIDP(name, value) name = static_cast<__typeof__(name)>(value)
+#else
+#define CAST_FROM_VOIDP(name, value) name = cast_to_typeof(name)(value)
+#endif
+
+#define UU(x) x __attribute__((__unused__))
+
+#include "toku_instrumentation.h"
diff --git a/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h b/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h
new file mode 100644
index 00000000000..571b950e111
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/portability/toku_pthread.h
@@ -0,0 +1,520 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <pthread.h>
+#include <stdint.h>
+#include <time.h>
+
+#include "toku_portability.h"
+// PORT2: #include "toku_assert.h"
+
+// TODO: some things moved toku_instrumentation.h, not necessarily the best
+// place
+typedef pthread_attr_t toku_pthread_attr_t;
+typedef pthread_t toku_pthread_t;
+typedef pthread_mutex_t toku_pthread_mutex_t;
+typedef pthread_condattr_t toku_pthread_condattr_t;
+typedef pthread_cond_t toku_pthread_cond_t;
+typedef pthread_rwlockattr_t toku_pthread_rwlockattr_t;
+typedef pthread_key_t toku_pthread_key_t;
+typedef struct timespec toku_timespec_t;
+
+// TODO: break this include loop
+#include <pthread.h>
+typedef pthread_mutexattr_t toku_pthread_mutexattr_t;
+
+struct toku_mutex_t {
+  pthread_mutex_t pmutex;
+  struct PSI_mutex *psi_mutex; /* The performance schema instrumentation hook */
+#if defined(TOKU_PTHREAD_DEBUG)
+  pthread_t owner;  // = pthread_self(); // for debugging
+  bool locked;
+  bool valid;
+  pfs_key_t instr_key_id;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+};
+
+struct toku_cond_t {
+  pthread_cond_t pcond;
+  struct PSI_cond *psi_cond;
+#if defined(TOKU_PTHREAD_DEBUG)
+  pfs_key_t instr_key_id;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+};
+
+#if defined(TOKU_PTHREAD_DEBUG)
+#define TOKU_COND_INITIALIZER \
+  { .pcond = PTHREAD_COND_INITIALIZER, .psi_cond = nullptr, .instr_key_id = 0 }
+#else
+#define TOKU_COND_INITIALIZER \
+  { .pcond = PTHREAD_COND_INITIALIZER, .psi_cond = nullptr }
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+
+struct toku_pthread_rwlock_t {
+  pthread_rwlock_t rwlock;
+  struct PSI_rwlock *psi_rwlock;
+#if defined(TOKU_PTHREAD_DEBUG)
+  pfs_key_t instr_key_id;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+};
+
+typedef struct toku_mutex_aligned {
+  toku_mutex_t aligned_mutex __attribute__((__aligned__(64)));
+} toku_mutex_aligned_t;
+
+// Initializing with {} will fill in a struct with all zeros.
+// But you may also need a pragma to suppress the warnings, as follows
+//
+//   #pragma GCC diagnostic push
+//   #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+//   toku_mutex_t foo = ZERO_MUTEX_INITIALIZER;
+//   #pragma GCC diagnostic pop
+//
+// In general it will be a lot of busy work to make this codebase compile
+// cleanly with -Wmissing-field-initializers
+
+#define ZERO_MUTEX_INITIALIZER \
+  {}
+
+#if defined(TOKU_PTHREAD_DEBUG)
+#define TOKU_MUTEX_INITIALIZER                                             \
+  {                                                                        \
+    .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr, .owner = 0, \
+    .locked = false, .valid = true, .instr_key_id = 0                      \
+  }
+#else
+#define TOKU_MUTEX_INITIALIZER \
+  { .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr }
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+
+// Darwin doesn't provide adaptive mutexes
+#if defined(__APPLE__)
+#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_DEFAULT
+#if defined(TOKU_PTHREAD_DEBUG)
+#define TOKU_ADAPTIVE_MUTEX_INITIALIZER                                    \
+  {                                                                        \
+    .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr, .owner = 0, \
+    .locked = false, .valid = true, .instr_key_id = 0                      \
+  }
+#else
+#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \
+  { .pmutex = PTHREAD_MUTEX_INITIALIZER, .psi_mutex = nullptr }
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+#else   // __FreeBSD__, __linux__, at least
+#if defined(__GLIBC__)
+#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_ADAPTIVE_NP
+#else
+// not all libc (e.g. musl) implement NP (Non-POSIX) attributes
+#define TOKU_MUTEX_ADAPTIVE PTHREAD_MUTEX_DEFAULT
+#endif
+#if defined(TOKU_PTHREAD_DEBUG)
+#define TOKU_ADAPTIVE_MUTEX_INITIALIZER                                    \
+  {                                                                        \
+    .pmutex = PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP, .psi_mutex = nullptr, \
+    .owner = 0, .locked = false, .valid = true, .instr_key_id = 0          \
+  }
+#else
+#define TOKU_ADAPTIVE_MUTEX_INITIALIZER \
+  { .pmutex = PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP, .psi_mutex = nullptr }
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+#endif  // defined(__APPLE__)
+
+// Different OSes implement mutexes as different amounts of nested structs.
+// C++ will fill out all missing values with zeroes if you provide at least one
+// zero, but it needs the right amount of nesting.
+#if defined(__FreeBSD__)
+#define ZERO_COND_INITIALIZER \
+  { 0 }
+#elif defined(__APPLE__)
+#define ZERO_COND_INITIALIZER \
+  {                           \
+    { 0 }                     \
+  }
+#else  // __linux__, at least
+#define ZERO_COND_INITIALIZER \
+  {}
+#endif
+
+static inline void toku_mutexattr_init(toku_pthread_mutexattr_t *attr) {
+  int r = pthread_mutexattr_init(attr);
+  assert_zero(r);
+}
+
+static inline void toku_mutexattr_settype(toku_pthread_mutexattr_t *attr,
+                                          int type) {
+  int r = pthread_mutexattr_settype(attr, type);
+  assert_zero(r);
+}
+
+static inline void toku_mutexattr_destroy(toku_pthread_mutexattr_t *attr) {
+  int r = pthread_mutexattr_destroy(attr);
+  assert_zero(r);
+}
+
+#if defined(TOKU_PTHREAD_DEBUG)
+static inline void toku_mutex_assert_locked(const toku_mutex_t *mutex) {
+  invariant(mutex->locked);
+  invariant(mutex->owner == pthread_self());
+}
+#else
+static inline void toku_mutex_assert_locked(const toku_mutex_t *mutex
+                                            __attribute__((unused))) {}
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+
+// asserting that a mutex is unlocked only makes sense
+// if the calling thread can guaruntee that no other threads
+// are trying to lock this mutex at the time of the assertion
+//
+// a good example of this is a tree with mutexes on each node.
+// when a node is locked the caller knows that no other threads
+// can be trying to lock its childrens' mutexes. the children
+// are in one of two fixed states: locked or unlocked.
+#if defined(TOKU_PTHREAD_DEBUG)
+static inline void toku_mutex_assert_unlocked(toku_mutex_t *mutex) {
+  invariant(mutex->owner == 0);
+  invariant(!mutex->locked);
+}
+#else
+static inline void toku_mutex_assert_unlocked(toku_mutex_t *mutex
+                                              __attribute__((unused))) {}
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+
+#define toku_mutex_lock(M) \
+  toku_mutex_lock_with_source_location(M, __FILE__, __LINE__)
+
+static inline void toku_cond_init(toku_cond_t *cond,
+                                  const toku_pthread_condattr_t *attr) {
+  int r = pthread_cond_init(&cond->pcond, attr);
+  assert_zero(r);
+}
+
+#define toku_mutex_trylock(M) \
+  toku_mutex_trylock_with_source_location(M, __FILE__, __LINE__)
+
+inline void toku_mutex_unlock(toku_mutex_t *mutex) {
+#if defined(TOKU_PTHREAD_DEBUG)
+  invariant(mutex->owner == pthread_self());
+  invariant(mutex->valid);
+  invariant(mutex->locked);
+  mutex->locked = false;
+  mutex->owner = 0;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+  toku_instr_mutex_unlock(mutex->psi_mutex);
+  int r = pthread_mutex_unlock(&mutex->pmutex);
+  assert_zero(r);
+}
+
+inline void toku_mutex_lock_with_source_location(toku_mutex_t *mutex,
+                                                 const char *src_file,
+                                                 int src_line) {
+  toku_mutex_instrumentation mutex_instr;
+  toku_instr_mutex_lock_start(mutex_instr, *mutex, src_file, src_line);
+
+  const int r = pthread_mutex_lock(&mutex->pmutex);
+  toku_instr_mutex_lock_end(mutex_instr, r);
+
+  assert_zero(r);
+#if defined(TOKU_PTHREAD_DEBUG)
+  invariant(mutex->valid);
+  invariant(!mutex->locked);
+  invariant(mutex->owner == 0);
+  mutex->locked = true;
+  mutex->owner = pthread_self();
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+}
+
+inline int toku_mutex_trylock_with_source_location(toku_mutex_t *mutex,
+                                                   const char *src_file,
+                                                   int src_line) {
+  toku_mutex_instrumentation mutex_instr;
+  toku_instr_mutex_trylock_start(mutex_instr, *mutex, src_file, src_line);
+
+  const int r = pthread_mutex_lock(&mutex->pmutex);
+  toku_instr_mutex_lock_end(mutex_instr, r);
+
+#if defined(TOKU_PTHREAD_DEBUG)
+  if (r == 0) {
+    invariant(mutex->valid);
+    invariant(!mutex->locked);
+    invariant(mutex->owner == 0);
+    mutex->locked = true;
+    mutex->owner = pthread_self();
+  }
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+  return r;
+}
+
+#define toku_cond_wait(C, M) \
+  toku_cond_wait_with_source_location(C, M, __FILE__, __LINE__)
+
+#define toku_cond_timedwait(C, M, W) \
+  toku_cond_timedwait_with_source_location(C, M, W, __FILE__, __LINE__)
+
+inline void toku_cond_init(const toku_instr_key &key, toku_cond_t *cond,
+                           const pthread_condattr_t *attr) {
+  toku_instr_cond_init(key, *cond);
+  int r = pthread_cond_init(&cond->pcond, attr);
+  assert_zero(r);
+}
+
+inline void toku_cond_destroy(toku_cond_t *cond) {
+  toku_instr_cond_destroy(cond->psi_cond);
+  int r = pthread_cond_destroy(&cond->pcond);
+  assert_zero(r);
+}
+
+inline void toku_cond_wait_with_source_location(toku_cond_t *cond,
+                                                toku_mutex_t *mutex,
+                                                const char *src_file,
+                                                int src_line) {
+#if defined(TOKU_PTHREAD_DEBUG)
+  invariant(mutex->locked);
+  mutex->locked = false;
+  mutex->owner = 0;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+
+  /* Instrumentation start */
+  toku_cond_instrumentation cond_instr;
+  toku_instr_cond_wait_start(cond_instr, toku_instr_cond_op::cond_wait, *cond,
+                             *mutex, src_file, src_line);
+
+  /* Instrumented code */
+  const int r = pthread_cond_wait(&cond->pcond, &mutex->pmutex);
+
+  /* Instrumentation end */
+  toku_instr_cond_wait_end(cond_instr, r);
+
+  assert_zero(r);
+#if defined(TOKU_PTHREAD_DEBUG)
+  invariant(!mutex->locked);
+  mutex->locked = true;
+  mutex->owner = pthread_self();
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+}
+
+inline int toku_cond_timedwait_with_source_location(toku_cond_t *cond,
+                                                    toku_mutex_t *mutex,
+                                                    toku_timespec_t *wakeup_at,
+                                                    const char *src_file,
+                                                    int src_line) {
+#if defined(TOKU_PTHREAD_DEBUG)
+  invariant(mutex->locked);
+  mutex->locked = false;
+  mutex->owner = 0;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+
+  /* Instrumentation start */
+  toku_cond_instrumentation cond_instr;
+  toku_instr_cond_wait_start(cond_instr, toku_instr_cond_op::cond_timedwait,
+                             *cond, *mutex, src_file, src_line);
+
+  /* Instrumented code */
+  const int r = pthread_cond_timedwait(&cond->pcond, &mutex->pmutex, wakeup_at);
+
+  /* Instrumentation end */
+  toku_instr_cond_wait_end(cond_instr, r);
+
+#if defined(TOKU_PTHREAD_DEBUG)
+  invariant(!mutex->locked);
+  mutex->locked = true;
+  mutex->owner = pthread_self();
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+  return r;
+}
+
+inline void toku_cond_signal(toku_cond_t *cond) {
+  toku_instr_cond_signal(*cond);
+  const int r = pthread_cond_signal(&cond->pcond);
+  assert_zero(r);
+}
+
+inline void toku_cond_broadcast(toku_cond_t *cond) {
+  toku_instr_cond_broadcast(*cond);
+  const int r = pthread_cond_broadcast(&cond->pcond);
+  assert_zero(r);
+}
+
+inline void toku_mutex_init(const toku_instr_key &key, toku_mutex_t *mutex,
+                            const toku_pthread_mutexattr_t *attr) {
+#if defined(TOKU_PTHREAD_DEBUG)
+  mutex->valid = true;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+  toku_instr_mutex_init(key, *mutex);
+  const int r = pthread_mutex_init(&mutex->pmutex, attr);
+  assert_zero(r);
+#if defined(TOKU_PTHREAD_DEBUG)
+  mutex->locked = false;
+  invariant(mutex->valid);
+  mutex->valid = true;
+  mutex->owner = 0;
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+}
+
+inline void toku_mutex_destroy(toku_mutex_t *mutex) {
+#if defined(TOKU_PTHREAD_DEBUG)
+  invariant(mutex->valid);
+  mutex->valid = false;
+  invariant(!mutex->locked);
+#endif  // defined(TOKU_PTHREAD_DEBUG)
+  toku_instr_mutex_destroy(mutex->psi_mutex);
+  int r = pthread_mutex_destroy(&mutex->pmutex);
+  assert_zero(r);
+}
+
+#define toku_pthread_rwlock_rdlock(RW) \
+  toku_pthread_rwlock_rdlock_with_source_location(RW, __FILE__, __LINE__)
+
+#define toku_pthread_rwlock_wrlock(RW) \
+  toku_pthread_rwlock_wrlock_with_source_location(RW, __FILE__, __LINE__)
+
+#if 0
+inline void toku_pthread_rwlock_init(
+    const toku_instr_key &key,
+    toku_pthread_rwlock_t *__restrict rwlock,
+    const toku_pthread_rwlockattr_t *__restrict attr) {
+    toku_instr_rwlock_init(key, *rwlock);
+    int r = pthread_rwlock_init(&rwlock->rwlock, attr);
+    assert_zero(r);
+}
+
+inline void toku_pthread_rwlock_destroy(toku_pthread_rwlock_t *rwlock) {
+    toku_instr_rwlock_destroy(rwlock->psi_rwlock);
+    int r = pthread_rwlock_destroy(&rwlock->rwlock);
+    assert_zero(r);
+}
+
+inline void toku_pthread_rwlock_rdlock_with_source_location(
+    toku_pthread_rwlock_t *rwlock,
+    const char *src_file,
+    uint src_line) {
+
+    /* Instrumentation start */
+    toku_rwlock_instrumentation rwlock_instr;
+    toku_instr_rwlock_rdlock_wait_start(
+        rwlock_instr, *rwlock, src_file, src_line);
+    /* Instrumented code */
+    const int r = pthread_rwlock_rdlock(&rwlock->rwlock);
+
+    /* Instrumentation end */
+    toku_instr_rwlock_rdlock_wait_end(rwlock_instr, r);
+
+    assert_zero(r);
+}
+
+inline void toku_pthread_rwlock_wrlock_with_source_location(
+    toku_pthread_rwlock_t *rwlock,
+    const char *src_file,
+    uint src_line) {
+
+    /* Instrumentation start */
+    toku_rwlock_instrumentation rwlock_instr;
+    toku_instr_rwlock_wrlock_wait_start(
+        rwlock_instr, *rwlock, src_file, src_line);
+    /* Instrumented code */
+    const int r = pthread_rwlock_wrlock(&rwlock->rwlock);
+
+    /* Instrumentation end */
+    toku_instr_rwlock_wrlock_wait_end(rwlock_instr, r);
+
+    assert_zero(r);
+}
+
+inline void toku_pthread_rwlock_rdunlock(toku_pthread_rwlock_t *rwlock) {
+    toku_instr_rwlock_unlock(*rwlock);
+    const int r = pthread_rwlock_unlock(&rwlock->rwlock);
+    assert_zero(r);
+}
+
+inline void toku_pthread_rwlock_wrunlock(toku_pthread_rwlock_t *rwlock) {
+    toku_instr_rwlock_unlock(*rwlock);
+    const int r = pthread_rwlock_unlock(&rwlock->rwlock);
+    assert_zero(r);
+}
+#endif
+
+static inline int toku_pthread_join(toku_pthread_t thread, void **value_ptr) {
+  return pthread_join(thread, value_ptr);
+}
+
+static inline int toku_pthread_detach(toku_pthread_t thread) {
+  return pthread_detach(thread);
+}
+
+static inline int toku_pthread_key_create(toku_pthread_key_t *key,
+                                          void (*destroyf)(void *)) {
+  return pthread_key_create(key, destroyf);
+}
+
+static inline int toku_pthread_key_delete(toku_pthread_key_t key) {
+  return pthread_key_delete(key);
+}
+
+static inline void *toku_pthread_getspecific(toku_pthread_key_t key) {
+  return pthread_getspecific(key);
+}
+
+static inline int toku_pthread_setspecific(toku_pthread_key_t key, void *data) {
+  return pthread_setspecific(key, data);
+}
+
+int toku_pthread_yield(void) __attribute__((__visibility__("default")));
+
+static inline toku_pthread_t toku_pthread_self(void) { return pthread_self(); }
+
+static inline void *toku_pthread_done(void *exit_value) {
+  toku_instr_delete_current_thread();
+  pthread_exit(exit_value);
+  return nullptr;  // Avoid compiler warning
+}
diff --git a/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h b/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h
new file mode 100644
index 00000000000..3cb5b579021
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/portability/toku_race_tools.h
@@ -0,0 +1,179 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+// PORT2: #include <portability/toku_config.h>
+
+#ifdef HAVE_valgrind
+#undef USE_VALGRIND
+#define USE_VALGRIND 1
+#endif
+
+#if defined(__linux__) && USE_VALGRIND
+
+#include <valgrind/drd.h>
+#include <valgrind/helgrind.h>
+
+#define TOKU_ANNOTATE_NEW_MEMORY(p, size) ANNOTATE_NEW_MEMORY(p, size)
+#define TOKU_VALGRIND_HG_ENABLE_CHECKING(p, size) \
+  VALGRIND_HG_ENABLE_CHECKING(p, size)
+#define TOKU_VALGRIND_HG_DISABLE_CHECKING(p, size) \
+  VALGRIND_HG_DISABLE_CHECKING(p, size)
+#define TOKU_DRD_IGNORE_VAR(v) DRD_IGNORE_VAR(v)
+#define TOKU_DRD_STOP_IGNORING_VAR(v) DRD_STOP_IGNORING_VAR(v)
+#define TOKU_ANNOTATE_IGNORE_READS_BEGIN() ANNOTATE_IGNORE_READS_BEGIN()
+#define TOKU_ANNOTATE_IGNORE_READS_END() ANNOTATE_IGNORE_READS_END()
+#define TOKU_ANNOTATE_IGNORE_WRITES_BEGIN() ANNOTATE_IGNORE_WRITES_BEGIN()
+#define TOKU_ANNOTATE_IGNORE_WRITES_END() ANNOTATE_IGNORE_WRITES_END()
+
+/*
+ * How to make helgrind happy about tree rotations and new mutex orderings:
+ *
+ * // Tell helgrind that we unlocked it so that the next call doesn't get a
+ * "destroyed a locked mutex" error.
+ * // Tell helgrind that we destroyed the mutex.
+ * VALGRIND_HG_MUTEX_UNLOCK_PRE(&locka);
+ * VALGRIND_HG_MUTEX_DESTROY_PRE(&locka);
+ *
+ * // And recreate it.  It would be better to simply be able to say that the
+ * order on these two can now be reversed, because this code forgets all the
+ * ordering information for this mutex.
+ * // Then tell helgrind that we have locked it again.
+ * VALGRIND_HG_MUTEX_INIT_POST(&locka, 0);
+ * VALGRIND_HG_MUTEX_LOCK_POST(&locka);
+ *
+ * When the ordering of two locks changes, we don't need tell Helgrind about do
+ * both locks.  Just one is good enough.
+ */
+
+#define TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(mutex) \
+  VALGRIND_HG_MUTEX_UNLOCK_PRE(mutex);                 \
+  VALGRIND_HG_MUTEX_DESTROY_PRE(mutex);                \
+  VALGRIND_HG_MUTEX_INIT_POST(mutex, 0);               \
+  VALGRIND_HG_MUTEX_LOCK_POST(mutex);
+
+#else  // !defined(__linux__) || !USE_VALGRIND
+
+#define NVALGRIND 1
+#define TOKU_ANNOTATE_NEW_MEMORY(p, size) ((void)0)
+#define TOKU_VALGRIND_HG_ENABLE_CHECKING(p, size) ((void)0)
+#define TOKU_VALGRIND_HG_DISABLE_CHECKING(p, size) ((void)0)
+#define TOKU_DRD_IGNORE_VAR(v)
+#define TOKU_DRD_STOP_IGNORING_VAR(v)
+#define TOKU_ANNOTATE_IGNORE_READS_BEGIN() ((void)0)
+#define TOKU_ANNOTATE_IGNORE_READS_END() ((void)0)
+#define TOKU_ANNOTATE_IGNORE_WRITES_BEGIN() ((void)0)
+#define TOKU_ANNOTATE_IGNORE_WRITES_END() ((void)0)
+#define TOKU_VALGRIND_RESET_MUTEX_ORDERING_INFO(mutex)
+#undef RUNNING_ON_VALGRIND
+#define RUNNING_ON_VALGRIND (0U)
+#endif
+
+// Valgrind 3.10.1 (and previous versions).
+// Problems with VALGRIND_HG_DISABLE_CHECKING and VALGRIND_HG_ENABLE_CHECKING.
+// Helgrind's implementation of disable and enable checking causes false races
+// to be reported.  In addition, the race report does not include ANY
+// information about the code that uses the helgrind disable and enable
+// functions.  Therefore, it is very difficult to figure out the cause of the
+// race. DRD does implement the disable and enable functions.
+
+// Problems with ANNOTATE_IGNORE_READS.
+// Helgrind does not implement ignore reads.
+// Annotate ignore reads is the way to inform DRD to ignore racy reads.
+
+// FT code uses unsafe reads in several places.  These unsafe reads have been
+// noted as valid since they use the toku_unsafe_fetch function. Unfortunately,
+// this causes helgrind to report erroneous data races which makes use of
+// helgrind problematic.
+
+// Unsafely fetch and return a `T' from src, telling drd to ignore
+// racey access to src for the next sizeof(*src) bytes
+template <typename T>
+T toku_unsafe_fetch(T *src) {
+  if (0)
+    TOKU_VALGRIND_HG_DISABLE_CHECKING(src,
+                                      sizeof *src);  // disabled, see comment
+  TOKU_ANNOTATE_IGNORE_READS_BEGIN();
+  T r = *src;
+  TOKU_ANNOTATE_IGNORE_READS_END();
+  if (0)
+    TOKU_VALGRIND_HG_ENABLE_CHECKING(src,
+                                     sizeof *src);  // disabled, see comment
+  return r;
+}
+
+template <typename T>
+T toku_unsafe_fetch(T &src) {
+  return toku_unsafe_fetch(&src);
+}
+
+// Unsafely set a `T' value into *dest from src, telling drd to ignore
+// racey access to dest for the next sizeof(*dest) bytes
+template <typename T>
+void toku_unsafe_set(T *dest, const T src) {
+  if (0)
+    TOKU_VALGRIND_HG_DISABLE_CHECKING(dest,
+                                      sizeof *dest);  // disabled, see comment
+  TOKU_ANNOTATE_IGNORE_WRITES_BEGIN();
+  *dest = src;
+  TOKU_ANNOTATE_IGNORE_WRITES_END();
+  if (0)
+    TOKU_VALGRIND_HG_ENABLE_CHECKING(dest,
+                                     sizeof *dest);  // disabled, see comment
+}
+
+template <typename T>
+void toku_unsafe_set(T &dest, const T src) {
+  toku_unsafe_set(&dest, src);
+}
diff --git a/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h b/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h
new file mode 100644
index 00000000000..4425a4a2e57
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h
@@ -0,0 +1,172 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+// PORT2: #include "toku_config.h"
+
+#include <stdint.h>
+#include <sys/time.h>
+#include <time.h>
+#if defined(__powerpc__)
+#include <sys/platform/ppc.h>
+#endif
+
+#if 0
+static inline float toku_tdiff (struct timeval *a, struct timeval *b) {
+    return (float)((a->tv_sec - b->tv_sec) + 1e-6 * (a->tv_usec - b->tv_usec));
+}
+// PORT2: temporary:
+#define HAVE_CLOCK_REALTIME
+#if !defined(HAVE_CLOCK_REALTIME)
+// OS X does not have clock_gettime, we fake clockid_t for the interface, and we'll implement it with clock_get_time.
+typedef int clockid_t;
+// just something bogus, it doesn't matter, we just want to make sure we're
+// only supporting this mode because we're not sure we can support other modes
+// without a real clock_gettime()
+#define CLOCK_REALTIME 0x01867234
+#endif
+int toku_clock_gettime(clockid_t clk_id, struct timespec *ts) __attribute__((__visibility__("default")));
+#endif
+
+// *************** Performance timers ************************
+// What do you really want from a performance timer:
+//  (1) Can determine actual time of day from the performance time.
+//  (2) Time goes forward, never backward.
+//  (3) Same time on different processors (or even different machines).
+//  (4) Time goes forward at a constant rate (doesn't get faster and slower)
+//  (5) Portable.
+//  (6) Getting the time is cheap.
+// Unfortuately it seems tough to get Properties 1-5.  So we go for Property 6,,
+// but we abstract it. We offer a type tokutime_t which can hold the time. This
+// type can be subtracted to get a time difference. We can get the present time
+// cheaply. We can convert this type to seconds (but that can be expensive). The
+// implementation is to use RDTSC (hence we lose property 3: not portable).
+// Recent machines have constant_tsc in which case we get property (4).
+// Recent OSs on recent machines (that have RDTSCP) fix the per-processor clock
+// skew, so we get property (3). We get property 2 with RDTSC (as long as
+// there's not any skew). We don't even try to get propety 1, since we don't
+// need it. The decision here is that these times are really accurate only on
+// modern machines with modern OSs.
+typedef uint64_t tokutime_t;  // Time type used in by tokutek timers.
+
+#if 0
+// The value of tokutime_t is not specified here. 
+// It might be microseconds since 1/1/1970 (if gettimeofday() is
+// used), or clock cycles since boot (if rdtsc is used).  Or something
+// else.
+// Two tokutime_t values can be subtracted to get a time difference.
+// Use tokutime_to_seconds to that convert difference  to seconds.
+// We want get_tokutime() to be fast, but don't care so much about tokutime_to_seconds();
+//
+// For accurate time calculations do the subtraction in the right order:
+//   Right:  tokutime_to_seconds(t1-t2);
+//   Wrong   tokutime_to_seconds(t1)-toku_time_to_seconds(t2);
+// Doing it the wrong way is likely to result in loss of precision.
+// A double can hold numbers up to about 53 bits.  RDTSC which uses about 33 bits every second, so that leaves
+// 2^20 seconds from booting (about 2 weeks) before the RDTSC value cannot be represented accurately as a double.
+//
+double tokutime_to_seconds(tokutime_t)  __attribute__((__visibility__("default"))); // Convert tokutime to seconds.
+
+#endif
+
+// Get the value of tokutime for right now.  We want this to be fast, so we
+// expose the implementation as RDTSC.
+static inline tokutime_t toku_time_now(void) {
+#if defined(__x86_64__) || defined(__i386__)
+  uint32_t lo, hi;
+  __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
+  return (uint64_t)hi << 32 | lo;
+#elif defined(__aarch64__)
+  uint64_t result;
+  __asm __volatile__("mrs %[rt], cntvct_el0" : [ rt ] "=r"(result));
+  return result;
+#elif defined(__powerpc__)
+  return __ppc_get_timebase();
+#else
+#error No timer implementation for this platform
+#endif
+}
+
+static inline uint64_t toku_current_time_microsec(void) {
+  struct timeval t;
+  gettimeofday(&t, NULL);
+  return t.tv_sec * (1UL * 1000 * 1000) + t.tv_usec;
+}
+
+#if 0
+// sleep microseconds
+static inline void toku_sleep_microsec(uint64_t ms) {
+    struct timeval  t;
+
+    t.tv_sec = ms / 1000000;
+    t.tv_usec = ms % 1000000;
+
+    select(0, NULL, NULL, NULL, &t);
+}
+#endif
+
+/*
+  PORT: Usage of this file:
+
+  uint64_t toku_current_time_microsec()   // uses gettimeoday
+      is used to track how much time various operations took (for example, lock
+      escalation). (TODO: it is not clear why these operations are tracked with
+      microsecond precision while others use nanoseconds)
+
+  tokutime_t toku_time_now() // uses rdtsc
+      seems to be used for a very similar purpose. This has greater precision
+
+  RocksDB environment provides Env::Default()->NowMicros() and NowNanos() which
+  should be adequate substitutes.
+*/
diff --git a/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h b/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h
new file mode 100644
index 00000000000..803914862f9
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h
@@ -0,0 +1,27 @@
+//
+// A substitute for ft/txn/txn.h
+//
+#pragma once
+
+#include <set>
+
+#include "../util/omt.h"
+
+typedef uint64_t TXNID;
+#define TXNID_NONE ((TXNID)0)
+
+// A set of transactions
+//  (TODO: consider using class toku::txnid_set. The reason for using STL
+//   container was that its API is easier)
+class TxnidVector : public std::set<TXNID> {
+ public:
+  bool contains(TXNID txnid) { return find(txnid) != end(); }
+};
+
+// A value for lock structures with a meaning "the lock is owned by multiple
+// transactions (and one has to check the TxnidVector to get their ids)
+#define TXNID_SHARED (TXNID(-1))
+
+// Auxiliary value meaning "any transaction id will do".  No real transaction
+// may have this is as id.
+#define TXNID_ANY (TXNID(-2))
diff --git a/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc b/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc
new file mode 100644
index 00000000000..50dc879ceae
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc
@@ -0,0 +1,132 @@
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+/*
+  This is a dump ground to make Lock Tree work without the rest of TokuDB.
+*/
+#include <string.h>
+
+#include "db.h"
+#include "ft/ft-status.h"
+#include "portability/memory.h"
+#include "util/dbt.h"
+
+// portability/os_malloc.cc
+
+void toku_free(void *p) { free(p); }
+
+void *toku_xmalloc(size_t size) { return malloc(size); }
+
+void *toku_xrealloc(void *v, size_t size) { return realloc(v, size); }
+
+void *toku_xmemdup(const void *v, size_t len) {
+  void *p = toku_xmalloc(len);
+  memcpy(p, v, len);
+  return p;
+}
+
+// TODO: what are the X-functions? Xcalloc, Xrealloc?
+void *toku_xcalloc(size_t nmemb, size_t size) { return calloc(nmemb, size); }
+
+// ft-ft-opts.cc:
+
+// locktree
+toku_instr_key lock_request_m_wait_cond_key;
+toku_instr_key manager_m_escalator_done_key;
+toku_instr_key locktree_request_info_mutex_key;
+toku_instr_key locktree_request_info_retry_mutex_key;
+toku_instr_key locktree_request_info_retry_cv_key;
+
+toku_instr_key treenode_mutex_key;
+toku_instr_key manager_mutex_key;
+toku_instr_key manager_escalation_mutex_key;
+toku_instr_key manager_escalator_mutex_key;
+
+// portability/memory.cc
+size_t toku_memory_footprint(void *, size_t touched) { return touched; }
+
+// ft/ft-status.c
+// PORT2: note: the @c parameter to TOKUFT_STATUS_INIT must not start with
+//   "TOKU"
+LTM_STATUS_S ltm_status;
+void LTM_STATUS_S::init() {
+  if (m_initialized) return;
+#define LTM_STATUS_INIT(k, c, t, l)                    \
+  TOKUFT_STATUS_INIT((*this), k, c, t, "locktree: " l, \
+                     TOKU_ENGINE_STATUS | TOKU_GLOBAL_STATUS)
+  LTM_STATUS_INIT(LTM_SIZE_CURRENT, LOCKTREE_MEMORY_SIZE, STATUS_UINT64,
+                  "memory size");
+  LTM_STATUS_INIT(LTM_SIZE_LIMIT, LOCKTREE_MEMORY_SIZE_LIMIT, STATUS_UINT64,
+                  "memory size limit");
+  LTM_STATUS_INIT(LTM_ESCALATION_COUNT, LOCKTREE_ESCALATION_NUM, STATUS_UINT64,
+                  "number of times lock escalation ran");
+  LTM_STATUS_INIT(LTM_ESCALATION_TIME, LOCKTREE_ESCALATION_SECONDS,
+                  STATUS_TOKUTIME, "time spent running escalation (seconds)");
+  LTM_STATUS_INIT(LTM_ESCALATION_LATEST_RESULT,
+                  LOCKTREE_LATEST_POST_ESCALATION_MEMORY_SIZE, STATUS_UINT64,
+                  "latest post-escalation memory size");
+  LTM_STATUS_INIT(LTM_NUM_LOCKTREES, LOCKTREE_OPEN_CURRENT, STATUS_UINT64,
+                  "number of locktrees open now");
+  LTM_STATUS_INIT(LTM_LOCK_REQUESTS_PENDING, LOCKTREE_PENDING_LOCK_REQUESTS,
+                  STATUS_UINT64, "number of pending lock requests");
+  LTM_STATUS_INIT(LTM_STO_NUM_ELIGIBLE, LOCKTREE_STO_ELIGIBLE_NUM,
+                  STATUS_UINT64, "number of locktrees eligible for the STO");
+  LTM_STATUS_INIT(LTM_STO_END_EARLY_COUNT, LOCKTREE_STO_ENDED_NUM,
+                  STATUS_UINT64,
+                  "number of times a locktree ended the STO early");
+  LTM_STATUS_INIT(LTM_STO_END_EARLY_TIME, LOCKTREE_STO_ENDED_SECONDS,
+                  STATUS_TOKUTIME, "time spent ending the STO early (seconds)");
+  LTM_STATUS_INIT(LTM_WAIT_COUNT, LOCKTREE_WAIT_COUNT, STATUS_UINT64,
+                  "number of wait locks");
+  LTM_STATUS_INIT(LTM_WAIT_TIME, LOCKTREE_WAIT_TIME, STATUS_UINT64,
+                  "time waiting for locks");
+  LTM_STATUS_INIT(LTM_LONG_WAIT_COUNT, LOCKTREE_LONG_WAIT_COUNT, STATUS_UINT64,
+                  "number of long wait locks");
+  LTM_STATUS_INIT(LTM_LONG_WAIT_TIME, LOCKTREE_LONG_WAIT_TIME, STATUS_UINT64,
+                  "long time waiting for locks");
+  LTM_STATUS_INIT(LTM_TIMEOUT_COUNT, LOCKTREE_TIMEOUT_COUNT, STATUS_UINT64,
+                  "number of lock timeouts");
+  LTM_STATUS_INIT(LTM_WAIT_ESCALATION_COUNT, LOCKTREE_WAIT_ESCALATION_COUNT,
+                  STATUS_UINT64, "number of waits on lock escalation");
+  LTM_STATUS_INIT(LTM_WAIT_ESCALATION_TIME, LOCKTREE_WAIT_ESCALATION_TIME,
+                  STATUS_UINT64, "time waiting on lock escalation");
+  LTM_STATUS_INIT(LTM_LONG_WAIT_ESCALATION_COUNT,
+                  LOCKTREE_LONG_WAIT_ESCALATION_COUNT, STATUS_UINT64,
+                  "number of long waits on lock escalation");
+  LTM_STATUS_INIT(LTM_LONG_WAIT_ESCALATION_TIME,
+                  LOCKTREE_LONG_WAIT_ESCALATION_TIME, STATUS_UINT64,
+                  "long time waiting on lock escalation");
+
+  m_initialized = true;
+#undef LTM_STATUS_INIT
+}
+void LTM_STATUS_S::destroy() {
+  if (!m_initialized) return;
+  for (int i = 0; i < LTM_STATUS_NUM_ROWS; ++i) {
+    if (status[i].type == STATUS_PARCOUNT) {
+      // PORT: TODO?? destroy_partitioned_counter(status[i].value.parcount);
+    }
+  }
+}
+
+int toku_keycompare(const void *key1, size_t key1len, const void *key2,
+                    size_t key2len) {
+  size_t comparelen = key1len < key2len ? key1len : key2len;
+  int c = memcmp(key1, key2, comparelen);
+  if (__builtin_expect(c != 0, 1)) {
+    return c;
+  } else {
+    if (key1len < key2len) {
+      return -1;
+    } else if (key1len > key2len) {
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+}
+
+int toku_builtin_compare_fun(const DBT *a, const DBT *b) {
+  return toku_keycompare(a->data, a->size, b->data, b->size);
+}
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc b/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc
new file mode 100644
index 00000000000..63cc3a267a1
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc
@@ -0,0 +1,153 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "dbt.h"
+
+#include <string.h>
+
+#include "../db.h"
+#include "../portability/memory.h"
+
+DBT *toku_init_dbt(DBT *dbt) {
+  memset(dbt, 0, sizeof(*dbt));
+  return dbt;
+}
+
+DBT toku_empty_dbt(void) {
+  static const DBT empty_dbt = {.data = 0, .size = 0, .ulen = 0, .flags = 0};
+  return empty_dbt;
+}
+
+DBT *toku_init_dbt_flags(DBT *dbt, uint32_t flags) {
+  toku_init_dbt(dbt);
+  dbt->flags = flags;
+  return dbt;
+}
+
+void toku_destroy_dbt(DBT *dbt) {
+  switch (dbt->flags) {
+    case DB_DBT_MALLOC:
+    case DB_DBT_REALLOC:
+      toku_free(dbt->data);
+      toku_init_dbt(dbt);
+      break;
+  }
+}
+
+DBT *toku_fill_dbt(DBT *dbt, const void *k, size_t len) {
+  toku_init_dbt(dbt);
+  dbt->size = len;
+  dbt->data = (char *)k;
+  return dbt;
+}
+
+DBT *toku_memdup_dbt(DBT *dbt, const void *k, size_t len) {
+  toku_init_dbt_flags(dbt, DB_DBT_MALLOC);
+  dbt->size = len;
+  dbt->data = toku_xmemdup(k, len);
+  return dbt;
+}
+
+DBT *toku_copyref_dbt(DBT *dst, const DBT src) {
+  dst->flags = 0;
+  dst->ulen = 0;
+  dst->size = src.size;
+  dst->data = src.data;
+  return dst;
+}
+
+DBT *toku_clone_dbt(DBT *dst, const DBT &src) {
+  return toku_memdup_dbt(dst, src.data, src.size);
+}
+
+void toku_sdbt_cleanup(struct simple_dbt *sdbt) {
+  if (sdbt->data) toku_free(sdbt->data);
+  memset(sdbt, 0, sizeof(*sdbt));
+}
+
+const DBT *toku_dbt_positive_infinity(void) {
+  static DBT positive_infinity_dbt = {
+      .data = 0, .size = 0, .ulen = 0, .flags = 0};  // port
+  return &positive_infinity_dbt;
+}
+
+const DBT *toku_dbt_negative_infinity(void) {
+  static DBT negative_infinity_dbt = {
+      .data = 0, .size = 0, .ulen = 0, .flags = 0};  // port
+  return &negative_infinity_dbt;
+}
+
+bool toku_dbt_is_infinite(const DBT *dbt) {
+  return dbt == toku_dbt_positive_infinity() ||
+         dbt == toku_dbt_negative_infinity();
+}
+
+bool toku_dbt_is_empty(const DBT *dbt) {
+  // can't have a null data field with a non-zero size
+  paranoid_invariant(dbt->data != nullptr || dbt->size == 0);
+  return dbt->data == nullptr;
+}
+
+int toku_dbt_infinite_compare(const DBT *a, const DBT *b) {
+  if (a == b) {
+    return 0;
+  } else if (a == toku_dbt_positive_infinity()) {
+    return 1;
+  } else if (b == toku_dbt_positive_infinity()) {
+    return -1;
+  } else if (a == toku_dbt_negative_infinity()) {
+    return -1;
+  } else {
+    invariant(b == toku_dbt_negative_infinity());
+    return 1;
+  }
+}
+
+bool toku_dbt_equals(const DBT *a, const DBT *b) {
+  if (!toku_dbt_is_infinite(a) && !toku_dbt_is_infinite(b)) {
+    return a->data == b->data && a->size == b->size;
+  } else {
+    // a or b is infinite, so they're equal if they are the same infinite
+    return a == b ? true : false;
+  }
+}
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/lock/range/range_tree/lib/util/dbt.h b/utilities/transactions/lock/range/range_tree/lib/util/dbt.h
new file mode 100644
index 00000000000..d86c440f86e
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/util/dbt.h
@@ -0,0 +1,98 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "../db.h"
+
+// TODO: John
+// Document this API a little better so that DBT
+// memory management can be morm widely understood.
+
+DBT *toku_init_dbt(DBT *);
+
+// returns: an initialized but empty dbt (for which toku_dbt_is_empty() is true)
+DBT toku_empty_dbt(void);
+
+DBT *toku_init_dbt_flags(DBT *, uint32_t flags);
+
+void toku_destroy_dbt(DBT *);
+
+DBT *toku_fill_dbt(DBT *dbt, const void *k, size_t len);
+
+DBT *toku_memdup_dbt(DBT *dbt, const void *k, size_t len);
+
+DBT *toku_copyref_dbt(DBT *dst, const DBT src);
+
+DBT *toku_clone_dbt(DBT *dst, const DBT &src);
+
+void toku_sdbt_cleanup(struct simple_dbt *sdbt);
+
+// returns: special DBT pointer representing positive infinity
+const DBT *toku_dbt_positive_infinity(void);
+
+// returns: special DBT pointer representing negative infinity
+const DBT *toku_dbt_negative_infinity(void);
+
+// returns: true if the given dbt is either positive or negative infinity
+bool toku_dbt_is_infinite(const DBT *dbt);
+
+// returns: true if the given dbt has no data (ie: dbt->data == nullptr)
+bool toku_dbt_is_empty(const DBT *dbt);
+
+// effect: compares two potentially infinity-valued dbts
+// requires: at least one is infinite (assert otherwise)
+int toku_dbt_infinite_compare(const DBT *a, const DBT *b);
+
+// returns: true if the given dbts have the same data pointer and size
+bool toku_dbt_equals(const DBT *a, const DBT *b);
diff --git a/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h b/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h
new file mode 100644
index 00000000000..158750fdbaf
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/util/growable_array.h
@@ -0,0 +1,144 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <memory.h>
+
+//******************************************************************************
+//
+// Overview: A growable array is a little bit like std::vector except that
+//  it doesn't have constructors (hence can be used in static constructs, since
+//  the google style guide says no constructors), and it's a little simpler.
+// Operations:
+//   init and deinit (we don't have constructors and destructors).
+//   fetch_unchecked to get values out.
+//   store_unchecked to put values in.
+//   push to add an element at the end
+//   get_size to find out the size
+//   get_memory_size to find out how much memory the data stucture is using.
+//
+//******************************************************************************
+
+namespace toku {
+
+template <typename T>
+class GrowableArray {
+ public:
+  void init(void)
+  // Effect: Initialize the array to contain no elements.
+  {
+    m_array = NULL;
+    m_size = 0;
+    m_size_limit = 0;
+  }
+
+  void deinit(void)
+  // Effect: Deinitialize the array (freeing any memory it uses, for example).
+  {
+    toku_free(m_array);
+    m_array = NULL;
+    m_size = 0;
+    m_size_limit = 0;
+  }
+
+  T fetch_unchecked(size_t i) const
+  // Effect: Fetch the ith element.  If i is out of range, the system asserts.
+  {
+    return m_array[i];
+  }
+
+  void store_unchecked(size_t i, T v)
+  // Effect: Store v in the ith element.  If i is out of range, the system
+  // asserts.
+  {
+    paranoid_invariant(i < m_size);
+    m_array[i] = v;
+  }
+
+  void push(T v)
+  // Effect: Add v to the end of the array (increasing the size).  The amortized
+  // cost of this operation is constant. Implementation hint:  Double the size
+  // of the array when it gets too big so that the amortized cost stays
+  // constant.
+  {
+    if (m_size >= m_size_limit) {
+      if (m_array == NULL) {
+        m_size_limit = 1;
+      } else {
+        m_size_limit *= 2;
+      }
+      XREALLOC_N(m_size_limit, m_array);
+    }
+    m_array[m_size++] = v;
+  }
+
+  size_t get_size(void) const
+  // Effect: Return the number of elements in the array.
+  {
+    return m_size;
+  }
+  size_t memory_size(void) const
+  // Effect: Return the size (in bytes) that the array occupies in memory.  This
+  // is really only an estimate.
+  {
+    return sizeof(*this) + sizeof(T) * m_size_limit;
+  }
+
+ private:
+  T *m_array;
+  size_t m_size;
+  size_t m_size_limit;  // How much space is allocated in array.
+};
+
+}  // namespace toku
diff --git a/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc b/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc
new file mode 100644
index 00000000000..0e7a9880be5
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc
@@ -0,0 +1,201 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "memarena.h"
+
+#include <string.h>
+
+#include <algorithm>
+
+#include "../portability/memory.h"
+
+void memarena::create(size_t initial_size) {
+  _current_chunk = arena_chunk();
+  _other_chunks = nullptr;
+  _size_of_other_chunks = 0;
+  _footprint_of_other_chunks = 0;
+  _n_other_chunks = 0;
+
+  _current_chunk.size = initial_size;
+  if (_current_chunk.size > 0) {
+    XMALLOC_N(_current_chunk.size, _current_chunk.buf);
+  }
+}
+
+void memarena::destroy(void) {
+  if (_current_chunk.buf) {
+    toku_free(_current_chunk.buf);
+  }
+  for (int i = 0; i < _n_other_chunks; i++) {
+    toku_free(_other_chunks[i].buf);
+  }
+  if (_other_chunks) {
+    toku_free(_other_chunks);
+  }
+  _current_chunk = arena_chunk();
+  _other_chunks = nullptr;
+  _n_other_chunks = 0;
+}
+
+static size_t round_to_page(size_t size) {
+  const size_t page_size = 4096;
+  const size_t r = page_size + ((size - 1) & ~(page_size - 1));
+  assert((r & (page_size - 1)) == 0);  // make sure it's aligned
+  assert(r >= size);                   // make sure it's not too small
+  assert(r <
+         size + page_size);  // make sure we didn't grow by more than a page.
+  return r;
+}
+
+static const size_t MEMARENA_MAX_CHUNK_SIZE = 64 * 1024 * 1024;
+
+void *memarena::malloc_from_arena(size_t size) {
+  if (_current_chunk.buf == nullptr ||
+      _current_chunk.size < _current_chunk.used + size) {
+    // The existing block isn't big enough.
+    // Add the block to the vector of blocks.
+    if (_current_chunk.buf) {
+      invariant(_current_chunk.size > 0);
+      int old_n = _n_other_chunks;
+      XREALLOC_N(old_n + 1, _other_chunks);
+      _other_chunks[old_n] = _current_chunk;
+      _n_other_chunks = old_n + 1;
+      _size_of_other_chunks += _current_chunk.size;
+      _footprint_of_other_chunks +=
+          toku_memory_footprint(_current_chunk.buf, _current_chunk.used);
+    }
+
+    // Make a new one. Grow the buffer size exponentially until we hit
+    // the max chunk size, but make it at least `size' bytes so the
+    // current allocation always fit.
+    size_t new_size =
+        std::min(MEMARENA_MAX_CHUNK_SIZE, 2 * _current_chunk.size);
+    if (new_size < size) {
+      new_size = size;
+    }
+    new_size = round_to_page(
+        new_size);  // at least size, but round to the next page size
+    XMALLOC_N(new_size, _current_chunk.buf);
+    _current_chunk.used = 0;
+    _current_chunk.size = new_size;
+  }
+  invariant(_current_chunk.buf != nullptr);
+
+  // allocate in the existing block.
+  char *p = _current_chunk.buf + _current_chunk.used;
+  _current_chunk.used += size;
+  return p;
+}
+
+void memarena::move_memory(memarena *dest) {
+  // Move memory to dest
+  XREALLOC_N(dest->_n_other_chunks + _n_other_chunks + 1, dest->_other_chunks);
+  dest->_size_of_other_chunks += _size_of_other_chunks + _current_chunk.size;
+  dest->_footprint_of_other_chunks +=
+      _footprint_of_other_chunks +
+      toku_memory_footprint(_current_chunk.buf, _current_chunk.used);
+  for (int i = 0; i < _n_other_chunks; i++) {
+    dest->_other_chunks[dest->_n_other_chunks++] = _other_chunks[i];
+  }
+  dest->_other_chunks[dest->_n_other_chunks++] = _current_chunk;
+
+  // Clear out this memarena's memory
+  toku_free(_other_chunks);
+  _current_chunk = arena_chunk();
+  _other_chunks = nullptr;
+  _size_of_other_chunks = 0;
+  _footprint_of_other_chunks = 0;
+  _n_other_chunks = 0;
+}
+
+size_t memarena::total_memory_size(void) const {
+  return sizeof(*this) + total_size_in_use() +
+         _n_other_chunks * sizeof(*_other_chunks);
+}
+
+size_t memarena::total_size_in_use(void) const {
+  return _size_of_other_chunks + _current_chunk.used;
+}
+
+size_t memarena::total_footprint(void) const {
+  return sizeof(*this) + _footprint_of_other_chunks +
+         toku_memory_footprint(_current_chunk.buf, _current_chunk.used) +
+         _n_other_chunks * sizeof(*_other_chunks);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+const void *memarena::chunk_iterator::current(size_t *used) const {
+  if (_chunk_idx < 0) {
+    *used = _ma->_current_chunk.used;
+    return _ma->_current_chunk.buf;
+  } else if (_chunk_idx < _ma->_n_other_chunks) {
+    *used = _ma->_other_chunks[_chunk_idx].used;
+    return _ma->_other_chunks[_chunk_idx].buf;
+  }
+  *used = 0;
+  return nullptr;
+}
+
+void memarena::chunk_iterator::next() { _chunk_idx++; }
+
+bool memarena::chunk_iterator::more() const {
+  if (_chunk_idx < 0) {
+    return _ma->_current_chunk.buf != nullptr;
+  }
+  return _chunk_idx < _ma->_n_other_chunks;
+}
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/lock/range/range_tree/lib/util/memarena.h b/utilities/transactions/lock/range/range_tree/lib/util/memarena.h
new file mode 100644
index 00000000000..ddcc1144f25
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/util/memarena.h
@@ -0,0 +1,141 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <stdlib.h>
+
+/*
+ * A memarena is used to efficiently store a collection of objects that never
+ * move The pattern is allocate more and more stuff and free all of the items at
+ * once. The underlying memory will store 1 or more objects per chunk. Each
+ * chunk is contiguously laid out in memory but chunks are not necessarily
+ * contiguous with each other.
+ */
+class memarena {
+ public:
+  memarena()
+      : _current_chunk(arena_chunk()),
+        _other_chunks(nullptr),
+        _n_other_chunks(0),
+        _size_of_other_chunks(0),
+        _footprint_of_other_chunks(0) {}
+
+  // Effect: Create a memarena with the specified initial size
+  void create(size_t initial_size);
+
+  void destroy(void);
+
+  // Effect: Allocate some memory.  The returned value remains valid until the
+  // memarena is cleared or closed.
+  //  In case of ENOMEM, aborts.
+  void *malloc_from_arena(size_t size);
+
+  // Effect: Move all the memory from this memarena into DEST.
+  //         When SOURCE is closed the memory won't be freed.
+  //         When DEST is closed, the memory will be freed, unless DEST moves
+  //         its memory to another memarena...
+  void move_memory(memarena *dest);
+
+  // Effect: Calculate the amount of memory used by a memory arena.
+  size_t total_memory_size(void) const;
+
+  // Effect: Calculate the used space of the memory arena (ie: excludes unused
+  // space)
+  size_t total_size_in_use(void) const;
+
+  // Effect: Calculate the amount of memory used, according to
+  // toku_memory_footprint(),
+  //         which is a more expensive but more accurate count of memory used.
+  size_t total_footprint(void) const;
+
+  // iterator over the underlying chunks that store objects in the memarena.
+  // a chunk is represented by a pointer to const memory and a usable byte
+  // count.
+  class chunk_iterator {
+   public:
+    chunk_iterator(const memarena *ma) : _ma(ma), _chunk_idx(-1) {}
+
+    // returns: base pointer to the current chunk
+    //          *used set to the number of usable bytes
+    //          if more() is false, returns nullptr and *used = 0
+    const void *current(size_t *used) const;
+
+    // requires: more() is true
+    void next();
+
+    bool more() const;
+
+   private:
+    // -1 represents the 'initial' chunk in a memarena, ie: ma->_current_chunk
+    // >= 0 represents the i'th chunk in the ma->_other_chunks array
+    const memarena *_ma;
+    int _chunk_idx;
+  };
+
+ private:
+  struct arena_chunk {
+    arena_chunk() : buf(nullptr), used(0), size(0) {}
+    char *buf;
+    size_t used;
+    size_t size;
+  };
+
+  struct arena_chunk _current_chunk;
+  struct arena_chunk *_other_chunks;
+  int _n_other_chunks;
+  size_t _size_of_other_chunks;       // the buf_size of all the other chunks.
+  size_t _footprint_of_other_chunks;  // the footprint of all the other chunks.
+
+  friend class memarena_unit_test;
+};
diff --git a/utilities/transactions/lock/range/range_tree/lib/util/omt.h b/utilities/transactions/lock/range/range_tree/lib/util/omt.h
new file mode 100644
index 00000000000..f208002d32c
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/util/omt.h
@@ -0,0 +1,794 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <memory.h>
+#include <stdint.h>
+
+#include "../portability/toku_portability.h"
+#include "../portability/toku_race_tools.h"
+#include "growable_array.h"
+
+namespace toku {
+
+/**
+ * Order Maintenance Tree (OMT)
+ *
+ * Maintains a collection of totally ordered values, where each value has an
+ * integer weight. The OMT is a mutable datatype.
+ *
+ * The Abstraction:
+ *
+ * An OMT is a vector of values, $V$, where $|V|$ is the length of the vector.
+ * The vector is numbered from $0$ to $|V|-1$.
+ * Each value has a weight.  The weight of the $i$th element is denoted
+ * $w(V_i)$.
+ *
+ * We can create a new OMT, which is the empty vector.
+ *
+ * We can insert a new element $x$ into slot $i$, changing $V$ into $V'$ where
+ *  $|V'|=1+|V|$       and
+ *
+ *   V'_j = V_j       if $j<i$
+ *          x         if $j=i$
+ *          V_{j-1}   if $j>i$.
+ *
+ * We can specify $i$ using a kind of function instead of as an integer.
+ * Let $b$ be a function mapping from values to nonzero integers, such that
+ * the signum of $b$ is monotically increasing.
+ * We can specify $i$ as the minimum integer such that $b(V_i)>0$.
+ *
+ * We look up a value using its index, or using a Heaviside function.
+ * For lookups, we allow $b$ to be zero for some values, and again the signum of
+ * $b$ must be monotonically increasing. When lookup up values, we can look up
+ *  $V_i$ where $i$ is the minimum integer such that $b(V_i)=0$.   (With a
+ * special return code if no such value exists.) (Rationale:  Ordinarily we want
+ * $i$ to be unique.  But for various reasons we want to allow multiple zeros,
+ * and we want the smallest $i$ in that case.) $V_i$ where $i$ is the minimum
+ * integer such that $b(V_i)>0$.   (Or an indication that no such value exists.)
+ *  $V_i$ where $i$ is the maximum integer such that $b(V_i)<0$.   (Or an
+ * indication that no such value exists.)
+ *
+ * When looking up a value using a Heaviside function, we get the value and its
+ * index.
+ *
+ * We can also split an OMT into two OMTs, splitting the weight of the values
+ * evenly. Find a value $j$ such that the values to the left of $j$ have about
+ * the same total weight as the values to the right of $j$. The resulting two
+ * OMTs contain the values to the left of $j$ and the values to the right of $j$
+ * respectively. All of the values from the original OMT go into one of the new
+ * OMTs. If the weights of the values don't split exactly evenly, then the
+ * implementation has the freedom to choose whether the new left OMT or the new
+ * right OMT is larger.
+ *
+ * Performance:
+ *  Insertion and deletion should run with $O(\log |V|)$ time and $O(\log |V|)$
+ * calls to the Heaviside function. The memory required is O(|V|).
+ *
+ * Usage:
+ *  The omt is templated by two parameters:
+ *   - omtdata_t is what will be stored within the omt.  These could be pointers
+ * or real data types (ints, structs).
+ *   - omtdataout_t is what will be returned by find and related functions.  By
+ * default, it is the same as omtdata_t, but you can set it to (omtdata_t *). To
+ * create an omt which will store "TXNID"s, for example, it is a good idea to
+ * typedef the template: typedef omt<TXNID> txnid_omt_t; If you are storing
+ * structs, you may want to be able to get a pointer to the data actually stored
+ * in the omt (see find_zero).  To do this, use the second template parameter:
+ *   typedef omt<struct foo, struct foo *> foo_omt_t;
+ */
+
+namespace omt_internal {
+
+template <bool subtree_supports_marks>
+class subtree_templated {
+ private:
+  uint32_t m_index;
+
+ public:
+  static const uint32_t NODE_NULL = UINT32_MAX;
+  inline void set_to_null(void) { m_index = NODE_NULL; }
+
+  inline bool is_null(void) const { return NODE_NULL == this->get_index(); }
+
+  inline uint32_t get_index(void) const { return m_index; }
+
+  inline void set_index(uint32_t index) {
+    paranoid_invariant(index != NODE_NULL);
+    m_index = index;
+  }
+} __attribute__((__packed__, aligned(4)));
+
+template <>
+class subtree_templated<true> {
+ private:
+  uint32_t m_bitfield;
+  static const uint32_t MASK_INDEX = ~(((uint32_t)1) << 31);
+  static const uint32_t MASK_BIT = ((uint32_t)1) << 31;
+
+  inline void set_index_internal(uint32_t new_index) {
+    m_bitfield = (m_bitfield & MASK_BIT) | new_index;
+  }
+
+ public:
+  static const uint32_t NODE_NULL = INT32_MAX;
+  inline void set_to_null(void) { this->set_index_internal(NODE_NULL); }
+
+  inline bool is_null(void) const { return NODE_NULL == this->get_index(); }
+
+  inline uint32_t get_index(void) const {
+    TOKU_DRD_IGNORE_VAR(m_bitfield);
+    const uint32_t bits = m_bitfield;
+    TOKU_DRD_STOP_IGNORING_VAR(m_bitfield);
+    return bits & MASK_INDEX;
+  }
+
+  inline void set_index(uint32_t index) {
+    paranoid_invariant(index < NODE_NULL);
+    this->set_index_internal(index);
+  }
+
+  inline bool get_bit(void) const {
+    TOKU_DRD_IGNORE_VAR(m_bitfield);
+    const uint32_t bits = m_bitfield;
+    TOKU_DRD_STOP_IGNORING_VAR(m_bitfield);
+    return (bits & MASK_BIT) != 0;
+  }
+
+  inline void enable_bit(void) {
+    // These bits may be set by a thread with a write lock on some
+    // leaf, and the index can be read by another thread with a (read
+    // or write) lock on another thread.  Also, the has_marks_below
+    // bit can be set by two threads simultaneously.  Neither of these
+    // are real races, so if we are using DRD we should tell it to
+    // ignore these bits just while we set this bit.  If there were a
+    // race in setting the index, that would be a real race.
+    TOKU_DRD_IGNORE_VAR(m_bitfield);
+    m_bitfield |= MASK_BIT;
+    TOKU_DRD_STOP_IGNORING_VAR(m_bitfield);
+  }
+
+  inline void disable_bit(void) { m_bitfield &= MASK_INDEX; }
+} __attribute__((__packed__));
+
+template <typename omtdata_t, bool subtree_supports_marks>
+class omt_node_templated {
+ public:
+  omtdata_t value;
+  uint32_t weight;
+  subtree_templated<subtree_supports_marks> left;
+  subtree_templated<subtree_supports_marks> right;
+
+  // this needs to be in both implementations because we don't have
+  // a "static if" the caller can use
+  inline void clear_stolen_bits(void) {}
+};  // note: originally this class had __attribute__((__packed__, aligned(4)))
+
+template <typename omtdata_t>
+class omt_node_templated<omtdata_t, true> {
+ public:
+  omtdata_t value;
+  uint32_t weight;
+  subtree_templated<true> left;
+  subtree_templated<true> right;
+  inline bool get_marked(void) const { return left.get_bit(); }
+  inline void set_marked_bit(void) { return left.enable_bit(); }
+  inline void unset_marked_bit(void) { return left.disable_bit(); }
+
+  inline bool get_marks_below(void) const { return right.get_bit(); }
+  inline void set_marks_below_bit(void) {
+    // This function can be called by multiple threads.
+    // Checking first reduces cache invalidation.
+    if (!this->get_marks_below()) {
+      right.enable_bit();
+    }
+  }
+  inline void unset_marks_below_bit(void) { right.disable_bit(); }
+
+  inline void clear_stolen_bits(void) {
+    this->unset_marked_bit();
+    this->unset_marks_below_bit();
+  }
+};  // note: originally this class had __attribute__((__packed__, aligned(4)))
+
+}  // namespace omt_internal
+
+template <typename omtdata_t, typename omtdataout_t = omtdata_t,
+          bool supports_marks = false>
+class omt {
+ public:
+  /**
+   * Effect: Create an empty OMT.
+   * Performance: constant time.
+   */
+  void create(void);
+
+  /**
+   * Effect: Create an empty OMT with no internal allocated space.
+   * Performance: constant time.
+   * Rationale: In some cases we need a valid omt but don't want to malloc.
+   */
+  void create_no_array(void);
+
+  /**
+   * Effect: Create a OMT containing values.  The number of values is in
+   * numvalues. Stores the new OMT in *omtp. Requires: this has not been created
+   * yet Requires: values != NULL Requires: values is sorted Performance:
+   * time=O(numvalues) Rationale:    Normally to insert N values takes O(N lg N)
+   * amortized time. If the N values are known in advance, are sorted, and the
+   * structure is empty, we can batch insert them much faster.
+   */
+  __attribute__((nonnull)) void create_from_sorted_array(
+      const omtdata_t *const values, const uint32_t numvalues);
+
+  /**
+   * Effect: Create an OMT containing values.  The number of values is in
+   * numvalues. On success the OMT takes ownership of *values array, and sets
+   * values=NULL. Requires: this has not been created yet Requires: values !=
+   * NULL Requires: *values is sorted Requires: *values was allocated with
+   * toku_malloc Requires: Capacity of the *values array is <= new_capacity
+   * Requires: On success, *values may not be accessed again by the caller.
+   * Performance:  time=O(1)
+   * Rational:     create_from_sorted_array takes O(numvalues) time.
+   *               By taking ownership of the array, we save a malloc and
+   * memcpy, and possibly a free (if the caller is done with the array).
+   */
+  void create_steal_sorted_array(omtdata_t **const values,
+                                 const uint32_t numvalues,
+                                 const uint32_t new_capacity);
+
+  /**
+   * Effect: Create a new OMT, storing it in *newomt.
+   *  The values to the right of index (starting at index) are moved to *newomt.
+   * Requires: newomt != NULL
+   * Returns
+   *    0             success,
+   *    EINVAL        if index > toku_omt_size(omt)
+   * On nonzero return, omt and *newomt are unmodified.
+   * Performance: time=O(n)
+   * Rationale:  We don't need a split-evenly operation.  We need to split items
+   * so that their total sizes are even, and other similar splitting criteria.
+   * It's easy to split evenly by calling size(), and dividing by two.
+   */
+  __attribute__((nonnull)) int split_at(omt *const newomt, const uint32_t idx);
+
+  /**
+   * Effect: Appends leftomt and rightomt to produce a new omt.
+   *  Creates this as the new omt.
+   *  leftomt and rightomt are destroyed.
+   * Performance: time=O(n) is acceptable, but one can imagine implementations
+   * that are O(\log n) worst-case.
+   */
+  __attribute__((nonnull)) void merge(omt *const leftomt, omt *const rightomt);
+
+  /**
+   * Effect: Creates a copy of an omt.
+   *  Creates this as the clone.
+   *  Each element is copied directly.  If they are pointers, the underlying
+   * data is not duplicated. Performance: O(n) or the running time of
+   * fill_array_with_subtree_values()
+   */
+  void clone(const omt &src);
+
+  /**
+   * Effect: Set the tree to be empty.
+   *  Note: Will not reallocate or resize any memory.
+   * Performance: time=O(1)
+   */
+  void clear(void);
+
+  /**
+   * Effect:  Destroy an OMT, freeing all its memory.
+   *   If the values being stored are pointers, their underlying data is not
+   * freed.  See free_items() Those values may be freed before or after calling
+   * toku_omt_destroy. Rationale: Returns no values since free() cannot fail.
+   * Rationale: Does not free the underlying pointers to reduce complexity.
+   * Performance:  time=O(1)
+   */
+  void destroy(void);
+
+  /**
+   * Effect: return |this|.
+   * Performance:  time=O(1)
+   */
+  uint32_t size(void) const;
+
+  /**
+   * Effect:  Insert value into the OMT.
+   *   If there is some i such that $h(V_i, v)=0$ then returns DB_KEYEXIST.
+   *   Otherwise, let i be the minimum value such that $h(V_i, v)>0$.
+   *      If no such i exists, then let i be |V|
+   *   Then this has the same effect as
+   *    insert_at(tree, value, i);
+   *   If idx!=NULL then i is stored in *idx
+   * Requires:  The signum of h must be monotonically increasing.
+   * Returns:
+   *    0            success
+   *    DB_KEYEXIST  the key is present (h was equal to zero for some value)
+   * On nonzero return, omt is unchanged.
+   * Performance: time=O(\log N) amortized.
+   * Rationale: Some future implementation may be O(\log N) worst-case time, but
+   * O(\log N) amortized is good enough for now.
+   */
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int insert(const omtdata_t &value, const omtcmp_t &v, uint32_t *const idx);
+
+  /**
+   * Effect: Increases indexes of all items at slot >= idx by 1.
+   *         Insert value into the position at idx.
+   * Returns:
+   *   0         success
+   *   EINVAL    if idx > this->size()
+   * On error, omt is unchanged.
+   * Performance: time=O(\log N) amortized time.
+   * Rationale: Some future implementation may be O(\log N) worst-case time, but
+   * O(\log N) amortized is good enough for now.
+   */
+  int insert_at(const omtdata_t &value, const uint32_t idx);
+
+  /**
+   * Effect:  Replaces the item at idx with value.
+   * Returns:
+   *   0       success
+   *   EINVAL  if idx>=this->size()
+   * On error, omt is unchanged.
+   * Performance: time=O(\log N)
+   * Rationale: The FT needs to be able to replace a value with another copy of
+   * the same value (allocated in a different location)
+   *
+   */
+  int set_at(const omtdata_t &value, const uint32_t idx);
+
+  /**
+   * Effect: Delete the item in slot idx.
+   *         Decreases indexes of all items at slot > idx by 1.
+   * Returns
+   *     0            success
+   *     EINVAL       if idx>=this->size()
+   * On error, omt is unchanged.
+   * Rationale: To delete an item, first find its index using find or find_zero,
+   * then delete it. Performance: time=O(\log N) amortized.
+   */
+  int delete_at(const uint32_t idx);
+
+  /**
+   * Effect:  Iterate over the values of the omt, from left to right, calling f
+   * on each value. The first argument passed to f is a ref-to-const of the
+   * value stored in the omt. The second argument passed to f is the index of
+   * the value. The third argument passed to f is iterate_extra. The indices run
+   * from 0 (inclusive) to this->size() (exclusive). Requires: f != NULL
+   * Returns:
+   *  If f ever returns nonzero, then the iteration stops, and the value
+   * returned by f is returned by iterate. If f always returns zero, then
+   * iterate returns 0. Requires:  Don't modify the omt while running.  (E.g., f
+   * may not insert or delete values from the omt.) Performance: time=O(i+\log
+   * N) where i is the number of times f is called, and N is the number of
+   * elements in the omt. Rationale: Although the functional iterator requires
+   * defining another function (as opposed to C++ style iterator), it is much
+   * easier to read. Rationale: We may at some point use functors, but for now
+   * this is a smaller change from the old OMT.
+   */
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate(iterate_extra_t *const iterate_extra) const;
+
+  /**
+   * Effect:  Iterate over the values of the omt, from left to right, calling f
+   * on each value. The first argument passed to f is a ref-to-const of the
+   * value stored in the omt. The second argument passed to f is the index of
+   * the value. The third argument passed to f is iterate_extra. The indices run
+   * from 0 (inclusive) to this->size() (exclusive). We will iterate only over
+   * [left,right)
+   *
+   * Requires: left <= right
+   * Requires: f != NULL
+   * Returns:
+   *  EINVAL  if right > this->size()
+   *  If f ever returns nonzero, then the iteration stops, and the value
+   * returned by f is returned by iterate_on_range. If f always returns zero,
+   * then iterate_on_range returns 0. Requires:  Don't modify the omt while
+   * running.  (E.g., f may not insert or delete values from the omt.)
+   * Performance: time=O(i+\log N) where i is the number of times f is called,
+   * and N is the number of elements in the omt. Rational: Although the
+   * functional iterator requires defining another function (as opposed to C++
+   * style iterator), it is much easier to read.
+   */
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate_on_range(const uint32_t left, const uint32_t right,
+                       iterate_extra_t *const iterate_extra) const;
+
+  /**
+   * Effect: Iterate over the values of the omt, and mark the nodes that are
+   * visited. Other than the marks, this behaves the same as iterate_on_range.
+   * Requires: supports_marks == true
+   * Performance: time=O(i+\log N) where i is the number of times f is called,
+   * and N is the number of elements in the omt. Notes: This function MAY be
+   * called concurrently by multiple threads, but not concurrently with any
+   * other non-const function.
+   */
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate_and_mark_range(const uint32_t left, const uint32_t right,
+                             iterate_extra_t *const iterate_extra);
+
+  /**
+   * Effect: Iterate over the values of the omt, from left to right, calling f
+   * on each value whose node has been marked. Other than the marks, this
+   * behaves the same as iterate. Requires: supports_marks == true Performance:
+   * time=O(i+\log N) where i is the number of times f is called, and N is the
+   * number of elements in the omt.
+   */
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate_over_marked(iterate_extra_t *const iterate_extra) const;
+
+  /**
+   * Effect: Delete all elements from the omt, whose nodes have been marked.
+   * Requires: supports_marks == true
+   * Performance: time=O(N + i\log N) where i is the number of marked elements,
+   * {c,sh}ould be faster
+   */
+  void delete_all_marked(void);
+
+  /**
+   * Effect: Verify that the internal state of the marks in the tree are
+   * self-consistent. Crashes the system if the marks are in a bad state.
+   * Requires: supports_marks == true
+   * Performance: time=O(N)
+   * Notes:
+   *  Even though this is a const function, it requires exclusive access.
+   * Rationale:
+   *  The current implementation of the marks relies on a sort of
+   *  "cache" bit representing the state of bits below it in the tree.
+   *  This allows glass-box testing that these bits are correct.
+   */
+  void verify_marks_consistent(void) const;
+
+  /**
+   * Effect: None
+   * Returns whether there are any marks in the tree.
+   */
+  bool has_marks(void) const;
+
+  /**
+   * Effect:  Iterate over the values of the omt, from left to right, calling f
+   * on each value. The first argument passed to f is a pointer to the value
+   * stored in the omt. The second argument passed to f is the index of the
+   * value. The third argument passed to f is iterate_extra. The indices run
+   * from 0 (inclusive) to this->size() (exclusive). Requires: same as for
+   * iterate() Returns: same as for iterate() Performance: same as for iterate()
+   * Rationale: In general, most iterators should use iterate() since they
+   * should not modify the data stored in the omt.  This function is for
+   * iterators which need to modify values (for example, free_items). Rationale:
+   * We assume if you are transforming the data in place, you want to do it to
+   * everything at once, so there is not yet an iterate_on_range_ptr (but there
+   * could be).
+   */
+  template <typename iterate_extra_t,
+            int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+  void iterate_ptr(iterate_extra_t *const iterate_extra);
+
+  /**
+   * Effect: Set *value=V_idx
+   * Returns
+   *    0             success
+   *    EINVAL        if index>=toku_omt_size(omt)
+   * On nonzero return, *value is unchanged
+   * Performance: time=O(\log N)
+   */
+  int fetch(const uint32_t idx, omtdataout_t *const value) const;
+
+  /**
+   * Effect:  Find the smallest i such that h(V_i, extra)>=0
+   *  If there is such an i and h(V_i,extra)==0 then set *idxp=i, set *value =
+   * V_i, and return 0. If there is such an i and h(V_i,extra)>0  then set
+   * *idxp=i and return DB_NOTFOUND. If there is no such i then set
+   * *idx=this->size() and return DB_NOTFOUND. Note: value is of type
+   * omtdataout_t, which may be of type (omtdata_t) or (omtdata_t *) but is
+   * fixed by the instantiation. If it is the value type, then the value is
+   * copied out (even if the value type is a pointer to something else) If it is
+   * the pointer type, then *value is set to a pointer to the data within the
+   * omt. This is determined by the type of the omt as initially declared. If
+   * the omt is declared as omt<foo_t>, then foo_t's will be stored and foo_t's
+   * will be returned by find and related functions. If the omt is declared as
+   * omt<foo_t, foo_t *>, then foo_t's will be stored, and pointers to the
+   * stored items will be returned by find and related functions. Rationale:
+   *  Structs too small for malloc should be stored directly in the omt.
+   *  These structs may need to be edited as they exist inside the omt, so we
+   * need a way to get a pointer within the omt. Using separate functions for
+   * returning pointers and values increases code duplication and reduces
+   * type-checking. That also reduces the ability of the creator of a data
+   * structure to give advice to its future users. Slight overloading in this
+   * case seemed to provide a better API and better type checking.
+   */
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find_zero(const omtcmp_t &extra, omtdataout_t *const value,
+                uint32_t *const idxp) const;
+
+  /**
+   *   Effect:
+   *    If direction >0 then find the smallest i such that h(V_i,extra)>0.
+   *    If direction <0 then find the largest  i such that h(V_i,extra)<0.
+   *    (Direction may not be equal to zero.)
+   *    If value!=NULL then store V_i in *value
+   *    If idxp!=NULL then store i in *idxp.
+   *   Requires: The signum of h is monotically increasing.
+   *   Returns
+   *      0             success
+   *      DB_NOTFOUND   no such value is found.
+   *   On nonzero return, *value and *idxp are unchanged
+   *   Performance: time=O(\log N)
+   *   Rationale:
+   *     Here's how to use the find function to find various things
+   *       Cases for find:
+   *        find first value:         ( h(v)=+1, direction=+1 )
+   *        find last value           ( h(v)=-1, direction=-1 )
+   *        find first X              ( h(v)=(v< x) ? -1 : 1    direction=+1 )
+   *        find last X               ( h(v)=(v<=x) ? -1 : 1    direction=-1 )
+   *        find X or successor to X  ( same as find first X. )
+   *
+   *   Rationale: To help understand heaviside functions and behavor of find:
+   *    There are 7 kinds of heaviside functions.
+   *    The signus of the h must be monotonically increasing.
+   *    Given a function of the following form, A is the element
+   *    returned for direction>0, B is the element returned
+   *    for direction<0, C is the element returned for
+   *    direction==0 (see find_zero) (with a return of 0), and D is the element
+   *    returned for direction==0 (see find_zero) with a return of DB_NOTFOUND.
+   *    If any of A, B, or C are not found, then asking for the
+   *    associated direction will return DB_NOTFOUND.
+   *    See find_zero for more information.
+   *
+   *    Let the following represent the signus of the heaviside function.
+   *
+   *    -...-
+   *        A
+   *         D
+   *
+   *    +...+
+   *    B
+   *    D
+   *
+   *    0...0
+   *    C
+   *
+   *    -...-0...0
+   *        AC
+   *
+   *    0...0+...+
+   *    C    B
+   *
+   *    -...-+...+
+   *        AB
+   *         D
+   *
+   *    -...-0...0+...+
+   *        AC    B
+   */
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find(const omtcmp_t &extra, int direction, omtdataout_t *const value,
+           uint32_t *const idxp) const;
+
+  /**
+   * Effect: Return the size (in bytes) of the omt, as it resides in main
+   * memory.  If the data stored are pointers, don't include the size of what
+   * they all point to.
+   */
+  size_t memory_size(void);
+
+ private:
+  typedef uint32_t node_idx;
+  typedef omt_internal::subtree_templated<supports_marks> subtree;
+  typedef omt_internal::omt_node_templated<omtdata_t, supports_marks> omt_node;
+  ENSURE_POD(subtree);
+
+  struct omt_array {
+    uint32_t start_idx;
+    uint32_t num_values;
+    omtdata_t *values;
+  };
+
+  struct omt_tree {
+    subtree root;
+    uint32_t free_idx;
+    omt_node *nodes;
+  };
+
+  bool is_array;
+  uint32_t capacity;
+  union {
+    struct omt_array a;
+    struct omt_tree t;
+  } d;
+
+  __attribute__((nonnull)) void unmark(const subtree &subtree,
+                                       const uint32_t index,
+                                       GrowableArray<node_idx> *const indexes);
+
+  void create_internal_no_array(const uint32_t new_capacity);
+
+  void create_internal(const uint32_t new_capacity);
+
+  uint32_t nweight(const subtree &subtree) const;
+
+  node_idx node_malloc(void);
+
+  void node_free(const node_idx idx);
+
+  void maybe_resize_array(const uint32_t n);
+
+  __attribute__((nonnull)) void fill_array_with_subtree_values(
+      omtdata_t *const array, const subtree &subtree) const;
+
+  void convert_to_array(void);
+
+  __attribute__((nonnull)) void rebuild_from_sorted_array(
+      subtree *const subtree, const omtdata_t *const values,
+      const uint32_t numvalues);
+
+  void convert_to_tree(void);
+
+  void maybe_resize_or_convert(const uint32_t n);
+
+  bool will_need_rebalance(const subtree &subtree, const int leftmod,
+                           const int rightmod) const;
+
+  __attribute__((nonnull)) void insert_internal(
+      subtree *const subtreep, const omtdata_t &value, const uint32_t idx,
+      subtree **const rebalance_subtree);
+
+  void set_at_internal_array(const omtdata_t &value, const uint32_t idx);
+
+  void set_at_internal(const subtree &subtree, const omtdata_t &value,
+                       const uint32_t idx);
+
+  void delete_internal(subtree *const subtreep, const uint32_t idx,
+                       omt_node *const copyn,
+                       subtree **const rebalance_subtree);
+
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate_internal_array(const uint32_t left, const uint32_t right,
+                             iterate_extra_t *const iterate_extra) const;
+
+  template <typename iterate_extra_t,
+            int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+  void iterate_ptr_internal(const uint32_t left, const uint32_t right,
+                            const subtree &subtree, const uint32_t idx,
+                            iterate_extra_t *const iterate_extra);
+
+  template <typename iterate_extra_t,
+            int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+  void iterate_ptr_internal_array(const uint32_t left, const uint32_t right,
+                                  iterate_extra_t *const iterate_extra);
+
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate_internal(const uint32_t left, const uint32_t right,
+                       const subtree &subtree, const uint32_t idx,
+                       iterate_extra_t *const iterate_extra) const;
+
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate_and_mark_range_internal(const uint32_t left, const uint32_t right,
+                                      const subtree &subtree,
+                                      const uint32_t idx,
+                                      iterate_extra_t *const iterate_extra);
+
+  template <typename iterate_extra_t,
+            int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+  int iterate_over_marked_internal(const subtree &subtree, const uint32_t idx,
+                                   iterate_extra_t *const iterate_extra) const;
+
+  uint32_t verify_marks_consistent_internal(const subtree &subtree,
+                                            const bool allow_marks) const;
+
+  void fetch_internal_array(const uint32_t i, omtdataout_t *const value) const;
+
+  void fetch_internal(const subtree &subtree, const uint32_t i,
+                      omtdataout_t *const value) const;
+
+  __attribute__((nonnull)) void fill_array_with_subtree_idxs(
+      node_idx *const array, const subtree &subtree) const;
+
+  __attribute__((nonnull)) void rebuild_subtree_from_idxs(
+      subtree *const subtree, const node_idx *const idxs,
+      const uint32_t numvalues);
+
+  __attribute__((nonnull)) void rebalance(subtree *const subtree);
+
+  __attribute__((nonnull)) static void copyout(omtdata_t *const out,
+                                               const omt_node *const n);
+
+  __attribute__((nonnull)) static void copyout(omtdata_t **const out,
+                                               omt_node *const n);
+
+  __attribute__((nonnull)) static void copyout(
+      omtdata_t *const out, const omtdata_t *const stored_value_ptr);
+
+  __attribute__((nonnull)) static void copyout(
+      omtdata_t **const out, omtdata_t *const stored_value_ptr);
+
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find_internal_zero_array(const omtcmp_t &extra, omtdataout_t *const value,
+                               uint32_t *const idxp) const;
+
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find_internal_zero(const subtree &subtree, const omtcmp_t &extra,
+                         omtdataout_t *const value, uint32_t *const idxp) const;
+
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find_internal_plus_array(const omtcmp_t &extra, omtdataout_t *const value,
+                               uint32_t *const idxp) const;
+
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find_internal_plus(const subtree &subtree, const omtcmp_t &extra,
+                         omtdataout_t *const value, uint32_t *const idxp) const;
+
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find_internal_minus_array(const omtcmp_t &extra,
+                                omtdataout_t *const value,
+                                uint32_t *const idxp) const;
+
+  template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+  int find_internal_minus(const subtree &subtree, const omtcmp_t &extra,
+                          omtdataout_t *const value,
+                          uint32_t *const idxp) const;
+};
+
+}  // namespace toku
+
+// include the implementation here
+#include "omt_impl.h"
diff --git a/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h b/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h
new file mode 100644
index 00000000000..e7798671653
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/util/omt_impl.h
@@ -0,0 +1,1295 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <string.h>
+
+#include "../db.h"
+#include "../portability/memory.h"
+
+namespace toku {
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create(void) {
+  this->create_internal(2);
+  if (supports_marks) {
+    this->convert_to_tree();
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create_no_array(void) {
+  if (!supports_marks) {
+    this->create_internal_no_array(0);
+  } else {
+    this->is_array = false;
+    this->capacity = 0;
+    this->d.t.nodes = nullptr;
+    this->d.t.root.set_to_null();
+    this->d.t.free_idx = 0;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create_from_sorted_array(
+    const omtdata_t *const values, const uint32_t numvalues) {
+  this->create_internal(numvalues);
+  memcpy(this->d.a.values, values, numvalues * (sizeof values[0]));
+  this->d.a.num_values = numvalues;
+  if (supports_marks) {
+    this->convert_to_tree();
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create_steal_sorted_array(
+    omtdata_t **const values, const uint32_t numvalues,
+    const uint32_t new_capacity) {
+  paranoid_invariant_notnull(values);
+  this->create_internal_no_array(new_capacity);
+  this->d.a.num_values = numvalues;
+  this->d.a.values = *values;
+  *values = nullptr;
+  if (supports_marks) {
+    this->convert_to_tree();
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+int omt<omtdata_t, omtdataout_t, supports_marks>::split_at(omt *const newomt,
+                                                           const uint32_t idx) {
+  barf_if_marked(*this);
+  paranoid_invariant_notnull(newomt);
+  if (idx > this->size()) {
+    return EINVAL;
+  }
+  this->convert_to_array();
+  const uint32_t newsize = this->size() - idx;
+  newomt->create_from_sorted_array(&this->d.a.values[this->d.a.start_idx + idx],
+                                   newsize);
+  this->d.a.num_values = idx;
+  this->maybe_resize_array(idx);
+  if (supports_marks) {
+    this->convert_to_tree();
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::merge(omt *const leftomt,
+                                                         omt *const rightomt) {
+  barf_if_marked(*this);
+  paranoid_invariant_notnull(leftomt);
+  paranoid_invariant_notnull(rightomt);
+  const uint32_t leftsize = leftomt->size();
+  const uint32_t rightsize = rightomt->size();
+  const uint32_t newsize = leftsize + rightsize;
+
+  if (leftomt->is_array) {
+    if (leftomt->capacity -
+            (leftomt->d.a.start_idx + leftomt->d.a.num_values) >=
+        rightsize) {
+      this->create_steal_sorted_array(
+          &leftomt->d.a.values, leftomt->d.a.num_values, leftomt->capacity);
+      this->d.a.start_idx = leftomt->d.a.start_idx;
+    } else {
+      this->create_internal(newsize);
+      memcpy(&this->d.a.values[0], &leftomt->d.a.values[leftomt->d.a.start_idx],
+             leftomt->d.a.num_values * (sizeof this->d.a.values[0]));
+    }
+  } else {
+    this->create_internal(newsize);
+    leftomt->fill_array_with_subtree_values(&this->d.a.values[0],
+                                            leftomt->d.t.root);
+  }
+  leftomt->destroy();
+  this->d.a.num_values = leftsize;
+
+  if (rightomt->is_array) {
+    memcpy(&this->d.a.values[this->d.a.start_idx + this->d.a.num_values],
+           &rightomt->d.a.values[rightomt->d.a.start_idx],
+           rightomt->d.a.num_values * (sizeof this->d.a.values[0]));
+  } else {
+    rightomt->fill_array_with_subtree_values(
+        &this->d.a.values[this->d.a.start_idx + this->d.a.num_values],
+        rightomt->d.t.root);
+  }
+  rightomt->destroy();
+  this->d.a.num_values += rightsize;
+  paranoid_invariant(this->size() == newsize);
+  if (supports_marks) {
+    this->convert_to_tree();
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::clone(const omt &src) {
+  barf_if_marked(*this);
+  this->create_internal(src.size());
+  if (src.is_array) {
+    memcpy(&this->d.a.values[0], &src.d.a.values[src.d.a.start_idx],
+           src.d.a.num_values * (sizeof this->d.a.values[0]));
+  } else {
+    src.fill_array_with_subtree_values(&this->d.a.values[0], src.d.t.root);
+  }
+  this->d.a.num_values = src.size();
+  if (supports_marks) {
+    this->convert_to_tree();
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::clear(void) {
+  if (this->is_array) {
+    this->d.a.start_idx = 0;
+    this->d.a.num_values = 0;
+  } else {
+    this->d.t.root.set_to_null();
+    this->d.t.free_idx = 0;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::destroy(void) {
+  this->clear();
+  this->capacity = 0;
+  if (this->is_array) {
+    if (this->d.a.values != nullptr) {
+      toku_free(this->d.a.values);
+    }
+    this->d.a.values = nullptr;
+  } else {
+    if (this->d.t.nodes != nullptr) {
+      toku_free(this->d.t.nodes);
+    }
+    this->d.t.nodes = nullptr;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+uint32_t omt<omtdata_t, omtdataout_t, supports_marks>::size(void) const {
+  if (this->is_array) {
+    return this->d.a.num_values;
+  } else {
+    return this->nweight(this->d.t.root);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::insert(const omtdata_t &value,
+                                                         const omtcmp_t &v,
+                                                         uint32_t *const idx) {
+  int r;
+  uint32_t insert_idx;
+
+  r = this->find_zero<omtcmp_t, h>(v, nullptr, &insert_idx);
+  if (r == 0) {
+    if (idx) *idx = insert_idx;
+    return DB_KEYEXIST;
+  }
+  if (r != DB_NOTFOUND) return r;
+
+  if ((r = this->insert_at(value, insert_idx))) return r;
+  if (idx) *idx = insert_idx;
+
+  return 0;
+}
+
+// The following 3 functions implement a static if for us.
+template <typename omtdata_t, typename omtdataout_t>
+static void barf_if_marked(const omt<omtdata_t, omtdataout_t, false> &UU(omt)) {
+}
+
+template <typename omtdata_t, typename omtdataout_t>
+static void barf_if_marked(const omt<omtdata_t, omtdataout_t, true> &omt) {
+  invariant(!omt.has_marks());
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+bool omt<omtdata_t, omtdataout_t, supports_marks>::has_marks(void) const {
+  static_assert(supports_marks, "Does not support marks");
+  if (this->d.t.root.is_null()) {
+    return false;
+  }
+  const omt_node &node = this->d.t.nodes[this->d.t.root.get_index()];
+  return node.get_marks_below() || node.get_marked();
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+int omt<omtdata_t, omtdataout_t, supports_marks>::insert_at(
+    const omtdata_t &value, const uint32_t idx) {
+  barf_if_marked(*this);
+  if (idx > this->size()) {
+    return EINVAL;
+  }
+
+  this->maybe_resize_or_convert(this->size() + 1);
+  if (this->is_array && idx != this->d.a.num_values &&
+      (idx != 0 || this->d.a.start_idx == 0)) {
+    this->convert_to_tree();
+  }
+  if (this->is_array) {
+    if (idx == this->d.a.num_values) {
+      this->d.a.values[this->d.a.start_idx + this->d.a.num_values] = value;
+    } else {
+      this->d.a.values[--this->d.a.start_idx] = value;
+    }
+    this->d.a.num_values++;
+  } else {
+    subtree *rebalance_subtree = nullptr;
+    this->insert_internal(&this->d.t.root, value, idx, &rebalance_subtree);
+    if (rebalance_subtree != nullptr) {
+      this->rebalance(rebalance_subtree);
+    }
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+int omt<omtdata_t, omtdataout_t, supports_marks>::set_at(const omtdata_t &value,
+                                                         const uint32_t idx) {
+  barf_if_marked(*this);
+  if (idx >= this->size()) {
+    return EINVAL;
+  }
+
+  if (this->is_array) {
+    this->set_at_internal_array(value, idx);
+  } else {
+    this->set_at_internal(this->d.t.root, value, idx);
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+int omt<omtdata_t, omtdataout_t, supports_marks>::delete_at(
+    const uint32_t idx) {
+  barf_if_marked(*this);
+  if (idx >= this->size()) {
+    return EINVAL;
+  }
+
+  this->maybe_resize_or_convert(this->size() - 1);
+  if (this->is_array && idx != 0 && idx != this->d.a.num_values - 1) {
+    this->convert_to_tree();
+  }
+  if (this->is_array) {
+    // Testing for 0 does not rule out it being the last entry.
+    // Test explicitly for num_values-1
+    if (idx != this->d.a.num_values - 1) {
+      this->d.a.start_idx++;
+    }
+    this->d.a.num_values--;
+  } else {
+    subtree *rebalance_subtree = nullptr;
+    this->delete_internal(&this->d.t.root, idx, nullptr, &rebalance_subtree);
+    if (rebalance_subtree != nullptr) {
+      this->rebalance(rebalance_subtree);
+    }
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate(
+    iterate_extra_t *const iterate_extra) const {
+  return this->iterate_on_range<iterate_extra_t, f>(0, this->size(),
+                                                    iterate_extra);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_on_range(
+    const uint32_t left, const uint32_t right,
+    iterate_extra_t *const iterate_extra) const {
+  if (right > this->size()) {
+    return EINVAL;
+  }
+  if (left == right) {
+    return 0;
+  }
+  if (this->is_array) {
+    return this->iterate_internal_array<iterate_extra_t, f>(left, right,
+                                                            iterate_extra);
+  }
+  return this->iterate_internal<iterate_extra_t, f>(left, right, this->d.t.root,
+                                                    0, iterate_extra);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_and_mark_range(
+    const uint32_t left, const uint32_t right,
+    iterate_extra_t *const iterate_extra) {
+  static_assert(supports_marks, "does not support marks");
+  if (right > this->size()) {
+    return EINVAL;
+  }
+  if (left == right) {
+    return 0;
+  }
+  paranoid_invariant(!this->is_array);
+  return this->iterate_and_mark_range_internal<iterate_extra_t, f>(
+      left, right, this->d.t.root, 0, iterate_extra);
+}
+
+// TODO: We can optimize this if we steal 3 bits.  1 bit: this node is
+// marked.  1 bit: left subtree has marks. 1 bit: right subtree has marks.
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_over_marked(
+    iterate_extra_t *const iterate_extra) const {
+  static_assert(supports_marks, "does not support marks");
+  paranoid_invariant(!this->is_array);
+  return this->iterate_over_marked_internal<iterate_extra_t, f>(
+      this->d.t.root, 0, iterate_extra);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::unmark(
+    const subtree &st, const uint32_t index,
+    GrowableArray<node_idx> *const indexes) {
+  if (st.is_null()) {
+    return;
+  }
+  omt_node &n = this->d.t.nodes[st.get_index()];
+  const uint32_t index_root = index + this->nweight(n.left);
+
+  const bool below = n.get_marks_below();
+  if (below) {
+    this->unmark(n.left, index, indexes);
+  }
+  if (n.get_marked()) {
+    indexes->push(index_root);
+  }
+  n.clear_stolen_bits();
+  if (below) {
+    this->unmark(n.right, index_root + 1, indexes);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::delete_all_marked(void) {
+  static_assert(supports_marks, "does not support marks");
+  if (!this->has_marks()) {
+    return;
+  }
+  paranoid_invariant(!this->is_array);
+  GrowableArray<node_idx> marked_indexes;
+  marked_indexes.init();
+
+  // Remove all marks.
+  // We need to delete all the stolen bits before calling delete_at to
+  // prevent barfing.
+  this->unmark(this->d.t.root, 0, &marked_indexes);
+
+  for (uint32_t i = 0; i < marked_indexes.get_size(); i++) {
+    // Delete from left to right, shift by number already deleted.
+    // Alternative is delete from right to left.
+    int r = this->delete_at(marked_indexes.fetch_unchecked(i) - i);
+    lazy_assert_zero(r);
+  }
+  marked_indexes.deinit();
+  barf_if_marked(*this);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+uint32_t
+omt<omtdata_t, omtdataout_t, supports_marks>::verify_marks_consistent_internal(
+    const subtree &st, const bool UU(allow_marks)) const {
+  if (st.is_null()) {
+    return 0;
+  }
+  const omt_node &node = this->d.t.nodes[st.get_index()];
+  uint32_t num_marks =
+      verify_marks_consistent_internal(node.left, node.get_marks_below());
+  num_marks +=
+      verify_marks_consistent_internal(node.right, node.get_marks_below());
+  if (node.get_marks_below()) {
+    paranoid_invariant(allow_marks);
+    paranoid_invariant(num_marks > 0);
+  } else {
+    // redundant with invariant below, but nice to have explicitly
+    paranoid_invariant(num_marks == 0);
+  }
+  if (node.get_marked()) {
+    paranoid_invariant(allow_marks);
+    ++num_marks;
+  }
+  return num_marks;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::verify_marks_consistent(
+    void) const {
+  static_assert(supports_marks, "does not support marks");
+  paranoid_invariant(!this->is_array);
+  this->verify_marks_consistent_internal(this->d.t.root, true);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+void omt<omtdata_t, omtdataout_t, supports_marks>::iterate_ptr(
+    iterate_extra_t *const iterate_extra) {
+  if (this->is_array) {
+    this->iterate_ptr_internal_array<iterate_extra_t, f>(0, this->size(),
+                                                         iterate_extra);
+  } else {
+    this->iterate_ptr_internal<iterate_extra_t, f>(
+        0, this->size(), this->d.t.root, 0, iterate_extra);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+int omt<omtdata_t, omtdataout_t, supports_marks>::fetch(
+    const uint32_t idx, omtdataout_t *const value) const {
+  if (idx >= this->size()) {
+    return EINVAL;
+  }
+  if (this->is_array) {
+    this->fetch_internal_array(idx, value);
+  } else {
+    this->fetch_internal(this->d.t.root, idx, value);
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_zero(
+    const omtcmp_t &extra, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  uint32_t tmp_index;
+  uint32_t *const child_idxp = (idxp != nullptr) ? idxp : &tmp_index;
+  int r;
+  if (this->is_array) {
+    r = this->find_internal_zero_array<omtcmp_t, h>(extra, value, child_idxp);
+  } else {
+    r = this->find_internal_zero<omtcmp_t, h>(this->d.t.root, extra, value,
+                                              child_idxp);
+  }
+  return r;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find(
+    const omtcmp_t &extra, int direction, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  uint32_t tmp_index;
+  uint32_t *const child_idxp = (idxp != nullptr) ? idxp : &tmp_index;
+  paranoid_invariant(direction != 0);
+  if (direction < 0) {
+    if (this->is_array) {
+      return this->find_internal_minus_array<omtcmp_t, h>(extra, value,
+                                                          child_idxp);
+    } else {
+      return this->find_internal_minus<omtcmp_t, h>(this->d.t.root, extra,
+                                                    value, child_idxp);
+    }
+  } else {
+    if (this->is_array) {
+      return this->find_internal_plus_array<omtcmp_t, h>(extra, value,
+                                                         child_idxp);
+    } else {
+      return this->find_internal_plus<omtcmp_t, h>(this->d.t.root, extra, value,
+                                                   child_idxp);
+    }
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+size_t omt<omtdata_t, omtdataout_t, supports_marks>::memory_size(void) {
+  if (this->is_array) {
+    return (sizeof *this) + this->capacity * (sizeof this->d.a.values[0]);
+  }
+  return (sizeof *this) + this->capacity * (sizeof this->d.t.nodes[0]);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create_internal_no_array(
+    const uint32_t new_capacity) {
+  this->is_array = true;
+  this->d.a.start_idx = 0;
+  this->d.a.num_values = 0;
+  this->d.a.values = nullptr;
+  this->capacity = new_capacity;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::create_internal(
+    const uint32_t new_capacity) {
+  this->create_internal_no_array(new_capacity);
+  XMALLOC_N(this->capacity, this->d.a.values);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+uint32_t omt<omtdata_t, omtdataout_t, supports_marks>::nweight(
+    const subtree &st) const {
+  if (st.is_null()) {
+    return 0;
+  } else {
+    return this->d.t.nodes[st.get_index()].weight;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+typename omt<omtdata_t, omtdataout_t, supports_marks>::node_idx
+omt<omtdata_t, omtdataout_t, supports_marks>::node_malloc(void) {
+  paranoid_invariant(this->d.t.free_idx < this->capacity);
+  omt_node &n = this->d.t.nodes[this->d.t.free_idx];
+  n.clear_stolen_bits();
+  return this->d.t.free_idx++;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::node_free(
+    const node_idx UU(idx)) {
+  paranoid_invariant(idx < this->capacity);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::maybe_resize_array(
+    const uint32_t n) {
+  const uint32_t new_size = n <= 2 ? 4 : 2 * n;
+  const uint32_t room = this->capacity - this->d.a.start_idx;
+
+  if (room < n || this->capacity / 2 >= new_size) {
+    omtdata_t *XMALLOC_N(new_size, tmp_values);
+    if (this->d.a.num_values) {
+      memcpy(tmp_values, &this->d.a.values[this->d.a.start_idx],
+             this->d.a.num_values * (sizeof tmp_values[0]));
+    }
+    this->d.a.start_idx = 0;
+    this->capacity = new_size;
+    toku_free(this->d.a.values);
+    this->d.a.values = tmp_values;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t,
+         supports_marks>::fill_array_with_subtree_values(omtdata_t *const array,
+                                                         const subtree &st)
+    const {
+  if (st.is_null()) return;
+  const omt_node &tree = this->d.t.nodes[st.get_index()];
+  this->fill_array_with_subtree_values(&array[0], tree.left);
+  array[this->nweight(tree.left)] = tree.value;
+  this->fill_array_with_subtree_values(&array[this->nweight(tree.left) + 1],
+                                       tree.right);
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::convert_to_array(void) {
+  if (!this->is_array) {
+    const uint32_t num_values = this->size();
+    uint32_t new_size = 2 * num_values;
+    new_size = new_size < 4 ? 4 : new_size;
+
+    omtdata_t *XMALLOC_N(new_size, tmp_values);
+    this->fill_array_with_subtree_values(tmp_values, this->d.t.root);
+    toku_free(this->d.t.nodes);
+    this->is_array = true;
+    this->capacity = new_size;
+    this->d.a.num_values = num_values;
+    this->d.a.values = tmp_values;
+    this->d.a.start_idx = 0;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::rebuild_from_sorted_array(
+    subtree *const st, const omtdata_t *const values,
+    const uint32_t numvalues) {
+  if (numvalues == 0) {
+    st->set_to_null();
+  } else {
+    const uint32_t halfway = numvalues / 2;
+    const node_idx newidx = this->node_malloc();
+    omt_node *const newnode = &this->d.t.nodes[newidx];
+    newnode->weight = numvalues;
+    newnode->value = values[halfway];
+    st->set_index(newidx);
+    // update everything before the recursive calls so the second call
+    // can be a tail call.
+    this->rebuild_from_sorted_array(&newnode->left, &values[0], halfway);
+    this->rebuild_from_sorted_array(&newnode->right, &values[halfway + 1],
+                                    numvalues - (halfway + 1));
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::convert_to_tree(void) {
+  if (this->is_array) {
+    const uint32_t num_nodes = this->size();
+    uint32_t new_size = num_nodes * 2;
+    new_size = new_size < 4 ? 4 : new_size;
+
+    omt_node *XMALLOC_N(new_size, new_nodes);
+    omtdata_t *const values = this->d.a.values;
+    omtdata_t *const tmp_values = &values[this->d.a.start_idx];
+    this->is_array = false;
+    this->d.t.nodes = new_nodes;
+    this->capacity = new_size;
+    this->d.t.free_idx = 0;
+    this->d.t.root.set_to_null();
+    this->rebuild_from_sorted_array(&this->d.t.root, tmp_values, num_nodes);
+    toku_free(values);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::maybe_resize_or_convert(
+    const uint32_t n) {
+  if (this->is_array) {
+    this->maybe_resize_array(n);
+  } else {
+    const uint32_t new_size = n <= 2 ? 4 : 2 * n;
+    const uint32_t num_nodes = this->nweight(this->d.t.root);
+    if ((this->capacity / 2 >= new_size) ||
+        (this->d.t.free_idx >= this->capacity && num_nodes < n) ||
+        (this->capacity < n)) {
+      this->convert_to_array();
+      // if we had a free list, the "supports_marks" version could
+      // just resize, as it is now, we have to convert to and back
+      // from an array.
+      if (supports_marks) {
+        this->convert_to_tree();
+      }
+    }
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+bool omt<omtdata_t, omtdataout_t, supports_marks>::will_need_rebalance(
+    const subtree &st, const int leftmod, const int rightmod) const {
+  if (st.is_null()) {
+    return false;
+  }
+  const omt_node &n = this->d.t.nodes[st.get_index()];
+  // one of the 1's is for the root.
+  // the other is to take ceil(n/2)
+  const uint32_t weight_left = this->nweight(n.left) + leftmod;
+  const uint32_t weight_right = this->nweight(n.right) + rightmod;
+  return ((1 + weight_left < (1 + 1 + weight_right) / 2) ||
+          (1 + weight_right < (1 + 1 + weight_left) / 2));
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::insert_internal(
+    subtree *const subtreep, const omtdata_t &value, const uint32_t idx,
+    subtree **const rebalance_subtree) {
+  if (subtreep->is_null()) {
+    paranoid_invariant_zero(idx);
+    const node_idx newidx = this->node_malloc();
+    omt_node *const newnode = &this->d.t.nodes[newidx];
+    newnode->weight = 1;
+    newnode->left.set_to_null();
+    newnode->right.set_to_null();
+    newnode->value = value;
+    subtreep->set_index(newidx);
+  } else {
+    omt_node &n = this->d.t.nodes[subtreep->get_index()];
+    n.weight++;
+    if (idx <= this->nweight(n.left)) {
+      if (*rebalance_subtree == nullptr &&
+          this->will_need_rebalance(*subtreep, 1, 0)) {
+        *rebalance_subtree = subtreep;
+      }
+      this->insert_internal(&n.left, value, idx, rebalance_subtree);
+    } else {
+      if (*rebalance_subtree == nullptr &&
+          this->will_need_rebalance(*subtreep, 0, 1)) {
+        *rebalance_subtree = subtreep;
+      }
+      const uint32_t sub_index = idx - this->nweight(n.left) - 1;
+      this->insert_internal(&n.right, value, sub_index, rebalance_subtree);
+    }
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::set_at_internal_array(
+    const omtdata_t &value, const uint32_t idx) {
+  this->d.a.values[this->d.a.start_idx + idx] = value;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::set_at_internal(
+    const subtree &st, const omtdata_t &value, const uint32_t idx) {
+  paranoid_invariant(!st.is_null());
+  omt_node &n = this->d.t.nodes[st.get_index()];
+  const uint32_t leftweight = this->nweight(n.left);
+  if (idx < leftweight) {
+    this->set_at_internal(n.left, value, idx);
+  } else if (idx == leftweight) {
+    n.value = value;
+  } else {
+    this->set_at_internal(n.right, value, idx - leftweight - 1);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::delete_internal(
+    subtree *const subtreep, const uint32_t idx, omt_node *const copyn,
+    subtree **const rebalance_subtree) {
+  paranoid_invariant_notnull(subtreep);
+  paranoid_invariant_notnull(rebalance_subtree);
+  paranoid_invariant(!subtreep->is_null());
+  omt_node &n = this->d.t.nodes[subtreep->get_index()];
+  const uint32_t leftweight = this->nweight(n.left);
+  if (idx < leftweight) {
+    n.weight--;
+    if (*rebalance_subtree == nullptr &&
+        this->will_need_rebalance(*subtreep, -1, 0)) {
+      *rebalance_subtree = subtreep;
+    }
+    this->delete_internal(&n.left, idx, copyn, rebalance_subtree);
+  } else if (idx == leftweight) {
+    if (n.left.is_null()) {
+      const uint32_t oldidx = subtreep->get_index();
+      *subtreep = n.right;
+      if (copyn != nullptr) {
+        copyn->value = n.value;
+      }
+      this->node_free(oldidx);
+    } else if (n.right.is_null()) {
+      const uint32_t oldidx = subtreep->get_index();
+      *subtreep = n.left;
+      if (copyn != nullptr) {
+        copyn->value = n.value;
+      }
+      this->node_free(oldidx);
+    } else {
+      if (*rebalance_subtree == nullptr &&
+          this->will_need_rebalance(*subtreep, 0, -1)) {
+        *rebalance_subtree = subtreep;
+      }
+      // don't need to copy up value, it's only used by this
+      // next call, and when that gets to the bottom there
+      // won't be any more recursion
+      n.weight--;
+      this->delete_internal(&n.right, 0, &n, rebalance_subtree);
+    }
+  } else {
+    n.weight--;
+    if (*rebalance_subtree == nullptr &&
+        this->will_need_rebalance(*subtreep, 0, -1)) {
+      *rebalance_subtree = subtreep;
+    }
+    this->delete_internal(&n.right, idx - leftweight - 1, copyn,
+                          rebalance_subtree);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_internal_array(
+    const uint32_t left, const uint32_t right,
+    iterate_extra_t *const iterate_extra) const {
+  int r;
+  for (uint32_t i = left; i < right; ++i) {
+    r = f(this->d.a.values[this->d.a.start_idx + i], i, iterate_extra);
+    if (r != 0) {
+      return r;
+    }
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+void omt<omtdata_t, omtdataout_t, supports_marks>::iterate_ptr_internal(
+    const uint32_t left, const uint32_t right, const subtree &st,
+    const uint32_t idx, iterate_extra_t *const iterate_extra) {
+  if (!st.is_null()) {
+    omt_node &n = this->d.t.nodes[st.get_index()];
+    const uint32_t idx_root = idx + this->nweight(n.left);
+    if (left < idx_root) {
+      this->iterate_ptr_internal<iterate_extra_t, f>(left, right, n.left, idx,
+                                                     iterate_extra);
+    }
+    if (left <= idx_root && idx_root < right) {
+      int r = f(&n.value, idx_root, iterate_extra);
+      lazy_assert_zero(r);
+    }
+    if (idx_root + 1 < right) {
+      this->iterate_ptr_internal<iterate_extra_t, f>(
+          left, right, n.right, idx_root + 1, iterate_extra);
+    }
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(omtdata_t *, const uint32_t, iterate_extra_t *const)>
+void omt<omtdata_t, omtdataout_t, supports_marks>::iterate_ptr_internal_array(
+    const uint32_t left, const uint32_t right,
+    iterate_extra_t *const iterate_extra) {
+  for (uint32_t i = left; i < right; ++i) {
+    int r = f(&this->d.a.values[this->d.a.start_idx + i], i, iterate_extra);
+    lazy_assert_zero(r);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_internal(
+    const uint32_t left, const uint32_t right, const subtree &st,
+    const uint32_t idx, iterate_extra_t *const iterate_extra) const {
+  if (st.is_null()) {
+    return 0;
+  }
+  int r;
+  const omt_node &n = this->d.t.nodes[st.get_index()];
+  const uint32_t idx_root = idx + this->nweight(n.left);
+  if (left < idx_root) {
+    r = this->iterate_internal<iterate_extra_t, f>(left, right, n.left, idx,
+                                                   iterate_extra);
+    if (r != 0) {
+      return r;
+    }
+  }
+  if (left <= idx_root && idx_root < right) {
+    r = f(n.value, idx_root, iterate_extra);
+    if (r != 0) {
+      return r;
+    }
+  }
+  if (idx_root + 1 < right) {
+    return this->iterate_internal<iterate_extra_t, f>(
+        left, right, n.right, idx_root + 1, iterate_extra);
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::
+    iterate_and_mark_range_internal(const uint32_t left, const uint32_t right,
+                                    const subtree &st, const uint32_t idx,
+                                    iterate_extra_t *const iterate_extra) {
+  paranoid_invariant(!st.is_null());
+  int r;
+  omt_node &n = this->d.t.nodes[st.get_index()];
+  const uint32_t idx_root = idx + this->nweight(n.left);
+  if (left < idx_root && !n.left.is_null()) {
+    n.set_marks_below_bit();
+    r = this->iterate_and_mark_range_internal<iterate_extra_t, f>(
+        left, right, n.left, idx, iterate_extra);
+    if (r != 0) {
+      return r;
+    }
+  }
+  if (left <= idx_root && idx_root < right) {
+    n.set_marked_bit();
+    r = f(n.value, idx_root, iterate_extra);
+    if (r != 0) {
+      return r;
+    }
+  }
+  if (idx_root + 1 < right && !n.right.is_null()) {
+    n.set_marks_below_bit();
+    return this->iterate_and_mark_range_internal<iterate_extra_t, f>(
+        left, right, n.right, idx_root + 1, iterate_extra);
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename iterate_extra_t,
+          int (*f)(const omtdata_t &, const uint32_t, iterate_extra_t *const)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::iterate_over_marked_internal(
+    const subtree &st, const uint32_t idx,
+    iterate_extra_t *const iterate_extra) const {
+  if (st.is_null()) {
+    return 0;
+  }
+  int r;
+  const omt_node &n = this->d.t.nodes[st.get_index()];
+  const uint32_t idx_root = idx + this->nweight(n.left);
+  if (n.get_marks_below()) {
+    r = this->iterate_over_marked_internal<iterate_extra_t, f>(n.left, idx,
+                                                               iterate_extra);
+    if (r != 0) {
+      return r;
+    }
+  }
+  if (n.get_marked()) {
+    r = f(n.value, idx_root, iterate_extra);
+    if (r != 0) {
+      return r;
+    }
+  }
+  if (n.get_marks_below()) {
+    return this->iterate_over_marked_internal<iterate_extra_t, f>(
+        n.right, idx_root + 1, iterate_extra);
+  }
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::fetch_internal_array(
+    const uint32_t i, omtdataout_t *const value) const {
+  if (value != nullptr) {
+    copyout(value, &this->d.a.values[this->d.a.start_idx + i]);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::fetch_internal(
+    const subtree &st, const uint32_t i, omtdataout_t *const value) const {
+  omt_node &n = this->d.t.nodes[st.get_index()];
+  const uint32_t leftweight = this->nweight(n.left);
+  if (i < leftweight) {
+    this->fetch_internal(n.left, i, value);
+  } else if (i == leftweight) {
+    if (value != nullptr) {
+      copyout(value, &n);
+    }
+  } else {
+    this->fetch_internal(n.right, i - leftweight - 1, value);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::fill_array_with_subtree_idxs(
+    node_idx *const array, const subtree &st) const {
+  if (!st.is_null()) {
+    const omt_node &tree = this->d.t.nodes[st.get_index()];
+    this->fill_array_with_subtree_idxs(&array[0], tree.left);
+    array[this->nweight(tree.left)] = st.get_index();
+    this->fill_array_with_subtree_idxs(&array[this->nweight(tree.left) + 1],
+                                       tree.right);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::rebuild_subtree_from_idxs(
+    subtree *const st, const node_idx *const idxs, const uint32_t numvalues) {
+  if (numvalues == 0) {
+    st->set_to_null();
+  } else {
+    uint32_t halfway = numvalues / 2;
+    st->set_index(idxs[halfway]);
+    // node_idx newidx = idxs[halfway];
+    omt_node &newnode = this->d.t.nodes[st->get_index()];
+    newnode.weight = numvalues;
+    // value is already in there.
+    this->rebuild_subtree_from_idxs(&newnode.left, &idxs[0], halfway);
+    this->rebuild_subtree_from_idxs(&newnode.right, &idxs[halfway + 1],
+                                    numvalues - (halfway + 1));
+    // n_idx = newidx;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::rebalance(
+    subtree *const st) {
+  node_idx idx = st->get_index();
+  if (idx == this->d.t.root.get_index()) {
+    // Try to convert to an array.
+    // If this fails, (malloc) nothing will have changed.
+    // In the failure case we continue on to the standard rebalance
+    // algorithm.
+    this->convert_to_array();
+    if (supports_marks) {
+      this->convert_to_tree();
+    }
+  } else {
+    const omt_node &n = this->d.t.nodes[idx];
+    node_idx *tmp_array;
+    size_t mem_needed = n.weight * (sizeof tmp_array[0]);
+    size_t mem_free =
+        (this->capacity - this->d.t.free_idx) * (sizeof this->d.t.nodes[0]);
+    bool malloced;
+    if (mem_needed <= mem_free) {
+      // There is sufficient free space at the end of the nodes array
+      // to hold enough node indexes to rebalance.
+      malloced = false;
+      tmp_array =
+          reinterpret_cast<node_idx *>(&this->d.t.nodes[this->d.t.free_idx]);
+    } else {
+      malloced = true;
+      XMALLOC_N(n.weight, tmp_array);
+    }
+    this->fill_array_with_subtree_idxs(tmp_array, *st);
+    this->rebuild_subtree_from_idxs(st, tmp_array, n.weight);
+    if (malloced) toku_free(tmp_array);
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::copyout(
+    omtdata_t *const out, const omt_node *const n) {
+  *out = n->value;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::copyout(
+    omtdata_t **const out, omt_node *const n) {
+  *out = &n->value;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::copyout(
+    omtdata_t *const out, const omtdata_t *const stored_value_ptr) {
+  *out = *stored_value_ptr;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+void omt<omtdata_t, omtdataout_t, supports_marks>::copyout(
+    omtdata_t **const out, omtdata_t *const stored_value_ptr) {
+  *out = stored_value_ptr;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_zero_array(
+    const omtcmp_t &extra, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  paranoid_invariant_notnull(idxp);
+  uint32_t min = this->d.a.start_idx;
+  uint32_t limit = this->d.a.start_idx + this->d.a.num_values;
+  uint32_t best_pos = subtree::NODE_NULL;
+  uint32_t best_zero = subtree::NODE_NULL;
+
+  while (min != limit) {
+    uint32_t mid = (min + limit) / 2;
+    int hv = h(this->d.a.values[mid], extra);
+    if (hv < 0) {
+      min = mid + 1;
+    } else if (hv > 0) {
+      best_pos = mid;
+      limit = mid;
+    } else {
+      best_zero = mid;
+      limit = mid;
+    }
+  }
+  if (best_zero != subtree::NODE_NULL) {
+    // Found a zero
+    if (value != nullptr) {
+      copyout(value, &this->d.a.values[best_zero]);
+    }
+    *idxp = best_zero - this->d.a.start_idx;
+    return 0;
+  }
+  if (best_pos != subtree::NODE_NULL)
+    *idxp = best_pos - this->d.a.start_idx;
+  else
+    *idxp = this->d.a.num_values;
+  return DB_NOTFOUND;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_zero(
+    const subtree &st, const omtcmp_t &extra, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  paranoid_invariant_notnull(idxp);
+  if (st.is_null()) {
+    *idxp = 0;
+    return DB_NOTFOUND;
+  }
+  omt_node &n = this->d.t.nodes[st.get_index()];
+  int hv = h(n.value, extra);
+  if (hv < 0) {
+    int r = this->find_internal_zero<omtcmp_t, h>(n.right, extra, value, idxp);
+    *idxp += this->nweight(n.left) + 1;
+    return r;
+  } else if (hv > 0) {
+    return this->find_internal_zero<omtcmp_t, h>(n.left, extra, value, idxp);
+  } else {
+    int r = this->find_internal_zero<omtcmp_t, h>(n.left, extra, value, idxp);
+    if (r == DB_NOTFOUND) {
+      *idxp = this->nweight(n.left);
+      if (value != nullptr) {
+        copyout(value, &n);
+      }
+      r = 0;
+    }
+    return r;
+  }
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_plus_array(
+    const omtcmp_t &extra, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  paranoid_invariant_notnull(idxp);
+  uint32_t min = this->d.a.start_idx;
+  uint32_t limit = this->d.a.start_idx + this->d.a.num_values;
+  uint32_t best = subtree::NODE_NULL;
+
+  while (min != limit) {
+    const uint32_t mid = (min + limit) / 2;
+    const int hv = h(this->d.a.values[mid], extra);
+    if (hv > 0) {
+      best = mid;
+      limit = mid;
+    } else {
+      min = mid + 1;
+    }
+  }
+  if (best == subtree::NODE_NULL) {
+    return DB_NOTFOUND;
+  }
+  if (value != nullptr) {
+    copyout(value, &this->d.a.values[best]);
+  }
+  *idxp = best - this->d.a.start_idx;
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_plus(
+    const subtree &st, const omtcmp_t &extra, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  paranoid_invariant_notnull(idxp);
+  if (st.is_null()) {
+    return DB_NOTFOUND;
+  }
+  omt_node *const n = &this->d.t.nodes[st.get_index()];
+  int hv = h(n->value, extra);
+  int r;
+  if (hv > 0) {
+    r = this->find_internal_plus<omtcmp_t, h>(n->left, extra, value, idxp);
+    if (r == DB_NOTFOUND) {
+      *idxp = this->nweight(n->left);
+      if (value != nullptr) {
+        copyout(value, n);
+      }
+      r = 0;
+    }
+  } else {
+    r = this->find_internal_plus<omtcmp_t, h>(n->right, extra, value, idxp);
+    if (r == 0) {
+      *idxp += this->nweight(n->left) + 1;
+    }
+  }
+  return r;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_minus_array(
+    const omtcmp_t &extra, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  paranoid_invariant_notnull(idxp);
+  uint32_t min = this->d.a.start_idx;
+  uint32_t limit = this->d.a.start_idx + this->d.a.num_values;
+  uint32_t best = subtree::NODE_NULL;
+
+  while (min != limit) {
+    const uint32_t mid = (min + limit) / 2;
+    const int hv = h(this->d.a.values[mid], extra);
+    if (hv < 0) {
+      best = mid;
+      min = mid + 1;
+    } else {
+      limit = mid;
+    }
+  }
+  if (best == subtree::NODE_NULL) {
+    return DB_NOTFOUND;
+  }
+  if (value != nullptr) {
+    copyout(value, &this->d.a.values[best]);
+  }
+  *idxp = best - this->d.a.start_idx;
+  return 0;
+}
+
+template <typename omtdata_t, typename omtdataout_t, bool supports_marks>
+template <typename omtcmp_t, int (*h)(const omtdata_t &, const omtcmp_t &)>
+int omt<omtdata_t, omtdataout_t, supports_marks>::find_internal_minus(
+    const subtree &st, const omtcmp_t &extra, omtdataout_t *const value,
+    uint32_t *const idxp) const {
+  paranoid_invariant_notnull(idxp);
+  if (st.is_null()) {
+    return DB_NOTFOUND;
+  }
+  omt_node *const n = &this->d.t.nodes[st.get_index()];
+  int hv = h(n->value, extra);
+  if (hv < 0) {
+    int r =
+        this->find_internal_minus<omtcmp_t, h>(n->right, extra, value, idxp);
+    if (r == 0) {
+      *idxp += this->nweight(n->left) + 1;
+    } else if (r == DB_NOTFOUND) {
+      *idxp = this->nweight(n->left);
+      if (value != nullptr) {
+        copyout(value, n);
+      }
+      r = 0;
+    }
+    return r;
+  } else {
+    return this->find_internal_minus<omtcmp_t, h>(n->left, extra, value, idxp);
+  }
+}
+}  // namespace toku
diff --git a/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h b/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h
new file mode 100644
index 00000000000..f20eeedf2fb
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/util/partitioned_counter.h
@@ -0,0 +1,165 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+// Overview: A partitioned_counter provides a counter that can be incremented
+// and the running sum can be read at any time.
+//  We assume that increments are frequent, whereas reading is infrequent.
+// Implementation hint: Use thread-local storage so each thread increments its
+// own data.  The increment does not require a lock or atomic operation.
+//  Reading the data can be performed by iterating over the thread-local
+//  versions, summing them up. The data structure also includes a sum for all
+//  the threads that have died. Use a pthread_key to create the thread-local
+//  versions.  When a thread finishes, the system calls pthread_key destructor
+//  which can add that thread's copy into the sum_of_dead counter.
+// Rationale: For statistics such as are found in engine status, we need a
+// counter that requires no cache misses to increment.  We've seen significant
+//  performance speedups by removing certain counters.  Rather than removing
+//  those statistics, we would like to just make the counter fast. We generally
+//  increment the counters frequently, and want to fetch the values
+//  infrequently. The counters are monotonic. The counters can be split into
+//  many counters, which can be summed up at the end. We don't care if we get
+//  slightly out-of-date counter sums when we read the counter.  We don't care
+//  if there is a race on reading the a counter
+//   variable and incrementing.
+//  See tests/test_partitioned_counter.c for some performance measurements.
+// Operations:
+//   create_partitioned_counter    Create a counter initialized to zero.
+//   destroy_partitioned_counter   Destroy it.
+//   increment_partitioned_counter Increment it.  This is the frequent
+//   operation. read_partitioned_counter      Get the current value.  This is
+//   infrequent.
+// See partitioned_counter.cc for the abstraction function and representation
+// invariant.
+//
+// The google style guide says to avoid using constructors, and it appears that
+// constructors may have broken all the tests, because they called
+// pthread_key_create before the key was actually created.  So the google style
+// guide may have some wisdom there...
+//
+// This version does not use constructors, essentially reverrting to the google
+// C++ style guide.
+//
+
+// The old C interface.  This required a bunch of explicit
+// ___attribute__((__destructor__)) functions to remember to destroy counters at
+// the end.
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef struct partitioned_counter *PARTITIONED_COUNTER;
+PARTITIONED_COUNTER create_partitioned_counter(void);
+// Effect: Create a counter, initialized to zero.
+
+void destroy_partitioned_counter(PARTITIONED_COUNTER);
+// Effect: Destroy the counter.  No operations on that counter are permitted
+// after this.
+
+void increment_partitioned_counter(PARTITIONED_COUNTER, uint64_t amount);
+// Effect: Increment the counter by amount.
+// Requires: No overflows.  This is a 64-bit unsigned counter.
+
+uint64_t read_partitioned_counter(PARTITIONED_COUNTER)
+    __attribute__((__visibility__("default")));
+// Effect: Return the current value of the counter.
+
+void partitioned_counters_init(void);
+// Effect: Initialize any partitioned counters data structures that must be set
+// up before any partitioned counters run.
+
+void partitioned_counters_destroy(void);
+// Effect: Destroy any partitioned counters data structures.
+
+#if defined(__cplusplus)
+};
+#endif
+
+#if 0
+#include <pthread.h>
+
+#include "fttypes.h"
+
+// Used inside the PARTITIONED_COUNTER.
+struct linked_list_head {
+    struct linked_list_element *first;
+};
+
+
+class PARTITIONED_COUNTER {
+public:
+    PARTITIONED_COUNTER(void);
+    // Effect: Construct a counter, initialized to zero.
+
+    ~PARTITIONED_COUNTER(void);
+    // Effect: Destruct the counter.
+
+    void increment(uint64_t amount);
+    // Effect: Increment the counter by amount.  This is a 64-bit unsigned counter, and if you overflow it, you will get overflowed results (that is mod 2^64).
+    // Requires: Don't use this from a static constructor or destructor.
+
+    uint64_t read(void);
+    // Effect: Read the sum.
+    // Requires: Don't use this from a static constructor or destructor.
+
+private:
+    uint64_t       _sum_of_dead;             // The sum of all thread-local counts from threads that have terminated.
+    pthread_key_t   _key;                     // The pthread_key which gives us the hook to construct and destruct thread-local storage.
+    struct linked_list_head _ll_counter_head; // A linked list of all the thread-local information for this counter.
+    
+    // This function is used to destroy the thread-local part of the state when a thread terminates.
+    // But it's not the destructor for the local part of the counter, it's a destructor on a "dummy" key just so that we get a notification when a thread ends.
+    friend void destroy_thread_local_part_of_partitioned_counters (void *);
+};
+#endif
diff --git a/utilities/transactions/lock/range/range_tree/lib/util/status.h b/utilities/transactions/lock/range/range_tree/lib/util/status.h
new file mode 100644
index 00000000000..3fd0095d0f4
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/lib/util/status.h
@@ -0,0 +1,76 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+======= */
+
+#ident \
+    "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include "partitioned_counter.h"
+// PORT2: #include <util/constexpr.h>
+
+#define TOKUFT_STATUS_INIT(array, k, c, t, l, inc)                    \
+  do {                                                                \
+    array.status[k].keyname = #k;                                     \
+    array.status[k].columnname = #c;                                  \
+    array.status[k].type = t;                                         \
+    array.status[k].legend = l;                                       \
+    constexpr_static_assert(                                          \
+        strcmp(#c, "NULL") && strcmp(#c, "0"),                        \
+        "Use nullptr for no column name instead of NULL, 0, etc..."); \
+    constexpr_static_assert(                                          \
+        (inc) == TOKU_ENGINE_STATUS || strcmp(#c, "nullptr"),         \
+        "Missing column name.");                                      \
+    array.status[k].include =                                         \
+        static_cast<toku_engine_status_include_type>(inc);            \
+    if (t == STATUS_PARCOUNT) {                                       \
+      array.status[k].value.parcount = create_partitioned_counter();  \
+    }                                                                 \
+  } while (0)
diff --git a/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc b/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc
new file mode 100644
index 00000000000..6dfb78d3f49
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc
@@ -0,0 +1,479 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+
+#include "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <mutex>
+
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/transaction_db_mutex.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/hash.h"
+#include "util/thread_local.h"
+#include "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_db_mutex_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+RangeLockManagerHandle* NewRangeLockManager(
+    std::shared_ptr<TransactionDBMutexFactory> mutex_factory) {
+  std::shared_ptr<TransactionDBMutexFactory> use_factory;
+
+  if (mutex_factory) {
+    use_factory = mutex_factory;
+  } else {
+    use_factory.reset(new TransactionDBMutexFactoryImpl());
+  }
+  return new RangeTreeLockManager(use_factory);
+}
+
+static const char SUFFIX_INFIMUM = 0x0;
+static const char SUFFIX_SUPREMUM = 0x1;
+
+// Convert Endpoint into an internal format used for storing it in locktree
+// (DBT structure is used for passing endpoints to locktree and getting back)
+void serialize_endpoint(const Endpoint& endp, std::string* buf) {
+  buf->push_back(endp.inf_suffix ? SUFFIX_SUPREMUM : SUFFIX_INFIMUM);
+  buf->append(endp.slice.data(), endp.slice.size());
+}
+
+// Decode the endpoint from the format it is stored in the locktree (DBT) to
+// one used outside (EndpointWithString)
+void deserialize_endpoint(const DBT* dbt, EndpointWithString* endp) {
+  assert(dbt->size >= 1);
+  const char* dbt_data = (const char*)dbt->data;
+  char suffix = dbt_data[0];
+  assert(suffix == SUFFIX_INFIMUM || suffix == SUFFIX_SUPREMUM);
+  endp->inf_suffix = (suffix == SUFFIX_SUPREMUM);
+  endp->slice.assign(dbt_data + 1, dbt->size - 1);
+}
+
+// Get a range lock on [start_key; end_key] range
+Status RangeTreeLockManager::TryLock(PessimisticTransaction* txn,
+                                     uint32_t column_family_id,
+                                     const Endpoint& start_endp,
+                                     const Endpoint& end_endp, Env*,
+                                     bool exclusive) {
+  toku::lock_request request;
+  request.create(mutex_factory_);
+  DBT start_key_dbt, end_key_dbt;
+
+  TEST_SYNC_POINT("RangeTreeLockManager::TryRangeLock:enter");
+  std::string start_key;
+  std::string end_key;
+  serialize_endpoint(start_endp, &start_key);
+  serialize_endpoint(end_endp, &end_key);
+
+  toku_fill_dbt(&start_key_dbt, start_key.data(), start_key.size());
+  toku_fill_dbt(&end_key_dbt, end_key.data(), end_key.size());
+
+  auto lt = GetLockTreeForCF(column_family_id);
+
+  // Put the key waited on into request's m_extra. See
+  // wait_callback_for_locktree for details.
+  std::string wait_key(start_endp.slice.data(), start_endp.slice.size());
+
+  request.set(lt.get(), (TXNID)txn, &start_key_dbt, &end_key_dbt,
+              exclusive ? toku::lock_request::WRITE : toku::lock_request::READ,
+              false /* not a big txn */, &wait_key);
+
+  // This is for "periodically wake up and check if the wait is killed" feature
+  // which we are not using.
+  uint64_t killed_time_msec = 0;
+  uint64_t wait_time_msec = txn->GetLockTimeout();
+
+  if (wait_time_msec == static_cast<uint64_t>(-1)) {
+    // The transaction has no wait timeout. lock_request::wait doesn't support
+    // this, it needs a number of milliseconds to wait. Pass it one year to
+    // be safe.
+    wait_time_msec = uint64_t(1000) * 60 * 60 * 24 * 365;
+  } else {
+    // convert microseconds to milliseconds
+    wait_time_msec = (wait_time_msec + 500) / 1000;
+  }
+
+  std::vector<RangeDeadlockInfo> di_path;
+  request.m_deadlock_cb = [&](TXNID txnid, bool is_exclusive,
+                              const DBT* start_dbt, const DBT* end_dbt) {
+    EndpointWithString start;
+    EndpointWithString end;
+    deserialize_endpoint(start_dbt, &start);
+    deserialize_endpoint(end_dbt, &end);
+
+    di_path.push_back({((PessimisticTransaction*)txnid)->GetID(),
+                       column_family_id, is_exclusive, std::move(start),
+                       std::move(end)});
+  };
+
+  request.start();
+
+  const int r = request.wait(wait_time_msec, killed_time_msec,
+                             nullptr,  // killed_callback
+                             wait_callback_for_locktree, nullptr);
+
+  // Inform the txn that we are no longer waiting:
+  txn->ClearWaitingTxn();
+
+  request.destroy();
+  switch (r) {
+    case 0:
+      break;  // fall through
+    case DB_LOCK_NOTGRANTED:
+      return Status::TimedOut(Status::SubCode::kLockTimeout);
+    case TOKUDB_OUT_OF_LOCKS:
+      return Status::Busy(Status::SubCode::kLockLimit);
+    case DB_LOCK_DEADLOCK: {
+      std::reverse(di_path.begin(), di_path.end());
+      dlock_buffer_.AddNewPath(
+          RangeDeadlockPath(di_path, request.get_start_time()));
+      return Status::Busy(Status::SubCode::kDeadlock);
+    }
+    default:
+      assert(0);
+      return Status::Busy(Status::SubCode::kLockLimit);
+  }
+
+  return Status::OK();
+}
+
+// Wait callback that locktree library will call to inform us about
+// the lock waits that are in progress.
+void wait_callback_for_locktree(void*, lock_wait_infos* infos) {
+  for (auto wait_info : *infos) {
+    auto txn = (PessimisticTransaction*)wait_info.waiter;
+    auto cf_id = (ColumnFamilyId)wait_info.ltree->get_dict_id().dictid;
+
+    autovector<TransactionID> waitee_ids;
+    for (auto waitee : wait_info.waitees) {
+      waitee_ids.push_back(((PessimisticTransaction*)waitee)->GetID());
+    }
+    txn->SetWaitingTxn(waitee_ids, cf_id, (std::string*)wait_info.m_extra);
+  }
+
+  // Here we can assume that the locktree code will now wait for some lock
+  TEST_SYNC_POINT("RangeTreeLockManager::TryRangeLock:WaitingTxn");
+}
+
+void RangeTreeLockManager::UnLock(PessimisticTransaction* txn,
+                                  ColumnFamilyId column_family_id,
+                                  const std::string& key, Env*) {
+  auto locktree = GetLockTreeForCF(column_family_id);
+  std::string endp_image;
+  serialize_endpoint({key.data(), key.size(), false}, &endp_image);
+
+  DBT key_dbt;
+  toku_fill_dbt(&key_dbt, endp_image.data(), endp_image.size());
+
+  toku::range_buffer range_buf;
+  range_buf.create();
+  range_buf.append(&key_dbt, &key_dbt);
+
+  locktree->release_locks((TXNID)txn, &range_buf);
+  range_buf.destroy();
+
+  toku::lock_request::retry_all_lock_requests(
+      locktree.get(), wait_callback_for_locktree, nullptr);
+}
+
+void RangeTreeLockManager::UnLock(PessimisticTransaction* txn,
+                                  const LockTracker& tracker, Env*) {
+  const RangeTreeLockTracker* range_tracker =
+      static_cast<const RangeTreeLockTracker*>(&tracker);
+
+  RangeTreeLockTracker* range_trx_tracker =
+      static_cast<RangeTreeLockTracker*>(&txn->GetTrackedLocks());
+  bool all_keys = (range_trx_tracker == range_tracker);
+
+  // tracked_locks_->range_list may hold nullptr if the transaction has never
+  // acquired any locks.
+  ((RangeTreeLockTracker*)range_tracker)->ReleaseLocks(this, txn, all_keys);
+}
+
+int RangeTreeLockManager::CompareDbtEndpoints(void* arg, const DBT* a_key,
+                                              const DBT* b_key) {
+  const char* a = (const char*)a_key->data;
+  const char* b = (const char*)b_key->data;
+
+  size_t a_len = a_key->size;
+  size_t b_len = b_key->size;
+
+  size_t min_len = std::min(a_len, b_len);
+
+  // Compare the values. The first byte encodes the endpoint type, its value
+  // is either SUFFIX_INFIMUM or SUFFIX_SUPREMUM.
+  Comparator* cmp = (Comparator*)arg;
+  int res = cmp->Compare(Slice(a + 1, min_len - 1), Slice(b + 1, min_len - 1));
+  if (!res) {
+    if (b_len > min_len) {
+      // a is shorter;
+      if (a[0] == SUFFIX_INFIMUM) {
+        return -1;  //"a is smaller"
+      } else {
+        // a is considered padded with 0xFF:FF:FF:FF...
+        return 1;  // "a" is bigger
+      }
+    } else if (a_len > min_len) {
+      // the opposite of the above: b is shorter.
+      if (b[0] == SUFFIX_INFIMUM) {
+        return 1;  //"b is smaller"
+      } else {
+        // b is considered padded with 0xFF:FF:FF:FF...
+        return -1;  // "b" is bigger
+      }
+    } else {
+      // the lengths are equal (and the key values, too)
+      if (a[0] < b[0]) {
+        return -1;
+      } else if (a[0] > b[0]) {
+        return 1;
+      } else {
+        return 0;
+      }
+    }
+  } else {
+    return res;
+  }
+}
+
+namespace {
+void UnrefLockTreeMapsCache(void* ptr) {
+  // Called when a thread exits or a ThreadLocalPtr gets destroyed.
+  auto lock_tree_map_cache = static_cast<
+      std::unordered_map<ColumnFamilyId, std::shared_ptr<locktree>>*>(ptr);
+  delete lock_tree_map_cache;
+}
+}  // anonymous namespace
+
+RangeTreeLockManager::RangeTreeLockManager(
+    std::shared_ptr<TransactionDBMutexFactory> mutex_factory)
+    : mutex_factory_(mutex_factory),
+      ltree_lookup_cache_(new ThreadLocalPtr(&UnrefLockTreeMapsCache)),
+      dlock_buffer_(10) {
+  ltm_.create(on_create, on_destroy, on_escalate, nullptr, mutex_factory_);
+}
+
+void RangeTreeLockManager::SetRangeDeadlockInfoBufferSize(
+    uint32_t target_size) {
+  dlock_buffer_.Resize(target_size);
+}
+
+void RangeTreeLockManager::Resize(uint32_t target_size) {
+  SetRangeDeadlockInfoBufferSize(target_size);
+}
+
+std::vector<RangeDeadlockPath>
+RangeTreeLockManager::GetRangeDeadlockInfoBuffer() {
+  return dlock_buffer_.PrepareBuffer();
+}
+
+std::vector<DeadlockPath> RangeTreeLockManager::GetDeadlockInfoBuffer() {
+  std::vector<DeadlockPath> res;
+  std::vector<RangeDeadlockPath> data = GetRangeDeadlockInfoBuffer();
+  // report left endpoints
+  for (auto it = data.begin(); it != data.end(); ++it) {
+    std::vector<DeadlockInfo> path;
+
+    for (auto it2 = it->path.begin(); it2 != it->path.end(); ++it2) {
+      path.push_back(
+          {it2->m_txn_id, it2->m_cf_id, it2->m_exclusive, it2->m_start.slice});
+    }
+    res.push_back(DeadlockPath(path, it->deadlock_time));
+  }
+  return res;
+}
+
+// @brief  Lock Escalation Callback function
+//
+// @param txnid   Transaction whose locks got escalated
+// @param lt      Lock Tree where escalation is happening
+// @param buffer  Escalation result: list of locks that this transaction now
+//                owns in this lock tree.
+// @param void*   Callback context
+void RangeTreeLockManager::on_escalate(TXNID txnid, const locktree* lt,
+                                       const range_buffer& buffer, void*) {
+  auto txn = (PessimisticTransaction*)txnid;
+  ((RangeTreeLockTracker*)&txn->GetTrackedLocks())->ReplaceLocks(lt, buffer);
+}
+
+RangeTreeLockManager::~RangeTreeLockManager() {
+  autovector<void*> local_caches;
+  ltree_lookup_cache_->Scrape(&local_caches, nullptr);
+  for (auto cache : local_caches) {
+    delete static_cast<LockTreeMap*>(cache);
+  }
+  ltree_map_.clear();  // this will call release_lt() for all locktrees
+  ltm_.destroy();
+}
+
+RangeLockManagerHandle::Counters RangeTreeLockManager::GetStatus() {
+  LTM_STATUS_S ltm_status_test;
+  ltm_.get_status(&ltm_status_test);
+  Counters res;
+
+  // Searching status variable by its string name is how Toku's unit tests
+  // do it (why didn't they make LTM_ESCALATION_COUNT constant visible?)
+  // lookup keyname in status
+  for (int i = 0; i < LTM_STATUS_S::LTM_STATUS_NUM_ROWS; i++) {
+    TOKU_ENGINE_STATUS_ROW status = &ltm_status_test.status[i];
+    if (strcmp(status->keyname, "LTM_ESCALATION_COUNT") == 0) {
+      res.escalation_count = status->value.num;
+      continue;
+    }
+    if (strcmp(status->keyname, "LTM_SIZE_CURRENT") == 0) {
+      res.current_lock_memory = status->value.num;
+    }
+  }
+  return res;
+}
+
+std::shared_ptr<locktree> RangeTreeLockManager::MakeLockTreePtr(locktree* lt) {
+  locktree_manager* ltm = &ltm_;
+  return std::shared_ptr<locktree>(lt,
+                                   [ltm](locktree* p) { ltm->release_lt(p); });
+}
+
+void RangeTreeLockManager::AddColumnFamily(const ColumnFamilyHandle* cfh) {
+  uint32_t column_family_id = cfh->GetID();
+
+  InstrumentedMutexLock l(&ltree_map_mutex_);
+  if (ltree_map_.find(column_family_id) == ltree_map_.end()) {
+    DICTIONARY_ID dict_id = {.dictid = column_family_id};
+    toku::comparator cmp;
+    cmp.create(CompareDbtEndpoints, (void*)cfh->GetComparator());
+    toku::locktree* ltree = ltm_.get_lt(dict_id, cmp,
+                                        /* on_create_extra*/ nullptr);
+    // This is ok to because get_lt has copied the comparator:
+    cmp.destroy();
+
+    ltree_map_.insert({column_family_id, MakeLockTreePtr(ltree)});
+  }
+}
+
+void RangeTreeLockManager::RemoveColumnFamily(const ColumnFamilyHandle* cfh) {
+  uint32_t column_family_id = cfh->GetID();
+  // Remove lock_map for this column family.  Since the lock map is stored
+  // as a shared ptr, concurrent transactions can still keep using it
+  // until they release their references to it.
+
+  // TODO what if one drops a column family while transaction(s) still have
+  // locks in it?
+  // locktree uses column family'c Comparator* as the criteria to do tree
+  // ordering. If the comparator is gone, we won't even be able to remove the
+  // elements from the locktree.
+  // A possible solution might be to remove everything right now:
+  //  - wait until everyone traversing the locktree are gone
+  //  - remove everything from the locktree.
+  //  - some transactions may have acquired locks in their LockTracker objects.
+  //    Arrange something so we don't blow up when they try to release them.
+  //  - ...
+  // This use case (drop column family while somebody is using it) doesn't seem
+  // the priority, though.
+
+  {
+    InstrumentedMutexLock l(&ltree_map_mutex_);
+
+    auto lock_maps_iter = ltree_map_.find(column_family_id);
+    assert(lock_maps_iter != ltree_map_.end());
+    ltree_map_.erase(lock_maps_iter);
+  }  // lock_map_mutex_
+
+  autovector<void*> local_caches;
+  ltree_lookup_cache_->Scrape(&local_caches, nullptr);
+  for (auto cache : local_caches) {
+    delete static_cast<LockTreeMap*>(cache);
+  }
+}
+
+std::shared_ptr<locktree> RangeTreeLockManager::GetLockTreeForCF(
+    ColumnFamilyId column_family_id) {
+  // First check thread-local cache
+  if (ltree_lookup_cache_->Get() == nullptr) {
+    ltree_lookup_cache_->Reset(new LockTreeMap());
+  }
+
+  auto ltree_map_cache = static_cast<LockTreeMap*>(ltree_lookup_cache_->Get());
+
+  auto it = ltree_map_cache->find(column_family_id);
+  if (it != ltree_map_cache->end()) {
+    // Found lock map for this column family.
+    return it->second;
+  }
+
+  // Not found in local cache, grab mutex and check shared LockMaps
+  InstrumentedMutexLock l(&ltree_map_mutex_);
+
+  it = ltree_map_.find(column_family_id);
+  if (it == ltree_map_.end()) {
+    return nullptr;
+  } else {
+    // Found lock map.  Store in thread-local cache and return.
+    ltree_map_cache->insert({column_family_id, it->second});
+    return it->second;
+  }
+}
+
+struct LOCK_PRINT_CONTEXT {
+  RangeLockManagerHandle::RangeLockStatus* data;  // Save locks here
+  uint32_t cfh_id;  // Column Family whose tree we are traversing
+};
+
+// Report left endpoints of the acquired locks
+LockManager::PointLockStatus RangeTreeLockManager::GetPointLockStatus() {
+  PointLockStatus res;
+  LockManager::RangeLockStatus data = GetRangeLockStatus();
+  // report left endpoints
+  for (auto it = data.begin(); it != data.end(); ++it) {
+    auto& val = it->second;
+    res.insert({it->first, {val.start.slice, val.ids, val.exclusive}});
+  }
+  return res;
+}
+
+static void push_into_lock_status_data(void* param, const DBT* left,
+                                       const DBT* right, TXNID txnid_arg,
+                                       bool is_shared, TxnidVector* owners) {
+  struct LOCK_PRINT_CONTEXT* ctx = (LOCK_PRINT_CONTEXT*)param;
+  struct RangeLockInfo info;
+
+  info.exclusive = !is_shared;
+
+  deserialize_endpoint(left, &info.start);
+  deserialize_endpoint(right, &info.end);
+
+  if (txnid_arg != TXNID_SHARED) {
+    TXNID txnid = ((PessimisticTransaction*)txnid_arg)->GetID();
+    info.ids.push_back(txnid);
+  } else {
+    for (auto it : *owners) {
+      TXNID real_id = ((PessimisticTransaction*)it)->GetID();
+      info.ids.push_back(real_id);
+    }
+  }
+  ctx->data->insert({ctx->cfh_id, info});
+}
+
+LockManager::RangeLockStatus RangeTreeLockManager::GetRangeLockStatus() {
+  LockManager::RangeLockStatus data;
+  {
+    InstrumentedMutexLock l(&ltree_map_mutex_);
+    for (auto it : ltree_map_) {
+      LOCK_PRINT_CONTEXT ctx = {&data, it.first};
+      it.second->dump_locks((void*)&ctx, push_into_lock_status_data);
+    }
+  }
+  return data;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h b/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h
new file mode 100644
index 00000000000..5d55ded028e
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h
@@ -0,0 +1,130 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+
+// For DeadlockInfoBuffer:
+#include "util/thread_local.h"
+#include "utilities/transactions/lock/point/point_lock_manager.h"
+#include "utilities/transactions/lock/range/range_lock_manager.h"
+
+// Lock Tree library:
+#include "utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.h"
+#include "utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h"
+#include "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+using namespace toku;
+
+typedef DeadlockInfoBufferTempl<RangeDeadlockPath> RangeDeadlockInfoBuffer;
+
+// A Range Lock Manager that uses PerconaFT's locktree library
+class RangeTreeLockManager : public RangeLockManagerBase,
+                             public RangeLockManagerHandle {
+ public:
+  LockManager* getLockManager() override { return this; }
+
+  void AddColumnFamily(const ColumnFamilyHandle* cfh) override;
+  void RemoveColumnFamily(const ColumnFamilyHandle* cfh) override;
+
+  void Resize(uint32_t) override;
+  std::vector<DeadlockPath> GetDeadlockInfoBuffer() override;
+
+  std::vector<RangeDeadlockPath> GetRangeDeadlockInfoBuffer() override;
+  void SetRangeDeadlockInfoBufferSize(uint32_t target_size) override;
+
+  // Get a lock on a range
+  //  @note only exclusive locks are currently supported (requesting a
+  //  non-exclusive lock will get an exclusive one)
+  using LockManager::TryLock;
+  Status TryLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+                 const Endpoint& start_endp, const Endpoint& end_endp, Env* env,
+                 bool exclusive) override;
+
+  void UnLock(PessimisticTransaction* txn, const LockTracker& tracker,
+              Env* env) override;
+  void UnLock(PessimisticTransaction* txn, ColumnFamilyId column_family_id,
+              const std::string& key, Env* env) override;
+  void UnLock(PessimisticTransaction*, ColumnFamilyId, const Endpoint&,
+              const Endpoint&, Env*) override {
+    // TODO: range unlock does nothing...
+  }
+
+  explicit RangeTreeLockManager(
+      std::shared_ptr<TransactionDBMutexFactory> mutex_factory);
+
+  ~RangeTreeLockManager() override;
+
+  int SetMaxLockMemory(size_t max_lock_memory) override {
+    return ltm_.set_max_lock_memory(max_lock_memory);
+  }
+
+  size_t GetMaxLockMemory() override { return ltm_.get_max_lock_memory(); }
+
+  Counters GetStatus() override;
+
+  bool IsPointLockSupported() const override {
+    // One could have acquired a point lock (it is reduced to range lock)
+    return true;
+  }
+
+  PointLockStatus GetPointLockStatus() override;
+
+  // This is from LockManager
+  LockManager::RangeLockStatus GetRangeLockStatus() override;
+
+  // This has the same meaning as GetRangeLockStatus but is from
+  // RangeLockManagerHandle
+  RangeLockManagerHandle::RangeLockStatus GetRangeLockStatusData() override {
+    return GetRangeLockStatus();
+  }
+
+  bool IsRangeLockSupported() const override { return true; }
+
+  const LockTrackerFactory& GetLockTrackerFactory() const override {
+    return RangeTreeLockTrackerFactory::Get();
+  }
+
+  // Get the locktree which stores locks for the Column Family with given cf_id
+  std::shared_ptr<locktree> GetLockTreeForCF(ColumnFamilyId cf_id);
+
+ private:
+  toku::locktree_manager ltm_;
+
+  std::shared_ptr<TransactionDBMutexFactory> mutex_factory_;
+
+  // Map from cf_id to locktree*. Can only be accessed while holding the
+  // ltree_map_mutex_. Must use a custom deleter that calls ltm_.release_lt
+  using LockTreeMap =
+      std::unordered_map<ColumnFamilyId, std::shared_ptr<locktree>>;
+  LockTreeMap ltree_map_;
+
+  InstrumentedMutex ltree_map_mutex_;
+
+  // Per-thread cache of ltree_map_.
+  // (uses the same approach as TransactionLockMgr::lock_maps_cache_)
+  std::unique_ptr<ThreadLocalPtr> ltree_lookup_cache_;
+
+  RangeDeadlockInfoBuffer dlock_buffer_;
+
+  std::shared_ptr<locktree> MakeLockTreePtr(locktree* lt);
+  static int CompareDbtEndpoints(void* arg, const DBT* a_key, const DBT* b_key);
+
+  // Callbacks
+  static int on_create(locktree*, void*) { return 0; /* no error */ }
+  static void on_destroy(locktree*) {}
+  static void on_escalate(TXNID txnid, const locktree* lt,
+                          const range_buffer& buffer, void* extra);
+};
+
+void serialize_endpoint(const Endpoint& endp, std::string* buf);
+void wait_callback_for_locktree(void* cdata, lock_wait_infos* infos);
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc b/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc
new file mode 100644
index 00000000000..be1e1478bc3
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc
@@ -0,0 +1,156 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#ifndef OS_WIN
+
+#include "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h"
+
+#include "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+RangeLockList *RangeTreeLockTracker::getOrCreateList() {
+  if (range_list_) return range_list_.get();
+
+  // Doesn't exist, create
+  range_list_.reset(new RangeLockList());
+  return range_list_.get();
+}
+
+void RangeTreeLockTracker::Track(const PointLockRequest &lock_req) {
+  DBT key_dbt;
+  std::string key;
+  serialize_endpoint(Endpoint(lock_req.key, false), &key);
+  toku_fill_dbt(&key_dbt, key.data(), key.size());
+  RangeLockList *rl = getOrCreateList();
+  rl->Append(lock_req.column_family_id, &key_dbt, &key_dbt);
+}
+
+void RangeTreeLockTracker::Track(const RangeLockRequest &lock_req) {
+  DBT start_dbt, end_dbt;
+  std::string start_key, end_key;
+
+  serialize_endpoint(lock_req.start_endp, &start_key);
+  serialize_endpoint(lock_req.end_endp, &end_key);
+
+  toku_fill_dbt(&start_dbt, start_key.data(), start_key.size());
+  toku_fill_dbt(&end_dbt, end_key.data(), end_key.size());
+
+  RangeLockList *rl = getOrCreateList();
+  rl->Append(lock_req.column_family_id, &start_dbt, &end_dbt);
+}
+
+PointLockStatus RangeTreeLockTracker::GetPointLockStatus(
+    ColumnFamilyId /*cf_id*/, const std::string & /*key*/) const {
+  // This function is not expected to be called as RangeTreeLockTracker::
+  // IsPointLockSupported() returns false. Return the status which indicates
+  // the point is not locked.
+  PointLockStatus p;
+  p.locked = false;
+  p.exclusive = true;
+  p.seq = 0;
+  return p;
+}
+
+void RangeTreeLockTracker::Clear() { range_list_.reset(); }
+
+void RangeLockList::Append(ColumnFamilyId cf_id, const DBT *left_key,
+                           const DBT *right_key) {
+  MutexLock l(&mutex_);
+  // Only the transaction owner thread calls this function.
+  // The same thread does the lock release, so we can be certain nobody is
+  // releasing the locks concurrently.
+  assert(!releasing_locks_.load());
+  auto it = buffers_.find(cf_id);
+  if (it == buffers_.end()) {
+    // create a new one
+    it = buffers_.emplace(cf_id, std::make_shared<toku::range_buffer>()).first;
+    it->second->create();
+  }
+  it->second->append(left_key, right_key);
+}
+
+void RangeLockList::ReleaseLocks(RangeTreeLockManager *mgr,
+                                 PessimisticTransaction *txn,
+                                 bool all_trx_locks) {
+  {
+    MutexLock l(&mutex_);
+    // The lt->release_locks() call below will walk range_list->buffer_. We
+    // need to prevent lock escalation callback from replacing
+    // range_list->buffer_ while we are doing that.
+    //
+    // Additional complication here is internal mutex(es) in the locktree
+    // (let's call them latches):
+    // - Lock escalation first obtains latches on the lock tree
+    // - Then, it calls RangeTreeLockManager::on_escalate to replace
+    // transaction's range_list->buffer_. = Access to that buffer must be
+    // synchronized, so it will want to acquire the range_list->mutex_.
+    //
+    // While in this function we would want to do the reverse:
+    // - Acquire range_list->mutex_ to prevent access to the range_list.
+    // - Then, lt->release_locks() call will walk through the range_list
+    // - and acquire latches on parts of the lock tree to remove locks from
+    //   it.
+    //
+    // How do we avoid the deadlock? The idea is that here we set
+    // releasing_locks_=true, and release the mutex.
+    // All other users of the range_list must:
+    // - Acquire the mutex, then check that releasing_locks_=false.
+    //   (the code in this function doesnt do that as there's only one thread
+    //    that releases transaction's locks)
+    releasing_locks_.store(true);
+  }
+
+  for (auto it : buffers_) {
+    // Don't try to call release_locks() if the buffer is empty! if we are
+    //  not holding any locks, the lock tree might be in the STO-mode with
+    //  another transaction, and our attempt to release an empty set of locks
+    //  will cause an assertion failure.
+    if (it.second->get_num_ranges()) {
+      auto lt_ptr = mgr->GetLockTreeForCF(it.first);
+      toku::locktree *lt = lt_ptr.get();
+
+      lt->release_locks((TXNID)txn, it.second.get(), all_trx_locks);
+
+      it.second->destroy();
+      it.second->create();
+
+      toku::lock_request::retry_all_lock_requests(lt,
+                                                  wait_callback_for_locktree);
+    }
+  }
+
+  Clear();
+  releasing_locks_.store(false);
+}
+
+void RangeLockList::ReplaceLocks(const toku::locktree *lt,
+                                 const toku::range_buffer &buffer) {
+  MutexLock l(&mutex_);
+  if (releasing_locks_.load()) {
+    // Do nothing. The transaction is releasing its locks, so it will not care
+    // about having a correct list of ranges. (In TokuDB,
+    // toku_db_txn_escalate_callback() makes use of this property, too)
+    return;
+  }
+
+  ColumnFamilyId cf_id = (ColumnFamilyId)lt->get_dict_id().dictid;
+
+  auto it = buffers_.find(cf_id);
+  it->second->destroy();
+  it->second->create();
+
+  toku::range_buffer::iterator iter(&buffer);
+  toku::range_buffer::iterator::record rec;
+  while (iter.current(&rec)) {
+    it->second->append(rec.get_left_key(), rec.get_right_key());
+    iter.next();
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // OS_WIN
+#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h b/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h
new file mode 100644
index 00000000000..4ef48d25271
--- /dev/null
+++ b/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h
@@ -0,0 +1,146 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "util/mutexlock.h"
+#include "utilities/transactions/lock/lock_tracker.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+
+// Range Locking:
+#include "lib/locktree/lock_request.h"
+#include "lib/locktree/locktree.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RangeTreeLockManager;
+
+// Storage for locks that are currently held by a transaction.
+//
+// Locks are kept in toku::range_buffer because toku::locktree::release_locks()
+// accepts that as an argument.
+//
+// Note: the list of locks may differ slighly from the contents of the lock
+// tree, due to concurrency between lock acquisition, lock release, and lock
+// escalation. See MDEV-18227 and RangeTreeLockManager::UnLock for details.
+// This property is currently harmless.
+//
+// Append() and ReleaseLocks() are not thread-safe, as they are expected to be
+// called only by the owner transaction. ReplaceLocks() is safe to call from
+// other threads.
+class RangeLockList {
+ public:
+  ~RangeLockList() { Clear(); }
+
+  RangeLockList() : releasing_locks_(false) {}
+
+  void Append(ColumnFamilyId cf_id, const DBT* left_key, const DBT* right_key);
+  void ReleaseLocks(RangeTreeLockManager* mgr, PessimisticTransaction* txn,
+                    bool all_trx_locks);
+  void ReplaceLocks(const toku::locktree* lt, const toku::range_buffer& buffer);
+
+ private:
+  void Clear() {
+    for (auto it : buffers_) {
+      it.second->destroy();
+    }
+    buffers_.clear();
+  }
+
+  std::unordered_map<ColumnFamilyId, std::shared_ptr<toku::range_buffer>>
+      buffers_;
+  port::Mutex mutex_;
+  std::atomic<bool> releasing_locks_;
+};
+
+// A LockTracker-based object that is used together with RangeTreeLockManager.
+class RangeTreeLockTracker : public LockTracker {
+ public:
+  RangeTreeLockTracker() : range_list_(nullptr) {}
+
+  RangeTreeLockTracker(const RangeTreeLockTracker&) = delete;
+  RangeTreeLockTracker& operator=(const RangeTreeLockTracker&) = delete;
+
+  void Track(const PointLockRequest&) override;
+  void Track(const RangeLockRequest&) override;
+
+  bool IsPointLockSupported() const override {
+    // This indicates that we don't implement GetPointLockStatus()
+    return false;
+  }
+  bool IsRangeLockSupported() const override { return true; }
+
+  // a Not-supported dummy implementation.
+  UntrackStatus Untrack(const RangeLockRequest& /*lock_request*/) override {
+    return UntrackStatus::NOT_TRACKED;
+  }
+
+  UntrackStatus Untrack(const PointLockRequest& /*lock_request*/) override {
+    return UntrackStatus::NOT_TRACKED;
+  }
+
+  // "If this method is not supported, leave it as a no-op."
+  void Merge(const LockTracker&) override {}
+
+  // "If this method is not supported, leave it as a no-op."
+  void Subtract(const LockTracker&) override {}
+
+  void Clear() override;
+
+  // "If this method is not supported, returns nullptr."
+  virtual LockTracker* GetTrackedLocksSinceSavePoint(
+      const LockTracker&) const override {
+    return nullptr;
+  }
+
+  PointLockStatus GetPointLockStatus(ColumnFamilyId column_family_id,
+                                     const std::string& key) const override;
+
+  // The return value is only used for tests
+  uint64_t GetNumPointLocks() const override { return 0; }
+
+  ColumnFamilyIterator* GetColumnFamilyIterator() const override {
+    return nullptr;
+  }
+
+  KeyIterator* GetKeyIterator(
+      ColumnFamilyId /*column_family_id*/) const override {
+    return nullptr;
+  }
+
+  void ReleaseLocks(RangeTreeLockManager* mgr, PessimisticTransaction* txn,
+                    bool all_trx_locks) {
+    if (range_list_) range_list_->ReleaseLocks(mgr, txn, all_trx_locks);
+  }
+
+  void ReplaceLocks(const toku::locktree* lt,
+                    const toku::range_buffer& buffer) {
+    // range_list_ cannot be NULL here
+    range_list_->ReplaceLocks(lt, buffer);
+  }
+
+ private:
+  RangeLockList* getOrCreateList();
+  std::unique_ptr<RangeLockList> range_list_;
+};
+
+class RangeTreeLockTrackerFactory : public LockTrackerFactory {
+ public:
+  static const RangeTreeLockTrackerFactory& Get() {
+    static const RangeTreeLockTrackerFactory instance;
+    return instance;
+  }
+
+  LockTracker* Create() const override { return new RangeTreeLockTracker(); }
+
+ private:
+  RangeTreeLockTrackerFactory() {}
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/transactions/optimistic_transaction.cc b/utilities/transactions/optimistic_transaction.cc
index ae95efdd7f7..0ee0f28b674 100644
--- a/utilities/transactions/optimistic_transaction.cc
+++ b/utilities/transactions/optimistic_transaction.cc
@@ -17,9 +17,10 @@
 #include "rocksdb/utilities/optimistic_transaction_db.h"
 #include "util/cast_util.h"
 #include "util/string_util.h"
-#include "utilities/transactions/transaction_util.h"
+#include "utilities/transactions/lock/point/point_lock_tracker.h"
 #include "utilities/transactions/optimistic_transaction.h"
 #include "utilities/transactions/optimistic_transaction_db_impl.h"
+#include "utilities/transactions/transaction_util.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -28,7 +29,9 @@ struct WriteOptions;
 OptimisticTransaction::OptimisticTransaction(
     OptimisticTransactionDB* txn_db, const WriteOptions& write_options,
     const OptimisticTransactionOptions& txn_options)
-    : TransactionBaseImpl(txn_db->GetBaseDB(), write_options), txn_db_(txn_db) {
+    : TransactionBaseImpl(txn_db->GetBaseDB(), write_options,
+                          PointLockTrackerFactory::Get()),
+      txn_db_(txn_db) {
   Initialize(txn_options);
 }
 
@@ -174,8 +177,6 @@ Status OptimisticTransaction::TryLock(ColumnFamilyHandle* column_family,
 // Should only be called on writer thread in order to avoid any race conditions
 // in detecting write conflicts.
 Status OptimisticTransaction::CheckTransactionForConflicts(DB* db) {
-  Status result;
-
   auto db_impl = static_cast_with_check<DBImpl>(db);
 
   // Since we are on the write thread and do not want to block other writers,
diff --git a/utilities/transactions/optimistic_transaction_db_impl.h b/utilities/transactions/optimistic_transaction_db_impl.h
index d895d49b896..a23d9a06d7c 100644
--- a/utilities/transactions/optimistic_transaction_db_impl.h
+++ b/utilities/transactions/optimistic_transaction_db_impl.h
@@ -46,6 +46,22 @@ class OptimisticTransactionDBImpl : public OptimisticTransactionDB {
                                 const OptimisticTransactionOptions& txn_options,
                                 Transaction* old_txn) override;
 
+  // Transactional `DeleteRange()` is not yet supported.
+  virtual Status DeleteRange(const WriteOptions&, ColumnFamilyHandle*,
+                             const Slice&, const Slice&) override {
+    return Status::NotSupported();
+  }
+
+  // Range deletions also must not be snuck into `WriteBatch`es as they are
+  // incompatible with `OptimisticTransactionDB`.
+  virtual Status Write(const WriteOptions& write_opts,
+                       WriteBatch* batch) override {
+    if (batch->HasDeleteRange()) {
+      return Status::NotSupported();
+    }
+    return OptimisticTransactionDB::Write(write_opts, batch);
+  }
+
   size_t GetLockBucketsSize() const { return bucketed_locks_.size(); }
 
   OccValidationPolicy GetValidatePolicy() const { return validate_policy_; }
diff --git a/utilities/transactions/optimistic_transaction_test.cc b/utilities/transactions/optimistic_transaction_test.cc
index 63c1a255c67..138823b652b 100644
--- a/utilities/transactions/optimistic_transaction_test.cc
+++ b/utilities/transactions/optimistic_transaction_test.cc
@@ -10,7 +10,7 @@
 #include <thread>
 
 #include "db/db_impl/db_impl.h"
-#include "logging/logging.h"
+#include "db/db_test_util.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/perf_context.h"
@@ -38,6 +38,7 @@ class OptimisticTransactionTest
     options.create_if_missing = true;
     options.max_write_buffer_number = 2;
     options.max_write_buffer_size_to_maintain = 1600;
+    options.merge_operator.reset(new TestPutOperator());
     dbname = test::PerThreadDBPath("optimistic_transaction_testdb");
 
     DestroyDB(dbname, options);
@@ -67,9 +68,9 @@ class OptimisticTransactionTest
         OptimisticTransactionDB::Open(DBOptions(options), occ_opts, dbname,
                                       column_families, &handles, &txn_db);
 
-    assert(s.ok());
-    assert(txn_db != nullptr);
-    assert(handles.size() == 1);
+    ASSERT_OK(s);
+    ASSERT_NE(txn_db, nullptr);
+    ASSERT_EQ(handles.size(), 1);
     delete handles[0];
   }
 };
@@ -78,26 +79,24 @@ TEST_P(OptimisticTransactionTest, SuccessTest) {
   WriteOptions write_options;
   ReadOptions read_options;
   string value;
-  Status s;
 
-  txn_db->Put(write_options, Slice("foo"), Slice("bar"));
-  txn_db->Put(write_options, Slice("foo2"), Slice("bar"));
+  ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar")));
+  ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar")));
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
-  txn->GetForUpdate(read_options, "foo", &value);
+  ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value));
   ASSERT_EQ(value, "bar");
 
-  txn->Put(Slice("foo"), Slice("bar2"));
+  ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
 
-  txn->GetForUpdate(read_options, "foo", &value);
+  ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value));
   ASSERT_EQ(value, "bar2");
 
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
 
-  txn_db->Get(read_options, "foo", &value);
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
   ASSERT_EQ(value, "bar2");
 
   delete txn;
@@ -107,31 +106,29 @@ TEST_P(OptimisticTransactionTest, WriteConflictTest) {
   WriteOptions write_options;
   ReadOptions read_options;
   string value;
-  Status s;
 
-  txn_db->Put(write_options, "foo", "bar");
-  txn_db->Put(write_options, "foo2", "bar");
+  ASSERT_OK(txn_db->Put(write_options, "foo", "bar"));
+  ASSERT_OK(txn_db->Put(write_options, "foo2", "bar"));
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
-  txn->Put("foo", "bar2");
+  ASSERT_OK(txn->Put("foo", "bar2"));
 
   // This Put outside of a transaction will conflict with the previous write
-  s = txn_db->Put(write_options, "foo", "barz");
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Put(write_options, "foo", "barz"));
 
-  s = txn_db->Get(read_options, "foo", &value);
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
   ASSERT_EQ(value, "barz");
   ASSERT_EQ(1, txn->GetNumKeys());
 
-  s = txn->Commit();
+  Status s = txn->Commit();
   ASSERT_TRUE(s.IsBusy());  // Txn should not commit
 
   // Verify that transaction did not write anything
-  txn_db->Get(read_options, "foo", &value);
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
   ASSERT_EQ(value, "barz");
-  txn_db->Get(read_options, "foo2", &value);
+  ASSERT_OK(txn_db->Get(read_options, "foo2", &value));
   ASSERT_EQ(value, "bar");
 
   delete txn;
@@ -142,31 +139,30 @@ TEST_P(OptimisticTransactionTest, WriteConflictTest2) {
   ReadOptions read_options;
   OptimisticTransactionOptions txn_options;
   string value;
-  Status s;
 
-  txn_db->Put(write_options, "foo", "bar");
-  txn_db->Put(write_options, "foo2", "bar");
+  ASSERT_OK(txn_db->Put(write_options, "foo", "bar"));
+  ASSERT_OK(txn_db->Put(write_options, "foo2", "bar"));
 
   txn_options.set_snapshot = true;
   Transaction* txn = txn_db->BeginTransaction(write_options, txn_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
   // This Put outside of a transaction will conflict with a later write
-  s = txn_db->Put(write_options, "foo", "barz");
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Put(write_options, "foo", "barz"));
 
-  txn->Put("foo", "bar2");  // Conflicts with write done after snapshot taken
+  ASSERT_OK(txn->Put(
+      "foo", "bar2"));  // Conflicts with write done after snapshot taken
 
-  s = txn_db->Get(read_options, "foo", &value);
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
   ASSERT_EQ(value, "barz");
 
-  s = txn->Commit();
+  Status s = txn->Commit();
   ASSERT_TRUE(s.IsBusy());  // Txn should not commit
 
   // Verify that transaction did not write anything
-  txn_db->Get(read_options, "foo", &value);
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
   ASSERT_EQ(value, "barz");
-  txn_db->Get(read_options, "foo2", &value);
+  ASSERT_OK(txn_db->Get(read_options, "foo2", &value));
   ASSERT_EQ(value, "bar");
 
   delete txn;
@@ -177,35 +173,33 @@ TEST_P(OptimisticTransactionTest, ReadConflictTest) {
   ReadOptions read_options, snapshot_read_options;
   OptimisticTransactionOptions txn_options;
   string value;
-  Status s;
 
-  txn_db->Put(write_options, "foo", "bar");
-  txn_db->Put(write_options, "foo2", "bar");
+  ASSERT_OK(txn_db->Put(write_options, "foo", "bar"));
+  ASSERT_OK(txn_db->Put(write_options, "foo2", "bar"));
 
   txn_options.set_snapshot = true;
   Transaction* txn = txn_db->BeginTransaction(write_options, txn_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
   txn->SetSnapshot();
   snapshot_read_options.snapshot = txn->GetSnapshot();
 
-  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
   ASSERT_EQ(value, "bar");
 
   // This Put outside of a transaction will conflict with the previous read
-  s = txn_db->Put(write_options, "foo", "barz");
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Put(write_options, "foo", "barz"));
 
-  s = txn_db->Get(read_options, "foo", &value);
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
   ASSERT_EQ(value, "barz");
 
-  s = txn->Commit();
+  Status s = txn->Commit();
   ASSERT_TRUE(s.IsBusy());  // Txn should not commit
 
   // Verify that transaction did not write anything
-  txn->GetForUpdate(read_options, "foo", &value);
+  ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value));
   ASSERT_EQ(value, "barz");
-  txn->GetForUpdate(read_options, "foo2", &value);
+  ASSERT_OK(txn->GetForUpdate(read_options, "foo2", &value));
   ASSERT_EQ(value, "bar");
 
   delete txn;
@@ -218,15 +212,13 @@ TEST_P(OptimisticTransactionTest, TxnOnlyTest) {
   WriteOptions write_options;
   ReadOptions read_options;
   string value;
-  Status s;
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
-  txn->Put("x", "y");
+  ASSERT_OK(txn->Put("x", "y"));
 
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
 
   delete txn;
 }
@@ -235,37 +227,34 @@ TEST_P(OptimisticTransactionTest, FlushTest) {
   WriteOptions write_options;
   ReadOptions read_options, snapshot_read_options;
   string value;
-  Status s;
 
-  txn_db->Put(write_options, Slice("foo"), Slice("bar"));
-  txn_db->Put(write_options, Slice("foo2"), Slice("bar"));
+  ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar")));
+  ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar")));
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
   snapshot_read_options.snapshot = txn->GetSnapshot();
 
-  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
   ASSERT_EQ(value, "bar");
 
-  txn->Put(Slice("foo"), Slice("bar2"));
+  ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
 
-  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
   ASSERT_EQ(value, "bar2");
 
   // Put a random key so we have a memtable to flush
-  s = txn_db->Put(write_options, "dummy", "dummy");
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy"));
 
   // force a memtable flush
   FlushOptions flush_ops;
-  txn_db->Flush(flush_ops);
+  ASSERT_OK(txn_db->Flush(flush_ops));
 
-  s = txn->Commit();
   // txn should commit since the flushed table is still in MemtableList History
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
 
-  txn_db->Get(read_options, "foo", &value);
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
   ASSERT_EQ(value, "bar2");
 
   delete txn;
@@ -275,52 +264,48 @@ TEST_P(OptimisticTransactionTest, FlushTest2) {
   WriteOptions write_options;
   ReadOptions read_options, snapshot_read_options;
   string value;
-  Status s;
 
-  txn_db->Put(write_options, Slice("foo"), Slice("bar"));
-  txn_db->Put(write_options, Slice("foo2"), Slice("bar"));
+  ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar")));
+  ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar")));
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
   snapshot_read_options.snapshot = txn->GetSnapshot();
 
-  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
   ASSERT_EQ(value, "bar");
 
-  txn->Put(Slice("foo"), Slice("bar2"));
+  ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
 
-  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
   ASSERT_EQ(value, "bar2");
 
   // Put a random key so we have a MemTable to flush
-  s = txn_db->Put(write_options, "dummy", "dummy");
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy"));
 
   // force a memtable flush
   FlushOptions flush_ops;
-  txn_db->Flush(flush_ops);
+  ASSERT_OK(txn_db->Flush(flush_ops));
 
   // Put a random key so we have a MemTable to flush
-  s = txn_db->Put(write_options, "dummy", "dummy2");
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy2"));
 
   // force a memtable flush
-  txn_db->Flush(flush_ops);
+  ASSERT_OK(txn_db->Flush(flush_ops));
 
-  s = txn_db->Put(write_options, "dummy", "dummy3");
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy3"));
 
   // force a memtable flush
   // Since our test db has max_write_buffer_number=2, this flush will cause
   // the first memtable to get purged from the MemtableList history.
-  txn_db->Flush(flush_ops);
+  ASSERT_OK(txn_db->Flush(flush_ops));
 
-  s = txn->Commit();
+  Status s = txn->Commit();
   // txn should not commit since MemTableList History is not large enough
   ASSERT_TRUE(s.IsTryAgain());
 
-  txn_db->Get(read_options, "foo", &value);
+  ASSERT_OK(txn_db->Get(read_options, "foo", &value));
   ASSERT_EQ(value, "bar");
 
   delete txn;
@@ -341,7 +326,6 @@ TEST_P(OptimisticTransactionTest, CheckKeySkipOldMemtable) {
     ReadOptions snapshot_read_options;
     ReadOptions snapshot_read_options2;
     string value;
-    Status s;
 
     ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar")));
     ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar")));
@@ -381,9 +365,9 @@ TEST_P(OptimisticTransactionTest, CheckKeySkipOldMemtable) {
     if (attempt == kAttemptHistoryMemtable) {
       ASSERT_OK(txn_db->Flush(flush_ops));
     } else {
-      assert(attempt == kAttemptImmMemTable);
+      ASSERT_EQ(attempt, kAttemptImmMemTable);
       DBImpl* db_impl = static_cast<DBImpl*>(txn_db->GetRootDB());
-      db_impl->TEST_SwitchMemtable();
+      ASSERT_OK(db_impl->TEST_SwitchMemtable());
     }
     uint64_t num_imm_mems;
     ASSERT_TRUE(txn_db->GetIntProperty(DB::Properties::kNumImmutableMemTable,
@@ -391,7 +375,7 @@ TEST_P(OptimisticTransactionTest, CheckKeySkipOldMemtable) {
     if (attempt == kAttemptHistoryMemtable) {
       ASSERT_EQ(0, num_imm_mems);
     } else {
-      assert(attempt == kAttemptImmMemTable);
+      ASSERT_EQ(attempt, kAttemptImmMemTable);
       ASSERT_EQ(1, num_imm_mems);
     }
 
@@ -408,7 +392,7 @@ TEST_P(OptimisticTransactionTest, CheckKeySkipOldMemtable) {
     SetPerfLevel(PerfLevel::kEnableCount);
 
     get_perf_context()->Reset();
-    s = txn->Commit();
+    Status s = txn->Commit();
     // We should have checked two memtables
     ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
     // txn should fail because of conflict, even if the memtable
@@ -421,7 +405,7 @@ TEST_P(OptimisticTransactionTest, CheckKeySkipOldMemtable) {
     ASSERT_EQ(2, get_perf_context()->get_from_memtable_count);
     ASSERT_TRUE(s.ok());
 
-    txn3->Put(Slice("foo2"), Slice("bar2"));
+    ASSERT_OK(txn3->Put(Slice("foo2"), Slice("bar2")));
     get_perf_context()->Reset();
     s = txn3->Commit();
     // txn3 is created after the active memtable is created, so that is the only
@@ -444,26 +428,24 @@ TEST_P(OptimisticTransactionTest, NoSnapshotTest) {
   WriteOptions write_options;
   ReadOptions read_options;
   string value;
-  Status s;
 
-  txn_db->Put(write_options, "AAA", "bar");
+  ASSERT_OK(txn_db->Put(write_options, "AAA", "bar"));
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
   // Modify key after transaction start
-  txn_db->Put(write_options, "AAA", "bar1");
+  ASSERT_OK(txn_db->Put(write_options, "AAA", "bar1"));
 
   // Read and write without a snapshot
-  txn->GetForUpdate(read_options, "AAA", &value);
+  ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
   ASSERT_EQ(value, "bar1");
-  txn->Put("AAA", "bar2");
+  ASSERT_OK(txn->Put("AAA", "bar2"));
 
   // Should commit since read/write was done after data changed
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
 
-  txn->GetForUpdate(read_options, "AAA", &value);
+  ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
   ASSERT_EQ(value, "bar2");
 
   delete txn;
@@ -473,75 +455,64 @@ TEST_P(OptimisticTransactionTest, MultipleSnapshotTest) {
   WriteOptions write_options;
   ReadOptions read_options, snapshot_read_options;
   string value;
-  Status s;
 
-  txn_db->Put(write_options, "AAA", "bar");
-  txn_db->Put(write_options, "BBB", "bar");
-  txn_db->Put(write_options, "CCC", "bar");
+  ASSERT_OK(txn_db->Put(write_options, "AAA", "bar"));
+  ASSERT_OK(txn_db->Put(write_options, "BBB", "bar"));
+  ASSERT_OK(txn_db->Put(write_options, "CCC", "bar"));
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
-  txn_db->Put(write_options, "AAA", "bar1");
+  ASSERT_OK(txn_db->Put(write_options, "AAA", "bar1"));
 
   // Read and write without a snapshot
-  txn->GetForUpdate(read_options, "AAA", &value);
+  ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
   ASSERT_EQ(value, "bar1");
-  txn->Put("AAA", "bar2");
+  ASSERT_OK(txn->Put("AAA", "bar2"));
 
   // Modify BBB before snapshot is taken
-  txn_db->Put(write_options, "BBB", "bar1");
+  ASSERT_OK(txn_db->Put(write_options, "BBB", "bar1"));
 
   txn->SetSnapshot();
   snapshot_read_options.snapshot = txn->GetSnapshot();
 
   // Read and write with snapshot
-  txn->GetForUpdate(snapshot_read_options, "BBB", &value);
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "BBB", &value));
   ASSERT_EQ(value, "bar1");
-  txn->Put("BBB", "bar2");
+  ASSERT_OK(txn->Put("BBB", "bar2"));
 
-  txn_db->Put(write_options, "CCC", "bar1");
+  ASSERT_OK(txn_db->Put(write_options, "CCC", "bar1"));
 
   // Set a new snapshot
   txn->SetSnapshot();
   snapshot_read_options.snapshot = txn->GetSnapshot();
 
   // Read and write with snapshot
-  txn->GetForUpdate(snapshot_read_options, "CCC", &value);
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "CCC", &value));
   ASSERT_EQ(value, "bar1");
-  txn->Put("CCC", "bar2");
+  ASSERT_OK(txn->Put("CCC", "bar2"));
 
-  s = txn->GetForUpdate(read_options, "AAA", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
   ASSERT_EQ(value, "bar2");
-  s = txn->GetForUpdate(read_options, "BBB", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn->GetForUpdate(read_options, "BBB", &value));
   ASSERT_EQ(value, "bar2");
-  s = txn->GetForUpdate(read_options, "CCC", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn->GetForUpdate(read_options, "CCC", &value));
   ASSERT_EQ(value, "bar2");
 
-  s = txn_db->Get(read_options, "AAA", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Get(read_options, "AAA", &value));
   ASSERT_EQ(value, "bar1");
-  s = txn_db->Get(read_options, "BBB", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Get(read_options, "BBB", &value));
   ASSERT_EQ(value, "bar1");
-  s = txn_db->Get(read_options, "CCC", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Get(read_options, "CCC", &value));
   ASSERT_EQ(value, "bar1");
 
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
 
-  s = txn_db->Get(read_options, "AAA", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Get(read_options, "AAA", &value));
   ASSERT_EQ(value, "bar2");
-  s = txn_db->Get(read_options, "BBB", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Get(read_options, "BBB", &value));
   ASSERT_EQ(value, "bar2");
-  s = txn_db->Get(read_options, "CCC", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Get(read_options, "CCC", &value));
   ASSERT_EQ(value, "bar2");
 
   // verify that we track multiple writes to the same key at different snapshots
@@ -549,8 +520,8 @@ TEST_P(OptimisticTransactionTest, MultipleSnapshotTest) {
   txn = txn_db->BeginTransaction(write_options);
 
   // Potentially conflicting writes
-  txn_db->Put(write_options, "ZZZ", "zzz");
-  txn_db->Put(write_options, "XXX", "xxx");
+  ASSERT_OK(txn_db->Put(write_options, "ZZZ", "zzz"));
+  ASSERT_OK(txn_db->Put(write_options, "XXX", "xxx"));
 
   txn->SetSnapshot();
 
@@ -561,16 +532,15 @@ TEST_P(OptimisticTransactionTest, MultipleSnapshotTest) {
 
   // This should not conflict in txn since the snapshot is later than the
   // previous write (spoiler alert:  it will later conflict with txn2).
-  txn->Put("ZZZ", "zzzz");
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Put("ZZZ", "zzzz"));
+  ASSERT_OK(txn->Commit());
 
   delete txn;
 
   // This will conflict since the snapshot is earlier than another write to ZZZ
-  txn2->Put("ZZZ", "xxxxx");
+  ASSERT_OK(txn2->Put("ZZZ", "xxxxx"));
 
-  s = txn2->Commit();
+  Status s = txn2->Commit();
   ASSERT_TRUE(s.IsBusy());
 
   delete txn2;
@@ -581,16 +551,13 @@ TEST_P(OptimisticTransactionTest, ColumnFamiliesTest) {
   ReadOptions read_options, snapshot_read_options;
   OptimisticTransactionOptions txn_options;
   string value;
-  Status s;
 
   ColumnFamilyHandle *cfa, *cfb;
   ColumnFamilyOptions cf_options;
 
   // Create 2 new column families
-  s = txn_db->CreateColumnFamily(cf_options, "CFA", &cfa);
-  ASSERT_OK(s);
-  s = txn_db->CreateColumnFamily(cf_options, "CFB", &cfb);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->CreateColumnFamily(cf_options, "CFA", &cfa));
+  ASSERT_OK(txn_db->CreateColumnFamily(cf_options, "CFB", &cfb));
 
   delete cfa;
   delete cfb;
@@ -608,13 +575,13 @@ TEST_P(OptimisticTransactionTest, ColumnFamiliesTest) {
   column_families.push_back(
       ColumnFamilyDescriptor("CFB", ColumnFamilyOptions()));
   std::vector<ColumnFamilyHandle*> handles;
-  s = OptimisticTransactionDB::Open(options, dbname, column_families, &handles,
-                                    &txn_db);
-  ASSERT_OK(s);
+  ASSERT_OK(OptimisticTransactionDB::Open(options, dbname, column_families,
+                                          &handles, &txn_db));
   assert(txn_db != nullptr);
+  ASSERT_NE(txn_db, nullptr);
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
   txn->SetSnapshot();
   snapshot_read_options.snapshot = txn->GetSnapshot();
@@ -625,26 +592,27 @@ TEST_P(OptimisticTransactionTest, ColumnFamiliesTest) {
 
   // Write some data to the db
   WriteBatch batch;
-  batch.Put("foo", "foo");
-  batch.Put(handles[1], "AAA", "bar");
-  batch.Put(handles[1], "AAAZZZ", "bar");
-  s = txn_db->Write(write_options, &batch);
-  ASSERT_OK(s);
-  txn_db->Delete(write_options, handles[1], "AAAZZZ");
+  ASSERT_OK(batch.Put("foo", "foo"));
+  ASSERT_OK(batch.Put(handles[1], "AAA", "bar"));
+  ASSERT_OK(batch.Put(handles[1], "AAAZZZ", "bar"));
+  ASSERT_OK(txn_db->Write(write_options, &batch));
+  ASSERT_OK(txn_db->Delete(write_options, handles[1], "AAAZZZ"));
 
   // These keys do no conflict with existing writes since they're in
   // different column families
-  txn->Delete("AAA");
-  txn->GetForUpdate(snapshot_read_options, handles[1], "foo", &value);
+  ASSERT_OK(txn->Delete("AAA"));
+  Status s =
+      txn->GetForUpdate(snapshot_read_options, handles[1], "foo", &value);
+  ASSERT_TRUE(s.IsNotFound());
   Slice key_slice("AAAZZZ");
   Slice value_slices[2] = {Slice("bar"), Slice("bar")};
-  txn->Put(handles[2], SliceParts(&key_slice, 1), SliceParts(value_slices, 2));
+  ASSERT_OK(txn->Put(handles[2], SliceParts(&key_slice, 1),
+                     SliceParts(value_slices, 2)));
 
   ASSERT_EQ(3, txn->GetNumKeys());
 
   // Txn should commit
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
   s = txn_db->Get(read_options, "AAA", &value);
   ASSERT_TRUE(s.IsNotFound());
   s = txn_db->Get(read_options, handles[2], "AAAZZZ", &value);
@@ -653,10 +621,11 @@ TEST_P(OptimisticTransactionTest, ColumnFamiliesTest) {
   Slice key_slices[3] = {Slice("AAA"), Slice("ZZ"), Slice("Z")};
   Slice value_slice("barbarbar");
   // This write will cause a conflict with the earlier batch write
-  txn2->Put(handles[1], SliceParts(key_slices, 3), SliceParts(&value_slice, 1));
+  ASSERT_OK(txn2->Put(handles[1], SliceParts(key_slices, 3),
+                      SliceParts(&value_slice, 1)));
 
-  txn2->Delete(handles[2], "XXX");
-  txn2->Delete(handles[1], "XXX");
+  ASSERT_OK(txn2->Delete(handles[2], "XXX"));
+  ASSERT_OK(txn2->Delete(handles[1], "XXX"));
   s = txn2->GetForUpdate(snapshot_read_options, handles[1], "AAA", &value);
   ASSERT_TRUE(s.IsNotFound());
 
@@ -664,6 +633,7 @@ TEST_P(OptimisticTransactionTest, ColumnFamiliesTest) {
   s = txn2->Commit();
   ASSERT_TRUE(s.IsBusy());
   s = txn_db->Get(read_options, handles[1], "AAAZZZ", &value);
+  ASSERT_TRUE(s.IsNotFound());
   ASSERT_EQ(value, "barbar");
 
   delete txn;
@@ -673,7 +643,7 @@ TEST_P(OptimisticTransactionTest, ColumnFamiliesTest) {
   snapshot_read_options.snapshot = txn->GetSnapshot();
 
   txn2 = txn_db->BeginTransaction(write_options, txn_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
   std::vector<ColumnFamilyHandle*> multiget_cfh = {handles[1], handles[2],
                                                    handles[0], handles[2]};
@@ -690,22 +660,21 @@ TEST_P(OptimisticTransactionTest, ColumnFamiliesTest) {
   ASSERT_EQ(values[1], "barbar");
   ASSERT_EQ(values[2], "foo");
 
-  txn->Delete(handles[2], "ZZZ");
-  txn->Put(handles[2], "ZZZ", "YYY");
-  txn->Put(handles[2], "ZZZ", "YYYY");
-  txn->Delete(handles[2], "ZZZ");
-  txn->Put(handles[2], "AAAZZZ", "barbarbar");
+  ASSERT_OK(txn->Delete(handles[2], "ZZZ"));
+  ASSERT_OK(txn->Put(handles[2], "ZZZ", "YYY"));
+  ASSERT_OK(txn->Put(handles[2], "ZZZ", "YYYY"));
+  ASSERT_OK(txn->Delete(handles[2], "ZZZ"));
+  ASSERT_OK(txn->Put(handles[2], "AAAZZZ", "barbarbar"));
 
   ASSERT_EQ(5, txn->GetNumKeys());
 
   // Txn should commit
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
   s = txn_db->Get(read_options, handles[2], "ZZZ", &value);
   ASSERT_TRUE(s.IsNotFound());
 
   // Put a key which will conflict with the next txn using the previous snapshot
-  txn_db->Put(write_options, handles[2], "foo", "000");
+  ASSERT_OK(txn_db->Put(write_options, handles[2], "foo", "000"));
 
   results = txn2->MultiGetForUpdate(snapshot_read_options, multiget_cfh,
                                     multiget_keys, &values);
@@ -738,35 +707,31 @@ TEST_P(OptimisticTransactionTest, EmptyTest) {
   WriteOptions write_options;
   ReadOptions read_options;
   string value;
-  Status s;
 
-  s = txn_db->Put(write_options, "aaa", "aaa");
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Put(write_options, "aaa", "aaa"));
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
   delete txn;
 
   txn = txn_db->BeginTransaction(write_options);
-  txn->Rollback();
+  ASSERT_OK(txn->Rollback());
   delete txn;
 
   txn = txn_db->BeginTransaction(write_options);
-  s = txn->GetForUpdate(read_options, "aaa", &value);
+  ASSERT_OK(txn->GetForUpdate(read_options, "aaa", &value));
   ASSERT_EQ(value, "aaa");
 
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
   delete txn;
 
   txn = txn_db->BeginTransaction(write_options);
   txn->SetSnapshot();
-  s = txn->GetForUpdate(read_options, "aaa", &value);
+  ASSERT_OK(txn->GetForUpdate(read_options, "aaa", &value));
   ASSERT_EQ(value, "aaa");
 
-  s = txn_db->Put(write_options, "aaa", "xxx");
-  s = txn->Commit();
+  ASSERT_OK(txn_db->Put(write_options, "aaa", "xxx"));
+  Status s = txn->Commit();
   ASSERT_TRUE(s.IsBusy());
   delete txn;
 }
@@ -776,7 +741,6 @@ TEST_P(OptimisticTransactionTest, PredicateManyPreceders) {
   ReadOptions read_options1, read_options2;
   OptimisticTransactionOptions txn_options;
   string value;
-  Status s;
 
   txn_options.set_snapshot = true;
   Transaction* txn1 = txn_db->BeginTransaction(write_options, txn_options);
@@ -791,20 +755,23 @@ TEST_P(OptimisticTransactionTest, PredicateManyPreceders) {
 
   std::vector<Status> results =
       txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
+  ASSERT_TRUE(results[0].IsNotFound());
   ASSERT_TRUE(results[1].IsNotFound());
+  ASSERT_TRUE(results[2].IsNotFound());
 
-  txn2->Put("2", "x");
+  ASSERT_OK(txn2->Put("2", "x"));
 
-  s = txn2->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn2->Commit());
 
   multiget_values.clear();
   results =
       txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
+  ASSERT_TRUE(results[0].IsNotFound());
   ASSERT_TRUE(results[1].IsNotFound());
+  ASSERT_TRUE(results[2].IsNotFound());
 
   // should not commit since txn2 wrote a key txn has read
-  s = txn1->Commit();
+  Status s = txn1->Commit();
   ASSERT_TRUE(s.IsBusy());
 
   delete txn1;
@@ -816,13 +783,12 @@ TEST_P(OptimisticTransactionTest, PredicateManyPreceders) {
   txn2 = txn_db->BeginTransaction(write_options, txn_options);
   read_options2.snapshot = txn2->GetSnapshot();
 
-  txn1->Put("4", "x");
+  ASSERT_OK(txn1->Put("4", "x"));
 
-  txn2->Delete("4");
+  ASSERT_OK(txn2->Delete("4"));
 
   // txn1 can commit since txn2's delete hasn't happened yet (it's just batched)
-  s = txn1->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->Commit());
 
   s = txn2->GetForUpdate(read_options2, "4", &value);
   ASSERT_TRUE(s.IsNotFound());
@@ -840,7 +806,6 @@ TEST_P(OptimisticTransactionTest, LostUpdate) {
   ReadOptions read_options, read_options1, read_options2;
   OptimisticTransactionOptions txn_options;
   string value;
-  Status s;
 
   // Test 2 transactions writing to the same key in multiple orders and
   // with/without snapshots
@@ -848,13 +813,12 @@ TEST_P(OptimisticTransactionTest, LostUpdate) {
   Transaction* txn1 = txn_db->BeginTransaction(write_options);
   Transaction* txn2 = txn_db->BeginTransaction(write_options);
 
-  txn1->Put("1", "1");
-  txn2->Put("1", "2");
+  ASSERT_OK(txn1->Put("1", "1"));
+  ASSERT_OK(txn2->Put("1", "2"));
 
-  s = txn1->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->Commit());
 
-  s = txn2->Commit();
+  Status s = txn2->Commit();
   ASSERT_TRUE(s.IsBusy());
 
   delete txn1;
@@ -867,11 +831,10 @@ TEST_P(OptimisticTransactionTest, LostUpdate) {
   txn2 = txn_db->BeginTransaction(write_options, txn_options);
   read_options2.snapshot = txn2->GetSnapshot();
 
-  txn1->Put("1", "3");
-  txn2->Put("1", "4");
+  ASSERT_OK(txn1->Put("1", "3"));
+  ASSERT_OK(txn2->Put("1", "4"));
 
-  s = txn1->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->Commit());
 
   s = txn2->Commit();
   ASSERT_TRUE(s.IsBusy());
@@ -885,11 +848,10 @@ TEST_P(OptimisticTransactionTest, LostUpdate) {
   txn2 = txn_db->BeginTransaction(write_options, txn_options);
   read_options2.snapshot = txn2->GetSnapshot();
 
-  txn1->Put("1", "5");
-  s = txn1->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->Put("1", "5"));
+  ASSERT_OK(txn1->Commit());
 
-  txn2->Put("1", "6");
+  ASSERT_OK(txn2->Put("1", "6"));
   s = txn2->Commit();
   ASSERT_TRUE(s.IsBusy());
 
@@ -902,14 +864,12 @@ TEST_P(OptimisticTransactionTest, LostUpdate) {
   txn2 = txn_db->BeginTransaction(write_options, txn_options);
   read_options2.snapshot = txn2->GetSnapshot();
 
-  txn1->Put("1", "5");
-  s = txn1->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->Put("1", "5"));
+  ASSERT_OK(txn1->Commit());
 
   txn2->SetSnapshot();
-  txn2->Put("1", "6");
-  s = txn2->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn2->Put("1", "6"));
+  ASSERT_OK(txn2->Commit());
 
   delete txn1;
   delete txn2;
@@ -917,19 +877,16 @@ TEST_P(OptimisticTransactionTest, LostUpdate) {
   txn1 = txn_db->BeginTransaction(write_options);
   txn2 = txn_db->BeginTransaction(write_options);
 
-  txn1->Put("1", "7");
-  s = txn1->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->Put("1", "7"));
+  ASSERT_OK(txn1->Commit());
 
-  txn2->Put("1", "8");
-  s = txn2->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn2->Put("1", "8"));
+  ASSERT_OK(txn2->Commit());
 
   delete txn1;
   delete txn2;
 
-  s = txn_db->Get(read_options, "1", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Get(read_options, "1", &value));
   ASSERT_EQ(value, "8");
 }
 
@@ -941,26 +898,24 @@ TEST_P(OptimisticTransactionTest, UntrackedWrites) {
 
   // Verify transaction rollback works for untracked keys.
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  txn->PutUntracked("untracked", "0");
-  txn->Rollback();
+  ASSERT_OK(txn->PutUntracked("untracked", "0"));
+  ASSERT_OK(txn->Rollback());
   s = txn_db->Get(read_options, "untracked", &value);
   ASSERT_TRUE(s.IsNotFound());
 
   delete txn;
   txn = txn_db->BeginTransaction(write_options);
 
-  txn->Put("tracked", "1");
-  txn->PutUntracked("untracked", "1");
-  txn->MergeUntracked("untracked", "2");
-  txn->DeleteUntracked("untracked");
+  ASSERT_OK(txn->Put("tracked", "1"));
+  ASSERT_OK(txn->PutUntracked("untracked", "1"));
+  ASSERT_OK(txn->MergeUntracked("untracked", "2"));
+  ASSERT_OK(txn->DeleteUntracked("untracked"));
 
   // Write to the untracked key outside of the transaction and verify
   // it doesn't prevent the transaction from committing.
-  s = txn_db->Put(write_options, "untracked", "x");
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Put(write_options, "untracked", "x"));
 
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
 
   s = txn_db->Get(read_options, "untracked", &value);
   ASSERT_TRUE(s.IsNotFound());
@@ -968,12 +923,12 @@ TEST_P(OptimisticTransactionTest, UntrackedWrites) {
   delete txn;
   txn = txn_db->BeginTransaction(write_options);
 
-  txn->Put("tracked", "10");
-  txn->PutUntracked("untracked", "A");
+  ASSERT_OK(txn->Put("tracked", "10"));
+  ASSERT_OK(txn->PutUntracked("untracked", "A"));
 
   // Write to tracked key outside of the transaction and verify that the
   // untracked keys are not written when the commit fails.
-  s = txn_db->Delete(write_options, "tracked");
+  ASSERT_OK(txn_db->Delete(write_options, "tracked"));
 
   s = txn->Commit();
   ASSERT_TRUE(s.IsBusy());
@@ -989,49 +944,29 @@ TEST_P(OptimisticTransactionTest, IteratorTest) {
   ReadOptions read_options, snapshot_read_options;
   OptimisticTransactionOptions txn_options;
   string value;
-  Status s;
 
   // Write some keys to the db
-  s = txn_db->Put(write_options, "A", "a");
-  ASSERT_OK(s);
-
-  s = txn_db->Put(write_options, "G", "g");
-  ASSERT_OK(s);
-
-  s = txn_db->Put(write_options, "F", "f");
-  ASSERT_OK(s);
-
-  s = txn_db->Put(write_options, "C", "c");
-  ASSERT_OK(s);
-
-  s = txn_db->Put(write_options, "D", "d");
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Put(write_options, "A", "a"));
+  ASSERT_OK(txn_db->Put(write_options, "G", "g"));
+  ASSERT_OK(txn_db->Put(write_options, "F", "f"));
+  ASSERT_OK(txn_db->Put(write_options, "C", "c"));
+  ASSERT_OK(txn_db->Put(write_options, "D", "d"));
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
   // Write some keys in a txn
-  s = txn->Put("B", "b");
-  ASSERT_OK(s);
-
-  s = txn->Put("H", "h");
-  ASSERT_OK(s);
-
-  s = txn->Delete("D");
-  ASSERT_OK(s);
-
-  s = txn->Put("E", "e");
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Put("B", "b"));
+  ASSERT_OK(txn->Put("H", "h"));
+  ASSERT_OK(txn->Delete("D"));
+  ASSERT_OK(txn->Put("E", "e"));
 
   txn->SetSnapshot();
   const Snapshot* snapshot = txn->GetSnapshot();
 
   // Write some keys to the db after the snapshot
-  s = txn_db->Put(write_options, "BB", "xx");
-  ASSERT_OK(s);
-
-  s = txn_db->Put(write_options, "C", "xx");
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Put(write_options, "BB", "xx"));
+  ASSERT_OK(txn_db->Put(write_options, "C", "xx"));
 
   read_options.snapshot = snapshot;
   Iterator* iter = txn->GetIterator(read_options);
@@ -1045,8 +980,7 @@ TEST_P(OptimisticTransactionTest, IteratorTest) {
     ASSERT_TRUE(iter->Valid());
     ASSERT_EQ(results[i], iter->value().ToString());
 
-    s = txn->GetForUpdate(read_options, iter->key(), nullptr);
-    ASSERT_OK(s);
+    ASSERT_OK(txn->GetForUpdate(read_options, iter->key(), nullptr));
 
     iter->Next();
   }
@@ -1092,24 +1026,34 @@ TEST_P(OptimisticTransactionTest, IteratorTest) {
   ASSERT_EQ("h", iter->value().ToString());
 
   // key "C" was modified in the db after txn's snapshot.  txn will not commit.
-  s = txn->Commit();
+  Status s = txn->Commit();
   ASSERT_TRUE(s.IsBusy());
 
   delete iter;
   delete txn;
 }
 
+TEST_P(OptimisticTransactionTest, DeleteRangeSupportTest) {
+  // `OptimisticTransactionDB` does not allow range deletion in any API.
+  ASSERT_TRUE(
+      txn_db
+          ->DeleteRange(WriteOptions(), txn_db->DefaultColumnFamily(), "a", "b")
+          .IsNotSupported());
+  WriteBatch wb;
+  ASSERT_OK(wb.DeleteRange("a", "b"));
+  ASSERT_NOK(txn_db->Write(WriteOptions(), &wb));
+}
+
 TEST_P(OptimisticTransactionTest, SavepointTest) {
   WriteOptions write_options;
   ReadOptions read_options, snapshot_read_options;
   OptimisticTransactionOptions txn_options;
   string value;
-  Status s;
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
-  s = txn->RollbackToSavePoint();
+  Status s = txn->RollbackToSavePoint();
   ASSERT_TRUE(s.IsNotFound());
 
   txn->SetSavePoint();  // 1
@@ -1118,104 +1062,68 @@ TEST_P(OptimisticTransactionTest, SavepointTest) {
   s = txn->RollbackToSavePoint();
   ASSERT_TRUE(s.IsNotFound());
 
-  s = txn->Put("B", "b");
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Put("B", "b"));
 
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
 
-  s = txn_db->Get(read_options, "B", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Get(read_options, "B", &value));
   ASSERT_EQ("b", value);
 
   delete txn;
   txn = txn_db->BeginTransaction(write_options);
-  ASSERT_TRUE(txn);
+  ASSERT_NE(txn, nullptr);
 
-  s = txn->Put("A", "a");
-  ASSERT_OK(s);
-
-  s = txn->Put("B", "bb");
-  ASSERT_OK(s);
-
-  s = txn->Put("C", "c");
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Put("A", "a"));
+  ASSERT_OK(txn->Put("B", "bb"));
+  ASSERT_OK(txn->Put("C", "c"));
 
   txn->SetSavePoint();  // 2
 
-  s = txn->Delete("B");
-  ASSERT_OK(s);
-
-  s = txn->Put("C", "cc");
-  ASSERT_OK(s);
-
-  s = txn->Put("D", "d");
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Delete("B"));
+  ASSERT_OK(txn->Put("C", "cc"));
+  ASSERT_OK(txn->Put("D", "d"));
 
   ASSERT_OK(txn->RollbackToSavePoint());  // Rollback to 2
 
-  s = txn->Get(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Get(read_options, "A", &value));
   ASSERT_EQ("a", value);
-
-  s = txn->Get(read_options, "B", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Get(read_options, "B", &value));
   ASSERT_EQ("bb", value);
-
-  s = txn->Get(read_options, "C", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Get(read_options, "C", &value));
   ASSERT_EQ("c", value);
-
   s = txn->Get(read_options, "D", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = txn->Put("A", "a");
-  ASSERT_OK(s);
-
-  s = txn->Put("E", "e");
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Put("A", "a"));
+  ASSERT_OK(txn->Put("E", "e"));
 
   // Rollback to beginning of txn
   s = txn->RollbackToSavePoint();
   ASSERT_TRUE(s.IsNotFound());
-  txn->Rollback();
+  ASSERT_OK(txn->Rollback());
 
   s = txn->Get(read_options, "A", &value);
   ASSERT_TRUE(s.IsNotFound());
-
-  s = txn->Get(read_options, "B", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Get(read_options, "B", &value));
   ASSERT_EQ("b", value);
-
   s = txn->Get(read_options, "D", &value);
   ASSERT_TRUE(s.IsNotFound());
-
   s = txn->Get(read_options, "D", &value);
   ASSERT_TRUE(s.IsNotFound());
-
   s = txn->Get(read_options, "E", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = txn->Put("A", "aa");
-  ASSERT_OK(s);
-
-  s = txn->Put("F", "f");
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Put("A", "aa"));
+  ASSERT_OK(txn->Put("F", "f"));
 
   txn->SetSavePoint();  // 3
   txn->SetSavePoint();  // 4
 
-  s = txn->Put("G", "g");
-  ASSERT_OK(s);
-
-  s = txn->Delete("F");
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Put("G", "g"));
+  ASSERT_OK(txn->Delete("F"));
+  ASSERT_OK(txn->Delete("B"));
 
-  s = txn->Delete("B");
-  ASSERT_OK(s);
-
-  s = txn->Get(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Get(read_options, "A", &value));
   ASSERT_EQ("aa", value);
 
   s = txn->Get(read_options, "F", &value);
@@ -1226,29 +1134,24 @@ TEST_P(OptimisticTransactionTest, SavepointTest) {
 
   ASSERT_OK(txn->RollbackToSavePoint());  // Rollback to 3
 
-  s = txn->Get(read_options, "F", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Get(read_options, "F", &value));
   ASSERT_EQ("f", value);
 
   s = txn->Get(read_options, "G", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
 
-  s = txn_db->Get(read_options, "F", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Get(read_options, "F", &value));
   ASSERT_EQ("f", value);
 
   s = txn_db->Get(read_options, "G", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = txn_db->Get(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Get(read_options, "A", &value));
   ASSERT_EQ("aa", value);
 
-  s = txn_db->Get(read_options, "B", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn_db->Get(read_options, "B", &value));
   ASSERT_EQ("b", value);
 
   s = txn_db->Get(read_options, "C", &value);
@@ -1268,61 +1171,52 @@ TEST_P(OptimisticTransactionTest, UndoGetForUpdateTest) {
   ReadOptions read_options, snapshot_read_options;
   OptimisticTransactionOptions txn_options;
   string value;
-  Status s;
 
-  txn_db->Put(write_options, "A", "");
+  ASSERT_OK(txn_db->Put(write_options, "A", ""));
 
   Transaction* txn1 = txn_db->BeginTransaction(write_options);
   ASSERT_TRUE(txn1);
 
-  s = txn1->GetForUpdate(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
 
   txn1->UndoGetForUpdate("A");
 
   Transaction* txn2 = txn_db->BeginTransaction(write_options);
   txn2->Put("A", "x");
-  s = txn2->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn2->Commit());
   delete txn2;
 
   // Verify that txn1 can commit since A isn't conflict checked
-  s = txn1->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->Commit());
   delete txn1;
 
   txn1 = txn_db->BeginTransaction(write_options);
-  txn1->Put("A", "a");
+  ASSERT_OK(txn1->Put("A", "a"));
 
-  s = txn1->GetForUpdate(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
 
   txn1->UndoGetForUpdate("A");
 
   txn2 = txn_db->BeginTransaction(write_options);
-  txn2->Put("A", "x");
-  s = txn2->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn2->Put("A", "x"));
+  ASSERT_OK(txn2->Commit());
   delete txn2;
 
   // Verify that txn1 cannot commit since A will still be conflict checked
-  s = txn1->Commit();
+  Status s = txn1->Commit();
   ASSERT_TRUE(s.IsBusy());
   delete txn1;
 
   txn1 = txn_db->BeginTransaction(write_options);
 
-  s = txn1->GetForUpdate(read_options, "A", &value);
-  ASSERT_OK(s);
-  s = txn1->GetForUpdate(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
 
   txn1->UndoGetForUpdate("A");
 
   txn2 = txn_db->BeginTransaction(write_options);
-  txn2->Put("A", "x");
-  s = txn2->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn2->Put("A", "x"));
+  ASSERT_OK(txn2->Commit());
   delete txn2;
 
   // Verify that txn1 cannot commit since A will still be conflict checked
@@ -1332,37 +1226,31 @@ TEST_P(OptimisticTransactionTest, UndoGetForUpdateTest) {
 
   txn1 = txn_db->BeginTransaction(write_options);
 
-  s = txn1->GetForUpdate(read_options, "A", &value);
-  ASSERT_OK(s);
-  s = txn1->GetForUpdate(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
 
   txn1->UndoGetForUpdate("A");
   txn1->UndoGetForUpdate("A");
 
   txn2 = txn_db->BeginTransaction(write_options);
-  txn2->Put("A", "x");
-  s = txn2->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn2->Put("A", "x"));
+  ASSERT_OK(txn2->Commit());
   delete txn2;
 
   // Verify that txn1 can commit since A isn't conflict checked
-  s = txn1->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->Commit());
   delete txn1;
 
   txn1 = txn_db->BeginTransaction(write_options);
 
-  s = txn1->GetForUpdate(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
 
   txn1->SetSavePoint();
   txn1->UndoGetForUpdate("A");
 
   txn2 = txn_db->BeginTransaction(write_options);
-  txn2->Put("A", "x");
-  s = txn2->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn2->Put("A", "x"));
+  ASSERT_OK(txn2->Commit());
   delete txn2;
 
   // Verify that txn1 cannot commit since A will still be conflict checked
@@ -1372,18 +1260,15 @@ TEST_P(OptimisticTransactionTest, UndoGetForUpdateTest) {
 
   txn1 = txn_db->BeginTransaction(write_options);
 
-  s = txn1->GetForUpdate(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
 
   txn1->SetSavePoint();
-  s = txn1->GetForUpdate(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
   txn1->UndoGetForUpdate("A");
 
   txn2 = txn_db->BeginTransaction(write_options);
-  txn2->Put("A", "x");
-  s = txn2->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn2->Put("A", "x"));
+  ASSERT_OK(txn2->Commit());
   delete txn2;
 
   // Verify that txn1 cannot commit since A will still be conflict checked
@@ -1393,26 +1278,22 @@ TEST_P(OptimisticTransactionTest, UndoGetForUpdateTest) {
 
   txn1 = txn_db->BeginTransaction(write_options);
 
-  s = txn1->GetForUpdate(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
 
   txn1->SetSavePoint();
-  s = txn1->GetForUpdate(read_options, "A", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->GetForUpdate(read_options, "A", &value));
   txn1->UndoGetForUpdate("A");
 
-  txn1->RollbackToSavePoint();
+  ASSERT_OK(txn1->RollbackToSavePoint());
   txn1->UndoGetForUpdate("A");
 
   txn2 = txn_db->BeginTransaction(write_options);
-  txn2->Put("A", "x");
-  s = txn2->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn2->Put("A", "x"));
+  ASSERT_OK(txn2->Commit());
   delete txn2;
 
   // Verify that txn1 can commit since A isn't conflict checked
-  s = txn1->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn1->Commit());
   delete txn1;
 }
 
@@ -1440,6 +1321,8 @@ Status OptimisticTransactionStressTestInserter(OptimisticTransactionDB* db,
     }
   }
 
+  inserter.GetLastStatus().PermitUncheckedError();
+
   // Make sure at least some of the transactions succeeded.  It's ok if
   // some failed due to write-conflicts.
   if (inserter.GetFailureCount() > num_transactions / 2) {
diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc
index d9281852860..7d4276972ba 100644
--- a/utilities/transactions/pessimistic_transaction.cc
+++ b/utilities/transactions/pessimistic_transaction.cc
@@ -38,7 +38,10 @@ TransactionID PessimisticTransaction::GenTxnID() {
 PessimisticTransaction::PessimisticTransaction(
     TransactionDB* txn_db, const WriteOptions& write_options,
     const TransactionOptions& txn_options, const bool init)
-    : TransactionBaseImpl(txn_db->GetRootDB(), write_options),
+    : TransactionBaseImpl(
+          txn_db->GetRootDB(), write_options,
+          static_cast_with_check<PessimisticTransactionDB>(txn_db)
+              ->GetLockTrackerFactory()),
       txn_db_impl_(nullptr),
       expiration_time_(0),
       txn_id_(0),
@@ -117,7 +120,7 @@ void PessimisticTransaction::Reinitialize(
 
 bool PessimisticTransaction::IsExpired() const {
   if (expiration_time_ > 0) {
-    if (db_->GetEnv()->NowMicros() >= expiration_time_) {
+    if (dbimpl_->GetSystemClock()->NowMicros() >= expiration_time_) {
       // Transaction is expired.
       return true;
     }
@@ -132,7 +135,7 @@ WriteCommittedTxn::WriteCommittedTxn(TransactionDB* txn_db,
     : PessimisticTransaction(txn_db, write_options, txn_options){};
 
 Status PessimisticTransaction::CommitBatch(WriteBatch* batch) {
-  std::unique_ptr<LockTracker> keys_to_unlock(NewLockTracker());
+  std::unique_ptr<LockTracker> keys_to_unlock(lock_tracker_factory_.Create());
   Status s = LockBatch(batch, keys_to_unlock.get());
 
   if (!s.ok()) {
@@ -170,7 +173,6 @@ Status PessimisticTransaction::CommitBatch(WriteBatch* batch) {
 }
 
 Status PessimisticTransaction::Prepare() {
-  Status s;
 
   if (name_.empty()) {
     return Status::InvalidArgument(
@@ -181,6 +183,7 @@ Status PessimisticTransaction::Prepare() {
     return Status::Expired();
   }
 
+  Status s;
   bool can_prepare = false;
 
   if (expiration_time_ > 0) {
@@ -223,7 +226,9 @@ Status PessimisticTransaction::Prepare() {
 Status WriteCommittedTxn::PrepareInternal() {
   WriteOptions write_options = write_options_;
   write_options.disableWAL = false;
-  WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), name_);
+  auto s = WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(),
+                                              name_);
+  assert(s.ok());
   class MarkLogCallback : public PreReleaseCallback {
    public:
     MarkLogCallback(DBImpl* db, bool two_write_queues)
@@ -253,15 +258,14 @@ Status WriteCommittedTxn::PrepareInternal() {
   const bool kDisableMemtable = true;
   SequenceNumber* const KIgnoreSeqUsed = nullptr;
   const size_t kNoBatchCount = 0;
-  Status s = db_impl_->WriteImpl(
-      write_options, GetWriteBatch()->GetWriteBatch(), kNoWriteCallback,
-      &log_number_, kRefNoLog, kDisableMemtable, KIgnoreSeqUsed, kNoBatchCount,
-      &mark_log_callback);
+  s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
+                          kNoWriteCallback, &log_number_, kRefNoLog,
+                          kDisableMemtable, KIgnoreSeqUsed, kNoBatchCount,
+                          &mark_log_callback);
   return s;
 }
 
 Status PessimisticTransaction::Commit() {
-  Status s;
   bool commit_without_prepare = false;
   bool commit_prepared = false;
 
@@ -291,6 +295,7 @@ Status PessimisticTransaction::Commit() {
     }
   }
 
+  Status s;
   if (commit_without_prepare) {
     assert(!commit_prepared);
     if (WriteBatchInternal::Count(GetCommitTimeWriteBatch()) > 0) {
@@ -374,7 +379,8 @@ Status WriteCommittedTxn::CommitInternal() {
   // We take the commit-time batch and append the Commit marker.
   // The Memtable will ignore the Commit marker in non-recovery mode
   WriteBatch* working_batch = GetCommitTimeWriteBatch();
-  WriteBatchInternal::MarkCommit(working_batch, name_);
+  auto s = WriteBatchInternal::MarkCommit(working_batch, name_);
+  assert(s.ok());
 
   // any operations appended to this working_batch will be ignored from WAL
   working_batch->MarkWalTerminationPoint();
@@ -382,13 +388,14 @@ Status WriteCommittedTxn::CommitInternal() {
   // insert prepared batch into Memtable only skipping WAL.
   // Memtable will ignore BeginPrepare/EndPrepare markers
   // in non recovery mode and simply insert the values
-  WriteBatchInternal::Append(working_batch, GetWriteBatch()->GetWriteBatch());
+  s = WriteBatchInternal::Append(working_batch,
+                                 GetWriteBatch()->GetWriteBatch());
+  assert(s.ok());
 
   uint64_t seq_used = kMaxSequenceNumber;
-  auto s =
-      db_impl_->WriteImpl(write_options_, working_batch, /*callback*/ nullptr,
+  s = db_impl_->WriteImpl(write_options_, working_batch, /*callback*/ nullptr,
                           /*log_used*/ nullptr, /*log_ref*/ log_number_,
-                          /*disable_memtable*/ false, &seq_used);  
+                          /*disable_memtable*/ false, &seq_used);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
   if (s.ok()) {
     SetId(seq_used);
@@ -436,8 +443,9 @@ Status PessimisticTransaction::Rollback() {
 
 Status WriteCommittedTxn::RollbackInternal() {
   WriteBatch rollback_marker;
-  WriteBatchInternal::MarkRollback(&rollback_marker, name_);
-  auto s = db_impl_->WriteImpl(write_options_, &rollback_marker);
+  auto s = WriteBatchInternal::MarkRollback(&rollback_marker, name_);
+  assert(s.ok());
+  s = db_impl_->WriteImpl(write_options_, &rollback_marker);
   return s;
 }
 
@@ -502,9 +510,10 @@ Status PessimisticTransaction::LockBatch(WriteBatch* batch,
 
   // Iterating on this handler will add all keys in this batch into keys
   Handler handler;
-  batch->Iterate(&handler);
-
-  Status s;
+  Status s = batch->Iterate(&handler);
+  if (!s.ok()) {
+    return s;
+  }
 
   // Attempt to lock all keys
   for (const auto& cf_iter : handler.keys_) {
@@ -555,9 +564,20 @@ Status PessimisticTransaction::TryLock(ColumnFamilyHandle* column_family,
   }
   uint32_t cfh_id = GetColumnFamilyID(column_family);
   std::string key_str = key.ToString();
-  PointLockStatus status = tracked_locks_->GetPointLockStatus(cfh_id, key_str);
-  bool previously_locked = status.locked;
-  bool lock_upgrade = previously_locked && exclusive && !status.exclusive;
+
+  PointLockStatus status;
+  bool lock_upgrade;
+  bool previously_locked;
+  if (tracked_locks_->IsPointLockSupported()) {
+    status = tracked_locks_->GetPointLockStatus(cfh_id, key_str);
+    previously_locked = status.locked;
+    lock_upgrade = previously_locked && exclusive && !status.exclusive;
+  } else {
+    // If the record is tracked, we can assume it was locked, too.
+    previously_locked = assume_tracked;
+    status.locked = false;
+    lock_upgrade = false;
+  }
 
   // Lock this key if this transactions hasn't already locked it or we require
   // an upgrade.
@@ -576,7 +596,8 @@ Status PessimisticTransaction::TryLock(ColumnFamilyHandle* column_family,
   SequenceNumber tracked_at_seq =
       status.locked ? status.seq : kMaxSequenceNumber;
   if (!do_validate || snapshot_ == nullptr) {
-    if (assume_tracked && !previously_locked) {
+    if (assume_tracked && !previously_locked &&
+        tracked_locks_->IsPointLockSupported()) {
       s = Status::InvalidArgument(
           "assume_tracked is set but it is not tracked yet");
     }
@@ -633,11 +654,13 @@ Status PessimisticTransaction::TryLock(ColumnFamilyHandle* column_family,
       TrackKey(cfh_id, key_str, tracked_at_seq, read_only, exclusive);
     } else {
 #ifndef NDEBUG
-      PointLockStatus lock_status =
-          tracked_locks_->GetPointLockStatus(cfh_id, key_str);
-      assert(lock_status.locked);
-      assert(lock_status.seq <= tracked_at_seq);
-      assert(lock_status.exclusive == exclusive);
+      if (tracked_locks_->IsPointLockSupported()) {
+        PointLockStatus lock_status =
+            tracked_locks_->GetPointLockStatus(cfh_id, key_str);
+        assert(lock_status.locked);
+        assert(lock_status.seq <= tracked_at_seq);
+        assert(lock_status.exclusive == exclusive);
+      }
 #endif
     }
   }
@@ -645,6 +668,22 @@ Status PessimisticTransaction::TryLock(ColumnFamilyHandle* column_family,
   return s;
 }
 
+Status PessimisticTransaction::GetRangeLock(ColumnFamilyHandle* column_family,
+                                            const Endpoint& start_endp,
+                                            const Endpoint& end_endp) {
+  ColumnFamilyHandle* cfh =
+      column_family ? column_family : db_impl_->DefaultColumnFamily();
+  uint32_t cfh_id = GetColumnFamilyID(cfh);
+
+  Status s = txn_db_impl_->TryRangeLock(this, cfh_id, start_endp, end_endp);
+
+  if (s.ok()) {
+    RangeLockRequest req{cfh_id, start_endp, end_endp};
+    tracked_locks_->Track(req);
+  }
+  return s;
+}
+
 // Return OK() if this key has not been modified more recently than the
 // transaction snapshot_.
 // tracked_at_seq is the global seq at which we either locked the key or already
diff --git a/utilities/transactions/pessimistic_transaction.h b/utilities/transactions/pessimistic_transaction.h
index 308d7460f37..6c5754ac635 100644
--- a/utilities/transactions/pessimistic_transaction.h
+++ b/utilities/transactions/pessimistic_transaction.h
@@ -116,6 +116,10 @@ class PessimisticTransaction : public TransactionBaseImpl {
 
   int64_t GetDeadlockDetectDepth() const { return deadlock_detect_depth_; }
 
+  virtual Status GetRangeLock(ColumnFamilyHandle* column_family,
+                              const Endpoint& start_key,
+                              const Endpoint& end_key) override;
+
  protected:
   // Refer to
   // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery
@@ -172,7 +176,7 @@ class PessimisticTransaction : public TransactionBaseImpl {
   //
   // If waiting_key_ is not null, then the pointer should always point to
   // a valid string object. The reason is that it is only non-null when the
-  // transaction is blocked in the TransactionLockMgr::AcquireWithTimeout
+  // transaction is blocked in the PointLockManager::AcquireWithTimeout
   // function. At that point, the key string object is one of the function
   // parameters.
   uint32_t waiting_cf_id_;
diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc
index a15df47e1ea..cb5ba033091 100644
--- a/utilities/transactions/pessimistic_transaction_db.cc
+++ b/utilities/transactions/pessimistic_transaction_db.cc
@@ -31,12 +31,7 @@ PessimisticTransactionDB::PessimisticTransactionDB(
     : TransactionDB(db),
       db_impl_(static_cast_with_check<DBImpl>(db)),
       txn_db_options_(txn_db_options),
-      lock_mgr_(this, txn_db_options_.num_stripes, txn_db_options.max_num_locks,
-                txn_db_options_.max_num_deadlocks,
-                txn_db_options_.custom_mutex_factory
-                    ? txn_db_options_.custom_mutex_factory
-                    : std::shared_ptr<TransactionDBMutexFactory>(
-                          new TransactionDBMutexFactoryImpl())) {
+      lock_manager_(NewLockManager(this, txn_db_options)) {
   assert(db_impl_ != nullptr);
   info_log_ = db_impl_->GetDBOptions().info_log;
 }
@@ -62,12 +57,7 @@ PessimisticTransactionDB::PessimisticTransactionDB(
     : TransactionDB(db),
       db_impl_(static_cast_with_check<DBImpl>(db->GetRootDB())),
       txn_db_options_(txn_db_options),
-      lock_mgr_(this, txn_db_options_.num_stripes, txn_db_options.max_num_locks,
-                txn_db_options_.max_num_deadlocks,
-                txn_db_options_.custom_mutex_factory
-                    ? txn_db_options_.custom_mutex_factory
-                    : std::shared_ptr<TransactionDBMutexFactory>(
-                          new TransactionDBMutexFactoryImpl())) {
+      lock_manager_(NewLockManager(this, txn_db_options)) {
   assert(db_impl_ != nullptr);
 }
 
@@ -286,10 +276,10 @@ void TransactionDB::PrepareWrap(
   db_options->allow_2pc = true;
 }
 
-Status TransactionDB::WrapDB(
-    // make sure this db is already opened with memtable history enabled,
-    // auto compaction distabled and 2 phase commit enabled
-    DB* db, const TransactionDBOptions& txn_db_options,
+namespace {
+template <typename DBType>
+Status WrapAnotherDBInternal(
+    DBType* db, const TransactionDBOptions& txn_db_options,
     const std::vector<size_t>& compaction_enabled_cf_indices,
     const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr) {
   assert(db != nullptr);
@@ -319,6 +309,17 @@ Status TransactionDB::WrapDB(
   }
   return s;
 }
+}  // namespace
+
+Status TransactionDB::WrapDB(
+    // make sure this db is already opened with memtable history enabled,
+    // auto compaction distabled and 2 phase commit enabled
+    DB* db, const TransactionDBOptions& txn_db_options,
+    const std::vector<size_t>& compaction_enabled_cf_indices,
+    const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr) {
+  return WrapAnotherDBInternal(db, txn_db_options,
+                               compaction_enabled_cf_indices, handles, dbptr);
+}
 
 Status TransactionDB::WrapStackableDB(
     // make sure this stackable_db is already opened with memtable history
@@ -326,40 +327,15 @@ Status TransactionDB::WrapStackableDB(
     StackableDB* db, const TransactionDBOptions& txn_db_options,
     const std::vector<size_t>& compaction_enabled_cf_indices,
     const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr) {
-  assert(db != nullptr);
-  assert(dbptr != nullptr);
-  *dbptr = nullptr;
-  std::unique_ptr<PessimisticTransactionDB> txn_db;
-
-  switch (txn_db_options.write_policy) {
-    case WRITE_UNPREPARED:
-      txn_db.reset(new WriteUnpreparedTxnDB(
-          db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options)));
-      break;
-    case WRITE_PREPARED:
-      txn_db.reset(new WritePreparedTxnDB(
-          db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options)));
-      break;
-    case WRITE_COMMITTED:
-    default:
-      txn_db.reset(new WriteCommittedTxnDB(
-          db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options)));
-  }
-  txn_db->UpdateCFComparatorMap(handles);
-  Status s = txn_db->Initialize(compaction_enabled_cf_indices, handles);
-  // In case of a failure at this point, db is deleted via the txn_db destructor
-  // and set to nullptr.
-  if (s.ok()) {
-    *dbptr = txn_db.release();
-  }
-  return s;
+  return WrapAnotherDBInternal(db, txn_db_options,
+                               compaction_enabled_cf_indices, handles, dbptr);
 }
 
-// Let TransactionLockMgr know that this column family exists so it can
+// Let LockManager know that this column family exists so it can
 // allocate a LockMap for it.
 void PessimisticTransactionDB::AddColumnFamily(
     const ColumnFamilyHandle* handle) {
-  lock_mgr_.AddColumnFamily(handle->GetID());
+  lock_manager_->AddColumnFamily(handle);
 }
 
 Status PessimisticTransactionDB::CreateColumnFamily(
@@ -373,14 +349,14 @@ Status PessimisticTransactionDB::CreateColumnFamily(
 
   s = db_->CreateColumnFamily(options, column_family_name, handle);
   if (s.ok()) {
-    lock_mgr_.AddColumnFamily((*handle)->GetID());
+    lock_manager_->AddColumnFamily(*handle);
     UpdateCFComparatorMap(*handle);
   }
 
   return s;
 }
 
-// Let TransactionLockMgr know that it can deallocate the LockMap for this
+// Let LockManager know that it can deallocate the LockMap for this
 // column family.
 Status PessimisticTransactionDB::DropColumnFamily(
     ColumnFamilyHandle* column_family) {
@@ -388,7 +364,7 @@ Status PessimisticTransactionDB::DropColumnFamily(
 
   Status s = db_->DropColumnFamily(column_family);
   if (s.ok()) {
-    lock_mgr_.RemoveColumnFamily(column_family->GetID());
+    lock_manager_->RemoveColumnFamily(column_family);
   }
 
   return s;
@@ -398,17 +374,25 @@ Status PessimisticTransactionDB::TryLock(PessimisticTransaction* txn,
                                          uint32_t cfh_id,
                                          const std::string& key,
                                          bool exclusive) {
-  return lock_mgr_.TryLock(txn, cfh_id, key, GetEnv(), exclusive);
+  return lock_manager_->TryLock(txn, cfh_id, key, GetEnv(), exclusive);
+}
+
+Status PessimisticTransactionDB::TryRangeLock(PessimisticTransaction* txn,
+                                              uint32_t cfh_id,
+                                              const Endpoint& start_endp,
+                                              const Endpoint& end_endp) {
+  return lock_manager_->TryLock(txn, cfh_id, start_endp, end_endp, GetEnv(),
+                                /*exclusive=*/true);
 }
 
 void PessimisticTransactionDB::UnLock(PessimisticTransaction* txn,
                                       const LockTracker& keys) {
-  lock_mgr_.UnLock(txn, keys, GetEnv());
+  lock_manager_->UnLock(txn, keys, GetEnv());
 }
 
 void PessimisticTransactionDB::UnLock(PessimisticTransaction* txn,
                                       uint32_t cfh_id, const std::string& key) {
-  lock_mgr_.UnLock(txn, cfh_id, key, GetEnv());
+  lock_manager_->UnLock(txn, cfh_id, key, GetEnv());
 }
 
 // Used when wrapping DB write operations in a transaction
@@ -597,17 +581,16 @@ void PessimisticTransactionDB::GetAllPreparedTransactions(
   }
 }
 
-TransactionLockMgr::LockStatusData
-PessimisticTransactionDB::GetLockStatusData() {
-  return lock_mgr_.GetLockStatusData();
+LockManager::PointLockStatus PessimisticTransactionDB::GetLockStatusData() {
+  return lock_manager_->GetPointLockStatus();
 }
 
 std::vector<DeadlockPath> PessimisticTransactionDB::GetDeadlockInfoBuffer() {
-  return lock_mgr_.GetDeadlockInfoBuffer();
+  return lock_manager_->GetDeadlockInfoBuffer();
 }
 
 void PessimisticTransactionDB::SetDeadlockInfoBufferSize(uint32_t target_size) {
-  lock_mgr_.Resize(target_size);
+  lock_manager_->Resize(target_size);
 }
 
 void PessimisticTransactionDB::RegisterTransaction(Transaction* txn) {
diff --git a/utilities/transactions/pessimistic_transaction_db.h b/utilities/transactions/pessimistic_transaction_db.h
index e2b548121c1..eb0dd2f052d 100644
--- a/utilities/transactions/pessimistic_transaction_db.h
+++ b/utilities/transactions/pessimistic_transaction_db.h
@@ -20,8 +20,9 @@
 #include "rocksdb/options.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "util/cast_util.h"
+#include "utilities/transactions/lock/lock_manager.h"
+#include "utilities/transactions/lock/range/range_lock_manager.h"
 #include "utilities/transactions/pessimistic_transaction.h"
-#include "utilities/transactions/transaction_lock_mgr.h"
 #include "utilities/transactions/write_prepared_txn.h"
 
 namespace ROCKSDB_NAMESPACE {
@@ -98,6 +99,8 @@ class PessimisticTransactionDB : public TransactionDB {
 
   Status TryLock(PessimisticTransaction* txn, uint32_t cfh_id,
                  const std::string& key, bool exclusive);
+  Status TryRangeLock(PessimisticTransaction* txn, uint32_t cfh_id,
+                      const Endpoint& start_endp, const Endpoint& end_endp);
 
   void UnLock(PessimisticTransaction* txn, const LockTracker& keys);
   void UnLock(PessimisticTransaction* txn, uint32_t cfh_id,
@@ -130,7 +133,7 @@ class PessimisticTransactionDB : public TransactionDB {
   // not thread safe. current use case is during recovery (single thread)
   void GetAllPreparedTransactions(std::vector<Transaction*>* trans) override;
 
-  TransactionLockMgr::LockStatusData GetLockStatusData() override;
+  LockManager::PointLockStatus GetLockStatusData() override;
 
   std::vector<DeadlockPath> GetDeadlockInfoBuffer() override;
   void SetDeadlockInfoBufferSize(uint32_t target_size) override;
@@ -142,6 +145,11 @@ class PessimisticTransactionDB : public TransactionDB {
   virtual void UpdateCFComparatorMap(const std::vector<ColumnFamilyHandle*>&) {}
   virtual void UpdateCFComparatorMap(ColumnFamilyHandle*) {}
 
+  // Use the returned factory to create LockTrackers in transactions.
+  const LockTrackerFactory& GetLockTrackerFactory() const {
+    return lock_manager_->GetLockTrackerFactory();
+  }
+
  protected:
   DBImpl* db_impl_;
   std::shared_ptr<Logger> info_log_;
@@ -166,7 +174,8 @@ class PessimisticTransactionDB : public TransactionDB {
   friend class TransactionStressTest_TwoPhaseLongPrepareTest_Test;
   friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
   friend class WriteUnpreparedTransactionTest_MarkLogWithPrepSection_Test;
-  TransactionLockMgr lock_mgr_;
+
+  std::shared_ptr<LockManager> lock_manager_;
 
   // Must be held when adding/dropping column families.
   InstrumentedMutex column_family_mutex_;
diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc
index 4c423402783..1b713acc86c 100644
--- a/utilities/transactions/transaction_base.cc
+++ b/utilities/transactions/transaction_base.cc
@@ -20,15 +20,17 @@
 
 namespace ROCKSDB_NAMESPACE {
 
-TransactionBaseImpl::TransactionBaseImpl(DB* db,
-                                         const WriteOptions& write_options)
+TransactionBaseImpl::TransactionBaseImpl(
+    DB* db, const WriteOptions& write_options,
+    const LockTrackerFactory& lock_tracker_factory)
     : db_(db),
       dbimpl_(static_cast_with_check<DBImpl>(db)),
       write_options_(write_options),
       cmp_(GetColumnFamilyUserComparator(db->DefaultColumnFamily())),
-      start_time_(db_->GetEnv()->NowMicros()),
+      lock_tracker_factory_(lock_tracker_factory),
+      start_time_(dbimpl_->GetSystemClock()->NowMicros()),
       write_batch_(cmp_, 0, true, 0),
-      tracked_locks_(NewLockTracker()),
+      tracked_locks_(lock_tracker_factory_.Create()),
       indexing_enabled_(true) {
   assert(dynamic_cast<DBImpl*>(db_) != nullptr);
   log_number_ = 0;
@@ -65,7 +67,7 @@ void TransactionBaseImpl::Reinitialize(DB* db,
   name_.clear();
   log_number_ = 0;
   write_options_ = write_options;
-  start_time_ = db_->GetEnv()->NowMicros();
+  start_time_ = dbimpl_->GetSystemClock()->NowMicros();
   indexing_enabled_ = true;
   cmp_ = GetColumnFamilyUserComparator(db_->DefaultColumnFamily());
 }
@@ -125,7 +127,8 @@ void TransactionBaseImpl::SetSavePoint() {
     save_points_.reset(new std::stack<TransactionBaseImpl::SavePoint, autovector<TransactionBaseImpl::SavePoint>>());
   }
   save_points_->emplace(snapshot_, snapshot_needed_, snapshot_notifier_,
-                        num_puts_, num_deletes_, num_merges_);
+                        num_puts_, num_deletes_, num_merges_,
+                        lock_tracker_factory_);
   write_batch_.SetSavePoint();
 }
 
@@ -172,7 +175,7 @@ Status TransactionBaseImpl::PopSavePoint() {
   if (save_points_->size() == 1) {
     save_points_->pop();
   } else {
-    TransactionBaseImpl::SavePoint top;
+    TransactionBaseImpl::SavePoint top(lock_tracker_factory_);
     std::swap(top, save_points_->top());
     save_points_->pop();
 
@@ -303,7 +306,8 @@ Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options) {
   Iterator* db_iter = db_->NewIterator(read_options);
   assert(db_iter);
 
-  return write_batch_.NewIteratorWithBase(db_iter);
+  return write_batch_.NewIteratorWithBase(db_->DefaultColumnFamily(), db_iter,
+                                          &read_options);
 }
 
 Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options,
@@ -527,7 +531,9 @@ Status TransactionBaseImpl::SingleDeleteUntracked(
 }
 
 void TransactionBaseImpl::PutLogData(const Slice& blob) {
-  write_batch_.PutLogData(blob);
+  auto s = write_batch_.PutLogData(blob);
+  (void)s;
+  assert(s.ok());
 }
 
 WriteBatchWithIndex* TransactionBaseImpl::GetWriteBatch() {
@@ -535,7 +541,7 @@ WriteBatchWithIndex* TransactionBaseImpl::GetWriteBatch() {
 }
 
 uint64_t TransactionBaseImpl::GetElapsedTime() const {
-  return (db_->GetEnv()->NowMicros() - start_time_) / 1000;
+  return (dbimpl_->GetSystemClock()->NowMicros() - start_time_) / 1000;
 }
 
 uint64_t TransactionBaseImpl::GetNumPuts() const { return num_puts_; }
diff --git a/utilities/transactions/transaction_base.h b/utilities/transactions/transaction_base.h
index c7832bdc8c8..2c5770d8a87 100644
--- a/utilities/transactions/transaction_base.h
+++ b/utilities/transactions/transaction_base.h
@@ -28,7 +28,8 @@ namespace ROCKSDB_NAMESPACE {
 
 class TransactionBaseImpl : public Transaction {
  public:
-  TransactionBaseImpl(DB* db, const WriteOptions& write_options);
+  TransactionBaseImpl(DB* db, const WriteOptions& write_options,
+                      const LockTrackerFactory& lock_tracker_factory);
 
   virtual ~TransactionBaseImpl();
 
@@ -249,6 +250,8 @@ class TransactionBaseImpl : public Transaction {
 
   WriteBatch* GetCommitTimeWriteBatch() override;
 
+  LockTracker& GetTrackedLocks() { return *tracked_locks_; }
+
  protected:
   // Add a key to the list of tracked keys.
   //
@@ -270,7 +273,8 @@ class TransactionBaseImpl : public Transaction {
       write_batch_.Clear();
     }
     assert(write_batch_.GetDataSize() == WriteBatchInternal::kHeader);
-    WriteBatchInternal::InsertNoop(write_batch_.GetWriteBatch());
+    auto s = WriteBatchInternal::InsertNoop(write_batch_.GetWriteBatch());
+    assert(s.ok());
   }
 
   DB* db_;
@@ -280,6 +284,8 @@ class TransactionBaseImpl : public Transaction {
 
   const Comparator* cmp_;
 
+  const LockTrackerFactory& lock_tracker_factory_;
+
   // Stores that time the txn was constructed, in microseconds.
   uint64_t start_time_;
 
@@ -305,16 +311,18 @@ class TransactionBaseImpl : public Transaction {
 
     SavePoint(std::shared_ptr<const Snapshot> snapshot, bool snapshot_needed,
               std::shared_ptr<TransactionNotifier> snapshot_notifier,
-              uint64_t num_puts, uint64_t num_deletes, uint64_t num_merges)
+              uint64_t num_puts, uint64_t num_deletes, uint64_t num_merges,
+              const LockTrackerFactory& lock_tracker_factory)
         : snapshot_(snapshot),
           snapshot_needed_(snapshot_needed),
           snapshot_notifier_(snapshot_notifier),
           num_puts_(num_puts),
           num_deletes_(num_deletes),
           num_merges_(num_merges),
-          new_locks_(NewLockTracker()) {}
+          new_locks_(lock_tracker_factory.Create()) {}
 
-    SavePoint() : new_locks_(NewLockTracker()) {}
+    explicit SavePoint(const LockTrackerFactory& lock_tracker_factory)
+        : new_locks_(lock_tracker_factory.Create()) {}
   };
 
   // Records writes pending in this transaction
diff --git a/utilities/transactions/transaction_lock_mgr_test.cc b/utilities/transactions/transaction_lock_mgr_test.cc
deleted file mode 100644
index e67b453ca1c..00000000000
--- a/utilities/transactions/transaction_lock_mgr_test.cc
+++ /dev/null
@@ -1,350 +0,0 @@
-//  Copyright (c) 2020-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#ifndef ROCKSDB_LITE
-
-#include "utilities/transactions/transaction_lock_mgr.h"
-
-#include "file/file_util.h"
-#include "port/port.h"
-#include "port/stack_trace.h"
-#include "rocksdb/utilities/transaction_db.h"
-#include "test_util/testharness.h"
-#include "test_util/testutil.h"
-#include "utilities/transactions/transaction_db_mutex_impl.h"
-
-namespace ROCKSDB_NAMESPACE {
-
-class TransactionLockMgrTest : public testing::Test {
- public:
-  void SetUp() override {
-    env_ = Env::Default();
-    db_dir_ = test::PerThreadDBPath("transaction_lock_mgr_test");
-    ASSERT_OK(env_->CreateDir(db_dir_));
-    mutex_factory_ = std::make_shared<TransactionDBMutexFactoryImpl>();
-
-    Options opt;
-    opt.create_if_missing = true;
-    TransactionDBOptions txn_opt;
-    txn_opt.transaction_lock_timeout = 0;
-    ASSERT_OK(TransactionDB::Open(opt, txn_opt, db_dir_, &db_));
-
-    locker_.reset(
-        new TransactionLockMgr(db_, txn_opt.num_stripes, txn_opt.max_num_locks,
-                               txn_opt.max_num_deadlocks, mutex_factory_));
-  }
-
-  void TearDown() override {
-    delete db_;
-    EXPECT_OK(DestroyDir(env_, db_dir_));
-  }
-
-  PessimisticTransaction* NewTxn(
-      TransactionOptions txn_opt = TransactionOptions()) {
-    Transaction* txn = db_->BeginTransaction(WriteOptions(), txn_opt);
-    return reinterpret_cast<PessimisticTransaction*>(txn);
-  }
-
- protected:
-  Env* env_;
-  std::unique_ptr<TransactionLockMgr> locker_;
-
- private:
-  std::string db_dir_;
-  std::shared_ptr<TransactionDBMutexFactory> mutex_factory_;
-  TransactionDB* db_;
-};
-
-TEST_F(TransactionLockMgrTest, LockNonExistingColumnFamily) {
-  locker_->RemoveColumnFamily(1024);
-  auto txn = NewTxn();
-  auto s = locker_->TryLock(txn, 1024, "k", env_, true);
-  ASSERT_TRUE(s.IsInvalidArgument());
-  ASSERT_STREQ(s.getState(), "Column family id not found: 1024");
-  delete txn;
-}
-
-TEST_F(TransactionLockMgrTest, LockStatus) {
-  locker_->AddColumnFamily(1024);
-  locker_->AddColumnFamily(2048);
-
-  auto txn1 = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn1, 1024, "k1", env_, true));
-  ASSERT_OK(locker_->TryLock(txn1, 2048, "k1", env_, true));
-
-  auto txn2 = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn2, 1024, "k2", env_, false));
-  ASSERT_OK(locker_->TryLock(txn2, 2048, "k2", env_, false));
-
-  auto s = locker_->GetLockStatusData();
-  ASSERT_EQ(s.size(), 4u);
-  for (uint32_t cf_id : {1024, 2048}) {
-    ASSERT_EQ(s.count(cf_id), 2u);
-    auto range = s.equal_range(cf_id);
-    for (auto it = range.first; it != range.second; it++) {
-      ASSERT_TRUE(it->second.key == "k1" || it->second.key == "k2");
-      if (it->second.key == "k1") {
-        ASSERT_EQ(it->second.exclusive, true);
-        ASSERT_EQ(it->second.ids.size(), 1u);
-        ASSERT_EQ(it->second.ids[0], txn1->GetID());
-      } else if (it->second.key == "k2") {
-        ASSERT_EQ(it->second.exclusive, false);
-        ASSERT_EQ(it->second.ids.size(), 1u);
-        ASSERT_EQ(it->second.ids[0], txn2->GetID());
-      }
-    }
-  }
-
-  delete txn1;
-  delete txn2;
-}
-
-TEST_F(TransactionLockMgrTest, UnlockExclusive) {
-  locker_->AddColumnFamily(1);
-
-  auto txn1 = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, true));
-  locker_->UnLock(txn1, 1, "k", env_);
-
-  auto txn2 = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, true));
-
-  delete txn1;
-  delete txn2;
-}
-
-TEST_F(TransactionLockMgrTest, UnlockShared) {
-  locker_->AddColumnFamily(1);
-
-  auto txn1 = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false));
-  locker_->UnLock(txn1, 1, "k", env_);
-
-  auto txn2 = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, true));
-
-  delete txn1;
-  delete txn2;
-}
-
-TEST_F(TransactionLockMgrTest, ReentrantExclusiveLock) {
-  // Tests that a txn can acquire exclusive lock on the same key repeatedly.
-  locker_->AddColumnFamily(1);
-  auto txn = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
-  delete txn;
-}
-
-TEST_F(TransactionLockMgrTest, ReentrantSharedLock) {
-  // Tests that a txn can acquire shared lock on the same key repeatedly.
-  locker_->AddColumnFamily(1);
-  auto txn = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
-  delete txn;
-}
-
-TEST_F(TransactionLockMgrTest, LockUpgrade) {
-  // Tests that a txn can upgrade from a shared lock to an exclusive lock.
-  locker_->AddColumnFamily(1);
-  auto txn = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
-  delete txn;
-}
-
-TEST_F(TransactionLockMgrTest, LockDowngrade) {
-  // Tests that a txn can acquire a shared lock after acquiring an exclusive
-  // lock on the same key.
-  locker_->AddColumnFamily(1);
-  auto txn = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, true));
-  ASSERT_OK(locker_->TryLock(txn, 1, "k", env_, false));
-  delete txn;
-}
-
-TEST_F(TransactionLockMgrTest, LockConflict) {
-  // Tests that lock conflicts lead to lock timeout.
-  locker_->AddColumnFamily(1);
-  auto txn1 = NewTxn();
-  auto txn2 = NewTxn();
-
-  {
-    // exclusive-exclusive conflict.
-    ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
-    auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
-    ASSERT_TRUE(s.IsTimedOut());
-  }
-
-  {
-    // exclusive-shared conflict.
-    ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, true));
-    auto s = locker_->TryLock(txn2, 1, "k2", env_, false);
-    ASSERT_TRUE(s.IsTimedOut());
-  }
-
-  {
-    // shared-exclusive conflict.
-    ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, false));
-    auto s = locker_->TryLock(txn2, 1, "k2", env_, true);
-    ASSERT_TRUE(s.IsTimedOut());
-  }
-
-  delete txn1;
-  delete txn2;
-}
-
-port::Thread BlockUntilWaitingTxn(std::function<void()> f) {
-  std::atomic<bool> reached(false);
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "TransactionLockMgr::AcquireWithTimeout:WaitingTxn",
-      [&](void* /*arg*/) { reached.store(true); });
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
-
-  port::Thread t(f);
-
-  while (!reached.load()) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(100));
-  }
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
-  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
-
-  return t;
-}
-
-TEST_F(TransactionLockMgrTest, SharedLocks) {
-  // Tests that shared locks can be concurrently held by multiple transactions.
-  locker_->AddColumnFamily(1);
-  auto txn1 = NewTxn();
-  auto txn2 = NewTxn();
-  ASSERT_OK(locker_->TryLock(txn1, 1, "k", env_, false));
-  ASSERT_OK(locker_->TryLock(txn2, 1, "k", env_, false));
-  delete txn1;
-  delete txn2;
-}
-
-TEST_F(TransactionLockMgrTest, Deadlock) {
-  // Tests that deadlock can be detected.
-  // Deadlock scenario:
-  // txn1 exclusively locks k1, and wants to lock k2;
-  // txn2 exclusively locks k2, and wants to lock k1.
-  locker_->AddColumnFamily(1);
-  TransactionOptions txn_opt;
-  txn_opt.deadlock_detect = true;
-  txn_opt.lock_timeout = 1000000;
-  auto txn1 = NewTxn(txn_opt);
-  auto txn2 = NewTxn(txn_opt);
-
-  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
-  ASSERT_OK(locker_->TryLock(txn2, 1, "k2", env_, true));
-
-  // txn1 tries to lock k2, will block forever.
-  port::Thread t = BlockUntilWaitingTxn([&]() {
-    // block because txn2 is holding a lock on k2.
-    locker_->TryLock(txn1, 1, "k2", env_, true);
-  });
-
-  auto s = locker_->TryLock(txn2, 1, "k1", env_, true);
-  ASSERT_TRUE(s.IsBusy());
-  ASSERT_EQ(s.subcode(), Status::SubCode::kDeadlock);
-
-  std::vector<DeadlockPath> deadlock_paths = locker_->GetDeadlockInfoBuffer();
-  ASSERT_EQ(deadlock_paths.size(), 1u);
-  ASSERT_FALSE(deadlock_paths[0].limit_exceeded);
-
-  std::vector<DeadlockInfo> deadlocks = deadlock_paths[0].path;
-  ASSERT_EQ(deadlocks.size(), 2u);
-
-  ASSERT_EQ(deadlocks[0].m_txn_id, txn1->GetID());
-  ASSERT_EQ(deadlocks[0].m_cf_id, 1u);
-  ASSERT_TRUE(deadlocks[0].m_exclusive);
-  ASSERT_EQ(deadlocks[0].m_waiting_key, "k2");
-
-  ASSERT_EQ(deadlocks[1].m_txn_id, txn2->GetID());
-  ASSERT_EQ(deadlocks[1].m_cf_id, 1u);
-  ASSERT_TRUE(deadlocks[1].m_exclusive);
-  ASSERT_EQ(deadlocks[1].m_waiting_key, "k1");
-
-  locker_->UnLock(txn2, 1, "k2", env_);
-  t.join();
-
-  delete txn2;
-  delete txn1;
-}
-
-TEST_F(TransactionLockMgrTest, DeadlockDepthExceeded) {
-  // Tests that when detecting deadlock, if the detection depth is exceeded,
-  // it's also viewed as deadlock.
-  locker_->AddColumnFamily(1);
-  TransactionOptions txn_opt;
-  txn_opt.deadlock_detect = true;
-  txn_opt.deadlock_detect_depth = 1;
-  txn_opt.lock_timeout = 1000000;
-  auto txn1 = NewTxn(txn_opt);
-  auto txn2 = NewTxn(txn_opt);
-  auto txn3 = NewTxn(txn_opt);
-  auto txn4 = NewTxn(txn_opt);
-  // "a ->(k) b" means transaction a is waiting for transaction b to release
-  // the held lock on key k.
-  // txn4 ->(k3) -> txn3 ->(k2) txn2 ->(k1) txn1
-  // txn3's deadlock detection will exceed the detection depth 1,
-  // which will be viewed as a deadlock.
-  // NOTE:
-  // txn4 ->(k3) -> txn3 must be set up before
-  // txn3 ->(k2) -> txn2, because to trigger deadlock detection for txn3,
-  // it must have another txn waiting on it, which is txn4 in this case.
-  ASSERT_OK(locker_->TryLock(txn1, 1, "k1", env_, true));
-
-  port::Thread t1 = BlockUntilWaitingTxn([&]() {
-    ASSERT_OK(locker_->TryLock(txn2, 1, "k2", env_, true));
-    // block because txn1 is holding a lock on k1.
-    locker_->TryLock(txn2, 1, "k1", env_, true);
-  });
-
-  ASSERT_OK(locker_->TryLock(txn3, 1, "k3", env_, true));
-
-  port::Thread t2 = BlockUntilWaitingTxn([&]() {
-    // block because txn3 is holding a lock on k1.
-    locker_->TryLock(txn4, 1, "k3", env_, true);
-  });
-
-  auto s = locker_->TryLock(txn3, 1, "k2", env_, true);
-  ASSERT_TRUE(s.IsBusy());
-  ASSERT_EQ(s.subcode(), Status::SubCode::kDeadlock);
-
-  std::vector<DeadlockPath> deadlock_paths = locker_->GetDeadlockInfoBuffer();
-  ASSERT_EQ(deadlock_paths.size(), 1u);
-  ASSERT_TRUE(deadlock_paths[0].limit_exceeded);
-
-  locker_->UnLock(txn1, 1, "k1", env_);
-  locker_->UnLock(txn3, 1, "k3", env_);
-  t1.join();
-  t2.join();
-
-  delete txn4;
-  delete txn3;
-  delete txn2;
-  delete txn1;
-}
-
-}  // namespace ROCKSDB_NAMESPACE
-
-int main(int argc, char** argv) {
-  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-#else
-#include <stdio.h>
-
-int main(int /*argc*/, char** /*argv*/) {
-  fprintf(stderr,
-          "SKIPPED because Transactions are not supported in ROCKSDB_LITE\n");
-  return 0;
-}
-
-#endif  // ROCKSDB_LITE
diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc
index 5effb19d2ee..59c605857fe 100644
--- a/utilities/transactions/transaction_test.cc
+++ b/utilities/transactions/transaction_test.cc
@@ -102,7 +102,7 @@ TEST_P(TransactionTest, DoubleEmptyWrite) {
   // Also test that it works during recovery
   txn0 = db->BeginTransaction(write_options, txn_options);
   ASSERT_OK(txn0->SetName("xid2"));
-  txn0->Put(Slice("foo0"), Slice("bar0a"));
+  ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0a")));
   ASSERT_OK(txn0->Prepare());
   delete txn0;
   reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
@@ -198,7 +198,7 @@ TEST_P(TransactionTest, AssumeExclusiveTracked) {
   ASSERT_OK(txn->SingleDelete(db->DefaultColumnFamily(), Slice("foo"),
                               ASSUME_LOCKED));
 
-  txn->Rollback();
+  ASSERT_OK(txn->Rollback());
   delete txn;
 }
 
@@ -223,7 +223,7 @@ TEST_P(TransactionTest, ValidateSnapshotTest) {
 
       if (with_flush) {
         auto db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
-        db_impl->TEST_FlushMemTable(true);
+        ASSERT_OK(db_impl->TEST_FlushMemTable(true));
         // Make sure the flushed memtable is not kept in memory
         int max_memtable_in_history =
             std::max(
@@ -232,8 +232,8 @@ TEST_P(TransactionTest, ValidateSnapshotTest) {
                     static_cast<int>(options.write_buffer_size)) +
             1;
         for (int i = 0; i < max_memtable_in_history; i++) {
-          db->Put(write_options, Slice("key"), Slice("value"));
-          db_impl->TEST_FlushMemTable(true);
+          ASSERT_OK(db->Put(write_options, Slice("key"), Slice("value")));
+          ASSERT_OK(db_impl->TEST_FlushMemTable(true));
         }
       }
 
@@ -282,7 +282,7 @@ TEST_P(TransactionTest, WaitingTxn) {
   ASSERT_TRUE(txn2);
 
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "TransactionLockMgr::AcquireWithTimeout:WaitingTxn", [&](void* /*arg*/) {
+      "PointLockManager::AcquireWithTimeout:WaitingTxn", [&](void* /*arg*/) {
         std::string key;
         uint32_t cf_id;
         std::vector<TransactionID> wait = txn2->GetWaitingTxns(&cf_id, &key);
@@ -392,9 +392,9 @@ TEST_P(TransactionTest, SharedLocks) {
   ASSERT_EQ(expected_txns, lock_txns);
   ASSERT_FALSE(cf_iterator->second.exclusive);
 
-  txn1->Rollback();
-  txn2->Rollback();
-  txn3->Rollback();
+  ASSERT_OK(txn1->Rollback());
+  ASSERT_OK(txn2->Rollback());
+  ASSERT_OK(txn3->Rollback());
 
   // Test txn1 and txn2 sharing a lock and txn3 trying to obtain it.
   s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
@@ -416,9 +416,9 @@ TEST_P(TransactionTest, SharedLocks) {
   s = txn3->GetForUpdate(read_options, "foo", nullptr);
   ASSERT_OK(s);
 
-  txn1->Rollback();
-  txn2->Rollback();
-  txn3->Rollback();
+  ASSERT_OK(txn1->Rollback());
+  ASSERT_OK(txn2->Rollback());
+  ASSERT_OK(txn3->Rollback());
 
   // Test txn1 and txn2 sharing a lock and txn2 trying to upgrade lock.
   s = txn1->GetForUpdate(read_options, "foo", nullptr, false /* exclusive */);
@@ -454,8 +454,8 @@ TEST_P(TransactionTest, SharedLocks) {
   ASSERT_TRUE(s.IsTimedOut());
   ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
 
-  txn1->Rollback();
-  txn2->Rollback();
+  ASSERT_OK(txn1->Rollback());
+  ASSERT_OK(txn2->Rollback());
 
   // Test txn1 holding an exclusive lock and txn2 trying to obtain shared
   // access.
@@ -508,7 +508,7 @@ TEST_P(TransactionTest, DeadlockCycleShared) {
 
   std::atomic<uint32_t> checkpoints(0);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "TransactionLockMgr::AcquireWithTimeout:WaitingTxn",
+      "PointLockManager::AcquireWithTimeout:WaitingTxn",
       [&](void* /*arg*/) { checkpoints.fetch_add(1); });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
@@ -519,7 +519,7 @@ TEST_P(TransactionTest, DeadlockCycleShared) {
       auto s = txns[i]->GetForUpdate(read_options, ToString(i + 1), nullptr,
                                      true /* exclusive */);
       ASSERT_OK(s);
-      txns[i]->Rollback();
+      ASSERT_OK(txns[i]->Rollback());
       delete txns[i];
     };
     threads.emplace_back(blocking_thread);
@@ -581,7 +581,7 @@ TEST_P(TransactionTest, DeadlockCycleShared) {
 
   // Rollback the leaf transaction.
   for (uint32_t i = 15; i < 31; i++) {
-    txns[i]->Rollback();
+    ASSERT_OK(txns[i]->Rollback());
     delete txns[i];
   }
 
@@ -641,7 +641,7 @@ TEST_P(TransactionTest, DeadlockCycleShared) {
 
   std::atomic<uint32_t> checkpoints_shared(0);
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-      "TransactionLockMgr::AcquireWithTimeout:WaitingTxn",
+      "PointLockManager::AcquireWithTimeout:WaitingTxn",
       [&](void* /*arg*/) { checkpoints_shared.fetch_add(1); });
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
@@ -651,7 +651,7 @@ TEST_P(TransactionTest, DeadlockCycleShared) {
       auto s =
           txns_shared[i]->GetForUpdate(read_options, ToString(i + 1), nullptr);
       ASSERT_OK(s);
-      txns_shared[i]->Rollback();
+      ASSERT_OK(txns_shared[i]->Rollback());
       delete txns_shared[i];
     };
     threads_shared.emplace_back(blocking_thread);
@@ -678,7 +678,7 @@ TEST_P(TransactionTest, DeadlockCycleShared) {
   // Verify the exclusivity field of the transactions in the deadlock path.
   ASSERT_TRUE(dlock_buffer[0].path[0].m_exclusive);
   ASSERT_FALSE(dlock_buffer[0].path[1].m_exclusive);
-  txns_shared[1]->Rollback();
+  ASSERT_OK(txns_shared[1]->Rollback());
   delete txns_shared[1];
 
   for (auto& t : threads_shared) {
@@ -714,7 +714,7 @@ TEST_P(TransactionStressTest, DeadlockCycle) {
 
     std::atomic<uint32_t> checkpoints(0);
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
-        "TransactionLockMgr::AcquireWithTimeout:WaitingTxn",
+        "PointLockManager::AcquireWithTimeout:WaitingTxn",
         [&](void* /*arg*/) { checkpoints.fetch_add(1); });
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
 
@@ -725,7 +725,7 @@ TEST_P(TransactionStressTest, DeadlockCycle) {
       std::function<void()> blocking_thread = [&, i] {
         auto s = txns[i]->GetForUpdate(read_options, ToString(i + 1), nullptr);
         ASSERT_OK(s);
-        txns[i]->Rollback();
+        ASSERT_OK(txns[i]->Rollback());
         delete txns[i];
       };
       threads.emplace_back(blocking_thread);
@@ -786,7 +786,7 @@ TEST_P(TransactionStressTest, DeadlockCycle) {
     }
 
     // Rollback the last transaction.
-    txns[len - 1]->Rollback();
+    ASSERT_OK(txns[len - 1]->Rollback());
     delete txns[len - 1];
 
     for (auto& t : threads) {
@@ -809,7 +809,7 @@ TEST_P(TransactionStressTest, DeadlockStress) {
   std::vector<std::string> keys;
 
   for (uint32_t i = 0; i < NUM_KEYS; i++) {
-    db->Put(write_options, Slice(ToString(i)), Slice(""));
+    ASSERT_OK(db->Put(write_options, Slice(ToString(i)), Slice("")));
     keys.push_back(ToString(i));
   }
 
@@ -831,7 +831,7 @@ TEST_P(TransactionStressTest, DeadlockStress) {
             txn->GetForUpdate(read_options, k, nullptr, txn->GetID() % 4 == 0);
         if (!s.ok()) {
           ASSERT_TRUE(s.IsDeadlock());
-          txn->Rollback();
+          ASSERT_OK(txn->Rollback());
           break;
         }
       }
@@ -896,7 +896,7 @@ TEST_P(TransactionTest, LogMarkLeakTest) {
       ASSERT_OK(txn->Commit());
       delete txn;
     }
-    db_impl->TEST_FlushMemTable(true);
+    ASSERT_OK(db_impl->TEST_FlushMemTable(true));
   }
   for (auto txn : txns) {
     ASSERT_OK(txn->Commit());
@@ -941,12 +941,14 @@ TEST_P(TransactionTest, SimpleTwoPhaseTransactionTest) {
     ASSERT_EQ(1, txn->GetNumPuts());
 
     // regular db read
-    db->Get(read_options, "foo2", &value);
+    ASSERT_OK(db->Get(read_options, "foo2", &value));
     ASSERT_EQ(value, "bar2");
 
     // commit time put
-    txn->GetCommitTimeWriteBatch()->Put(Slice("gtid"), Slice("dogs"));
-    txn->GetCommitTimeWriteBatch()->Put(Slice("gtid2"), Slice("cats"));
+    ASSERT_OK(
+        txn->GetCommitTimeWriteBatch()->Put(Slice("gtid"), Slice("dogs")));
+    ASSERT_OK(
+        txn->GetCommitTimeWriteBatch()->Put(Slice("gtid2"), Slice("cats")));
 
     // nothing has been prepped yet
     ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
@@ -1017,7 +1019,7 @@ TEST_P(TransactionTest, SimpleTwoPhaseTransactionTest) {
         assert(false);
     }
 
-    db_impl->TEST_FlushMemTable(true);
+    ASSERT_OK(db_impl->TEST_FlushMemTable(true));
     // After flush the recoverable state must be visible
     if (cwb4recovery) {
       s = db->Get(read_options, "gtid", &value);
@@ -1104,8 +1106,8 @@ TEST_P(TransactionTest, TwoPhaseNameTest) {
   s = txn1->SetName("name4");
   ASSERT_EQ(s, Status::InvalidArgument());
 
-  txn1->Rollback();
-  txn2->Rollback();
+  ASSERT_OK(txn1->Rollback());
+  ASSERT_OK(txn2->Rollback());
   delete txn1;
   delete txn2;
 }
@@ -1144,7 +1146,8 @@ TEST_P(TransactionTest, TwoPhaseEmptyWriteTest) {
 
       delete txn1;
 
-      txn2->GetCommitTimeWriteBatch()->Put(Slice("foo"), Slice("bar"));
+      ASSERT_OK(
+          txn2->GetCommitTimeWriteBatch()->Put(Slice("foo"), Slice("bar")));
 
       s = txn2->Prepare();
       ASSERT_OK(s);
@@ -1160,13 +1163,13 @@ TEST_P(TransactionTest, TwoPhaseEmptyWriteTest) {
       } else {
         if (test_with_empty_wal) {
           DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
-          db_impl->TEST_FlushMemTable(true);
+          ASSERT_OK(db_impl->TEST_FlushMemTable(true));
           // After flush the state must be visible
           s = db->Get(read_options, "foo", &value);
           ASSERT_OK(s);
           ASSERT_EQ(value, "bar");
         }
-        db->FlushWAL(true);
+        ASSERT_OK(db->FlushWAL(true));
         // kill and reopen to trigger recovery
         s = ReOpenNoDelete();
         ASSERT_OK(s);
@@ -1259,7 +1262,7 @@ TEST_P(TransactionTest, TwoPhaseRollbackTest) {
   // flush to next wal
   s = db->Put(write_options, Slice("foo"), Slice("bar"));
   ASSERT_OK(s);
-  db_impl->TEST_FlushMemTable(true);
+  ASSERT_OK(db_impl->TEST_FlushMemTable(true));
 
   // issue rollback (marker written to WAL)
   s = txn->Rollback();
@@ -1315,7 +1318,7 @@ TEST_P(TransactionTest, PersistentTwoPhaseTransactionTest) {
   ASSERT_OK(s);
   ASSERT_EQ(1, txn->GetNumPuts());
 
-  db_impl->TEST_FlushMemTable(true);
+  ASSERT_OK(db_impl->TEST_FlushMemTable(true));
 
   // regular db read
   db->Get(read_options, "foo2", &value);
@@ -1332,7 +1335,7 @@ TEST_P(TransactionTest, PersistentTwoPhaseTransactionTest) {
   s = db->Get(read_options, Slice("foo"), &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  db->FlushWAL(false);
+  ASSERT_OK(db->FlushWAL(false));
   delete txn;
   // kill and reopen
   reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
@@ -1406,7 +1409,7 @@ TEST_P(TransactionTest, PersistentTwoPhaseTransactionTest) {
   s = db->Put(write_options, Slice("foo3"), Slice("bar3"));
   ASSERT_OK(s);
 
-  db_impl->TEST_FlushMemTable(true);
+  ASSERT_OK(db_impl->TEST_FlushMemTable(true));
 
   // after memtable flush we can now release the log
   ASSERT_GT(db_impl->MinLogNumberToKeep(), log_containing_prep);
@@ -1672,7 +1675,7 @@ TEST_P(TransactionTest, TwoPhaseDoubleRecoveryTest) {
 
   // kill and reopen
   env->SetFilesystemActive(false);
-  ReOpenNoDelete();
+  ASSERT_OK(ReOpenNoDelete());
   assert(db != nullptr);
 
   // value is now available
@@ -1933,7 +1936,7 @@ TEST_P(TransactionTest, TwoPhaseLogRollingTest2) {
 
   // request a flush for all column families such that the earliest
   // alive log file can be killed
-  db_impl->TEST_SwitchWAL();
+  ASSERT_OK(db_impl->TEST_SwitchWAL());
   // log cannot be flushed because txn2 has not been commited
   ASSERT_TRUE(!db_impl->TEST_IsLogGettingFlushed());
   ASSERT_TRUE(db_impl->TEST_UnableToReleaseOldestLog());
@@ -1959,7 +1962,7 @@ TEST_P(TransactionTest, TwoPhaseLogRollingTest2) {
   s = txn2->Commit();
   ASSERT_OK(s);
 
-  db_impl->TEST_SwitchWAL();
+  ASSERT_OK(db_impl->TEST_SwitchWAL());
   ASSERT_TRUE(!db_impl->TEST_UnableToReleaseOldestLog());
 
   // we should see that cfb now has a flush requested
@@ -2020,12 +2023,12 @@ TEST_P(TransactionTest, TwoPhaseOutOfOrderDelete) {
   s = db->Put(wal_on, "cats", "dogs4");
   ASSERT_OK(s);
 
-  db->FlushWAL(false);
+  ASSERT_OK(db->FlushWAL(false));
 
   // kill and reopen
   env->SetFilesystemActive(false);
   reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
-  ReOpenNoDelete();
+  ASSERT_OK(ReOpenNoDelete());
   assert(db != nullptr);
 
   s = db->Get(read_options, "first", &value);
@@ -2098,8 +2101,8 @@ TEST_P(TransactionTest, WriteConflictTest) {
   string value;
   Status s;
 
-  db->Put(write_options, "foo", "A");
-  db->Put(write_options, "foo2", "B");
+  ASSERT_OK(db->Put(write_options, "foo", "A"));
+  ASSERT_OK(db->Put(write_options, "foo2", "B"));
 
   Transaction* txn = db->BeginTransaction(write_options);
   ASSERT_TRUE(txn);
@@ -2135,7 +2138,7 @@ TEST_P(TransactionTest, WriteConflictTest2) {
   std::string value;
   Status s;
 
-  db->Put(write_options, "foo", "bar");
+  ASSERT_OK(db->Put(write_options, "foo", "bar"));
 
   txn_options.set_snapshot = true;
   Transaction* txn = db->BeginTransaction(write_options, txn_options);
@@ -2183,8 +2186,8 @@ TEST_P(TransactionTest, ReadConflictTest) {
   std::string value;
   Status s;
 
-  db->Put(write_options, "foo", "bar");
-  db->Put(write_options, "foo2", "bar");
+  ASSERT_OK(db->Put(write_options, "foo", "bar"));
+  ASSERT_OK(db->Put(write_options, "foo2", "bar"));
 
   txn_options.set_snapshot = true;
   Transaction* txn = db->BeginTransaction(write_options, txn_options);
@@ -2193,7 +2196,7 @@ TEST_P(TransactionTest, ReadConflictTest) {
   txn->SetSnapshot();
   snapshot_read_options.snapshot = txn->GetSnapshot();
 
-  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
   ASSERT_EQ(value, "bar");
 
   // This Put outside of a transaction will conflict with the previous read
@@ -2239,21 +2242,21 @@ TEST_P(TransactionTest, FlushTest) {
   std::string value;
   Status s;
 
-  db->Put(write_options, Slice("foo"), Slice("bar"));
-  db->Put(write_options, Slice("foo2"), Slice("bar"));
+  ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
+  ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar")));
 
   Transaction* txn = db->BeginTransaction(write_options);
   ASSERT_TRUE(txn);
 
   snapshot_read_options.snapshot = txn->GetSnapshot();
 
-  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
   ASSERT_EQ(value, "bar");
 
   s = txn->Put(Slice("foo"), Slice("bar2"));
   ASSERT_OK(s);
 
-  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
   ASSERT_EQ(value, "bar2");
 
   // Put a random key so we have a memtable to flush
@@ -2304,9 +2307,9 @@ TEST_P(TransactionTest, FlushTest2) {
 
     DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
 
-    db->Put(write_options, Slice("foo"), Slice("bar"));
-    db->Put(write_options, Slice("foo2"), Slice("bar2"));
-    db->Put(write_options, Slice("foo3"), Slice("bar3"));
+    ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
+    ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar2")));
+    ASSERT_OK(db->Put(write_options, Slice("foo3"), Slice("bar3")));
 
     txn_options.set_snapshot = true;
     Transaction* txn = db->BeginTransaction(write_options, txn_options);
@@ -2314,13 +2317,13 @@ TEST_P(TransactionTest, FlushTest2) {
 
     snapshot_read_options.snapshot = txn->GetSnapshot();
 
-    txn->GetForUpdate(snapshot_read_options, "foo", &value);
+    ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
     ASSERT_EQ(value, "bar");
 
     s = txn->Put(Slice("foo"), Slice("bar2"));
     ASSERT_OK(s);
 
-    txn->GetForUpdate(snapshot_read_options, "foo", &value);
+    ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value));
     ASSERT_EQ(value, "bar2");
     // verify foo is locked by txn
     s = db->Delete(write_options, "foo");
@@ -2405,7 +2408,7 @@ TEST_P(TransactionTest, FlushTest2) {
     s = db->Delete(write_options, "foo3");
     ASSERT_TRUE(s.IsTimedOut());
 
-    db_impl->TEST_WaitForCompact();
+    ASSERT_OK(db_impl->TEST_WaitForCompact());
 
     s = txn->Commit();
     ASSERT_OK(s);
@@ -2432,16 +2435,16 @@ TEST_P(TransactionTest, NoSnapshotTest) {
   std::string value;
   Status s;
 
-  db->Put(write_options, "AAA", "bar");
+  ASSERT_OK(db->Put(write_options, "AAA", "bar"));
 
   Transaction* txn = db->BeginTransaction(write_options);
   ASSERT_TRUE(txn);
 
   // Modify key after transaction start
-  db->Put(write_options, "AAA", "bar1");
+  ASSERT_OK(db->Put(write_options, "AAA", "bar1"));
 
   // Read and write without a snap
-  txn->GetForUpdate(read_options, "AAA", &value);
+  ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
   ASSERT_EQ(value, "bar1");
   s = txn->Put("AAA", "bar2");
   ASSERT_OK(s);
@@ -2450,7 +2453,7 @@ TEST_P(TransactionTest, NoSnapshotTest) {
   s = txn->Commit();
   ASSERT_OK(s);
 
-  txn->GetForUpdate(read_options, "AAA", &value);
+  ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
   ASSERT_EQ(value, "bar2");
 
   delete txn;
@@ -2469,7 +2472,7 @@ TEST_P(TransactionTest, MultipleSnapshotTest) {
   Transaction* txn = db->BeginTransaction(write_options);
   ASSERT_TRUE(txn);
 
-  db->Put(write_options, "AAA", "bar1");
+  ASSERT_OK(db->Put(write_options, "AAA", "bar1"));
 
   // Read and write without a snapshot
   ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
@@ -2496,7 +2499,7 @@ TEST_P(TransactionTest, MultipleSnapshotTest) {
   snapshot_read_options.snapshot = txn->GetSnapshot();
 
   // Read and write with snapshot
-  txn->GetForUpdate(snapshot_read_options, "CCC", &value);
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "CCC", &value));
   ASSERT_EQ(value, "bar1");
   s = txn->Put("CCC", "bar2");
   ASSERT_OK(s);
@@ -2539,8 +2542,8 @@ TEST_P(TransactionTest, MultipleSnapshotTest) {
   txn = db->BeginTransaction(write_options);
 
   // Potentially conflicting writes
-  db->Put(write_options, "ZZZ", "zzz");
-  db->Put(write_options, "XXX", "xxx");
+  ASSERT_OK(db->Put(write_options, "ZZZ", "zzz"));
+  ASSERT_OK(db->Put(write_options, "XXX", "xxx"));
 
   txn->SetSnapshot();
 
@@ -2622,12 +2625,12 @@ TEST_P(TransactionTest, ColumnFamiliesTest) {
 
   // Write some data to the db
   WriteBatch batch;
-  batch.Put("foo", "foo");
-  batch.Put(handles[1], "AAA", "bar");
-  batch.Put(handles[1], "AAAZZZ", "bar");
+  ASSERT_OK(batch.Put("foo", "foo"));
+  ASSERT_OK(batch.Put(handles[1], "AAA", "bar"));
+  ASSERT_OK(batch.Put(handles[1], "AAAZZZ", "bar"));
   s = db->Write(write_options, &batch);
   ASSERT_OK(s);
-  db->Delete(write_options, handles[1], "AAAZZZ");
+  ASSERT_OK(db->Delete(write_options, handles[1], "AAAZZZ"));
 
   // These keys do not conflict with existing writes since they're in
   // different column families
@@ -2711,7 +2714,7 @@ TEST_P(TransactionTest, ColumnFamiliesTest) {
   ASSERT_TRUE(s.IsNotFound());
 
   // Put a key which will conflict with the next txn using the previous snapshot
-  db->Put(write_options, handles[2], "foo", "000");
+  ASSERT_OK(db->Put(write_options, handles[2], "foo", "000"));
 
   results = txn2->MultiGetForUpdate(snapshot_read_options, multiget_cfh,
                                     multiget_keys, &values);
@@ -2775,13 +2778,13 @@ TEST_P(TransactionTest, MultiGetBatchedTest) {
 
   // Write some data to the db
   WriteBatch batch;
-  batch.Put(handles[1], "aaa", "val1");
-  batch.Put(handles[1], "bbb", "val2");
-  batch.Put(handles[1], "ccc", "val3");
-  batch.Put(handles[1], "ddd", "foo");
-  batch.Put(handles[1], "eee", "val5");
-  batch.Put(handles[1], "fff", "val6");
-  batch.Merge(handles[1], "ggg", "foo");
+  ASSERT_OK(batch.Put(handles[1], "aaa", "val1"));
+  ASSERT_OK(batch.Put(handles[1], "bbb", "val2"));
+  ASSERT_OK(batch.Put(handles[1], "ccc", "val3"));
+  ASSERT_OK(batch.Put(handles[1], "ddd", "foo"));
+  ASSERT_OK(batch.Put(handles[1], "eee", "val5"));
+  ASSERT_OK(batch.Put(handles[1], "fff", "val6"));
+  ASSERT_OK(batch.Merge(handles[1], "ggg", "foo"));
   s = db->Write(write_options, &batch);
   ASSERT_OK(s);
 
@@ -2811,7 +2814,8 @@ TEST_P(TransactionTest, MultiGetBatchedTest) {
   ASSERT_TRUE(statuses[1].IsNotFound());
   ASSERT_TRUE(statuses[2].ok());
   ASSERT_EQ(values[2], "val3_new");
-  ASSERT_TRUE(statuses[3].IsMergeInProgress());
+  ASSERT_TRUE(statuses[3].ok());
+  ASSERT_EQ(values[3], "foo,bar");
   ASSERT_TRUE(statuses[4].ok());
   ASSERT_EQ(values[4], "val5");
   ASSERT_TRUE(statuses[5].ok());
@@ -2870,7 +2874,7 @@ TEST_P(TransactionTest, MultiGetLargeBatchedTest) {
   WriteBatch batch;
   for (int i = 0; i < 3 * MultiGetContext::MAX_BATCH_SIZE; ++i) {
     std::string val = "val" + std::to_string(i);
-    batch.Put(handles[1], key_str[i], val);
+    ASSERT_OK(batch.Put(handles[1], key_str[i], val));
   }
   s = db->Write(write_options, &batch);
   ASSERT_OK(s);
@@ -2922,6 +2926,47 @@ TEST_P(TransactionTest, MultiGetLargeBatchedTest) {
   }
 }
 
+TEST_P(TransactionTest, MultiGetSnapshot) {
+  WriteOptions write_options;
+  TransactionOptions transaction_options;
+  Transaction* txn1 = db->BeginTransaction(write_options, transaction_options);
+
+  Slice key = "foo";
+
+  Status s = txn1->Put(key, "bar");
+  ASSERT_OK(s);
+
+  s = txn1->SetName("test");
+  ASSERT_OK(s);
+
+  s = txn1->Prepare();
+  ASSERT_OK(s);
+
+  // Get snapshot between prepare and commit
+  // Un-committed data should be invisible to other transactions
+  const Snapshot* s1 = db->GetSnapshot();
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+  delete txn1;
+
+  Transaction* txn2 = db->BeginTransaction(write_options, transaction_options);
+  ReadOptions read_options;
+  read_options.snapshot = s1;
+
+  std::vector<Slice> keys;
+  std::vector<PinnableSlice> values(1);
+  std::vector<Status> statuses(1);
+  keys.push_back(key);
+  auto cfd = db->DefaultColumnFamily();
+  txn2->MultiGet(read_options, cfd, 1, keys.data(), values.data(),
+                 statuses.data());
+  ASSERT_TRUE(statuses[0].IsNotFound());
+  delete txn2;
+
+  db->ReleaseSnapshot(s1);
+}
+
 TEST_P(TransactionTest, ColumnFamiliesTest2) {
   WriteOptions write_options;
   ReadOptions read_options, snapshot_read_options;
@@ -3013,7 +3058,7 @@ TEST_P(TransactionTest, EmptyTest) {
   delete txn;
 
   txn = db->BeginTransaction(write_options);
-  txn->Rollback();
+  ASSERT_OK(txn->Rollback());
   delete txn;
 
   txn = db->BeginTransaction(write_options);
@@ -3060,17 +3105,23 @@ TEST_P(TransactionTest, PredicateManyPreceders) {
 
   std::vector<Status> results =
       txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
+  ASSERT_EQ(results.size(), 3);
+  ASSERT_TRUE(results[0].IsNotFound());
   ASSERT_TRUE(results[1].IsNotFound());
+  ASSERT_TRUE(results[2].IsNotFound());
 
   s = txn2->Put("2", "x");  // Conflict's with txn1's MultiGetForUpdate
   ASSERT_TRUE(s.IsTimedOut());
 
-  txn2->Rollback();
+  ASSERT_OK(txn2->Rollback());
 
   multiget_values.clear();
   results =
       txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
+  ASSERT_EQ(results.size(), 3);
+  ASSERT_TRUE(results[0].IsNotFound());
   ASSERT_TRUE(results[1].IsNotFound());
+  ASSERT_TRUE(results[2].IsNotFound());
 
   s = txn1->Commit();
   ASSERT_OK(s);
@@ -3096,7 +3147,7 @@ TEST_P(TransactionTest, PredicateManyPreceders) {
   s = txn2->GetForUpdate(read_options2, "4", &value);
   ASSERT_TRUE(s.IsBusy());
 
-  txn2->Rollback();
+  ASSERT_OK(txn2->Rollback());
 
   delete txn1;
   delete txn2;
@@ -3246,7 +3297,7 @@ TEST_P(TransactionTest, UntrackedWrites) {
 
   s = txn->PutUntracked("untracked", "0");
   ASSERT_OK(s);
-  txn->Rollback();
+  ASSERT_OK(txn->Rollback());
   s = db->Get(read_options, "untracked", &value);
   ASSERT_TRUE(s.IsNotFound());
 
@@ -3389,7 +3440,7 @@ TEST_P(TransactionTest, ReinitializeTest) {
   s = txn1->Put("Z", "a");
   ASSERT_OK(s);
 
-  txn1->Rollback();
+  ASSERT_OK(txn1->Rollback());
 
   s = txn1->Put("Y", "y");
   ASSERT_OK(s);
@@ -3450,7 +3501,7 @@ TEST_P(TransactionTest, Rollback) {
   s = txn2->Put("X", "2");
   ASSERT_TRUE(s.IsTimedOut());
 
-  txn1->Rollback();
+  ASSERT_OK(txn1->Rollback());
   delete txn1;
 
   // txn2 should now be able to write to X
@@ -3861,7 +3912,7 @@ TEST_P(TransactionTest, SavepointTest) {
   // Rollback to beginning of txn
   s = txn->RollbackToSavePoint();
   ASSERT_TRUE(s.IsNotFound());
-  txn->Rollback();
+  ASSERT_OK(txn->Rollback());
 
   ASSERT_EQ(0, txn->GetNumPuts());
   ASSERT_EQ(0, txn->GetNumDeletes());
@@ -4218,7 +4269,7 @@ TEST_P(TransactionTest, UndoGetForUpdateTest) {
   // Verify that A is now unlocked
   s = txn2->Put("A", "a2");
   ASSERT_OK(s);
-  txn2->Commit();
+  ASSERT_OK(txn2->Commit());
   delete txn2;
   s = db->Get(read_options, "A", &value);
   ASSERT_OK(s);
@@ -4244,7 +4295,7 @@ TEST_P(TransactionTest, UndoGetForUpdateTest) {
   s = txn2->Put("B", "b4");
   ASSERT_TRUE(s.IsTimedOut());
 
-  txn1->Rollback();
+  ASSERT_OK(txn1->Rollback());
   delete txn1;
 
   // Verify that A and B are no longer locked
@@ -4432,7 +4483,7 @@ TEST_P(TransactionTest, UndoGetForUpdateTest2) {
   s = txn2->Put("G", "g3");
   ASSERT_OK(s);
 
-  txn1->RollbackToSavePoint();  // rollback to 2
+  ASSERT_OK(txn1->RollbackToSavePoint());  // rollback to 2
 
   // Verify A,B,D,E,F are still locked and C,G,H are not.
   s = txn2->Put("A", "a3");
@@ -4479,7 +4530,7 @@ TEST_P(TransactionTest, UndoGetForUpdateTest2) {
   s = txn2->Put("H", "h3");
   ASSERT_OK(s);
 
-  txn1->RollbackToSavePoint();  // rollback to 1
+  ASSERT_OK(txn1->RollbackToSavePoint());  // rollback to 1
 
   // Verify A,B,F are still locked and C,D,E,G,H are not.
   s = txn2->Put("A", "a3");
@@ -4789,7 +4840,8 @@ TEST_P(TransactionTest, MergeTest) {
   ASSERT_OK(s);
 
   s = txn->Get(read_options, "A", &value);
-  ASSERT_TRUE(s.IsMergeInProgress());
+  ASSERT_OK(s);
+  ASSERT_EQ("a0,1,2", value);
 
   s = txn->Put("A", "a");
   ASSERT_OK(s);
@@ -4802,7 +4854,8 @@ TEST_P(TransactionTest, MergeTest) {
   ASSERT_OK(s);
 
   s = txn->Get(read_options, "A", &value);
-  ASSERT_TRUE(s.IsMergeInProgress());
+  ASSERT_OK(s);
+  ASSERT_EQ("a,3", value);
 
   TransactionOptions txn_options;
   txn_options.lock_timeout = 1;  // 1 ms
@@ -4826,6 +4879,56 @@ TEST_P(TransactionTest, MergeTest) {
   ASSERT_EQ("a,3", value);
 }
 
+TEST_P(TransactionTest, DeleteRangeSupportTest) {
+  // The `DeleteRange()` API is banned everywhere.
+  ASSERT_TRUE(
+      db->DeleteRange(WriteOptions(), db->DefaultColumnFamily(), "a", "b")
+          .IsNotSupported());
+
+  // But range deletions can be added via the `Write()` API by specifying the
+  // proper flags to promise there are no conflicts according to the DB type
+  // (see `TransactionDB::DeleteRange()` API doc for details).
+  for (bool skip_concurrency_control : {false, true}) {
+    for (bool skip_duplicate_key_check : {false, true}) {
+      ASSERT_OK(db->Put(WriteOptions(), "a", "val"));
+      WriteBatch wb;
+      ASSERT_OK(wb.DeleteRange("a", "b"));
+      TransactionDBWriteOptimizations flags;
+      flags.skip_concurrency_control = skip_concurrency_control;
+      flags.skip_duplicate_key_check = skip_duplicate_key_check;
+      Status s = db->Write(WriteOptions(), flags, &wb);
+      std::string value;
+      switch (txn_db_options.write_policy) {
+        case WRITE_COMMITTED:
+          if (skip_concurrency_control) {
+            ASSERT_OK(s);
+            ASSERT_TRUE(db->Get(ReadOptions(), "a", &value).IsNotFound());
+          } else {
+            ASSERT_NOK(s);
+            ASSERT_OK(db->Get(ReadOptions(), "a", &value));
+          }
+          break;
+        case WRITE_PREPARED:
+          // Intentional fall-through
+        case WRITE_UNPREPARED:
+          if (skip_concurrency_control && skip_duplicate_key_check) {
+            ASSERT_OK(s);
+            ASSERT_TRUE(db->Get(ReadOptions(), "a", &value).IsNotFound());
+          } else {
+            ASSERT_NOK(s);
+            ASSERT_OK(db->Get(ReadOptions(), "a", &value));
+          }
+          break;
+      }
+      // Without any promises from the user, range deletion via other `Write()`
+      // APIs are still banned.
+      ASSERT_OK(db->Put(WriteOptions(), "a", "val"));
+      ASSERT_NOK(db->Write(WriteOptions(), &wb));
+      ASSERT_OK(db->Get(ReadOptions(), "a", &value));
+    }
+  }
+}
+
 TEST_P(TransactionTest, DeferSnapshotTest) {
   WriteOptions write_options;
   ReadOptions read_options;
@@ -5218,7 +5321,7 @@ TEST_P(TransactionStressTest, ExpiredTransactionDataRace1) {
 
         // Force txn1 to expire
         /* sleep override */
-        std::this_thread::sleep_for(std::chrono::milliseconds(150));
+        std::this_thread::sleep_for(std::chrono::milliseconds(1500));
 
         Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
         Status s;
@@ -5234,7 +5337,7 @@ TEST_P(TransactionStressTest, ExpiredTransactionDataRace1) {
   WriteOptions write_options;
   TransactionOptions txn_options;
 
-  txn_options.expiration = 100;
+  txn_options.expiration = 1000;  // 1 second
   Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
 
   Status s;
@@ -5246,6 +5349,7 @@ TEST_P(TransactionStressTest, ExpiredTransactionDataRace1) {
   ReadOptions read_options;
   string value;
   s = db->Get(read_options, "X", &value);
+  ASSERT_OK(s);
   ASSERT_EQ("1", value);
 
   delete txn1;
@@ -5281,6 +5385,7 @@ Status TransactionStressTestInserter(
       return inserter.GetLastStatus();
     }
   }
+  inserter.GetLastStatus().PermitUncheckedError();
 
   // Make sure at least some of the transactions succeeded.  It's ok if
   // some failed due to write-conflicts.
@@ -5300,20 +5405,20 @@ Status TransactionStressTestInserter(
 TEST_P(MySQLStyleTransactionTest, TransactionStressTest) {
   // Small write buffer to trigger more compactions
   options.write_buffer_size = 1024;
-  ReOpenNoDelete();
-  const size_t num_workers = 4;   // worker threads count
-  const size_t num_checkers = 2;  // checker threads count
-  const size_t num_slow_checkers = 2;  // checker threads emulating backups
-  const size_t num_slow_workers = 1;   // slow worker threads count
-  const size_t num_transactions_per_thread = 10000;
-  const uint16_t num_sets = 3;
-  const size_t num_keys_per_set = 100;
+  ASSERT_OK(ReOpenNoDelete());
+  constexpr size_t num_workers = 4;        // worker threads count
+  constexpr size_t num_checkers = 2;       // checker threads count
+  constexpr size_t num_slow_checkers = 2;  // checker threads emulating backups
+  constexpr size_t num_slow_workers = 1;   // slow worker threads count
+  constexpr size_t num_transactions_per_thread = 10000;
+  constexpr uint16_t num_sets = 3;
+  constexpr size_t num_keys_per_set = 100;
   // Setting the key-space to be 100 keys should cause enough write-conflicts
   // to make this test interesting.
 
   std::vector<port::Thread> threads;
   std::atomic<uint32_t> finished = {0};
-  bool TAKE_SNAPSHOT = true;
+  constexpr bool TAKE_SNAPSHOT = true;
   uint64_t time_seed = env->NowMicros();
   printf("time_seed is %" PRIu64 "\n", time_seed);  // would help to reproduce
 
@@ -5329,9 +5434,8 @@ TEST_P(MySQLStyleTransactionTest, TransactionStressTest) {
     Random64 rand(time_seed * thd_seed);
     // Verify that data is consistent
     while (finished < num_workers) {
-      Status s = RandomTransactionInserter::Verify(
-          db, num_sets, num_keys_per_set, TAKE_SNAPSHOT, &rand);
-      ASSERT_OK(s);
+      ASSERT_OK(RandomTransactionInserter::Verify(
+          db, num_sets, num_keys_per_set, TAKE_SNAPSHOT, &rand));
     }
   };
   std::function<void()> call_slow_checker = [&] {
@@ -5412,7 +5516,7 @@ TEST_P(TransactionTest, MemoryLimitTest) {
   ASSERT_TRUE(s.IsMemoryLimit());
   ASSERT_EQ(2, txn->GetNumPuts());
 
-  txn->Rollback();
+  ASSERT_OK(txn->Rollback());
   delete txn;
 }
 
@@ -5551,7 +5655,7 @@ TEST_P(TransactionTest, Optimizations) {
     ASSERT_OK(ReOpen());
     WriteOptions write_options;
     WriteBatch batch;
-    batch.Put(Slice("k"), Slice("v1"));
+    ASSERT_OK(batch.Put(Slice("k"), Slice("v1")));
     ASSERT_OK(db->Write(write_options, &batch));
 
     ReadOptions ropt;
@@ -5593,7 +5697,7 @@ class ThreeBytewiseComparator : public Comparator {
 TEST_P(TransactionTest, GetWithoutSnapshot) {
   WriteOptions write_options;
   std::atomic<bool> finish = {false};
-  db->Put(write_options, "key", "value");
+  ASSERT_OK(db->Put(write_options, "key", "value"));
   ROCKSDB_NAMESPACE::port::Thread commit_thread([&]() {
     for (int i = 0; i < 100; i++) {
       TransactionOptions txn_options;
@@ -5629,16 +5733,16 @@ TEST_P(TransactionTest, DuplicateKeys) {
     ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
     WriteOptions write_options;
     WriteBatch batch;
-    batch.Put(Slice("key"), Slice("value"));
-    batch.Put(Slice("key2"), Slice("value2"));
+    ASSERT_OK(batch.Put(Slice("key"), Slice("value")));
+    ASSERT_OK(batch.Put(Slice("key2"), Slice("value2")));
     // duplicate the keys
-    batch.Put(Slice("key"), Slice("value3"));
+    ASSERT_OK(batch.Put(Slice("key"), Slice("value3")));
     // duplicate the 2nd key. It should not be counted duplicate since a
     // sub-patch is cut after the last duplicate.
-    batch.Put(Slice("key2"), Slice("value4"));
+    ASSERT_OK(batch.Put(Slice("key2"), Slice("value4")));
     // duplicate the keys but in a different cf. It should not be counted as
     // duplicate keys
-    batch.Put(cf_handle, Slice("key"), Slice("value5"));
+    ASSERT_OK(batch.Put(cf_handle, Slice("key"), Slice("value5")));
 
     ASSERT_OK(db->Write(write_options, &batch));
 
@@ -5665,11 +5769,11 @@ TEST_P(TransactionTest, DuplicateKeys) {
     ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
     WriteOptions write_options;
     WriteBatch batch;
-    batch.Put(cf_handle, Slice("key"), Slice("value"));
+    ASSERT_OK(batch.Put(cf_handle, Slice("key"), Slice("value")));
     // The first three bytes are the same, do it must be counted as duplicate
-    batch.Put(cf_handle, Slice("key2"), Slice("value2"));
+    ASSERT_OK(batch.Put(cf_handle, Slice("key2"), Slice("value2")));
     // check for 2nd duplicate key in cf with non-default comparator
-    batch.Put(cf_handle, Slice("key2b"), Slice("value2b"));
+    ASSERT_OK(batch.Put(cf_handle, Slice("key2b"), Slice("value2b")));
     ASSERT_OK(db->Write(write_options, &batch));
 
     // The value must be the most recent value for all the keys equal to "key",
@@ -5834,10 +5938,10 @@ TEST_P(TransactionTest, DuplicateKeys) {
     ASSERT_OK(db->Put(write_options, cf_handle, Slice("key"), Slice("value")));
     WriteBatch batch;
     // Merge more than max_successive_merges times
-    batch.Merge(cf_handle, Slice("key"), Slice("1"));
-    batch.Merge(cf_handle, Slice("key"), Slice("2"));
-    batch.Merge(cf_handle, Slice("key"), Slice("3"));
-    batch.Merge(cf_handle, Slice("key"), Slice("4"));
+    ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("1")));
+    ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("2")));
+    ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("3")));
+    ASSERT_OK(batch.Merge(cf_handle, Slice("key"), Slice("4")));
     ASSERT_OK(db->Write(write_options, &batch));
     ReadOptions read_options;
     string value;
@@ -5883,6 +5987,7 @@ TEST_P(TransactionTest, DuplicateKeys) {
     std::vector<ColumnFamilyHandle*> handles;
     ASSERT_OK(ReOpenNoDelete(cfds, &handles));
 
+    assert(db != nullptr);
     ASSERT_OK(db->Put(write_options, "foo0", "init"));
     ASSERT_OK(db->Put(write_options, "foo1", "init"));
     ASSERT_OK(db->Put(write_options, handles[1], "foo0", "init"));
@@ -5916,10 +6021,10 @@ TEST_P(TransactionTest, DuplicateKeys) {
     ASSERT_OK(txn0->Prepare());
     delete txn0;
     // This will check the asserts inside recovery code
-    db->FlushWAL(true);
+    ASSERT_OK(db->FlushWAL(true));
     // Flush only cf 1
-    static_cast_with_check<DBImpl>(db->GetRootDB())
-        ->TEST_FlushMemTable(true, false, handles[1]);
+    ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
+                  ->TEST_FlushMemTable(true, false, handles[1]));
     reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
     ASSERT_OK(ReOpenNoDelete(cfds, &handles));
     txn0 = db->GetTransactionByName("xid");
@@ -5956,8 +6061,8 @@ TEST_P(TransactionTest, DuplicateKeys) {
     // This will check the asserts inside recovery code
     ASSERT_OK(db->FlushWAL(true));
     // Flush only cf 1
-    static_cast_with_check<DBImpl>(db->GetRootDB())
-        ->TEST_FlushMemTable(true, false, handles[1]);
+    ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
+                  ->TEST_FlushMemTable(true, false, handles[1]));
     reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
     ASSERT_OK(ReOpenNoDelete(cfds, &handles));
     txn0 = db->GetTransactionByName("xid");
@@ -5989,8 +6094,8 @@ TEST_P(TransactionTest, DuplicateKeys) {
     // This will check the asserts inside recovery code
     ASSERT_OK(db->FlushWAL(true));
     // Flush only cf 1
-    static_cast_with_check<DBImpl>(db->GetRootDB())
-        ->TEST_FlushMemTable(true, false, handles[1]);
+    ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
+                  ->TEST_FlushMemTable(true, false, handles[1]));
     reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
     ASSERT_OK(ReOpenNoDelete(cfds, &handles));
     txn0 = db->GetTransactionByName("xid");
@@ -6016,8 +6121,8 @@ TEST_P(TransactionTest, DuplicateKeys) {
     // This will check the asserts inside recovery code
     ASSERT_OK(db->FlushWAL(true));
     // Flush only cf 1
-    static_cast_with_check<DBImpl>(db->GetRootDB())
-        ->TEST_FlushMemTable(true, false, handles[1]);
+    ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
+                  ->TEST_FlushMemTable(true, false, handles[1]));
     reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
     ASSERT_OK(ReOpenNoDelete(cfds, &handles));
     txn0 = db->GetTransactionByName("xid");
@@ -6043,8 +6148,8 @@ TEST_P(TransactionTest, DuplicateKeys) {
     // This will check the asserts inside recovery code
     ASSERT_OK(db->FlushWAL(true));
     // Flush only cf 1
-    static_cast_with_check<DBImpl>(db->GetRootDB())
-        ->TEST_FlushMemTable(true, false, handles[1]);
+    ASSERT_OK(static_cast_with_check<DBImpl>(db->GetRootDB())
+                  ->TEST_FlushMemTable(true, false, handles[1]));
     reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
     ASSERT_OK(ReOpenNoDelete(cfds, &handles));
     txn0 = db->GetTransactionByName("xid");
@@ -6075,7 +6180,7 @@ TEST_P(TransactionTest, ReseekOptimization) {
   write_options.sync = true;
   write_options.disableWAL = false;
   ColumnFamilyDescriptor cfd;
-  db->DefaultColumnFamily()->GetDescriptor(&cfd);
+  ASSERT_OK(db->DefaultColumnFamily()->GetDescriptor(&cfd));
   auto max_skip = cfd.options.max_sequential_skip_in_iterations;
 
   ASSERT_OK(db->Put(write_options, Slice("foo0"), Slice("initv")));
@@ -6113,7 +6218,7 @@ TEST_P(TransactionTest, ReseekOptimization) {
   }
   ASSERT_EQ(cnt, 2);
   delete iter;
-  txn0->Rollback();
+  ASSERT_OK(txn0->Rollback());
   delete txn0;
 }
 
@@ -6125,7 +6230,7 @@ TEST_P(TransactionTest, DoubleCrashInRecovery) {
     for (const bool write_after_recovery : {false, true}) {
       options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
       options.manual_wal_flush = manual_wal_flush;
-      ReOpen();
+      ASSERT_OK(ReOpen());
       std::string cf_name = "two";
       ColumnFamilyOptions cf_options;
       ColumnFamilyHandle* cf_handle = nullptr;
@@ -6140,12 +6245,12 @@ TEST_P(TransactionTest, DoubleCrashInRecovery) {
       ASSERT_OK(txn->Prepare());
 
       FlushOptions flush_ops;
-      db->Flush(flush_ops);
+      ASSERT_OK(db->Flush(flush_ops));
       // Now we have a log that cannot be deleted
 
       ASSERT_OK(db->Put(write_options, cf_handle, "foo1", "bar1"));
       // Flush only the 2nd cf
-      db->Flush(flush_ops, cf_handle);
+      ASSERT_OK(db->Flush(flush_ops, cf_handle));
 
       // The value is large enough to be touched by the corruption we ingest
       // below.
@@ -6157,7 +6262,7 @@ TEST_P(TransactionTest, DoubleCrashInRecovery) {
       // key/value not touched by corruption
       ASSERT_OK(db->Put(write_options, "foo4", "bar4"));
 
-      db->FlushWAL(true);
+      ASSERT_OK(db->FlushWAL(true));
       DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
       uint64_t wal_file_id = db_impl->TEST_LogfileNumber();
       std::string fname = LogFileName(dbname, wal_file_id);
@@ -6184,6 +6289,7 @@ TEST_P(TransactionTest, DoubleCrashInRecovery) {
       column_families.push_back(
           ColumnFamilyDescriptor("two", ColumnFamilyOptions()));
       ASSERT_OK(ReOpenNoDelete(column_families, &handles));
+      assert(db != nullptr);
 
       if (write_after_recovery) {
         // Write data to the log right after the corrupted log
@@ -6191,7 +6297,7 @@ TEST_P(TransactionTest, DoubleCrashInRecovery) {
       }
 
       // Persist data written to WAL during recovery or by the last Put
-      db->FlushWAL(true);
+      ASSERT_OK(db->FlushWAL(true));
       // 2nd crash to recover while having a valid log after the corrupted one.
       ASSERT_OK(ReOpenNoDelete(column_families, &handles));
       assert(db != nullptr);
diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h
index 1b5f549dbd1..492137cb811 100644
--- a/utilities/transactions/transaction_test.h
+++ b/utilities/transactions/transaction_test.h
@@ -62,12 +62,13 @@ class TransactionTestBase : public ::testing::Test {
     options.unordered_write = write_ordering == kUnorderedWrite;
     options.level0_file_num_compaction_trigger = 2;
     options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+    special_env.skip_fsync_ = true;
     env = new FaultInjectionTestEnv(&special_env);
     options.env = env;
     options.two_write_queues = two_write_queue;
     dbname = test::PerThreadDBPath("transaction_testdb");
 
-    DestroyDB(dbname, options);
+    EXPECT_OK(DestroyDB(dbname, options));
     txn_db_options.transaction_lock_timeout = 0;
     txn_db_options.default_lock_timeout = 0;
     txn_db_options.write_policy = write_policy;
@@ -84,7 +85,7 @@ class TransactionTestBase : public ::testing::Test {
     } else {
       s = OpenWithStackableDB();
     }
-    assert(s.ok());
+    EXPECT_OK(s);
   }
 
   ~TransactionTestBase() {
@@ -95,7 +96,7 @@ class TransactionTestBase : public ::testing::Test {
     // unlink-ed files. By using the default fs we simply ignore errors resulted
     // from attempting to delete such files in DestroyDB.
     options.env = Env::Default();
-    DestroyDB(dbname, options);
+    EXPECT_OK(DestroyDB(dbname, options));
     delete env;
   }
 
@@ -390,7 +391,7 @@ class TransactionTestBase : public ::testing::Test {
     if (txn_db_options.write_policy == WRITE_COMMITTED) {
       options.unordered_write = false;
     }
-    ReOpen();
+    ASSERT_OK(ReOpen());
 
     for (int i = 0; i < 1024; i++) {
       auto istr = std::to_string(index);
@@ -409,9 +410,9 @@ class TransactionTestBase : public ::testing::Test {
         case 1: {
           WriteBatch wb;
           committed_kvs[k] = v;
-          wb.Put(k, v);
+          ASSERT_OK(wb.Put(k, v));
           committed_kvs[k] = v2;
-          wb.Put(k, v2);
+          ASSERT_OK(wb.Put(k, v2));
           ASSERT_OK(db->Write(write_options, &wb));
 
         } break;
@@ -431,7 +432,7 @@ class TransactionTestBase : public ::testing::Test {
           delete txn;
           break;
         default:
-          assert(0);
+          FAIL();
       }
 
       index++;
@@ -444,9 +445,9 @@ class TransactionTestBase : public ::testing::Test {
     auto db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
     // Before upgrade/downgrade the WAL must be emptied
     if (empty_wal) {
-      db_impl->TEST_FlushMemTable();
+      ASSERT_OK(db_impl->TEST_FlushMemTable());
     } else {
-      db_impl->FlushWAL(true);
+      ASSERT_OK(db_impl->FlushWAL(true));
     }
     auto s = ReOpenNoDelete();
     if (empty_wal) {
@@ -460,7 +461,7 @@ class TransactionTestBase : public ::testing::Test {
     db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
     // Check that WAL is empty
     VectorLogPtr log_files;
-    db_impl->GetSortedWalFiles(log_files);
+    ASSERT_OK(db_impl->GetSortedWalFiles(log_files));
     ASSERT_EQ(0, log_files.size());
 
     for (auto& kv : committed_kvs) {
diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc
index 686d985c256..a63374f61f9 100644
--- a/utilities/transactions/write_prepared_transaction_test.cc
+++ b/utilities/transactions/write_prepared_transaction_test.cc
@@ -201,7 +201,7 @@ TEST(WriteBatchWithIndex, SubBatchCnt) {
   Options options;
   options.create_if_missing = true;
   const std::string dbname = test::PerThreadDBPath("transaction_testdb");
-  DestroyDB(dbname, options);
+  EXPECT_OK(DestroyDB(dbname, options));
   ASSERT_OK(DB::Open(options, dbname, &db));
   ColumnFamilyHandle* cf_handle = nullptr;
   ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
@@ -215,18 +215,18 @@ TEST(WriteBatchWithIndex, SubBatchCnt) {
   batch_cnt_at.push_back(batch_cnt);
   batch.SetSavePoint();
   save_points++;
-  batch.Put(Slice("key"), Slice("value"));
+  ASSERT_OK(batch.Put(Slice("key"), Slice("value")));
   ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
   batch_cnt_at.push_back(batch_cnt);
   batch.SetSavePoint();
   save_points++;
-  batch.Put(Slice("key2"), Slice("value2"));
+  ASSERT_OK(batch.Put(Slice("key2"), Slice("value2")));
   ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
   // duplicate the keys
   batch_cnt_at.push_back(batch_cnt);
   batch.SetSavePoint();
   save_points++;
-  batch.Put(Slice("key"), Slice("value3"));
+  ASSERT_OK(batch.Put(Slice("key"), Slice("value3")));
   batch_cnt++;
   ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
   // duplicate the 2nd key. It should not be counted duplicate since a
@@ -234,14 +234,14 @@ TEST(WriteBatchWithIndex, SubBatchCnt) {
   batch_cnt_at.push_back(batch_cnt);
   batch.SetSavePoint();
   save_points++;
-  batch.Put(Slice("key2"), Slice("value4"));
+  ASSERT_OK(batch.Put(Slice("key2"), Slice("value4")));
   ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
   // duplicate the keys but in a different cf. It should not be counted as
   // duplicate keys
   batch_cnt_at.push_back(batch_cnt);
   batch.SetSavePoint();
   save_points++;
-  batch.Put(cf_handle, Slice("key"), Slice("value5"));
+  ASSERT_OK(batch.Put(cf_handle, Slice("key"), Slice("value5")));
   ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
 
   // Test that the number of sub-batches matches what we count with
@@ -256,7 +256,7 @@ TEST(WriteBatchWithIndex, SubBatchCnt) {
   // Test that RollbackToSavePoint will properly resets the number of
   // sub-batches
   for (size_t i = save_points; i > 0; i--) {
-    batch.RollbackToSavePoint();
+    ASSERT_OK(batch.RollbackToSavePoint());
     ASSERT_EQ(batch_cnt_at[i - 1], batch.SubBatchCnt());
   }
 
@@ -277,7 +277,7 @@ TEST(WriteBatchWithIndex, SubBatchCnt) {
         Slice key = Slice(keys[ki]);
         std::string tmp = rnd.RandomString(16);
         Slice value = Slice(tmp);
-        rndbatch.Put(key, value);
+        ASSERT_OK(rndbatch.Put(key, value));
       }
       SubBatchCounter batch_counter(comparators);
       ASSERT_OK(rndbatch.GetWriteBatch()->Iterate(&batch_counter));
@@ -526,7 +526,7 @@ class WritePreparedTransactionTestBase : public TransactionTestBase {
         ASSERT_EQ(expected_versions[i].value, versions[i].value);
       }
       // Range delete not supported.
-      assert(expected_versions[i].type != kTypeRangeDeletion);
+      ASSERT_NE(expected_versions[i].type, kTypeRangeDeletion);
     }
   }
 };
@@ -702,8 +702,8 @@ INSTANTIATE_TEST_CASE_P(
 
 TEST_P(WritePreparedTransactionTest, CommitMap) {
   WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
-  assert(wp_db);
-  assert(wp_db->db_impl_);
+  ASSERT_NE(wp_db, nullptr);
+  ASSERT_NE(wp_db->db_impl_, nullptr);
   size_t size = wp_db->COMMIT_CACHE_SIZE;
   CommitEntry c = {5, 12}, e;
   bool evicted = wp_db->AddCommitEntry(c.prep_seq % size, c, &e);
@@ -797,14 +797,13 @@ TEST_P(WritePreparedTransactionTest, CheckKeySkipOldMemtable) {
   for (int attempt = kAttemptHistoryMemtable; attempt <= kAttemptImmMemTable;
        attempt++) {
     options.max_write_buffer_number_to_maintain = 3;
-    ReOpen();
+    ASSERT_OK(ReOpen());
 
     WriteOptions write_options;
     ReadOptions read_options;
     TransactionOptions txn_options;
     txn_options.set_snapshot = true;
     string value;
-    Status s;
 
     ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
     ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar")));
@@ -841,9 +840,9 @@ TEST_P(WritePreparedTransactionTest, CheckKeySkipOldMemtable) {
     if (attempt == kAttemptHistoryMemtable) {
       ASSERT_OK(db->Flush(flush_ops));
     } else {
-      assert(attempt == kAttemptImmMemTable);
+      ASSERT_EQ(attempt, kAttemptImmMemTable);
       DBImpl* db_impl = static_cast<DBImpl*>(db->GetRootDB());
-      db_impl->TEST_SwitchMemtable();
+      ASSERT_OK(db_impl->TEST_SwitchMemtable());
     }
     uint64_t num_imm_mems;
     ASSERT_TRUE(db->GetIntProperty(DB::Properties::kNumImmutableMemTable,
@@ -851,7 +850,7 @@ TEST_P(WritePreparedTransactionTest, CheckKeySkipOldMemtable) {
     if (attempt == kAttemptHistoryMemtable) {
       ASSERT_EQ(0, num_imm_mems);
     } else {
-      assert(attempt == kAttemptImmMemTable);
+      ASSERT_EQ(attempt, kAttemptImmMemTable);
       ASSERT_EQ(1, num_imm_mems);
     }
 
@@ -893,7 +892,7 @@ TEST_P(WritePreparedTransactionTest, CheckKeySkipOldMemtable) {
     if (attempt == kAttemptHistoryMemtable) {
       ASSERT_EQ(3, get_perf_context()->get_from_memtable_count);
     } else {
-      assert(attempt == kAttemptImmMemTable);
+      ASSERT_EQ(attempt, kAttemptImmMemTable);
       ASSERT_EQ(4, get_perf_context()->get_from_memtable_count);
     }
 
@@ -910,7 +909,7 @@ TEST_P(WritePreparedTransactionTest, CheckKeySkipOldMemtable) {
       // Only active memtable will be checked in snapshot validation but
       // both of active and immutable snapshot will be queried when
       // getting the value.
-      assert(attempt == kAttemptImmMemTable);
+      ASSERT_EQ(attempt, kAttemptImmMemTable);
       ASSERT_EQ(3, get_perf_context()->get_from_memtable_count);
     }
 
@@ -1091,7 +1090,7 @@ TEST_P(WritePreparedTransactionTest, CheckAgainstSnapshots) {
   const uint64_t cache_size = 1ul << snapshot_cache_bits;
   // Safety check to express the intended size in the test. Can be adjusted if
   // the snapshots lists changed.
-  assert((1ul << snapshot_cache_bits) * 2 + 1 == snapshots.size());
+  ASSERT_EQ((1ul << snapshot_cache_bits) * 2 + 1, snapshots.size());
   DBImpl* mock_db = new DBImpl(options, dbname);
   UpdateTransactionDBOptions(snapshot_cache_bits);
   std::unique_ptr<WritePreparedTxnDBMock> wp_db(
@@ -1106,7 +1105,7 @@ TEST_P(WritePreparedTransactionTest, CheckAgainstSnapshots) {
   std::vector<SequenceNumber> seqs = {50l,  55l,  150l, 155l, 250l, 255l, 350l,
                                       355l, 450l, 455l, 550l, 555l, 650l, 655l,
                                       750l, 755l, 850l, 855l, 950l, 955l};
-  assert(seqs.size() > 1);
+  ASSERT_GT(seqs.size(), 1);
   for (size_t i = 0; i + 1 < seqs.size(); i++) {
     wp_db->old_commit_map_empty_ = true;  // reset
     CommitEntry commit_entry = {seqs[i], seqs[i + 1]};
@@ -1184,7 +1183,7 @@ TEST_P(SnapshotConcurrentAccessTest, SnapshotConcurrentAccess) {
   const size_t snapshot_cache_bits = 2;
   // Safety check to express the intended size in the test. Can be adjusted if
   // the snapshots lists changed.
-  assert((1ul << snapshot_cache_bits) * 2 + 2 == snapshots.size());
+  ASSERT_EQ((1ul << snapshot_cache_bits) * 2 + 2, snapshots.size());
   SequenceNumber version = 1000l;
   // Choose the cache size so that the new snapshot list could replace all the
   // existing items in the cache and also have some overflow.
@@ -1365,7 +1364,7 @@ TEST_P(WritePreparedTransactionTest, MaxCatchupWithNewSnapshot) {
   const size_t snapshot_cache_bits = 7;  // same as default
   const size_t commit_cache_bits = 0;    // only 1 entry => frequent eviction
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  ASSERT_OK(ReOpen());
   WriteOptions woptions;
   WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
 
@@ -1378,9 +1377,9 @@ TEST_P(WritePreparedTransactionTest, MaxCatchupWithNewSnapshot) {
       // is not published yet, thus causing max evicted seq go higher than last
       // published.
       for (int b = 0; b < batch_cnt; b++) {
-        batch.Put("foo", "foo");
+        ASSERT_OK(batch.Put("foo", "foo"));
       }
-      db->Write(woptions, &batch);
+      ASSERT_OK(db->Write(woptions, &batch));
     }
   });
 
@@ -1415,7 +1414,7 @@ TEST_P(WritePreparedTransactionTest, MaxCatchupWithUnbackedSnapshot) {
   const size_t snapshot_cache_bits = 7;  // same as default
   const size_t commit_cache_bits = 0;    // only 1 entry => frequent eviction
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  ASSERT_OK(ReOpen());
   WriteOptions woptions;
   WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
 
@@ -1423,8 +1422,8 @@ TEST_P(WritePreparedTransactionTest, MaxCatchupWithUnbackedSnapshot) {
   ROCKSDB_NAMESPACE::port::Thread t1([&]() {
     for (int i = 0; i < writes; i++) {
       WriteBatch batch;
-      batch.Put("key", "foo");
-      db->Write(woptions, &batch);
+      ASSERT_OK(batch.Put("key", "foo"));
+      ASSERT_OK(db->Write(woptions, &batch));
     }
   });
 
@@ -1474,7 +1473,7 @@ TEST_P(WritePreparedTransactionTest, CleanupSnapshotEqualToMax) {
   const size_t snapshot_cache_bits = 7;  // same as default
   const size_t commit_cache_bits = 0;    // only 1 entry => frequent eviction
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  ASSERT_OK(ReOpen());
   WriteOptions woptions;
   WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
   // Insert something to increase seq
@@ -1534,8 +1533,8 @@ TEST_P(WritePreparedTransactionTest, TxnInitialize) {
   // udpated
   ASSERT_GT(snap_impl->min_uncommitted_, kMinUnCommittedSeq);
 
-  txn0->Rollback();
-  txn1->Rollback();
+  ASSERT_OK(txn0->Rollback());
+  ASSERT_OK(txn1->Rollback());
   delete txn0;
   delete txn1;
 }
@@ -1548,7 +1547,7 @@ TEST_P(WritePreparedTransactionTest, AdvanceMaxEvictedSeqWithDuplicates) {
   const size_t snapshot_cache_bits = 7;  // same as default
   const size_t commit_cache_bits = 1;    // disable commit cache
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  ASSERT_OK(ReOpen());
 
   ReadOptions ropt;
   PinnableSlice pinnable_val;
@@ -1569,10 +1568,10 @@ TEST_P(WritePreparedTransactionTest, AdvanceMaxEvictedSeqWithDuplicates) {
   delete txn0;
 
   WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
-  wp_db->db_impl_->FlushWAL(true);
+  ASSERT_OK(wp_db->db_impl_->FlushWAL(true));
   wp_db->TEST_Crash();
-  ReOpenNoDelete();
-  assert(db != nullptr);
+  ASSERT_OK(ReOpenNoDelete());
+  ASSERT_NE(db, nullptr);
   s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val);
   ASSERT_TRUE(s.IsNotFound());
 
@@ -1589,7 +1588,7 @@ TEST_P(WritePreparedTransactionTest, SmallestUnCommittedSeq) {
   const size_t snapshot_cache_bits = 7;  // same as default
   const size_t commit_cache_bits = 1;    // disable commit cache
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  ASSERT_OK(ReOpen());
   WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
   ReadOptions ropt;
   PinnableSlice pinnable_val;
@@ -1622,7 +1621,7 @@ TEST_P(WritePreparedTransactionTest, SmallestUnCommittedSeq) {
       // Since commit cache is practically disabled, commit results in immediate
       // advance in max_evicted_seq_ and subsequently moving some prepared txns
       // to delayed_prepared_.
-      txn->Commit();
+      ASSERT_OK(txn->Commit());
       committed_txns.push_back(txn);
     }
   });
@@ -1651,7 +1650,7 @@ TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrent) {
   // almost infeasible.
   txn_db_options.transaction_lock_timeout = 1000;
   txn_db_options.default_lock_timeout = 1000;
-  ReOpen();
+  ASSERT_OK(ReOpen());
   FlushOptions fopt;
 
   // Number of different txn types we use in this test
@@ -1671,7 +1670,11 @@ TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrent) {
   }
   const size_t max_n = static_cast<size_t>(std::pow(type_cnt, txn_cnt));
   printf("Number of cases being tested is %" ROCKSDB_PRIszt "\n", max_n);
-  for (size_t n = 0; n < max_n; n++, ReOpen()) {
+  for (size_t n = 0; n < max_n; n++) {
+    if (n > 0) {
+      ASSERT_OK(ReOpen());
+    }
+
     if (n % split_cnt_ != split_id_) continue;
     if (n % 1000 == 0) {
       printf("Tested %" ROCKSDB_PRIszt " cases so far\n", n);
@@ -1731,7 +1734,7 @@ TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrent) {
           threads.emplace_back(txn_t3, bi);
           break;
         default:
-          assert(false);
+          FAIL();
       }
       // wait to be linked
       while (linked.load() <= bi) {
@@ -1765,22 +1768,22 @@ TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrent) {
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
 
     // Check if recovery preserves the last sequence number
-    db_impl->FlushWAL(true);
-    ReOpenNoDelete();
-    assert(db != nullptr);
+    ASSERT_OK(db_impl->FlushWAL(true));
+    ASSERT_OK(ReOpenNoDelete());
+    ASSERT_NE(db, nullptr);
     db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
     seq = db_impl->TEST_GetLastVisibleSequence();
     ASSERT_LE(exp_seq, seq + with_empty_commits);
 
     // Check if flush preserves the last sequence number
-    db_impl->Flush(fopt);
+    ASSERT_OK(db_impl->Flush(fopt));
     seq = db_impl->GetLatestSequenceNumber();
     ASSERT_LE(exp_seq, seq + with_empty_commits);
 
     // Check if recovery after flush preserves the last sequence number
-    db_impl->FlushWAL(true);
-    ReOpenNoDelete();
-    assert(db != nullptr);
+    ASSERT_OK(db_impl->FlushWAL(true));
+    ASSERT_OK(ReOpenNoDelete());
+    ASSERT_NE(db, nullptr);
     db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
     seq = db_impl->GetLatestSequenceNumber();
     ASSERT_LE(exp_seq, seq + with_empty_commits);
@@ -1792,7 +1795,7 @@ TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrent) {
 // properly.
 TEST_P(WritePreparedTransactionTest, BasicRecovery) {
   options.disable_auto_compactions = true;
-  ReOpen();
+  ASSERT_OK(ReOpen());
   WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
 
   txn_t0(0);
@@ -1807,6 +1810,7 @@ TEST_P(WritePreparedTransactionTest, BasicRecovery) {
   s = txn0->Put(Slice("foo0" + istr0), Slice("bar0" + istr0));
   ASSERT_OK(s);
   s = txn0->Prepare();
+  ASSERT_OK(s);
   auto prep_seq_0 = txn0->GetId();
 
   txn_t1(0);
@@ -1819,6 +1823,7 @@ TEST_P(WritePreparedTransactionTest, BasicRecovery) {
   s = txn1->Put(Slice("foo1" + istr1), Slice("bar"));
   ASSERT_OK(s);
   s = txn1->Prepare();
+  ASSERT_OK(s);
   auto prep_seq_1 = txn1->GetId();
 
   txn_t2(0);
@@ -1832,10 +1837,10 @@ TEST_P(WritePreparedTransactionTest, BasicRecovery) {
 
   delete txn0;
   delete txn1;
-  wp_db->db_impl_->FlushWAL(true);
+  ASSERT_OK(wp_db->db_impl_->FlushWAL(true));
   wp_db->TEST_Crash();
-  ReOpenNoDelete();
-  assert(db != nullptr);
+  ASSERT_OK(ReOpenNoDelete());
+  ASSERT_NE(db, nullptr);
   wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
   // After recovery, all the uncommitted txns (0 and 1) should be inserted into
   // delayed_prepared_
@@ -1863,7 +1868,7 @@ TEST_P(WritePreparedTransactionTest, BasicRecovery) {
   // recovery
   txn1 = db->GetTransactionByName("xid" + istr1);
   ASSERT_NE(txn1, nullptr);
-  txn1->Commit();
+  ASSERT_OK(txn1->Commit());
   delete txn1;
 
   index++;
@@ -1874,13 +1879,14 @@ TEST_P(WritePreparedTransactionTest, BasicRecovery) {
   s = txn2->Put(Slice("foo2" + istr2), Slice("bar"));
   ASSERT_OK(s);
   s = txn2->Prepare();
+  ASSERT_OK(s);
   auto prep_seq_2 = txn2->GetId();
 
   delete txn2;
-  wp_db->db_impl_->FlushWAL(true);
+  ASSERT_OK(wp_db->db_impl_->FlushWAL(true));
   wp_db->TEST_Crash();
-  ReOpenNoDelete();
-  assert(db != nullptr);
+  ASSERT_OK(ReOpenNoDelete());
+  ASSERT_NE(db, nullptr);
   wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
   ASSERT_TRUE(wp_db->prepared_txns_.empty());
   ASSERT_FALSE(wp_db->delayed_prepared_empty_);
@@ -1900,10 +1906,10 @@ TEST_P(WritePreparedTransactionTest, BasicRecovery) {
   // Commit all the remaining txns
   txn0 = db->GetTransactionByName("xid" + istr0);
   ASSERT_NE(txn0, nullptr);
-  txn0->Commit();
+  ASSERT_OK(txn0->Commit());
   txn2 = db->GetTransactionByName("xid" + istr2);
   ASSERT_NE(txn2, nullptr);
-  txn2->Commit();
+  ASSERT_OK(txn2->Commit());
 
   // Check the value is committed after commit
   s = db->Get(ropt, db->DefaultColumnFamily(), "foo0" + istr0, &pinnable_val);
@@ -1913,9 +1919,9 @@ TEST_P(WritePreparedTransactionTest, BasicRecovery) {
 
   delete txn0;
   delete txn2;
-  wp_db->db_impl_->FlushWAL(true);
-  ReOpenNoDelete();
-  assert(db != nullptr);
+  ASSERT_OK(wp_db->db_impl_->FlushWAL(true));
+  ASSERT_OK(ReOpenNoDelete());
+  ASSERT_NE(db, nullptr);
   wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
   ASSERT_TRUE(wp_db->prepared_txns_.empty());
   ASSERT_TRUE(wp_db->delayed_prepared_empty_);
@@ -1932,7 +1938,7 @@ TEST_P(WritePreparedTransactionTest, BasicRecovery) {
 // committed data before the restart is visible to all snapshots.
 TEST_P(WritePreparedTransactionTest, IsInSnapshotEmptyMap) {
   for (bool end_with_prepare : {false, true}) {
-    ReOpen();
+    ASSERT_OK(ReOpen());
     WriteOptions woptions;
     ASSERT_OK(db->Put(woptions, "key", "value"));
     ASSERT_OK(db->Put(woptions, "key", "value"));
@@ -1948,10 +1954,10 @@ TEST_P(WritePreparedTransactionTest, IsInSnapshotEmptyMap) {
     }
     dynamic_cast<WritePreparedTxnDB*>(db)->TEST_Crash();
     auto db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
-    db_impl->FlushWAL(true);
-    ReOpenNoDelete();
+    ASSERT_OK(db_impl->FlushWAL(true));
+    ASSERT_OK(ReOpenNoDelete());
     WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
-    assert(wp_db != nullptr);
+    ASSERT_NE(wp_db, nullptr);
     ASSERT_GT(wp_db->max_evicted_seq_, 0);  // max after recovery
     // Take a snapshot right after recovery
     const Snapshot* snap = db->GetSnapshot();
@@ -2190,7 +2196,7 @@ void ASSERT_SAME(ReadOptions roptions, TransactionDB* db, Status exp_s,
   Status s;
   PinnableSlice v;
   s = db->Get(roptions, db->DefaultColumnFamily(), key, &v);
-  ASSERT_TRUE(exp_s == s);
+  ASSERT_EQ(exp_s, s);
   ASSERT_TRUE(s.ok() || s.IsNotFound());
   if (s.ok()) {
     ASSERT_TRUE(exp_v == v);
@@ -2203,7 +2209,7 @@ void ASSERT_SAME(ReadOptions roptions, TransactionDB* db, Status exp_s,
   ASSERT_EQ(1, values.size());
   ASSERT_EQ(1, s_vec.size());
   s = s_vec[0];
-  ASSERT_TRUE(exp_s == s);
+  ASSERT_EQ(exp_s, s);
   ASSERT_TRUE(s.ok() || s.IsNotFound());
   if (s.ok()) {
     ASSERT_TRUE(exp_v == values[0]);
@@ -2224,7 +2230,7 @@ TEST_P(WritePreparedTransactionTest, Rollback) {
   for (size_t ikey = 1; ikey <= num_keys; ikey++) {
     for (size_t ivalue = 0; ivalue < num_values; ivalue++) {
       for (bool crash : {false, true}) {
-        ReOpen();
+        ASSERT_OK(ReOpen());
         WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
         std::string key_str = "key" + ToString(ikey);
         switch (ivalue) {
@@ -2243,7 +2249,7 @@ TEST_P(WritePreparedTransactionTest, Rollback) {
             ASSERT_OK(db->SingleDelete(woptions, key_str));
             break;
           default:
-            assert(0);
+            FAIL();
         }
 
         PinnableSlice v1;
@@ -2286,10 +2292,10 @@ TEST_P(WritePreparedTransactionTest, Rollback) {
         if (crash) {
           delete txn;
           auto db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
-          db_impl->FlushWAL(true);
+          ASSERT_OK(db_impl->FlushWAL(true));
           dynamic_cast<WritePreparedTxnDB*>(db)->TEST_Crash();
-          ReOpenNoDelete();
-          assert(db != nullptr);
+          ASSERT_OK(ReOpenNoDelete());
+          ASSERT_NE(db, nullptr);
           wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
           txn = db->GetTransactionByName("xid0");
           ASSERT_FALSE(wp_db->delayed_prepared_empty_);
@@ -2328,7 +2334,7 @@ TEST_P(WritePreparedTransactionTest, Rollback) {
 TEST_P(WritePreparedTransactionTest, DisableGCDuringRecovery) {
   // Use large buffer to avoid memtable flush after 1024 insertions
   options.write_buffer_size = 1024 * 1024;
-  ReOpen();
+  ASSERT_OK(ReOpen());
   std::vector<KeyVersion> versions;
   uint64_t seq = 0;
   for (uint64_t i = 1; i <= 1024; i++) {
@@ -2345,10 +2351,10 @@ TEST_P(WritePreparedTransactionTest, DisableGCDuringRecovery) {
   std::reverse(std::begin(versions), std::end(versions));
   VerifyInternalKeys(versions);
   DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
-  db_impl->FlushWAL(true);
+  ASSERT_OK(db_impl->FlushWAL(true));
   // Use small buffer to ensure memtable flush during recovery
   options.write_buffer_size = 1024;
-  ReOpenNoDelete();
+  ASSERT_OK(ReOpenNoDelete());
   VerifyInternalKeys(versions);
 }
 
@@ -2375,7 +2381,7 @@ TEST_P(WritePreparedTransactionTest, SequenceNumberZero) {
 // proceed with older versions of the key as-if the new version doesn't exist.
 TEST_P(WritePreparedTransactionTest, CompactionShouldKeepUncommittedKeys) {
   options.disable_auto_compactions = true;
-  ReOpen();
+  ASSERT_OK(ReOpen());
   DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
   // Snapshots to avoid keys get evicted.
   std::vector<const Snapshot*> snapshots;
@@ -2466,7 +2472,7 @@ TEST_P(WritePreparedTransactionTest, CompactionShouldKeepUncommittedKeys) {
 // not just prepare sequence.
 TEST_P(WritePreparedTransactionTest, CompactionShouldKeepSnapshotVisibleKeys) {
   options.disable_auto_compactions = true;
-  ReOpen();
+  ASSERT_OK(ReOpen());
   // Keep track of expected sequence number.
   SequenceNumber expected_seq = 0;
   auto* txn1 = db->BeginTransaction(WriteOptions());
@@ -2532,7 +2538,7 @@ TEST_P(WritePreparedTransactionTest, SmallestUncommittedOptimization) {
   const size_t commit_cache_bits = 0;    // disable commit cache
   for (bool has_recent_prepare : {true, false}) {
     UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-    ReOpen();
+    ASSERT_OK(ReOpen());
 
     ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));
     auto* transaction =
@@ -2581,7 +2587,7 @@ TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction) {
   const size_t snapshot_cache_bits = 7;  // same as default
   const size_t commit_cache_bits = 0;    // minimum commit cache
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  ASSERT_OK(ReOpen());
 
   ASSERT_OK(db->Put(WriteOptions(), "key1", "value1_1"));
   auto* transaction =
@@ -2630,7 +2636,7 @@ TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction2) {
   const size_t snapshot_cache_bits = 7;  // same as default
   const size_t commit_cache_bits = 0;    // minimum commit cache
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  ASSERT_OK(ReOpen());
 
   ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));
   ASSERT_OK(db->Put(WriteOptions(), "key1", "value2"));
@@ -2680,7 +2686,7 @@ TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction3) {
   const size_t snapshot_cache_bits = 7;  // same as default
   const size_t commit_cache_bits = 1;    // commit cache size = 2
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  ASSERT_OK(ReOpen());
 
   // Add a dummy key to evict v2 commit cache, but keep v1 commit cache.
   // It also advance max_evicted_seq and can trigger old_commit_map cleanup.
@@ -2731,7 +2737,7 @@ TEST_P(WritePreparedTransactionTest, ReleaseEarliestSnapshotDuringCompaction) {
   const size_t snapshot_cache_bits = 7;  // same as default
   const size_t commit_cache_bits = 0;    // minimum commit cache
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  ASSERT_OK(ReOpen());
 
   ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));
   auto* transaction =
@@ -2795,7 +2801,7 @@ TEST_P(WritePreparedTransactionTest,
 
   Random rnd(1103);
   options.disable_auto_compactions = true;
-  ReOpen();
+  ASSERT_OK(ReOpen());
 
   for (size_t i = 0; i < kNumTransactions; i++) {
     std::string key = "key" + ToString(i);
@@ -2836,7 +2842,7 @@ TEST_P(WritePreparedTransactionTest,
   snapshots.push_back(db->GetSnapshot());
   snapshot_data.push_back(current_data);
 
-  assert(snapshots.size() == snapshot_data.size());
+  ASSERT_EQ(snapshots.size(), snapshot_data.size());
   for (size_t i = 0; i < snapshots.size(); i++) {
     VerifyKeys(snapshot_data[i], snapshots[i]);
   }
@@ -2871,7 +2877,7 @@ TEST_P(WritePreparedTransactionTest,
 TEST_P(WritePreparedTransactionTest,
        CompactionShouldKeepSequenceForUncommittedKeys) {
   options.disable_auto_compactions = true;
-  ReOpen();
+  ASSERT_OK(ReOpen());
   // Keep track of expected sequence number.
   SequenceNumber expected_seq = 0;
   auto* transaction = db->BeginTransaction(WriteOptions());
@@ -2913,7 +2919,7 @@ TEST_P(WritePreparedTransactionTest,
 
 TEST_P(WritePreparedTransactionTest, CommitAndSnapshotDuringCompaction) {
   options.disable_auto_compactions = true;
-  ReOpen();
+  ASSERT_OK(ReOpen());
 
   const Snapshot* snapshot = nullptr;
   ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));
@@ -2996,6 +3002,7 @@ TEST_P(WritePreparedTransactionTest, Iterate) {
 
 TEST_P(WritePreparedTransactionTest, IteratorRefreshNotSupported) {
   Iterator* iter = db->NewIterator(ReadOptions());
+  ASSERT_OK(iter->status());
   ASSERT_TRUE(iter->Refresh().IsNotSupported());
   delete iter;
 }
@@ -3017,13 +3024,13 @@ TEST_P(WritePreparedTransactionTest, NonAtomicCommitOfDelayedPrepared) {
     }
     for (auto split_before_mutex : split_options) {
       UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-      ReOpen();
+      ASSERT_OK(ReOpen());
       WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
       DBImpl* db_impl = static_cast_with_check<DBImpl>(db->GetRootDB());
       // Fill up the commit cache
       std::string init_value("value1");
       for (int i = 0; i < 10; i++) {
-        db->Put(WriteOptions(), Slice("key1"), Slice(init_value));
+        ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice(init_value)));
       }
       // Prepare a transaction but do not commit it
       Transaction* txn =
@@ -3034,7 +3041,7 @@ TEST_P(WritePreparedTransactionTest, NonAtomicCommitOfDelayedPrepared) {
       // Commit a bunch of entries to advance max evicted seq and make the
       // prepared a delayed prepared
       for (int i = 0; i < 10; i++) {
-        db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+        ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
       }
       // The snapshot should not see the delayed prepared entry
       auto snap = db->GetSnapshot();
@@ -3075,7 +3082,7 @@ TEST_P(WritePreparedTransactionTest, NonAtomicCommitOfDelayedPrepared) {
           auto seq = db_impl->TEST_GetLastVisibleSequence();
           size_t tries = 0;
           while (wp_db->max_evicted_seq_ < seq && tries < 50) {
-            db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+            ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
             tries++;
           };
           ASSERT_LT(tries, 50);
@@ -3115,12 +3122,12 @@ TEST_P(WritePreparedTransactionTest, NonAtomicUpdateOfDelayedPrepared) {
   const size_t snapshot_cache_bits = 7;  // same as default
   const size_t commit_cache_bits = 3;    // 8 entries
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  ASSERT_OK(ReOpen());
   WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
   // Fill up the commit cache
   std::string init_value("value1");
   for (int i = 0; i < 10; i++) {
-    db->Put(WriteOptions(), Slice("key1"), Slice(init_value));
+    ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice(init_value)));
   }
   // Prepare a transaction but do not commit it
   Transaction* txn = db->BeginTransaction(WriteOptions(), TransactionOptions());
@@ -3128,8 +3135,8 @@ TEST_P(WritePreparedTransactionTest, NonAtomicUpdateOfDelayedPrepared) {
   ASSERT_OK(txn->Put(Slice("key1"), Slice("value2")));
   ASSERT_OK(txn->Prepare());
   // Create a gap between prepare seq and snapshot seq
-  db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
-  db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+  ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+  ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
   // The snapshot should not see the delayed prepared entry
   auto snap = db->GetSnapshot();
   ASSERT_LT(txn->GetId(), snap->GetSequenceNumber());
@@ -3148,7 +3155,7 @@ TEST_P(WritePreparedTransactionTest, NonAtomicUpdateOfDelayedPrepared) {
     // prepared a delayed prepared
     size_t tries = 0;
     while (wp_db->max_evicted_seq_ < txn->GetId() && tries < 50) {
-      db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+      ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
       tries++;
     };
     ASSERT_LT(tries, 50);
@@ -3185,13 +3192,13 @@ TEST_P(WritePreparedTransactionTest, NonAtomicUpdateOfMaxEvictedSeq) {
   const size_t snapshot_cache_bits = 7;  // same as default
   const size_t commit_cache_bits = 3;    // 8 entries
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  ASSERT_OK(ReOpen());
   WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
   // Fill up the commit cache
   std::string init_value("value1");
   std::string last_value("value_final");
   for (int i = 0; i < 10; i++) {
-    db->Put(WriteOptions(), Slice("key1"), Slice(init_value));
+    ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice(init_value)));
   }
   // Do an uncommitted write to prevent min_uncommitted optimization
   Transaction* txn1 =
@@ -3206,8 +3213,8 @@ TEST_P(WritePreparedTransactionTest, NonAtomicUpdateOfMaxEvictedSeq) {
   ASSERT_OK(txn->Prepare());
   ASSERT_OK(txn->Commit());
   // Create a gap between commit entry and snapshot seq
-  db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
-  db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+  ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+  ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
   // The snapshot should see the last commit
   auto snap = db->GetSnapshot();
   ASSERT_LE(txn->GetId(), snap->GetSequenceNumber());
@@ -3225,7 +3232,7 @@ TEST_P(WritePreparedTransactionTest, NonAtomicUpdateOfMaxEvictedSeq) {
     // Commit a bunch of entries to advance max evicted seq beyond txn->GetId()
     size_t tries = 0;
     while (wp_db->max_evicted_seq_ < txn->GetId() && tries < 50) {
-      db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+      ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
       tries++;
     };
     ASSERT_LT(tries, 50);
@@ -3248,7 +3255,7 @@ TEST_P(WritePreparedTransactionTest, NonAtomicUpdateOfMaxEvictedSeq) {
   read_thread.join();
   commit_thread.join();
   delete txn;
-  txn1->Commit();
+  ASSERT_OK(txn1->Commit());
   delete txn1;
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
   ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
@@ -3266,7 +3273,7 @@ TEST_P(WritePreparedTransactionTest, AddPreparedBeforeMax) {
   // 1 entry to advance max after the 2nd commit
   const size_t commit_cache_bits = 0;
   UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-  ReOpen();
+  ASSERT_OK(ReOpen());
   WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
   std::string some_value("value_some");
   std::string uncommitted_value("value_uncommitted");
@@ -3347,7 +3354,7 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
   for (const size_t commit_cache_bits : {0, 2, 3}) {
     for (const size_t sub_batch_cnt : {1, 2, 3}) {
       UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
-      ReOpen();
+      ASSERT_OK(ReOpen());
       std::atomic<const Snapshot*> snap = {nullptr};
       std::atomic<SequenceNumber> exp_prepare = {0};
       ROCKSDB_NAMESPACE::port::Thread callback_thread;
@@ -3385,7 +3392,7 @@ TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
         // Too many txns might cause commit_seq - prepare_seq in another thread
         // to go beyond DELTA_UPPERBOUND
         for (int i = 0; i < 25 * (1 << commit_cache_bits); i++) {
-          db->Put(WriteOptions(), Slice("key1"), Slice("value1"));
+          ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice("value1")));
         }
       });
       ROCKSDB_NAMESPACE::port::Thread write_thread([&]() {
@@ -3448,7 +3455,7 @@ TEST_P(WritePreparedTransactionTest, AtomicCommit) {
     ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
     ROCKSDB_NAMESPACE::port::Thread write_thread([&]() {
       if (skip_prepare) {
-        db->Put(WriteOptions(), Slice("key"), Slice("value"));
+        ASSERT_OK(db->Put(WriteOptions(), Slice("key"), Slice("value")));
       } else {
         Transaction* txn =
             db->BeginTransaction(WriteOptions(), TransactionOptions());
diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc
index de9b4f91c2a..5f666b28013 100644
--- a/utilities/transactions/write_prepared_txn.cc
+++ b/utilities/transactions/write_prepared_txn.cc
@@ -70,16 +70,21 @@ Status WritePreparedTxn::Get(const ReadOptions& options,
       wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq);
   WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted,
                                         backed_by_snapshot);
-  auto res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key,
-                                            pinnable_val, &callback);
-  if (LIKELY(callback.valid() &&
-             wpt_db_->ValidateSnapshot(callback.max_visible_seq(),
-                                       backed_by_snapshot))) {
-    return res;
-  } else {
-    wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
-    return Status::TryAgain();
+  Status res = write_batch_.GetFromBatchAndDB(db_, options, column_family, key,
+                                              pinnable_val, &callback);
+  const bool callback_valid =
+      callback.valid();  // NOTE: validity of callback must always be checked
+                         // before it is destructed
+  if (res.ok()) {
+    if (!LIKELY(callback_valid &&
+                wpt_db_->ValidateSnapshot(callback.max_visible_seq(),
+                                          backed_by_snapshot))) {
+      wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN);
+      res = Status::TryAgain();
+    }
   }
+
+  return res;
 }
 
 Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options) {
@@ -104,8 +109,9 @@ Status WritePreparedTxn::PrepareInternal() {
   write_options.disableWAL = false;
   const bool WRITE_AFTER_COMMIT = true;
   const bool kFirstPrepareBatch = true;
-  WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), name_,
-                                     !WRITE_AFTER_COMMIT);
+  auto s = WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(),
+                                              name_, !WRITE_AFTER_COMMIT);
+  assert(s.ok());
   // For each duplicate key we account for a new sub-batch
   prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt();
   // Having AddPrepared in the PreReleaseCallback allows in-order addition of
@@ -116,10 +122,10 @@ Status WritePreparedTxn::PrepareInternal() {
       db_impl_->immutable_db_options().two_write_queues, kFirstPrepareBatch);
   const bool DISABLE_MEMTABLE = true;
   uint64_t seq_used = kMaxSequenceNumber;
-  Status s = db_impl_->WriteImpl(
-      write_options, GetWriteBatch()->GetWriteBatch(),
-      /*callback*/ nullptr, &log_number_, /*log ref*/ 0, !DISABLE_MEMTABLE,
-      &seq_used, prepare_batch_cnt_, &add_prepared_callback);
+  s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
+                          /*callback*/ nullptr, &log_number_, /*log ref*/ 0,
+                          !DISABLE_MEMTABLE, &seq_used, prepare_batch_cnt_,
+                          &add_prepared_callback);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
   auto prepare_seq = seq_used;
   SetId(prepare_seq);
@@ -144,7 +150,8 @@ Status WritePreparedTxn::CommitInternal() {
   // The Memtable will ignore the Commit marker in non-recovery mode
   WriteBatch* working_batch = GetCommitTimeWriteBatch();
   const bool empty = working_batch->Count() == 0;
-  WriteBatchInternal::MarkCommit(working_batch, name_);
+  auto s = WriteBatchInternal::MarkCommit(working_batch, name_);
+  assert(s.ok());
 
   const bool for_recovery = use_only_the_last_commit_time_batch_for_recovery_;
   if (!empty && for_recovery) {
@@ -162,7 +169,7 @@ Status WritePreparedTxn::CommitInternal() {
     ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
                    "Duplicate key overhead");
     SubBatchCounter counter(*wpt_db_->GetCFComparatorMap());
-    auto s = working_batch->Iterate(&counter);
+    s = working_batch->Iterate(&counter);
     assert(s.ok());
     commit_batch_cnt = counter.BatchCount();
   }
@@ -188,9 +195,9 @@ Status WritePreparedTxn::CommitInternal() {
   // redundantly reference the log that contains the prepared data.
   const uint64_t zero_log_number = 0ull;
   size_t batch_cnt = UNLIKELY(commit_batch_cnt) ? commit_batch_cnt : 1;
-  auto s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
-                               zero_log_number, disable_memtable, &seq_used,
-                               batch_cnt, pre_release_callback);
+  s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
+                          zero_log_number, disable_memtable, &seq_used,
+                          batch_cnt, pre_release_callback);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
   const SequenceNumber commit_batch_seq = seq_used;
   if (LIKELY(do_one_write || !s.ok())) {
@@ -217,9 +224,11 @@ Status WritePreparedTxn::CommitInternal() {
       wpt_db_, db_impl_, prepare_seq, prepare_batch_cnt_, kZeroData,
       commit_batch_seq, commit_batch_cnt);
   WriteBatch empty_batch;
-  empty_batch.PutLogData(Slice());
+  s = empty_batch.PutLogData(Slice());
+  assert(s.ok());
   // In the absence of Prepare markers, use Noop as a batch separator
-  WriteBatchInternal::InsertNoop(&empty_batch);
+  s = WriteBatchInternal::InsertNoop(&empty_batch);
+  assert(s.ok());
   const bool DISABLE_MEMTABLE = true;
   const size_t ONE_BATCH = 1;
   const uint64_t NO_REF_LOG = 0;
@@ -347,12 +356,12 @@ Status WritePreparedTxn::RollbackInternal() {
                      wpt_db_->txn_db_options_.rollback_merge_operands,
                      roptions);
   auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&rollback_handler);
-  assert(s.ok());
   if (!s.ok()) {
     return s;
   }
   // The Rollback marker will be used as a batch separator
-  WriteBatchInternal::MarkRollback(&rollback_batch, name_);
+  s = WriteBatchInternal::MarkRollback(&rollback_batch, name_);
+  assert(s.ok());
   bool do_one_write = !db_impl_->immutable_db_options().two_write_queues;
   const bool DISABLE_MEMTABLE = true;
   const uint64_t NO_REF_LOG = 0;
@@ -402,9 +411,11 @@ Status WritePreparedTxn::RollbackInternal() {
   WritePreparedRollbackPreReleaseCallback update_commit_map_with_prepare(
       wpt_db_, db_impl_, GetId(), rollback_seq, prepare_batch_cnt_);
   WriteBatch empty_batch;
-  empty_batch.PutLogData(Slice());
+  s = empty_batch.PutLogData(Slice());
+  assert(s.ok());
   // In the absence of Prepare markers, use Noop as a batch separator
-  WriteBatchInternal::InsertNoop(&empty_batch);
+  s = WriteBatchInternal::InsertNoop(&empty_batch);
+  assert(s.ok());
   s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
                           NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
                           &update_commit_map_with_prepare);
diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc
index e6e42d7e171..167d2e80cfb 100644
--- a/utilities/transactions/write_prepared_txn_db.cc
+++ b/utilities/transactions/write_prepared_txn_db.cc
@@ -157,7 +157,9 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig,
     // TODO(myabandeh): add an option to allow user skipping this cost
     SubBatchCounter counter(*GetCFComparatorMap());
     auto s = batch->Iterate(&counter);
-    assert(s.ok());
+    if (!s.ok()) {
+      return s;
+    }
     batch_cnt = counter.BatchCount();
     WPRecordTick(TXN_DUPLICATE_KEY_OVERHEAD);
     ROCKS_LOG_DETAILS(info_log_, "Duplicate key overhead: %" PRIu64 " batches",
@@ -168,7 +170,8 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig,
   bool do_one_write = !db_impl_->immutable_db_options().two_write_queues;
   WriteOptions write_options(write_options_orig);
   // In the absence of Prepare markers, use Noop as a batch separator
-  WriteBatchInternal::InsertNoop(batch);
+  auto s = WriteBatchInternal::InsertNoop(batch);
+  assert(s.ok());
   const bool DISABLE_MEMTABLE = true;
   const uint64_t no_log_ref = 0;
   uint64_t seq_used = kMaxSequenceNumber;
@@ -189,9 +192,9 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig,
   } else {
     pre_release_callback = &add_prepared_callback;
   }
-  auto s = db_impl_->WriteImpl(write_options, batch, nullptr, nullptr,
-                               no_log_ref, !DISABLE_MEMTABLE, &seq_used,
-                               batch_cnt, pre_release_callback);
+  s = db_impl_->WriteImpl(write_options, batch, nullptr, nullptr, no_log_ref,
+                          !DISABLE_MEMTABLE, &seq_used, batch_cnt,
+                          pre_release_callback);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
   uint64_t prepare_seq = seq_used;
   if (txn != nullptr) {
@@ -319,8 +322,8 @@ static void CleanupWritePreparedTxnDBIterator(void* arg1, void* /*arg2*/) {
 
 Iterator* WritePreparedTxnDB::NewIterator(const ReadOptions& options,
                                           ColumnFamilyHandle* column_family) {
-  constexpr bool ALLOW_BLOB = true;
-  constexpr bool ALLOW_REFRESH = true;
+  constexpr bool expose_blob_index = false;
+  constexpr bool allow_refresh = false;
   std::shared_ptr<ManagedSnapshot> own_snapshot = nullptr;
   SequenceNumber snapshot_seq = kMaxSequenceNumber;
   SequenceNumber min_uncommitted = 0;
@@ -345,7 +348,7 @@ Iterator* WritePreparedTxnDB::NewIterator(const ReadOptions& options,
       new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted);
   auto* db_iter =
       db_impl_->NewIteratorImpl(options, cfd, snapshot_seq, &state->callback,
-                                !ALLOW_BLOB, !ALLOW_REFRESH);
+                                expose_blob_index, allow_refresh);
   db_iter->RegisterCleanup(CleanupWritePreparedTxnDBIterator, state, nullptr);
   return db_iter;
 }
@@ -354,8 +357,8 @@ Status WritePreparedTxnDB::NewIterators(
     const ReadOptions& options,
     const std::vector<ColumnFamilyHandle*>& column_families,
     std::vector<Iterator*>* iterators) {
-  constexpr bool ALLOW_BLOB = true;
-  constexpr bool ALLOW_REFRESH = true;
+  constexpr bool expose_blob_index = false;
+  constexpr bool allow_refresh = false;
   std::shared_ptr<ManagedSnapshot> own_snapshot = nullptr;
   SequenceNumber snapshot_seq = kMaxSequenceNumber;
   SequenceNumber min_uncommitted = 0;
@@ -382,7 +385,7 @@ Status WritePreparedTxnDB::NewIterators(
         new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted);
     auto* db_iter =
         db_impl_->NewIteratorImpl(options, cfd, snapshot_seq, &state->callback,
-                                  !ALLOW_BLOB, !ALLOW_REFRESH);
+                                  expose_blob_index, allow_refresh);
     db_iter->RegisterCleanup(CleanupWritePreparedTxnDBIterator, state, nullptr);
     iterators->push_back(db_iter);
   }
diff --git a/utilities/transactions/write_prepared_txn_db.h b/utilities/transactions/write_prepared_txn_db.h
index 9413fd92393..694a04ebe5f 100644
--- a/utilities/transactions/write_prepared_txn_db.h
+++ b/utilities/transactions/write_prepared_txn_db.h
@@ -26,7 +26,6 @@
 #include "util/string_util.h"
 #include "utilities/transactions/pessimistic_transaction.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
-#include "utilities/transactions/transaction_lock_mgr.h"
 #include "utilities/transactions/write_prepared_txn.h"
 
 namespace ROCKSDB_NAMESPACE {
diff --git a/utilities/transactions/write_unprepared_transaction_test.cc b/utilities/transactions/write_unprepared_transaction_test.cc
index 1d21ae36e39..0c1d32e3593 100644
--- a/utilities/transactions/write_unprepared_transaction_test.cc
+++ b/utilities/transactions/write_unprepared_transaction_test.cc
@@ -73,7 +73,7 @@ TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWrite) {
   for (uint64_t max_skip : {0, std::numeric_limits<int>::max()}) {
     options.max_sequential_skip_in_iterations = max_skip;
     options.disable_auto_compactions = true;
-    ReOpen();
+    ASSERT_OK(ReOpen());
 
     TransactionOptions txn_options;
     WriteOptions woptions;
@@ -90,7 +90,7 @@ TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWrite) {
       std::string stored_value = "v" + ToString(i);
       ASSERT_OK(txn->Put("a", stored_value));
       ASSERT_OK(txn->Put("b", stored_value));
-      wup_txn->FlushWriteBatchToDB(false);
+      ASSERT_OK(wup_txn->FlushWriteBatchToDB(false));
 
       // Test Get()
       std::string value;
@@ -155,7 +155,7 @@ TEST_P(WriteUnpreparedStressTest, ReadYourOwnWriteStress) {
   WriteOptions write_options;
   txn_db_options.transaction_lock_timeout = -1;
   options.disable_auto_compactions = true;
-  ReOpen();
+  ASSERT_OK(ReOpen());
 
   std::vector<std::string> keys;
   for (uint32_t k = 0; k < kNumKeys * kNumThreads; k++) {
@@ -188,7 +188,7 @@ TEST_P(WriteUnpreparedStressTest, ReadYourOwnWriteStress) {
       }
 
       txn = db->BeginTransaction(write_options, txn_options);
-      txn->SetName(ToString(id));
+      ASSERT_OK(txn->SetName(ToString(id)));
       txn->SetSnapshot();
       if (a >= RO_SNAPSHOT) {
         read_options.snapshot = txn->GetSnapshot();
@@ -273,23 +273,27 @@ TEST_P(WriteUnpreparedStressTest, ReadYourOwnWriteStress) {
         case 1:  // Validate Next()
         {
           Iterator* iter = txn->GetIterator(read_options);
+          ASSERT_OK(iter->status());
           for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
             verify_key(iter->key().ToString(), iter->value().ToString());
           }
+          ASSERT_OK(iter->status());
           delete iter;
           break;
         }
         case 2:  // Validate Prev()
         {
           Iterator* iter = txn->GetIterator(read_options);
+          ASSERT_OK(iter->status());
           for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
             verify_key(iter->key().ToString(), iter->value().ToString());
           }
+          ASSERT_OK(iter->status());
           delete iter;
           break;
         }
         default:
-          ASSERT_TRUE(false);
+          FAIL();
       }
 
       if (rnd.OneIn(2)) {
@@ -334,7 +338,7 @@ TEST_P(WriteUnpreparedTransactionTest, RecoveryTest) {
         for (int num_batches = 1; num_batches < 10; num_batches++) {
           // Reset database.
           prepared_trans.clear();
-          ReOpen();
+          ASSERT_OK(ReOpen());
           wup_db = dynamic_cast<WriteUnpreparedTxnDB*>(db);
           if (!empty) {
             for (int i = 0; i < num_batches; i++) {
@@ -346,7 +350,7 @@ TEST_P(WriteUnpreparedTransactionTest, RecoveryTest) {
           // Write num_batches unprepared batches.
           Transaction* txn = db->BeginTransaction(write_options, txn_options);
           WriteUnpreparedTxn* wup_txn = dynamic_cast<WriteUnpreparedTxn*>(txn);
-          txn->SetName("xid");
+          ASSERT_OK(txn->SetName("xid"));
           for (int i = 0; i < num_batches; i++) {
             ASSERT_OK(txn->Put("k" + ToString(i), "value" + ToString(i)));
             if (txn_options.write_batch_flush_threshold == 1) {
@@ -365,14 +369,14 @@ TEST_P(WriteUnpreparedTransactionTest, RecoveryTest) {
             // test that recovery does the rollback.
             wup_txn->unprep_seqs_.clear();
           } else {
-            txn->Prepare();
+            ASSERT_OK(txn->Prepare());
           }
           delete txn;
 
           // Crash and run recovery code paths.
-          wup_db->db_impl_->FlushWAL(true);
+          ASSERT_OK(wup_db->db_impl_->FlushWAL(true));
           wup_db->TEST_Crash();
-          ReOpenNoDelete();
+          ASSERT_OK(ReOpenNoDelete());
           assert(db != nullptr);
 
           db->GetAllPreparedTransactions(&prepared_trans);
@@ -386,6 +390,7 @@ TEST_P(WriteUnpreparedTransactionTest, RecoveryTest) {
           }
 
           Iterator* iter = db->NewIterator(ReadOptions());
+          ASSERT_OK(iter->status());
           iter->SeekToFirst();
           // Check that DB has before values.
           if (!empty || a == COMMIT) {
@@ -402,6 +407,7 @@ TEST_P(WriteUnpreparedTransactionTest, RecoveryTest) {
             }
           }
           ASSERT_FALSE(iter->Valid());
+          ASSERT_OK(iter->status());
           delete iter;
         }
       }
@@ -422,13 +428,13 @@ TEST_P(WriteUnpreparedTransactionTest, UnpreparedBatch) {
     txn_options.write_batch_flush_threshold = batch_size;
     for (bool prepare : {false, true}) {
       for (bool commit : {false, true}) {
-        ReOpen();
+        ASSERT_OK(ReOpen());
         Transaction* txn = db->BeginTransaction(write_options, txn_options);
         WriteUnpreparedTxn* wup_txn = dynamic_cast<WriteUnpreparedTxn*>(txn);
-        txn->SetName("xid");
+        ASSERT_OK(txn->SetName("xid"));
 
         for (int i = 0; i < kNumKeys; i++) {
-          txn->Put("k" + ToString(i), "v" + ToString(i));
+          ASSERT_OK(txn->Put("k" + ToString(i), "v" + ToString(i)));
           if (txn_options.write_batch_flush_threshold == 1) {
             // WriteUnprepared will check write_batch_flush_threshold and
             // possibly flush before appending to the write batch. No flush will
@@ -445,9 +451,11 @@ TEST_P(WriteUnpreparedTransactionTest, UnpreparedBatch) {
         }
 
         Iterator* iter = db->NewIterator(ReadOptions());
+        ASSERT_OK(iter->status());
         iter->SeekToFirst();
         assert(!iter->Valid());
         ASSERT_FALSE(iter->Valid());
+        ASSERT_OK(iter->status());
         delete iter;
 
         if (commit) {
@@ -458,6 +466,7 @@ TEST_P(WriteUnpreparedTransactionTest, UnpreparedBatch) {
         delete txn;
 
         iter = db->NewIterator(ReadOptions());
+        ASSERT_OK(iter->status());
         iter->SeekToFirst();
 
         for (int i = 0; i < (commit ? kNumKeys : 0); i++) {
@@ -467,6 +476,7 @@ TEST_P(WriteUnpreparedTransactionTest, UnpreparedBatch) {
           iter->Next();
         }
         ASSERT_FALSE(iter->Valid());
+        ASSERT_OK(iter->status());
         delete iter;
       }
     }
@@ -490,7 +500,7 @@ TEST_P(WriteUnpreparedTransactionTest, MarkLogWithPrepSection) {
 
   for (bool prepare : {false, true}) {
     for (bool commit : {false, true}) {
-      ReOpen();
+      ASSERT_OK(ReOpen());
       auto wup_db = dynamic_cast<WriteUnpreparedTxnDB*>(db);
       auto db_impl = wup_db->db_impl_;
 
@@ -508,7 +518,7 @@ TEST_P(WriteUnpreparedTransactionTest, MarkLogWithPrepSection) {
         }
 
         if (i > 0) {
-          db_impl->TEST_SwitchWAL();
+          ASSERT_OK(db_impl->TEST_SwitchWAL());
         }
       }
 
@@ -568,12 +578,14 @@ TEST_P(WriteUnpreparedTransactionTest, NoSnapshotWrite) {
   // snapshot, if iterator snapshot is fresh enough.
   ReadOptions roptions;
   auto iter = txn->GetIterator(roptions);
+  ASSERT_OK(iter->status());
   int keys = 0;
   for (iter->SeekToLast(); iter->Valid(); iter->Prev(), keys++) {
     ASSERT_OK(iter->status());
     ASSERT_EQ(iter->key().ToString(), iter->value().ToString());
   }
   ASSERT_EQ(keys, 3);
+  ASSERT_OK(iter->status());
 
   delete iter;
   delete txn;
@@ -598,6 +610,7 @@ TEST_P(WriteUnpreparedTransactionTest, IterateAndWrite) {
 
     ReadOptions roptions;
     auto iter = txn->GetIterator(roptions);
+    ASSERT_OK(iter->status());
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
       ASSERT_OK(iter->status());
       if (iter->key() == "9") {
@@ -612,11 +625,13 @@ TEST_P(WriteUnpreparedTransactionTest, IterateAndWrite) {
         ASSERT_OK(txn->Put(iter->key(), "b"));
       }
     }
+    ASSERT_OK(iter->status());
 
     delete iter;
     ASSERT_OK(txn->Commit());
 
     iter = db->NewIterator(roptions);
+    ASSERT_OK(iter->status());
     if (a == DO_DELETE) {
       // Check that db is empty.
       iter->SeekToFirst();
@@ -630,12 +645,59 @@ TEST_P(WriteUnpreparedTransactionTest, IterateAndWrite) {
       }
       ASSERT_EQ(keys, 100);
     }
+    ASSERT_OK(iter->status());
 
     delete iter;
     delete txn;
   }
 }
 
+// Test that using an iterator after transaction clear is not supported
+TEST_P(WriteUnpreparedTransactionTest, IterateAfterClear) {
+  WriteOptions woptions;
+  TransactionOptions txn_options;
+  txn_options.write_batch_flush_threshold = 1;
+
+  enum Action { kCommit, kRollback };
+
+  for (Action a : {kCommit, kRollback}) {
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(db->Put(woptions, ToString(i), ToString(i)));
+    }
+
+    Transaction* txn = db->BeginTransaction(woptions, txn_options);
+    ASSERT_OK(txn->Put("9", "a"));
+
+    ReadOptions roptions;
+    auto iter1 = txn->GetIterator(roptions);
+    auto iter2 = txn->GetIterator(roptions);
+    iter1->SeekToFirst();
+    iter2->Seek("9");
+
+    // Check that iterators are valid before transaction finishes.
+    ASSERT_TRUE(iter1->Valid());
+    ASSERT_TRUE(iter2->Valid());
+    ASSERT_OK(iter1->status());
+    ASSERT_OK(iter2->status());
+
+    if (a == kCommit) {
+      ASSERT_OK(txn->Commit());
+    } else {
+      ASSERT_OK(txn->Rollback());
+    }
+
+    // Check that iterators are invalidated after transaction finishes.
+    ASSERT_FALSE(iter1->Valid());
+    ASSERT_FALSE(iter2->Valid());
+    ASSERT_TRUE(iter1->status().IsInvalidArgument());
+    ASSERT_TRUE(iter2->status().IsInvalidArgument());
+
+    delete iter1;
+    delete iter2;
+    delete txn;
+  }
+}
+
 TEST_P(WriteUnpreparedTransactionTest, SavePoint) {
   WriteOptions woptions;
   TransactionOptions txn_options;
diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc
index ed2600026e3..c6740374ffa 100644
--- a/utilities/transactions/write_unprepared_txn.cc
+++ b/utilities/transactions/write_unprepared_txn.cc
@@ -9,6 +9,7 @@
 #include "db/db_impl/db_impl.h"
 #include "util/cast_util.h"
 #include "utilities/transactions/write_unprepared_txn_db.h"
+#include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -279,7 +280,9 @@ Status WriteUnpreparedTxn::FlushWriteBatchToDBInternal(bool prepared) {
     static std::atomic_ullong autogen_id{0};
     // To avoid changing all tests to call SetName, just autogenerate one.
     if (wupt_db_->txn_db_options_.autogenerate_name) {
-      SetName(std::string("autoxid") + ToString(autogen_id.fetch_add(1)));
+      auto s =
+          SetName(std::string("autoxid") + ToString(autogen_id.fetch_add(1)));
+      assert(s.ok());
     } else
 #endif
     {
@@ -354,8 +357,9 @@ Status WriteUnpreparedTxn::FlushWriteBatchToDBInternal(bool prepared) {
   const bool WRITE_AFTER_COMMIT = true;
   const bool first_prepare_batch = log_number_ == 0;
   // MarkEndPrepare will change Noop marker to the appropriate marker.
-  WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), name_,
-                                     !WRITE_AFTER_COMMIT, !prepared);
+  s = WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(),
+                                         name_, !WRITE_AFTER_COMMIT, !prepared);
+  assert(s.ok());
   // For each duplicate key we account for a new sub-batch
   prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt();
   // AddPrepared better to be called in the pre-release callback otherwise there
@@ -541,7 +545,8 @@ Status WriteUnpreparedTxn::CommitInternal() {
   // will ignore the Commit marker in non-recovery mode
   WriteBatch* working_batch = GetCommitTimeWriteBatch();
   const bool empty = working_batch->Count() == 0;
-  WriteBatchInternal::MarkCommit(working_batch, name_);
+  auto s = WriteBatchInternal::MarkCommit(working_batch, name_);
+  assert(s.ok());
 
   const bool for_recovery = use_only_the_last_commit_time_batch_for_recovery_;
   if (!empty && for_recovery) {
@@ -557,7 +562,7 @@ Status WriteUnpreparedTxn::CommitInternal() {
     ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
                    "Duplicate key overhead");
     SubBatchCounter counter(*wpt_db_->GetCFComparatorMap());
-    auto s = working_batch->Iterate(&counter);
+    s = working_batch->Iterate(&counter);
     assert(s.ok());
     commit_batch_cnt = counter.BatchCount();
   }
@@ -583,9 +588,9 @@ Status WriteUnpreparedTxn::CommitInternal() {
   // need to redundantly reference the log that contains the prepared data.
   const uint64_t zero_log_number = 0ull;
   size_t batch_cnt = UNLIKELY(commit_batch_cnt) ? commit_batch_cnt : 1;
-  auto s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
-                               zero_log_number, disable_memtable, &seq_used,
-                               batch_cnt, pre_release_callback);
+  s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
+                          zero_log_number, disable_memtable, &seq_used,
+                          batch_cnt, pre_release_callback);
   assert(!s.ok() || seq_used != kMaxSequenceNumber);
   const SequenceNumber commit_batch_seq = seq_used;
   if (LIKELY(do_one_write || !s.ok())) {
@@ -619,9 +624,11 @@ Status WriteUnpreparedTxn::CommitInternal() {
 
   // Update commit map only from the 2nd queue
   WriteBatch empty_batch;
-  empty_batch.PutLogData(Slice());
+  s = empty_batch.PutLogData(Slice());
+  assert(s.ok());
   // In the absence of Prepare markers, use Noop as a batch separator
-  WriteBatchInternal::InsertNoop(&empty_batch);
+  s = WriteBatchInternal::InsertNoop(&empty_batch);
+  assert(s.ok());
   const bool DISABLE_MEMTABLE = true;
   const size_t ONE_BATCH = 1;
   const uint64_t NO_REF_LOG = 0;
@@ -719,10 +726,14 @@ Status WriteUnpreparedTxn::RollbackInternal() {
   // TODO(lth): We write rollback batch all in a single batch here, but this
   // should be subdivded into multiple batches as well. In phase 2, when key
   // sets are read from WAL, this will happen naturally.
-  WriteRollbackKeys(*tracked_locks_, &rollback_batch, &callback, roptions);
+  s = WriteRollbackKeys(*tracked_locks_, &rollback_batch, &callback, roptions);
+  if (!s.ok()) {
+    return s;
+  }
 
   // The Rollback marker will be used as a batch separator
-  WriteBatchInternal::MarkRollback(rollback_batch.GetWriteBatch(), name_);
+  s = WriteBatchInternal::MarkRollback(rollback_batch.GetWriteBatch(), name_);
+  assert(s.ok());
   bool do_one_write = !db_impl_->immutable_db_options().two_write_queues;
   const bool DISABLE_MEMTABLE = true;
   const uint64_t NO_REF_LOG = 0;
@@ -778,9 +789,11 @@ Status WriteUnpreparedTxn::RollbackInternal() {
                     prepare_seq);
   WriteBatch empty_batch;
   const size_t ONE_BATCH = 1;
-  empty_batch.PutLogData(Slice());
+  s = empty_batch.PutLogData(Slice());
+  assert(s.ok());
   // In the absence of Prepare markers, use Noop as a batch separator
-  WriteBatchInternal::InsertNoop(&empty_batch);
+  s = WriteBatchInternal::InsertNoop(&empty_batch);
+  assert(s.ok());
   s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
                           NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
                           &update_commit_map_with_rollback_batch);
@@ -807,7 +820,11 @@ void WriteUnpreparedTxn::Clear() {
   unflushed_save_points_.reset(nullptr);
   recovered_txn_ = false;
   largest_validated_seq_ = 0;
-  assert(active_iterators_.empty());
+  for (auto& it : active_iterators_) {
+    auto bdit = static_cast<BaseDeltaIterator*>(it);
+    bdit->Invalidate(Status::InvalidArgument(
+        "Cannot use iterator after transaction has finished"));
+  }
   active_iterators_.clear();
   untracked_keys_.clear();
   TransactionBaseImpl::Clear();
@@ -863,11 +880,13 @@ Status WriteUnpreparedTxn::RollbackToSavePointInternal() {
   WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted,
                                           top.unprep_seqs_,
                                           kBackedByDBSnapshot);
-  WriteRollbackKeys(tracked_keys, &write_batch_, &callback, roptions);
+  s = WriteRollbackKeys(tracked_keys, &write_batch_, &callback, roptions);
+  if (!s.ok()) {
+    return s;
+  }
 
   const bool kPrepared = true;
   s = FlushWriteBatchToDBInternal(!kPrepared);
-  assert(s.ok());
   if (!s.ok()) {
     return s;
   }
diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc
index dbef7d69e54..0ef96d0a453 100644
--- a/utilities/transactions/write_unprepared_txn_db.cc
+++ b/utilities/transactions/write_unprepared_txn_db.cc
@@ -167,7 +167,10 @@ Status WriteUnpreparedTxnDB::RollbackRecoveredTransaction(
     }
 
     // The Rollback marker will be used as a batch separator
-    WriteBatchInternal::MarkRollback(&rollback_batch, rtxn->name_);
+    s = WriteBatchInternal::MarkRollback(&rollback_batch, rtxn->name_);
+    if (!s.ok()) {
+      return s;
+    }
 
     const uint64_t kNoLogRef = 0;
     const bool kDisableMemtable = true;
@@ -384,8 +387,8 @@ Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options,
                                             ColumnFamilyHandle* column_family,
                                             WriteUnpreparedTxn* txn) {
   // TODO(lth): Refactor so that this logic is shared with WritePrepared.
-  constexpr bool ALLOW_BLOB = true;
-  constexpr bool ALLOW_REFRESH = true;
+  constexpr bool expose_blob_index = false;
+  constexpr bool allow_refresh = false;
   std::shared_ptr<ManagedSnapshot> own_snapshot = nullptr;
   SequenceNumber snapshot_seq = kMaxSequenceNumber;
   SequenceNumber min_uncommitted = 0;
@@ -456,9 +459,9 @@ Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options,
       static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
   auto* state =
       new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted, txn);
-  auto* db_iter =
-      db_impl_->NewIteratorImpl(options, cfd, state->MaxVisibleSeq(),
-                                &state->callback, !ALLOW_BLOB, !ALLOW_REFRESH);
+  auto* db_iter = db_impl_->NewIteratorImpl(
+      options, cfd, state->MaxVisibleSeq(), &state->callback, expose_blob_index,
+      allow_refresh);
   db_iter->RegisterCleanup(CleanupWriteUnpreparedTxnDBIterator, state, nullptr);
   return db_iter;
 }
diff --git a/utilities/ttl/db_ttl_impl.cc b/utilities/ttl/db_ttl_impl.cc
index ac3a3d17d0b..917130ca10f 100644
--- a/utilities/ttl/db_ttl_impl.cc
+++ b/utilities/ttl/db_ttl_impl.cc
@@ -11,25 +11,26 @@
 #include "rocksdb/convenience.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
+#include "rocksdb/system_clock.h"
 #include "rocksdb/utilities/db_ttl.h"
 #include "util/coding.h"
 
 namespace ROCKSDB_NAMESPACE {
 
 void DBWithTTLImpl::SanitizeOptions(int32_t ttl, ColumnFamilyOptions* options,
-                                    Env* env) {
+                                    SystemClock* clock) {
   if (options->compaction_filter) {
     options->compaction_filter =
-        new TtlCompactionFilter(ttl, env, options->compaction_filter);
+        new TtlCompactionFilter(ttl, clock, options->compaction_filter);
   } else {
     options->compaction_filter_factory =
         std::shared_ptr<CompactionFilterFactory>(new TtlCompactionFilterFactory(
-            ttl, env, options->compaction_filter_factory));
+            ttl, clock, options->compaction_filter_factory));
   }
 
   if (options->merge_operator) {
     options->merge_operator.reset(
-        new TtlMergeOperator(options->merge_operator, env));
+        new TtlMergeOperator(options->merge_operator, clock));
   }
 }
 
@@ -38,7 +39,7 @@ DBWithTTLImpl::DBWithTTLImpl(DB* db) : DBWithTTL(db), closed_(false) {}
 
 DBWithTTLImpl::~DBWithTTLImpl() {
   if (!closed_) {
-    Close();
+    Close().PermitUncheckedError();
   }
 }
 
@@ -97,12 +98,15 @@ Status DBWithTTL::Open(
         "ttls size has to be the same as number of column families");
   }
 
+  SystemClock* clock = (db_options.env == nullptr)
+                           ? SystemClock::Default().get()
+                           : db_options.env->GetSystemClock().get();
+
   std::vector<ColumnFamilyDescriptor> column_families_sanitized =
       column_families;
   for (size_t i = 0; i < column_families_sanitized.size(); ++i) {
     DBWithTTLImpl::SanitizeOptions(
-        ttls[i], &column_families_sanitized[i].options,
-        db_options.env == nullptr ? Env::Default() : db_options.env);
+        ttls[i], &column_families_sanitized[i].options, clock);
   }
   DB* db;
 
@@ -125,7 +129,8 @@ Status DBWithTTLImpl::CreateColumnFamilyWithTtl(
     const ColumnFamilyOptions& options, const std::string& column_family_name,
     ColumnFamilyHandle** handle, int ttl) {
   ColumnFamilyOptions sanitized_options = options;
-  DBWithTTLImpl::SanitizeOptions(ttl, &sanitized_options, GetEnv());
+  DBWithTTLImpl::SanitizeOptions(ttl, &sanitized_options,
+                                 GetEnv()->GetSystemClock().get());
 
   return DBWithTTL::CreateColumnFamily(sanitized_options, column_family_name,
                                        handle);
@@ -140,11 +145,11 @@ Status DBWithTTLImpl::CreateColumnFamily(const ColumnFamilyOptions& options,
 // Appends the current timestamp to the string.
 // Returns false if could not get the current_time, true if append succeeds
 Status DBWithTTLImpl::AppendTS(const Slice& val, std::string* val_with_ts,
-                               Env* env) {
+                               SystemClock* clock) {
   val_with_ts->reserve(kTSLength + val.size());
   char ts_string[kTSLength];
   int64_t curtime;
-  Status st = env->GetCurrentTime(&curtime);
+  Status st = clock->GetCurrentTime(&curtime);
   if (!st.ok()) {
     return st;
   }
@@ -170,12 +175,13 @@ Status DBWithTTLImpl::SanityCheckTimestamp(const Slice& str) {
 }
 
 // Checks if the string is stale or not according to TTl provided
-bool DBWithTTLImpl::IsStale(const Slice& value, int32_t ttl, Env* env) {
+bool DBWithTTLImpl::IsStale(const Slice& value, int32_t ttl,
+                            SystemClock* clock) {
   if (ttl <= 0) {  // Data is fresh if TTL is non-positive
     return false;
   }
   int64_t curtime;
-  if (!env->GetCurrentTime(&curtime).ok()) {
+  if (!clock->GetCurrentTime(&curtime).ok()) {
     return false;  // Treat the data as fresh if could not get current time
   }
   int32_t timestamp_value =
@@ -185,32 +191,33 @@ bool DBWithTTLImpl::IsStale(const Slice& value, int32_t ttl, Env* env) {
 
 // Strips the TS from the end of the slice
 Status DBWithTTLImpl::StripTS(PinnableSlice* pinnable_val) {
-  Status st;
   if (pinnable_val->size() < kTSLength) {
     return Status::Corruption("Bad timestamp in key-value");
   }
   // Erasing characters which hold the TS
   pinnable_val->remove_suffix(kTSLength);
-  return st;
+  return Status::OK();
 }
 
 // Strips the TS from the end of the string
 Status DBWithTTLImpl::StripTS(std::string* str) {
-  Status st;
   if (str->length() < kTSLength) {
     return Status::Corruption("Bad timestamp in key-value");
   }
   // Erasing characters which hold the TS
   str->erase(str->length() - kTSLength, kTSLength);
-  return st;
+  return Status::OK();
 }
 
 Status DBWithTTLImpl::Put(const WriteOptions& options,
                           ColumnFamilyHandle* column_family, const Slice& key,
                           const Slice& val) {
   WriteBatch batch;
-  batch.Put(column_family, key, val);
-  return Write(options, &batch);
+  Status st = batch.Put(column_family, key, val);
+  if (st.ok()) {
+    st = Write(options, &batch);
+  }
+  return st;
 }
 
 Status DBWithTTLImpl::Get(const ReadOptions& options,
@@ -262,53 +269,55 @@ Status DBWithTTLImpl::Merge(const WriteOptions& options,
                             ColumnFamilyHandle* column_family, const Slice& key,
                             const Slice& value) {
   WriteBatch batch;
-  batch.Merge(column_family, key, value);
-  return Write(options, &batch);
+  Status st = batch.Merge(column_family, key, value);
+  if (st.ok()) {
+    st = Write(options, &batch);
+  }
+  return st;
 }
 
 Status DBWithTTLImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
   class Handler : public WriteBatch::Handler {
    public:
-    explicit Handler(Env* env) : env_(env) {}
+    explicit Handler(SystemClock* clock) : clock_(clock) {}
     WriteBatch updates_ttl;
-    Status batch_rewrite_status;
     Status PutCF(uint32_t column_family_id, const Slice& key,
                  const Slice& value) override {
       std::string value_with_ts;
-      Status st = AppendTS(value, &value_with_ts, env_);
+      Status st = AppendTS(value, &value_with_ts, clock_);
       if (!st.ok()) {
-        batch_rewrite_status = st;
-      } else {
-        WriteBatchInternal::Put(&updates_ttl, column_family_id, key,
-                                value_with_ts);
+        return st;
       }
-      return Status::OK();
+      return WriteBatchInternal::Put(&updates_ttl, column_family_id, key,
+                                     value_with_ts);
     }
     Status MergeCF(uint32_t column_family_id, const Slice& key,
                    const Slice& value) override {
       std::string value_with_ts;
-      Status st = AppendTS(value, &value_with_ts, env_);
+      Status st = AppendTS(value, &value_with_ts, clock_);
       if (!st.ok()) {
-        batch_rewrite_status = st;
-      } else {
-        WriteBatchInternal::Merge(&updates_ttl, column_family_id, key,
-                                  value_with_ts);
+        return st;
       }
-      return Status::OK();
+      return WriteBatchInternal::Merge(&updates_ttl, column_family_id, key,
+                                       value_with_ts);
     }
     Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
-      WriteBatchInternal::Delete(&updates_ttl, column_family_id, key);
-      return Status::OK();
+      return WriteBatchInternal::Delete(&updates_ttl, column_family_id, key);
+    }
+    Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
+                         const Slice& end_key) override {
+      return WriteBatchInternal::DeleteRange(&updates_ttl, column_family_id,
+                                             begin_key, end_key);
     }
     void LogData(const Slice& blob) override { updates_ttl.PutLogData(blob); }
 
    private:
-    Env* env_;
+    SystemClock* clock_;
   };
-  Handler handler(GetEnv());
-  updates->Iterate(&handler);
-  if (!handler.batch_rewrite_status.ok()) {
-    return handler.batch_rewrite_status;
+  Handler handler(GetEnv()->GetSystemClock().get());
+  Status st = updates->Iterate(&handler);
+  if (!st.ok()) {
+    return st;
   } else {
     return db_->Write(opts, &(handler.updates_ttl));
   }
diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h
index 7ef32e83dc2..ab3ff3729d3 100644
--- a/utilities/ttl/db_ttl_impl.h
+++ b/utilities/ttl/db_ttl_impl.h
@@ -13,8 +13,8 @@
 #include "db/db_impl/db_impl.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/db.h"
-#include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
+#include "rocksdb/system_clock.h"
 #include "rocksdb/utilities/db_ttl.h"
 #include "rocksdb/utilities/utility_db.h"
 #include "utilities/compaction_filters/layered_compaction_filter_base.h"
@@ -29,7 +29,7 @@ namespace ROCKSDB_NAMESPACE {
 class DBWithTTLImpl : public DBWithTTL {
  public:
   static void SanitizeOptions(int32_t ttl, ColumnFamilyOptions* options,
-                              Env* env);
+                              SystemClock* clock);
 
   explicit DBWithTTLImpl(DB* db);
 
@@ -82,9 +82,10 @@ class DBWithTTLImpl : public DBWithTTL {
 
   virtual DB* GetBaseDB() override { return db_; }
 
-  static bool IsStale(const Slice& value, int32_t ttl, Env* env);
+  static bool IsStale(const Slice& value, int32_t ttl, SystemClock* clock);
 
-  static Status AppendTS(const Slice& val, std::string* val_with_ts, Env* env);
+  static Status AppendTS(const Slice& val, std::string* val_with_ts,
+                         SystemClock* clock);
 
   static Status SanityCheckTimestamp(const Slice& str);
 
@@ -151,19 +152,19 @@ class TtlIterator : public Iterator {
 
 class TtlCompactionFilter : public LayeredCompactionFilterBase {
  public:
-  TtlCompactionFilter(int32_t ttl, Env* env,
+  TtlCompactionFilter(int32_t ttl, SystemClock* clock,
                       const CompactionFilter* _user_comp_filter,
                       std::unique_ptr<const CompactionFilter>
                           _user_comp_filter_from_factory = nullptr)
       : LayeredCompactionFilterBase(_user_comp_filter,
                                     std::move(_user_comp_filter_from_factory)),
         ttl_(ttl),
-        env_(env) {}
+        clock_(clock) {}
 
   virtual bool Filter(int level, const Slice& key, const Slice& old_val,
                       std::string* new_val, bool* value_changed) const
       override {
-    if (DBWithTTLImpl::IsStale(old_val, ttl_, env_)) {
+    if (DBWithTTLImpl::IsStale(old_val, ttl_, clock_)) {
       return true;
     }
     if (user_comp_filter() == nullptr) {
@@ -188,15 +189,17 @@ class TtlCompactionFilter : public LayeredCompactionFilterBase {
 
  private:
   int32_t ttl_;
-  Env* env_;
+  SystemClock* clock_;
 };
 
 class TtlCompactionFilterFactory : public CompactionFilterFactory {
  public:
   TtlCompactionFilterFactory(
-      int32_t ttl, Env* env,
+      int32_t ttl, SystemClock* clock,
       std::shared_ptr<CompactionFilterFactory> comp_filter_factory)
-      : ttl_(ttl), env_(env), user_comp_filter_factory_(comp_filter_factory) {}
+      : ttl_(ttl),
+        clock_(clock),
+        user_comp_filter_factory_(comp_filter_factory) {}
 
   virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
       const CompactionFilter::Context& context) override {
@@ -208,7 +211,7 @@ class TtlCompactionFilterFactory : public CompactionFilterFactory {
     }
 
     return std::unique_ptr<TtlCompactionFilter>(new TtlCompactionFilter(
-        ttl_, env_, nullptr, std::move(user_comp_filter_from_factory)));
+        ttl_, clock_, nullptr, std::move(user_comp_filter_from_factory)));
   }
 
   void SetTtl(int32_t ttl) {
@@ -221,7 +224,7 @@ class TtlCompactionFilterFactory : public CompactionFilterFactory {
 
  private:
   int32_t ttl_;
-  Env* env_;
+  SystemClock* clock_;
   std::shared_ptr<CompactionFilterFactory> user_comp_filter_factory_;
 };
 
@@ -229,10 +232,10 @@ class TtlMergeOperator : public MergeOperator {
 
  public:
   explicit TtlMergeOperator(const std::shared_ptr<MergeOperator>& merge_op,
-                            Env* env)
-      : user_merge_op_(merge_op), env_(env) {
+                            SystemClock* clock)
+      : user_merge_op_(merge_op), clock_(clock) {
     assert(merge_op);
-    assert(env);
+    assert(clock);
   }
 
   virtual bool FullMergeV2(const MergeOperationInput& merge_in,
@@ -288,7 +291,7 @@ class TtlMergeOperator : public MergeOperator {
 
     // Augment the *new_value with the ttl time-stamp
     int64_t curtime;
-    if (!env_->GetCurrentTime(&curtime).ok()) {
+    if (!clock_->GetCurrentTime(&curtime).ok()) {
       ROCKS_LOG_ERROR(
           merge_in.logger,
           "Error: Could not get current time to be attached internally "
@@ -329,7 +332,7 @@ class TtlMergeOperator : public MergeOperator {
 
     // Augment the *new_value with the ttl time-stamp
     int64_t curtime;
-    if (!env_->GetCurrentTime(&curtime).ok()) {
+    if (!clock_->GetCurrentTime(&curtime).ok()) {
       ROCKS_LOG_ERROR(
           logger,
           "Error: Could not get current time to be attached internally "
@@ -347,7 +350,7 @@ class TtlMergeOperator : public MergeOperator {
 
  private:
   std::shared_ptr<MergeOperator> user_merge_op_;
-  Env* env_;
+  SystemClock* clock_;
 };
 }  // namespace ROCKSDB_NAMESPACE
 #endif  // ROCKSDB_LITE
diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc
index 015f67cc4d8..8e60fbc763f 100644
--- a/utilities/ttl/ttl_test.cc
+++ b/utilities/ttl/ttl_test.cc
@@ -27,7 +27,7 @@ enum BatchOperation { OP_PUT = 0, OP_DELETE = 1 };
 class SpecialTimeEnv : public EnvWrapper {
  public:
   explicit SpecialTimeEnv(Env* base) : EnvWrapper(base) {
-    base->GetCurrentTime(&current_time_);
+    EXPECT_OK(base->GetCurrentTime(&current_time_));
   }
 
   void Sleep(int64_t sleep_time) { current_time_ += sleep_time; }
@@ -95,7 +95,7 @@ class TtlTest : public testing::Test {
   void CloseTtlHelper(bool close_db) {
     if (db_ttl_ != nullptr) {
       if (close_db) {
-        db_ttl_->Close();
+        EXPECT_OK(db_ttl_->Close());
       }
       delete db_ttl_;
       db_ttl_ = nullptr;
@@ -137,17 +137,17 @@ class TtlTest : public testing::Test {
     for (int64_t i = 0; i < num_ops && kv_it_ != kvmap_.end(); i++, ++kv_it_) {
       switch (batch_ops[i]) {
         case OP_PUT:
-          batch.Put(kv_it_->first, kv_it_->second);
+          ASSERT_OK(batch.Put(kv_it_->first, kv_it_->second));
           break;
         case OP_DELETE:
-          batch.Delete(kv_it_->first);
+          ASSERT_OK(batch.Delete(kv_it_->first));
           break;
         default:
           FAIL();
       }
     }
-    db_ttl_->Write(wopts, &batch);
-    db_ttl_->Flush(flush_opts);
+    ASSERT_OK(db_ttl_->Write(wopts, &batch));
+    ASSERT_OK(db_ttl_->Flush(flush_opts));
   }
 
   // Puts num_entries starting from start_pos_map from kvmap_ into the database
@@ -170,22 +170,34 @@ class TtlTest : public testing::Test {
                             : db_ttl_->Put(wopts, cf, "keymock", "valuemock"));
     if (flush) {
       if (cf == nullptr) {
-        db_ttl_->Flush(flush_opts);
+        ASSERT_OK(db_ttl_->Flush(flush_opts));
       } else {
-        db_ttl_->Flush(flush_opts, cf);
+        ASSERT_OK(db_ttl_->Flush(flush_opts, cf));
       }
     }
   }
 
   // Runs a manual compaction
-  void ManualCompact(ColumnFamilyHandle* cf = nullptr) {
+  Status ManualCompact(ColumnFamilyHandle* cf = nullptr) {
     if (cf == nullptr) {
-      db_ttl_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+      return db_ttl_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
     } else {
-      db_ttl_->CompactRange(CompactRangeOptions(), cf, nullptr, nullptr);
+      return db_ttl_->CompactRange(CompactRangeOptions(), cf, nullptr, nullptr);
     }
   }
 
+  // Runs a DeleteRange
+  void MakeDeleteRange(std::string start, std::string end,
+                       ColumnFamilyHandle* cf = nullptr) {
+    ASSERT_TRUE(db_ttl_);
+    static WriteOptions wops;
+    WriteBatch wb;
+    ASSERT_OK(cf == nullptr
+                  ? wb.DeleteRange(db_ttl_->DefaultColumnFamily(), start, end)
+                  : wb.DeleteRange(cf, start, end));
+    ASSERT_OK(db_ttl_->Write(wops, &wb));
+  }
+
   // checks the whole kvmap_ to return correct values using KeyMayExist
   void SimpleKeyMayExistCheck() {
     static ReadOptions ropts;
@@ -225,18 +237,9 @@ class TtlTest : public testing::Test {
     }
   }
 
-  // Sleeps for slp_tim then runs a manual compaction
-  // Checks span starting from st_pos from kvmap_ in the db and
-  // Gets should return true if check is true and false otherwise
-  // Also checks that value that we got is the same as inserted; and =kNewValue
-  //   if test_compaction_change is true
-  void SleepCompactCheck(int slp_tim, int64_t st_pos, int64_t span,
-                         bool check = true, bool test_compaction_change = false,
-                         ColumnFamilyHandle* cf = nullptr) {
-    ASSERT_TRUE(db_ttl_);
-
-    env_->Sleep(slp_tim);
-    ManualCompact(cf);
+  void CompactCheck(int64_t st_pos, int64_t span, bool check = true,
+                    bool test_compaction_change = false,
+                    ColumnFamilyHandle* cf = nullptr) {
     static ReadOptions ropts;
     kv_it_ = kvmap_.begin();
     advance(kv_it_, st_pos);
@@ -267,13 +270,27 @@ class TtlTest : public testing::Test {
       }
     }
   }
+  // Sleeps for slp_tim then runs a manual compaction
+  // Checks span starting from st_pos from kvmap_ in the db and
+  // Gets should return true if check is true and false otherwise
+  // Also checks that value that we got is the same as inserted; and =kNewValue
+  //   if test_compaction_change is true
+  void SleepCompactCheck(int slp_tim, int64_t st_pos, int64_t span,
+                         bool check = true, bool test_compaction_change = false,
+                         ColumnFamilyHandle* cf = nullptr) {
+    ASSERT_TRUE(db_ttl_);
+
+    env_->Sleep(slp_tim);
+    ASSERT_OK(ManualCompact(cf));
+    CompactCheck(st_pos, span, check, test_compaction_change, cf);
+  }
 
   // Similar as SleepCompactCheck but uses TtlIterator to read from db
   void SleepCompactCheckIter(int slp, int st_pos, int64_t span,
                              bool check = true) {
     ASSERT_TRUE(db_ttl_);
     env_->Sleep(slp);
-    ManualCompact();
+    ASSERT_OK(ManualCompact());
     static ReadOptions ropts;
     Iterator *dbiter = db_ttl_->NewIterator(ropts);
     kv_it_ = kvmap_.begin();
@@ -292,6 +309,7 @@ class TtlTest : public testing::Test {
         dbiter->Next();
       }
     }
+    ASSERT_OK(dbiter->status());
     delete dbiter;
   }
 
@@ -534,11 +552,16 @@ TEST_F(TtlTest, ReadOnlyPresentForever) {
   MakeKVMap(kSampleSize_);
 
   OpenTtl(1);                                 // T=0:Open the db normally
-  PutValues(0, kSampleSize_);                  // T=0:Insert Set1. Delete at t=1
+  PutValues(0, kSampleSize_);                 // T=0:Insert Set1. Delete at t=1
   CloseTtl();
 
   OpenReadOnlyTtl(1);
-  SleepCompactCheck(2, 0, kSampleSize_);       // T=2:Set1 should still be there
+  ASSERT_TRUE(db_ttl_);
+
+  env_->Sleep(2);
+  Status s = ManualCompact();  // T=2:Set1 should still be there
+  ASSERT_TRUE(s.IsNotSupported());
+  CompactCheck(0, kSampleSize_);
   CloseTtl();
 }
 
@@ -673,6 +696,29 @@ TEST_F(TtlTest, ChangeTtlOnOpenDb) {
   CloseTtl();
 }
 
+// Test DeleteRange for DBWithTtl
+TEST_F(TtlTest, DeleteRangeTest) {
+  OpenTtl();
+  ASSERT_OK(db_ttl_->Put(WriteOptions(), "a", "val"));
+  MakeDeleteRange("a", "b");
+  ASSERT_OK(db_ttl_->Put(WriteOptions(), "c", "val"));
+  MakeDeleteRange("b", "d");
+  ASSERT_OK(db_ttl_->Put(WriteOptions(), "e", "val"));
+  MakeDeleteRange("d", "e");
+  // first iteration verifies query correctness in memtable, second verifies
+  // query correctness for a single SST file
+  for (int i = 0; i < 2; i++) {
+    if (i > 0) {
+      ASSERT_OK(db_ttl_->Flush(FlushOptions()));
+    }
+    std::string value;
+    ASSERT_TRUE(db_ttl_->Get(ReadOptions(), "a", &value).IsNotFound());
+    ASSERT_TRUE(db_ttl_->Get(ReadOptions(), "c", &value).IsNotFound());
+    ASSERT_OK(db_ttl_->Get(ReadOptions(), "e", &value));
+  }
+  CloseTtl();
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 // A black-box test for the ttl wrapper around rocksdb
diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc
index 1e92d581412..6ad54f219b5 100644
--- a/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -23,414 +23,6 @@
 #include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
 
 namespace ROCKSDB_NAMESPACE {
-
-// when direction == forward
-// * current_at_base_ <=> base_iterator > delta_iterator
-// when direction == backwards
-// * current_at_base_ <=> base_iterator < delta_iterator
-// always:
-// * equal_keys_ <=> base_iterator == delta_iterator
-class BaseDeltaIterator : public Iterator {
- public:
-  BaseDeltaIterator(Iterator* base_iterator, WBWIIterator* delta_iterator,
-                    const Comparator* comparator,
-                    const ReadOptions* read_options = nullptr)
-      : forward_(true),
-        current_at_base_(true),
-        equal_keys_(false),
-        status_(Status::OK()),
-        base_iterator_(base_iterator),
-        delta_iterator_(delta_iterator),
-        comparator_(comparator),
-        iterate_upper_bound_(read_options ? read_options->iterate_upper_bound
-                                          : nullptr) {}
-
-  ~BaseDeltaIterator() override {}
-
-  bool Valid() const override {
-    return current_at_base_ ? BaseValid() : DeltaValid();
-  }
-
-  void SeekToFirst() override {
-    forward_ = true;
-    base_iterator_->SeekToFirst();
-    delta_iterator_->SeekToFirst();
-    UpdateCurrent();
-  }
-
-  void SeekToLast() override {
-    forward_ = false;
-    base_iterator_->SeekToLast();
-    delta_iterator_->SeekToLast();
-    UpdateCurrent();
-  }
-
-  void Seek(const Slice& k) override {
-    forward_ = true;
-    base_iterator_->Seek(k);
-    delta_iterator_->Seek(k);
-    UpdateCurrent();
-  }
-
-  void SeekForPrev(const Slice& k) override {
-    forward_ = false;
-    base_iterator_->SeekForPrev(k);
-    delta_iterator_->SeekForPrev(k);
-    UpdateCurrent();
-  }
-
-  void Next() override {
-    if (!Valid()) {
-      status_ = Status::NotSupported("Next() on invalid iterator");
-      return;
-    }
-
-    if (!forward_) {
-      // Need to change direction
-      // if our direction was backward and we're not equal, we have two states:
-      // * both iterators are valid: we're already in a good state (current
-      // shows to smaller)
-      // * only one iterator is valid: we need to advance that iterator
-      forward_ = true;
-      equal_keys_ = false;
-      if (!BaseValid()) {
-        assert(DeltaValid());
-        base_iterator_->SeekToFirst();
-      } else if (!DeltaValid()) {
-        delta_iterator_->SeekToFirst();
-      } else if (current_at_base_) {
-        // Change delta from larger than base to smaller
-        AdvanceDelta();
-      } else {
-        // Change base from larger than delta to smaller
-        AdvanceBase();
-      }
-      if (DeltaValid() && BaseValid()) {
-        if (comparator_->Equal(delta_iterator_->Entry().key,
-                               base_iterator_->key())) {
-          equal_keys_ = true;
-        }
-      }
-    }
-    Advance();
-  }
-
-  void Prev() override {
-    if (!Valid()) {
-      status_ = Status::NotSupported("Prev() on invalid iterator");
-      return;
-    }
-
-    if (forward_) {
-      // Need to change direction
-      // if our direction was backward and we're not equal, we have two states:
-      // * both iterators are valid: we're already in a good state (current
-      // shows to smaller)
-      // * only one iterator is valid: we need to advance that iterator
-      forward_ = false;
-      equal_keys_ = false;
-      if (!BaseValid()) {
-        assert(DeltaValid());
-        base_iterator_->SeekToLast();
-      } else if (!DeltaValid()) {
-        delta_iterator_->SeekToLast();
-      } else if (current_at_base_) {
-        // Change delta from less advanced than base to more advanced
-        AdvanceDelta();
-      } else {
-        // Change base from less advanced than delta to more advanced
-        AdvanceBase();
-      }
-      if (DeltaValid() && BaseValid()) {
-        if (comparator_->Equal(delta_iterator_->Entry().key,
-                               base_iterator_->key())) {
-          equal_keys_ = true;
-        }
-      }
-    }
-
-    Advance();
-  }
-
-  Slice key() const override {
-    return current_at_base_ ? base_iterator_->key()
-                            : delta_iterator_->Entry().key;
-  }
-
-  Slice value() const override {
-    return current_at_base_ ? base_iterator_->value()
-                            : delta_iterator_->Entry().value;
-  }
-
-  Status status() const override {
-    if (!status_.ok()) {
-      return status_;
-    }
-    if (!base_iterator_->status().ok()) {
-      return base_iterator_->status();
-    }
-    return delta_iterator_->status();
-  }
-
- private:
-  void AssertInvariants() {
-#ifndef NDEBUG
-    bool not_ok = false;
-    if (!base_iterator_->status().ok()) {
-      assert(!base_iterator_->Valid());
-      not_ok = true;
-    }
-    if (!delta_iterator_->status().ok()) {
-      assert(!delta_iterator_->Valid());
-      not_ok = true;
-    }
-    if (not_ok) {
-      assert(!Valid());
-      assert(!status().ok());
-      return;
-    }
-
-    if (!Valid()) {
-      return;
-    }
-    if (!BaseValid()) {
-      assert(!current_at_base_ && delta_iterator_->Valid());
-      return;
-    }
-    if (!DeltaValid()) {
-      assert(current_at_base_ && base_iterator_->Valid());
-      return;
-    }
-    // we don't support those yet
-    assert(delta_iterator_->Entry().type != kMergeRecord &&
-           delta_iterator_->Entry().type != kLogDataRecord);
-    int compare = comparator_->Compare(delta_iterator_->Entry().key,
-                                       base_iterator_->key());
-    if (forward_) {
-      // current_at_base -> compare < 0
-      assert(!current_at_base_ || compare < 0);
-      // !current_at_base -> compare <= 0
-      assert(current_at_base_ && compare >= 0);
-    } else {
-      // current_at_base -> compare > 0
-      assert(!current_at_base_ || compare > 0);
-      // !current_at_base -> compare <= 0
-      assert(current_at_base_ && compare <= 0);
-    }
-    // equal_keys_ <=> compare == 0
-    assert((equal_keys_ || compare != 0) && (!equal_keys_ || compare == 0));
-#endif
-  }
-
-  void Advance() {
-    if (equal_keys_) {
-      assert(BaseValid() && DeltaValid());
-      AdvanceBase();
-      AdvanceDelta();
-    } else {
-      if (current_at_base_) {
-        assert(BaseValid());
-        AdvanceBase();
-      } else {
-        assert(DeltaValid());
-        AdvanceDelta();
-      }
-    }
-    UpdateCurrent();
-  }
-
-  void AdvanceDelta() {
-    if (forward_) {
-      delta_iterator_->Next();
-    } else {
-      delta_iterator_->Prev();
-    }
-  }
-  void AdvanceBase() {
-    if (forward_) {
-      base_iterator_->Next();
-    } else {
-      base_iterator_->Prev();
-    }
-  }
-  bool BaseValid() const { return base_iterator_->Valid(); }
-  bool DeltaValid() const { return delta_iterator_->Valid(); }
-  void UpdateCurrent() {
-// Suppress false positive clang analyzer warnings.
-#ifndef __clang_analyzer__
-    status_ = Status::OK();
-    while (true) {
-      WriteEntry delta_entry;
-      if (DeltaValid()) {
-        assert(delta_iterator_->status().ok());
-        delta_entry = delta_iterator_->Entry();
-      } else if (!delta_iterator_->status().ok()) {
-        // Expose the error status and stop.
-        current_at_base_ = false;
-        return;
-      }
-      equal_keys_ = false;
-      if (!BaseValid()) {
-        if (!base_iterator_->status().ok()) {
-          // Expose the error status and stop.
-          current_at_base_ = true;
-          return;
-        }
-
-        // Base has finished.
-        if (!DeltaValid()) {
-          // Finished
-          return;
-        }
-        if (iterate_upper_bound_) {
-          if (comparator_->Compare(delta_entry.key, *iterate_upper_bound_) >=
-              0) {
-            // out of upper bound -> finished.
-            return;
-          }
-        }
-        if (delta_entry.type == kDeleteRecord ||
-            delta_entry.type == kSingleDeleteRecord) {
-          AdvanceDelta();
-        } else {
-          current_at_base_ = false;
-          return;
-        }
-      } else if (!DeltaValid()) {
-        // Delta has finished.
-        current_at_base_ = true;
-        return;
-      } else {
-        int compare =
-            (forward_ ? 1 : -1) *
-            comparator_->Compare(delta_entry.key, base_iterator_->key());
-        if (compare <= 0) {  // delta bigger or equal
-          if (compare == 0) {
-            equal_keys_ = true;
-          }
-          if (delta_entry.type != kDeleteRecord &&
-              delta_entry.type != kSingleDeleteRecord) {
-            current_at_base_ = false;
-            return;
-          }
-          // Delta is less advanced and is delete.
-          AdvanceDelta();
-          if (equal_keys_) {
-            AdvanceBase();
-          }
-        } else {
-          current_at_base_ = true;
-          return;
-        }
-      }
-    }
-
-    AssertInvariants();
-#endif  // __clang_analyzer__
-  }
-
-  bool forward_;
-  bool current_at_base_;
-  bool equal_keys_;
-  Status status_;
-  std::unique_ptr<Iterator> base_iterator_;
-  std::unique_ptr<WBWIIterator> delta_iterator_;
-  const Comparator* comparator_;  // not owned
-  const Slice* iterate_upper_bound_;
-};
-
-typedef SkipList<WriteBatchIndexEntry*, const WriteBatchEntryComparator&>
-    WriteBatchEntrySkipList;
-
-class WBWIIteratorImpl : public WBWIIterator {
- public:
-  WBWIIteratorImpl(uint32_t column_family_id,
-                   WriteBatchEntrySkipList* skip_list,
-                   const ReadableWriteBatch* write_batch)
-      : column_family_id_(column_family_id),
-        skip_list_iter_(skip_list),
-        write_batch_(write_batch) {}
-
-  ~WBWIIteratorImpl() override {}
-
-  bool Valid() const override {
-    if (!skip_list_iter_.Valid()) {
-      return false;
-    }
-    const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key();
-    return (iter_entry != nullptr &&
-            iter_entry->column_family == column_family_id_);
-  }
-
-  void SeekToFirst() override {
-    WriteBatchIndexEntry search_entry(
-        nullptr /* search_key */, column_family_id_,
-        true /* is_forward_direction */, true /* is_seek_to_first */);
-    skip_list_iter_.Seek(&search_entry);
-  }
-
-  void SeekToLast() override {
-    WriteBatchIndexEntry search_entry(
-        nullptr /* search_key */, column_family_id_ + 1,
-        true /* is_forward_direction */, true /* is_seek_to_first */);
-    skip_list_iter_.Seek(&search_entry);
-    if (!skip_list_iter_.Valid()) {
-      skip_list_iter_.SeekToLast();
-    } else {
-      skip_list_iter_.Prev();
-    }
-  }
-
-  void Seek(const Slice& key) override {
-    WriteBatchIndexEntry search_entry(&key, column_family_id_,
-                                      true /* is_forward_direction */,
-                                      false /* is_seek_to_first */);
-    skip_list_iter_.Seek(&search_entry);
-  }
-
-  void SeekForPrev(const Slice& key) override {
-    WriteBatchIndexEntry search_entry(&key, column_family_id_,
-                                      false /* is_forward_direction */,
-                                      false /* is_seek_to_first */);
-    skip_list_iter_.SeekForPrev(&search_entry);
-  }
-
-  void Next() override { skip_list_iter_.Next(); }
-
-  void Prev() override { skip_list_iter_.Prev(); }
-
-  WriteEntry Entry() const override {
-    WriteEntry ret;
-    Slice blob, xid;
-    const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key();
-    // this is guaranteed with Valid()
-    assert(iter_entry != nullptr &&
-           iter_entry->column_family == column_family_id_);
-    auto s = write_batch_->GetEntryFromDataOffset(
-        iter_entry->offset, &ret.type, &ret.key, &ret.value, &blob, &xid);
-    assert(s.ok());
-    assert(ret.type == kPutRecord || ret.type == kDeleteRecord ||
-           ret.type == kSingleDeleteRecord || ret.type == kDeleteRangeRecord ||
-           ret.type == kMergeRecord);
-    return ret;
-  }
-
-  Status status() const override {
-    // this is in-memory data structure, so the only way status can be non-ok is
-    // through memory corruption
-    return Status::OK();
-  }
-
-  const WriteBatchIndexEntry* GetRawEntry() const {
-    return skip_list_iter_.key();
-  }
-
- private:
-  uint32_t column_family_id_;
-  WriteBatchEntrySkipList::Iterator skip_list_iter_;
-  const ReadableWriteBatch* write_batch_;
-};
-
 struct WriteBatchWithIndex::Rep {
   explicit Rep(const Comparator* index_comparator, size_t reserved_bytes = 0,
                size_t max_bytes = 0, bool _overwrite_key = false)
@@ -461,13 +53,16 @@ struct WriteBatchWithIndex::Rep {
   // In overwrite mode, find the existing entry for the same key and update it
   // to point to the current entry.
   // Return true if the key is found and updated.
-  bool UpdateExistingEntry(ColumnFamilyHandle* column_family, const Slice& key);
-  bool UpdateExistingEntryWithCfId(uint32_t column_family_id, const Slice& key);
+  bool UpdateExistingEntry(ColumnFamilyHandle* column_family, const Slice& key,
+                           WriteType type);
+  bool UpdateExistingEntryWithCfId(uint32_t column_family_id, const Slice& key,
+                                   WriteType type);
 
   // Add the recent entry to the update.
   // In overwrite mode, if key already exists in the index, update it.
-  void AddOrUpdateIndex(ColumnFamilyHandle* column_family, const Slice& key);
-  void AddOrUpdateIndex(const Slice& key);
+  void AddOrUpdateIndex(ColumnFamilyHandle* column_family, const Slice& key,
+                        WriteType type);
+  void AddOrUpdateIndex(const Slice& key, WriteType type);
 
   // Allocate an index entry pointing to the last entry in the write batch and
   // put it to skip list.
@@ -483,24 +78,32 @@ struct WriteBatchWithIndex::Rep {
 };
 
 bool WriteBatchWithIndex::Rep::UpdateExistingEntry(
-    ColumnFamilyHandle* column_family, const Slice& key) {
+    ColumnFamilyHandle* column_family, const Slice& key, WriteType type) {
   uint32_t cf_id = GetColumnFamilyID(column_family);
-  return UpdateExistingEntryWithCfId(cf_id, key);
+  return UpdateExistingEntryWithCfId(cf_id, key, type);
 }
 
 bool WriteBatchWithIndex::Rep::UpdateExistingEntryWithCfId(
-    uint32_t column_family_id, const Slice& key) {
+    uint32_t column_family_id, const Slice& key, WriteType type) {
   if (!overwrite_key) {
     return false;
   }
 
-  WBWIIteratorImpl iter(column_family_id, &skip_list, &write_batch);
+  WBWIIteratorImpl iter(column_family_id, &skip_list, &write_batch,
+                        &comparator);
   iter.Seek(key);
   if (!iter.Valid()) {
     return false;
-  }
-  if (comparator.CompareKey(column_family_id, key, iter.Entry().key) != 0) {
+  } else if (!iter.MatchesKey(column_family_id, key)) {
     return false;
+  } else {
+    // Move to the end of this key (NextKey-Prev)
+    iter.NextKey();  // Move to the next key
+    if (iter.Valid()) {
+      iter.Prev();  // Move back one entry
+    } else {
+      iter.SeekToLast();
+    }
   }
   WriteBatchIndexEntry* non_const_entry =
       const_cast<WriteBatchIndexEntry*>(iter.GetRawEntry());
@@ -508,13 +111,17 @@ bool WriteBatchWithIndex::Rep::UpdateExistingEntryWithCfId(
     last_sub_batch_offset = last_entry_offset;
     sub_batch_cnt++;
   }
-  non_const_entry->offset = last_entry_offset;
-  return true;
+  if (type == kMergeRecord) {
+    return false;
+  } else {
+    non_const_entry->offset = last_entry_offset;
+    return true;
+  }
 }
 
 void WriteBatchWithIndex::Rep::AddOrUpdateIndex(
-    ColumnFamilyHandle* column_family, const Slice& key) {
-  if (!UpdateExistingEntry(column_family, key)) {
+    ColumnFamilyHandle* column_family, const Slice& key, WriteType type) {
+  if (!UpdateExistingEntry(column_family, key, type)) {
     uint32_t cf_id = GetColumnFamilyID(column_family);
     const auto* cf_cmp = GetColumnFamilyUserComparator(column_family);
     if (cf_cmp != nullptr) {
@@ -524,8 +131,9 @@ void WriteBatchWithIndex::Rep::AddOrUpdateIndex(
   }
 }
 
-void WriteBatchWithIndex::Rep::AddOrUpdateIndex(const Slice& key) {
-  if (!UpdateExistingEntryWithCfId(0, key)) {
+void WriteBatchWithIndex::Rep::AddOrUpdateIndex(const Slice& key,
+                                                WriteType type) {
+  if (!UpdateExistingEntryWithCfId(0, key, type)) {
     AddNewEntry(0);
   }
 }
@@ -597,14 +205,31 @@ Status WriteBatchWithIndex::Rep::ReBuildIndex() {
     switch (tag) {
       case kTypeColumnFamilyValue:
       case kTypeValue:
+        found++;
+        if (!UpdateExistingEntryWithCfId(column_family_id, key, kPutRecord)) {
+          AddNewEntry(column_family_id);
+        }
+        break;
       case kTypeColumnFamilyDeletion:
       case kTypeDeletion:
+        found++;
+        if (!UpdateExistingEntryWithCfId(column_family_id, key,
+                                         kDeleteRecord)) {
+          AddNewEntry(column_family_id);
+        }
+        break;
       case kTypeColumnFamilySingleDeletion:
       case kTypeSingleDeletion:
+        found++;
+        if (!UpdateExistingEntryWithCfId(column_family_id, key,
+                                         kSingleDeleteRecord)) {
+          AddNewEntry(column_family_id);
+        }
+        break;
       case kTypeColumnFamilyMerge:
       case kTypeMerge:
         found++;
-        if (!UpdateExistingEntryWithCfId(column_family_id, key)) {
+        if (!UpdateExistingEntryWithCfId(column_family_id, key, kMergeRecord)) {
           AddNewEntry(column_family_id);
         }
         break;
@@ -648,34 +273,33 @@ WriteBatch* WriteBatchWithIndex::GetWriteBatch() { return &rep->write_batch; }
 size_t WriteBatchWithIndex::SubBatchCnt() { return rep->sub_batch_cnt; }
 
 WBWIIterator* WriteBatchWithIndex::NewIterator() {
-  return new WBWIIteratorImpl(0, &(rep->skip_list), &rep->write_batch);
+  return new WBWIIteratorImpl(0, &(rep->skip_list), &rep->write_batch,
+                              &(rep->comparator));
 }
 
 WBWIIterator* WriteBatchWithIndex::NewIterator(
     ColumnFamilyHandle* column_family) {
   return new WBWIIteratorImpl(GetColumnFamilyID(column_family),
-                              &(rep->skip_list), &rep->write_batch);
+                              &(rep->skip_list), &rep->write_batch,
+                              &(rep->comparator));
 }
 
 Iterator* WriteBatchWithIndex::NewIteratorWithBase(
     ColumnFamilyHandle* column_family, Iterator* base_iterator,
     const ReadOptions* read_options) {
-  if (rep->overwrite_key == false) {
-    assert(false);
-    return nullptr;
-  }
-  return new BaseDeltaIterator(base_iterator, NewIterator(column_family),
+  auto wbwiii =
+      new WBWIIteratorImpl(GetColumnFamilyID(column_family), &(rep->skip_list),
+                           &rep->write_batch, &rep->comparator);
+  return new BaseDeltaIterator(column_family, base_iterator, wbwiii,
                                GetColumnFamilyUserComparator(column_family),
                                read_options);
 }
 
 Iterator* WriteBatchWithIndex::NewIteratorWithBase(Iterator* base_iterator) {
-  if (rep->overwrite_key == false) {
-    assert(false);
-    return nullptr;
-  }
   // default column family's comparator
-  return new BaseDeltaIterator(base_iterator, NewIterator(),
+  auto wbwiii = new WBWIIteratorImpl(0, &(rep->skip_list), &rep->write_batch,
+                                     &rep->comparator);
+  return new BaseDeltaIterator(nullptr, base_iterator, wbwiii,
                                rep->comparator.default_comparator());
 }
 
@@ -684,7 +308,7 @@ Status WriteBatchWithIndex::Put(ColumnFamilyHandle* column_family,
   rep->SetLastEntryOffset();
   auto s = rep->write_batch.Put(column_family, key, value);
   if (s.ok()) {
-    rep->AddOrUpdateIndex(column_family, key);
+    rep->AddOrUpdateIndex(column_family, key, kPutRecord);
   }
   return s;
 }
@@ -693,7 +317,7 @@ Status WriteBatchWithIndex::Put(const Slice& key, const Slice& value) {
   rep->SetLastEntryOffset();
   auto s = rep->write_batch.Put(key, value);
   if (s.ok()) {
-    rep->AddOrUpdateIndex(key);
+    rep->AddOrUpdateIndex(key, kPutRecord);
   }
   return s;
 }
@@ -703,7 +327,7 @@ Status WriteBatchWithIndex::Delete(ColumnFamilyHandle* column_family,
   rep->SetLastEntryOffset();
   auto s = rep->write_batch.Delete(column_family, key);
   if (s.ok()) {
-    rep->AddOrUpdateIndex(column_family, key);
+    rep->AddOrUpdateIndex(column_family, key, kDeleteRecord);
   }
   return s;
 }
@@ -712,7 +336,7 @@ Status WriteBatchWithIndex::Delete(const Slice& key) {
   rep->SetLastEntryOffset();
   auto s = rep->write_batch.Delete(key);
   if (s.ok()) {
-    rep->AddOrUpdateIndex(key);
+    rep->AddOrUpdateIndex(key, kDeleteRecord);
   }
   return s;
 }
@@ -722,7 +346,7 @@ Status WriteBatchWithIndex::SingleDelete(ColumnFamilyHandle* column_family,
   rep->SetLastEntryOffset();
   auto s = rep->write_batch.SingleDelete(column_family, key);
   if (s.ok()) {
-    rep->AddOrUpdateIndex(column_family, key);
+    rep->AddOrUpdateIndex(column_family, key, kSingleDeleteRecord);
   }
   return s;
 }
@@ -731,7 +355,7 @@ Status WriteBatchWithIndex::SingleDelete(const Slice& key) {
   rep->SetLastEntryOffset();
   auto s = rep->write_batch.SingleDelete(key);
   if (s.ok()) {
-    rep->AddOrUpdateIndex(key);
+    rep->AddOrUpdateIndex(key, kSingleDeleteRecord);
   }
   return s;
 }
@@ -741,7 +365,7 @@ Status WriteBatchWithIndex::Merge(ColumnFamilyHandle* column_family,
   rep->SetLastEntryOffset();
   auto s = rep->write_batch.Merge(column_family, key, value);
   if (s.ok()) {
-    rep->AddOrUpdateIndex(column_family, key);
+    rep->AddOrUpdateIndex(column_family, key, kMergeRecord);
   }
   return s;
 }
@@ -750,7 +374,7 @@ Status WriteBatchWithIndex::Merge(const Slice& key, const Slice& value) {
   rep->SetLastEntryOffset();
   auto s = rep->write_batch.Merge(key, value);
   if (s.ok()) {
-    rep->AddOrUpdateIndex(key);
+    rep->AddOrUpdateIndex(key, kMergeRecord);
   }
   return s;
 }
@@ -765,24 +389,19 @@ Status WriteBatchWithIndex::GetFromBatch(ColumnFamilyHandle* column_family,
                                          const DBOptions& options,
                                          const Slice& key, std::string* value) {
   Status s;
-  MergeContext merge_context;
-  const ImmutableDBOptions immuable_db_options(options);
-
-  WriteBatchWithIndexInternal::Result result =
-      WriteBatchWithIndexInternal::GetFromBatch(
-          immuable_db_options, this, column_family, key, &merge_context,
-          &rep->comparator, value, rep->overwrite_key, &s);
+  WriteBatchWithIndexInternal wbwii(&options, column_family);
+  auto result = wbwii.GetFromBatch(this, key, value, &s);
 
   switch (result) {
-    case WriteBatchWithIndexInternal::Result::kFound:
-    case WriteBatchWithIndexInternal::Result::kError:
+    case WBWIIteratorImpl::kFound:
+    case WBWIIteratorImpl::kError:
       // use returned status
       break;
-    case WriteBatchWithIndexInternal::Result::kDeleted:
-    case WriteBatchWithIndexInternal::Result::kNotFound:
+    case WBWIIteratorImpl::kDeleted:
+    case WBWIIteratorImpl::kNotFound:
       s = Status::NotFound();
       break;
-    case WriteBatchWithIndexInternal::Result::kMergeInProgress:
+    case WBWIIteratorImpl::kMergeInProgress:
       s = Status::MergeInProgress();
       break;
     default:
@@ -844,39 +463,24 @@ Status WriteBatchWithIndex::GetFromBatchAndDB(
     DB* db, const ReadOptions& read_options, ColumnFamilyHandle* column_family,
     const Slice& key, PinnableSlice* pinnable_val, ReadCallback* callback) {
   Status s;
-  MergeContext merge_context;
-  const ImmutableDBOptions& immuable_db_options =
-      static_cast_with_check<DBImpl>(db->GetRootDB())->immutable_db_options();
+  WriteBatchWithIndexInternal wbwii(db, column_family);
 
   // Since the lifetime of the WriteBatch is the same as that of the transaction
   // we cannot pin it as otherwise the returned value will not be available
   // after the transaction finishes.
   std::string& batch_value = *pinnable_val->GetSelf();
-  WriteBatchWithIndexInternal::Result result =
-      WriteBatchWithIndexInternal::GetFromBatch(
-          immuable_db_options, this, column_family, key, &merge_context,
-          &rep->comparator, &batch_value, rep->overwrite_key, &s);
+  auto result = wbwii.GetFromBatch(this, key, &batch_value, &s);
 
-  if (result == WriteBatchWithIndexInternal::Result::kFound) {
+  if (result == WBWIIteratorImpl::kFound) {
     pinnable_val->PinSelf();
     return s;
-  }
-  if (result == WriteBatchWithIndexInternal::Result::kDeleted) {
-    return Status::NotFound();
-  }
-  if (result == WriteBatchWithIndexInternal::Result::kError) {
+  } else if (!s.ok() || result == WBWIIteratorImpl::kError) {
     return s;
+  } else if (result == WBWIIteratorImpl::kDeleted) {
+    return Status::NotFound();
   }
-  if (result == WriteBatchWithIndexInternal::Result::kMergeInProgress &&
-      rep->overwrite_key == true) {
-    // Since we've overwritten keys, we do not know what other operations are
-    // in this batch for this key, so we cannot do a Merge to compute the
-    // result.  Instead, we will simply return MergeInProgress.
-    return Status::MergeInProgress();
-  }
-
-  assert(result == WriteBatchWithIndexInternal::Result::kMergeInProgress ||
-         result == WriteBatchWithIndexInternal::Result::kNotFound);
+  assert(result == WBWIIteratorImpl::kMergeInProgress ||
+         result == WBWIIteratorImpl::kNotFound);
 
   // Did not find key in batch OR could not resolve Merges.  Try DB.
   if (!callback) {
@@ -891,32 +495,18 @@ Status WriteBatchWithIndex::GetFromBatchAndDB(
   }
 
   if (s.ok() || s.IsNotFound()) {  // DB Get Succeeded
-    if (result == WriteBatchWithIndexInternal::Result::kMergeInProgress) {
+    if (result == WBWIIteratorImpl::kMergeInProgress) {
       // Merge result from DB with merges in Batch
-      auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
-      const MergeOperator* merge_operator =
-          cfh->cfd()->ioptions()->merge_operator;
-      Statistics* statistics = immuable_db_options.statistics.get();
-      Env* env = immuable_db_options.env;
-      Logger* logger = immuable_db_options.info_log.get();
-
-      Slice* merge_data;
+      std::string merge_result;
       if (s.ok()) {
-        merge_data = pinnable_val;
+        s = wbwii.MergeKey(key, pinnable_val, &merge_result);
       } else {  // Key not present in db (s.IsNotFound())
-        merge_data = nullptr;
+        s = wbwii.MergeKey(key, nullptr, &merge_result);
       }
-
-      if (merge_operator) {
-        std::string merge_result;
-        s = MergeHelper::TimedFullMerge(merge_operator, key, merge_data,
-                                        merge_context.GetOperands(),
-                                        &merge_result, logger, statistics, env);
+      if (s.ok()) {
         pinnable_val->Reset();
         *pinnable_val->GetSelf() = std::move(merge_result);
         pinnable_val->PinSelf();
-      } else {
-        s = Status::InvalidArgument("Options::merge_operator must be set");
       }
     }
   }
@@ -936,13 +526,12 @@ void WriteBatchWithIndex::MultiGetFromBatchAndDB(
     DB* db, const ReadOptions& read_options, ColumnFamilyHandle* column_family,
     const size_t num_keys, const Slice* keys, PinnableSlice* values,
     Status* statuses, bool sorted_input, ReadCallback* callback) {
-  const ImmutableDBOptions& immuable_db_options =
-      static_cast_with_check<DBImpl>(db->GetRootDB())->immutable_db_options();
+  WriteBatchWithIndexInternal wbwii(db, column_family);
 
   autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
   autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
   // To hold merges from the write batch
-  autovector<std::pair<WriteBatchWithIndexInternal::Result, MergeContext>,
+  autovector<std::pair<WBWIIteratorImpl::Result, MergeContext>,
              MultiGetContext::MAX_BATCH_SIZE>
       merges;
   // Since the lifetime of the WriteBatch is the same as that of the transaction
@@ -950,36 +539,27 @@ void WriteBatchWithIndex::MultiGetFromBatchAndDB(
   // after the transaction finishes.
   for (size_t i = 0; i < num_keys; ++i) {
     MergeContext merge_context;
-    PinnableSlice* pinnable_val = &values[i];
-    std::string& batch_value = *pinnable_val->GetSelf();
+    std::string batch_value;
     Status* s = &statuses[i];
-    WriteBatchWithIndexInternal::Result result =
-        WriteBatchWithIndexInternal::GetFromBatch(
-            immuable_db_options, this, column_family, keys[i], &merge_context,
-            &rep->comparator, &batch_value, rep->overwrite_key, s);
+    PinnableSlice* pinnable_val = &values[i];
+    pinnable_val->Reset();
+    auto result =
+        wbwii.GetFromBatch(this, keys[i], &merge_context, &batch_value, s);
 
-    if (result == WriteBatchWithIndexInternal::Result::kFound) {
+    if (result == WBWIIteratorImpl::kFound) {
+      *pinnable_val->GetSelf() = std::move(batch_value);
       pinnable_val->PinSelf();
       continue;
     }
-    if (result == WriteBatchWithIndexInternal::Result::kDeleted) {
+    if (result == WBWIIteratorImpl::kDeleted) {
       *s = Status::NotFound();
       continue;
     }
-    if (result == WriteBatchWithIndexInternal::Result::kError) {
-      continue;
-    }
-    if (result == WriteBatchWithIndexInternal::Result::kMergeInProgress &&
-        rep->overwrite_key == true) {
-      // Since we've overwritten keys, we do not know what other operations are
-      // in this batch for this key, so we cannot do a Merge to compute the
-      // result.  Instead, we will simply return MergeInProgress.
-      *s = Status::MergeInProgress();
+    if (result == WBWIIteratorImpl::kError) {
       continue;
     }
-
-    assert(result == WriteBatchWithIndexInternal::Result::kMergeInProgress ||
-           result == WriteBatchWithIndexInternal::Result::kNotFound);
+    assert(result == WBWIIteratorImpl::kMergeInProgress ||
+           result == WBWIIteratorImpl::kNotFound);
     key_context.emplace_back(column_family, keys[i], &values[i],
                              /*timestamp*/ nullptr, &statuses[i]);
     merges.emplace_back(result, std::move(merge_context));
@@ -996,38 +576,26 @@ void WriteBatchWithIndex::MultiGetFromBatchAndDB(
       ->MultiGetWithCallback(read_options, column_family, callback,
                              &sorted_keys);
 
-  ColumnFamilyHandleImpl* cfh =
-      static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
-  const MergeOperator* merge_operator = cfh->cfd()->ioptions()->merge_operator;
   for (auto iter = key_context.begin(); iter != key_context.end(); ++iter) {
     KeyContext& key = *iter;
     if (key.s->ok() || key.s->IsNotFound()) {  // DB Get Succeeded
       size_t index = iter - key_context.begin();
-      std::pair<WriteBatchWithIndexInternal::Result, MergeContext>&
-          merge_result = merges[index];
-      if (merge_result.first ==
-          WriteBatchWithIndexInternal::Result::kMergeInProgress) {
+      std::pair<WBWIIteratorImpl::Result, MergeContext>& merge_result =
+          merges[index];
+      if (merge_result.first == WBWIIteratorImpl::kMergeInProgress) {
+        std::string merged_value;
         // Merge result from DB with merges in Batch
-        Statistics* statistics = immuable_db_options.statistics.get();
-        Env* env = immuable_db_options.env;
-        Logger* logger = immuable_db_options.info_log.get();
-
-        Slice* merge_data;
         if (key.s->ok()) {
-          merge_data = iter->value;
+          *key.s = wbwii.MergeKey(*key.key, iter->value, merge_result.second,
+                                  &merged_value);
         } else {  // Key not present in db (s.IsNotFound())
-          merge_data = nullptr;
+          *key.s = wbwii.MergeKey(*key.key, nullptr, merge_result.second,
+                                  &merged_value);
         }
-
-        if (merge_operator) {
-          *key.s = MergeHelper::TimedFullMerge(
-              merge_operator, *key.key, merge_data,
-              merge_result.second.GetOperands(), key.value->GetSelf(), logger,
-              statistics, env);
+        if (key.s->ok()) {
+          key.value->Reset();
+          *key.value->GetSelf() = std::move(merged_value);
           key.value->PinSelf();
-        } else {
-          *key.s =
-              Status::InvalidArgument("Options::merge_operator must be set");
         }
       }
     }
diff --git a/utilities/write_batch_with_index/write_batch_with_index_internal.cc b/utilities/write_batch_with_index/write_batch_with_index_internal.cc
index 0d017ae19c5..6b264617c95 100644
--- a/utilities/write_batch_with_index/write_batch_with_index_internal.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index_internal.cc
@@ -8,6 +8,7 @@
 #include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
 
 #include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
 #include "rocksdb/comparator.h"
@@ -18,10 +19,432 @@
 #include "util/string_util.h"
 
 namespace ROCKSDB_NAMESPACE {
+BaseDeltaIterator::BaseDeltaIterator(ColumnFamilyHandle* column_family,
+                                     Iterator* base_iterator,
+                                     WBWIIteratorImpl* delta_iterator,
+                                     const Comparator* comparator,
+                                     const ReadOptions* read_options)
+    : forward_(true),
+      current_at_base_(true),
+      equal_keys_(false),
+      status_(Status::OK()),
+      base_iterator_(base_iterator),
+      delta_iterator_(delta_iterator),
+      comparator_(comparator),
+      iterate_upper_bound_(read_options ? read_options->iterate_upper_bound
+                                        : nullptr) {
+  wbwii_.reset(new WriteBatchWithIndexInternal(column_family));
+}
+
+bool BaseDeltaIterator::Valid() const {
+  return status_.ok() ? (current_at_base_ ? BaseValid() : DeltaValid()) : false;
+}
+
+void BaseDeltaIterator::SeekToFirst() {
+  forward_ = true;
+  base_iterator_->SeekToFirst();
+  delta_iterator_->SeekToFirst();
+  UpdateCurrent();
+}
+
+void BaseDeltaIterator::SeekToLast() {
+  forward_ = false;
+  base_iterator_->SeekToLast();
+  delta_iterator_->SeekToLast();
+  UpdateCurrent();
+}
+
+void BaseDeltaIterator::Seek(const Slice& k) {
+  forward_ = true;
+  base_iterator_->Seek(k);
+  delta_iterator_->Seek(k);
+  UpdateCurrent();
+}
+
+void BaseDeltaIterator::SeekForPrev(const Slice& k) {
+  forward_ = false;
+  base_iterator_->SeekForPrev(k);
+  delta_iterator_->SeekForPrev(k);
+  UpdateCurrent();
+}
+
+void BaseDeltaIterator::Next() {
+  if (!Valid()) {
+    status_ = Status::NotSupported("Next() on invalid iterator");
+    return;
+  }
+
+  if (!forward_) {
+    // Need to change direction
+    // if our direction was backward and we're not equal, we have two states:
+    // * both iterators are valid: we're already in a good state (current
+    // shows to smaller)
+    // * only one iterator is valid: we need to advance that iterator
+    forward_ = true;
+    equal_keys_ = false;
+    if (!BaseValid()) {
+      assert(DeltaValid());
+      base_iterator_->SeekToFirst();
+    } else if (!DeltaValid()) {
+      delta_iterator_->SeekToFirst();
+    } else if (current_at_base_) {
+      // Change delta from larger than base to smaller
+      AdvanceDelta();
+    } else {
+      // Change base from larger than delta to smaller
+      AdvanceBase();
+    }
+    if (DeltaValid() && BaseValid()) {
+      if (comparator_->Equal(delta_iterator_->Entry().key,
+                             base_iterator_->key())) {
+        equal_keys_ = true;
+      }
+    }
+  }
+  Advance();
+}
+
+void BaseDeltaIterator::Prev() {
+  if (!Valid()) {
+    status_ = Status::NotSupported("Prev() on invalid iterator");
+    return;
+  }
+
+  if (forward_) {
+    // Need to change direction
+    // if our direction was backward and we're not equal, we have two states:
+    // * both iterators are valid: we're already in a good state (current
+    // shows to smaller)
+    // * only one iterator is valid: we need to advance that iterator
+    forward_ = false;
+    equal_keys_ = false;
+    if (!BaseValid()) {
+      assert(DeltaValid());
+      base_iterator_->SeekToLast();
+    } else if (!DeltaValid()) {
+      delta_iterator_->SeekToLast();
+    } else if (current_at_base_) {
+      // Change delta from less advanced than base to more advanced
+      AdvanceDelta();
+    } else {
+      // Change base from less advanced than delta to more advanced
+      AdvanceBase();
+    }
+    if (DeltaValid() && BaseValid()) {
+      if (comparator_->Equal(delta_iterator_->Entry().key,
+                             base_iterator_->key())) {
+        equal_keys_ = true;
+      }
+    }
+  }
+
+  Advance();
+}
+
+Slice BaseDeltaIterator::key() const {
+  return current_at_base_ ? base_iterator_->key()
+                          : delta_iterator_->Entry().key;
+}
+
+Slice BaseDeltaIterator::value() const {
+  if (current_at_base_) {
+    return base_iterator_->value();
+  } else {
+    WriteEntry delta_entry = delta_iterator_->Entry();
+    if (wbwii_->GetNumOperands() == 0) {
+      return delta_entry.value;
+    } else if (delta_entry.type == kDeleteRecord ||
+               delta_entry.type == kSingleDeleteRecord) {
+      status_ =
+          wbwii_->MergeKey(delta_entry.key, nullptr, merge_result_.GetSelf());
+    } else if (delta_entry.type == kPutRecord) {
+      status_ = wbwii_->MergeKey(delta_entry.key, &delta_entry.value,
+                                 merge_result_.GetSelf());
+    } else if (delta_entry.type == kMergeRecord) {
+      if (equal_keys_) {
+        Slice base_value = base_iterator_->value();
+        status_ = wbwii_->MergeKey(delta_entry.key, &base_value,
+                                   merge_result_.GetSelf());
+      } else {
+        status_ =
+            wbwii_->MergeKey(delta_entry.key, nullptr, merge_result_.GetSelf());
+      }
+    }
+    merge_result_.PinSelf();
+    return merge_result_;
+  }
+}
+
+Status BaseDeltaIterator::status() const {
+  if (!status_.ok()) {
+    return status_;
+  }
+  if (!base_iterator_->status().ok()) {
+    return base_iterator_->status();
+  }
+  return delta_iterator_->status();
+}
+
+void BaseDeltaIterator::Invalidate(Status s) { status_ = s; }
+
+void BaseDeltaIterator::AssertInvariants() {
+#ifndef NDEBUG
+  bool not_ok = false;
+  if (!base_iterator_->status().ok()) {
+    assert(!base_iterator_->Valid());
+    not_ok = true;
+  }
+  if (!delta_iterator_->status().ok()) {
+    assert(!delta_iterator_->Valid());
+    not_ok = true;
+  }
+  if (not_ok) {
+    assert(!Valid());
+    assert(!status().ok());
+    return;
+  }
 
-class Env;
-class Logger;
-class Statistics;
+  if (!Valid()) {
+    return;
+  }
+  if (!BaseValid()) {
+    assert(!current_at_base_ && delta_iterator_->Valid());
+    return;
+  }
+  if (!DeltaValid()) {
+    assert(current_at_base_ && base_iterator_->Valid());
+    return;
+  }
+  // we don't support those yet
+  assert(delta_iterator_->Entry().type != kMergeRecord &&
+         delta_iterator_->Entry().type != kLogDataRecord);
+  int compare =
+      comparator_->Compare(delta_iterator_->Entry().key, base_iterator_->key());
+  if (forward_) {
+    // current_at_base -> compare < 0
+    assert(!current_at_base_ || compare < 0);
+    // !current_at_base -> compare <= 0
+    assert(current_at_base_ && compare >= 0);
+  } else {
+    // current_at_base -> compare > 0
+    assert(!current_at_base_ || compare > 0);
+    // !current_at_base -> compare <= 0
+    assert(current_at_base_ && compare <= 0);
+  }
+  // equal_keys_ <=> compare == 0
+  assert((equal_keys_ || compare != 0) && (!equal_keys_ || compare == 0));
+#endif
+}
+
+void BaseDeltaIterator::Advance() {
+  if (equal_keys_) {
+    assert(BaseValid() && DeltaValid());
+    AdvanceBase();
+    AdvanceDelta();
+  } else {
+    if (current_at_base_) {
+      assert(BaseValid());
+      AdvanceBase();
+    } else {
+      assert(DeltaValid());
+      AdvanceDelta();
+    }
+  }
+  UpdateCurrent();
+}
+
+void BaseDeltaIterator::AdvanceDelta() {
+  if (forward_) {
+    delta_iterator_->NextKey();
+  } else {
+    delta_iterator_->PrevKey();
+  }
+}
+void BaseDeltaIterator::AdvanceBase() {
+  if (forward_) {
+    base_iterator_->Next();
+  } else {
+    base_iterator_->Prev();
+  }
+}
+
+bool BaseDeltaIterator::BaseValid() const { return base_iterator_->Valid(); }
+bool BaseDeltaIterator::DeltaValid() const { return delta_iterator_->Valid(); }
+void BaseDeltaIterator::UpdateCurrent() {
+// Suppress false positive clang analyzer warnings.
+#ifndef __clang_analyzer__
+  status_ = Status::OK();
+  while (true) {
+    auto delta_result = WBWIIteratorImpl::kNotFound;
+    WriteEntry delta_entry;
+    if (DeltaValid()) {
+      assert(delta_iterator_->status().ok());
+      delta_result =
+          delta_iterator_->FindLatestUpdate(wbwii_->GetMergeContext());
+      delta_entry = delta_iterator_->Entry();
+    } else if (!delta_iterator_->status().ok()) {
+      // Expose the error status and stop.
+      current_at_base_ = false;
+      return;
+    }
+    equal_keys_ = false;
+    if (!BaseValid()) {
+      if (!base_iterator_->status().ok()) {
+        // Expose the error status and stop.
+        current_at_base_ = true;
+        return;
+      }
+
+      // Base has finished.
+      if (!DeltaValid()) {
+        // Finished
+        return;
+      }
+      if (iterate_upper_bound_) {
+        if (comparator_->Compare(delta_entry.key, *iterate_upper_bound_) >= 0) {
+          // out of upper bound -> finished.
+          return;
+        }
+      }
+      if (delta_result == WBWIIteratorImpl::kDeleted &&
+          wbwii_->GetNumOperands() == 0) {
+        AdvanceDelta();
+      } else {
+        current_at_base_ = false;
+        return;
+      }
+    } else if (!DeltaValid()) {
+      // Delta has finished.
+      current_at_base_ = true;
+      return;
+    } else {
+      int compare =
+          (forward_ ? 1 : -1) *
+          comparator_->Compare(delta_entry.key, base_iterator_->key());
+      if (compare <= 0) {  // delta bigger or equal
+        if (compare == 0) {
+          equal_keys_ = true;
+        }
+        if (delta_result != WBWIIteratorImpl::kDeleted ||
+            wbwii_->GetNumOperands() > 0) {
+          current_at_base_ = false;
+          return;
+        }
+        // Delta is less advanced and is delete.
+        AdvanceDelta();
+        if (equal_keys_) {
+          AdvanceBase();
+        }
+      } else {
+        current_at_base_ = true;
+        return;
+      }
+    }
+  }
+
+  AssertInvariants();
+#endif  // __clang_analyzer__
+}
+
+void WBWIIteratorImpl::AdvanceKey(bool forward) {
+  if (Valid()) {
+    Slice key = Entry().key;
+    do {
+      if (forward) {
+        Next();
+      } else {
+        Prev();
+      }
+    } while (MatchesKey(column_family_id_, key));
+  }
+}
+
+void WBWIIteratorImpl::NextKey() { AdvanceKey(true); }
+
+void WBWIIteratorImpl::PrevKey() {
+  AdvanceKey(false);  // Move to the tail of the previous key
+  if (Valid()) {
+    AdvanceKey(false);  // Move back another key.  Now we are at the start of
+                        // the previous key
+    if (Valid()) {      // Still a valid
+      Next();           // Move forward one onto this key
+    } else {
+      SeekToFirst();  // Not valid, move to the start
+    }
+  }
+}
+
+WBWIIteratorImpl::Result WBWIIteratorImpl::FindLatestUpdate(
+    MergeContext* merge_context) {
+  if (Valid()) {
+    Slice key = Entry().key;
+    return FindLatestUpdate(key, merge_context);
+  } else {
+    merge_context->Clear();  // Clear any entries in the MergeContext
+    return WBWIIteratorImpl::kNotFound;
+  }
+}
+
+WBWIIteratorImpl::Result WBWIIteratorImpl::FindLatestUpdate(
+    const Slice& key, MergeContext* merge_context) {
+  Result result = WBWIIteratorImpl::kNotFound;
+  merge_context->Clear();  // Clear any entries in the MergeContext
+  // TODO(agiardullo): consider adding support for reverse iteration
+  if (!Valid()) {
+    return result;
+  } else if (comparator_->CompareKey(column_family_id_, Entry().key, key) !=
+             0) {
+    return result;
+  } else {
+    // We want to iterate in the reverse order that the writes were added to the
+    // batch.  Since we don't have a reverse iterator, we must seek past the
+    // end. We do this by seeking to the next key, and then back one step
+    NextKey();
+    if (Valid()) {
+      Prev();
+    } else {
+      SeekToLast();
+    }
+
+    // We are at the end of the iterator for this key.  Search backwards for the
+    // last Put or Delete, accumulating merges along the way.
+    while (Valid()) {
+      const WriteEntry entry = Entry();
+      if (comparator_->CompareKey(column_family_id_, entry.key, key) != 0) {
+        break;  // Unexpected error or we've reached a different next key
+      }
+
+      switch (entry.type) {
+        case kPutRecord:
+          return WBWIIteratorImpl::kFound;
+        case kDeleteRecord:
+          return WBWIIteratorImpl::kDeleted;
+        case kSingleDeleteRecord:
+          return WBWIIteratorImpl::kDeleted;
+        case kMergeRecord:
+          result = WBWIIteratorImpl::kMergeInProgress;
+          merge_context->PushOperand(entry.value);
+          break;
+        case kLogDataRecord:
+          break;  // ignore
+        case kXIDRecord:
+          break;  // ignore
+        default:
+          return WBWIIteratorImpl::kError;
+      }  // end switch statement
+      Prev();
+    }  // End while Valid()
+    // At this point, we have been through the whole list and found no Puts or
+    // Deletes. The iterator points to the previous key.  Move the iterator back
+    // onto this one.
+    if (Valid()) {
+      Next();
+    } else {
+      SeekToFirst();
+    }
+  }
+  return result;
+}
 
 Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset,
                                                   WriteType* type, Slice* Key,
@@ -45,6 +468,9 @@ Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset,
   uint32_t column_family;
   Status s = ReadRecordFromWriteBatch(&input, &tag, &column_family, Key, value,
                                       blob, xid);
+  if (!s.ok()) {
+    return s;
+  }
 
   switch (tag) {
     case kTypeColumnFamilyValue:
@@ -152,136 +578,123 @@ int WriteBatchEntryComparator::CompareKey(uint32_t column_family,
   }
 }
 
-WriteBatchWithIndexInternal::Result WriteBatchWithIndexInternal::GetFromBatch(
-    const ImmutableDBOptions& immuable_db_options, WriteBatchWithIndex* batch,
-    ColumnFamilyHandle* column_family, const Slice& key,
-    MergeContext* merge_context, WriteBatchEntryComparator* cmp,
-    std::string* value, bool overwrite_key, Status* s) {
-  uint32_t cf_id = GetColumnFamilyID(column_family);
-  *s = Status::OK();
-  WriteBatchWithIndexInternal::Result result =
-      WriteBatchWithIndexInternal::Result::kNotFound;
+WriteEntry WBWIIteratorImpl::Entry() const {
+  WriteEntry ret;
+  Slice blob, xid;
+  const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key();
+  // this is guaranteed with Valid()
+  assert(iter_entry != nullptr &&
+         iter_entry->column_family == column_family_id_);
+  auto s = write_batch_->GetEntryFromDataOffset(
+      iter_entry->offset, &ret.type, &ret.key, &ret.value, &blob, &xid);
+  assert(s.ok());
+  assert(ret.type == kPutRecord || ret.type == kDeleteRecord ||
+         ret.type == kSingleDeleteRecord || ret.type == kDeleteRangeRecord ||
+         ret.type == kMergeRecord);
+  return ret;
+}
 
-  std::unique_ptr<WBWIIterator> iter =
-      std::unique_ptr<WBWIIterator>(batch->NewIterator(column_family));
+bool WBWIIteratorImpl::MatchesKey(uint32_t cf_id, const Slice& key) {
+  if (Valid()) {
+    return comparator_->CompareKey(cf_id, key, Entry().key) == 0;
+  } else {
+    return false;
+  }
+}
 
-  // We want to iterate in the reverse order that the writes were added to the
-  // batch.  Since we don't have a reverse iterator, we must seek past the end.
-  // TODO(agiardullo): consider adding support for reverse iteration
-  iter->Seek(key);
-  while (iter->Valid()) {
-    const WriteEntry entry = iter->Entry();
-    if (cmp->CompareKey(cf_id, entry.key, key) != 0) {
-      break;
-    }
+WriteBatchWithIndexInternal::WriteBatchWithIndexInternal(
+    ColumnFamilyHandle* column_family)
+    : db_(nullptr), db_options_(nullptr), column_family_(column_family) {}
 
-    iter->Next();
+WriteBatchWithIndexInternal::WriteBatchWithIndexInternal(
+    DB* db, ColumnFamilyHandle* column_family)
+    : db_(db), db_options_(nullptr), column_family_(column_family) {
+  if (db_ != nullptr && column_family_ == nullptr) {
+    column_family_ = db_->DefaultColumnFamily();
   }
+}
 
-  if (!(*s).ok()) {
-    return WriteBatchWithIndexInternal::Result::kError;
-  }
+WriteBatchWithIndexInternal::WriteBatchWithIndexInternal(
+    const DBOptions* db_options, ColumnFamilyHandle* column_family)
+    : db_(nullptr), db_options_(db_options), column_family_(column_family) {}
 
-  if (!iter->Valid()) {
-    // Read past end of results.  Reposition on last result.
-    iter->SeekToLast();
+Status WriteBatchWithIndexInternal::MergeKey(const Slice& key,
+                                             const Slice* value,
+                                             const MergeContext& context,
+                                             std::string* result) const {
+  if (column_family_ != nullptr) {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family_);
+    const auto merge_operator = cfh->cfd()->ioptions()->merge_operator.get();
+    if (merge_operator == nullptr) {
+      return Status::InvalidArgument(
+          "Merge_operator must be set for column_family");
+    } else if (db_ != nullptr) {
+      const ImmutableDBOptions& immutable_db_options =
+          static_cast_with_check<DBImpl>(db_->GetRootDB())
+              ->immutable_db_options();
+      Statistics* statistics = immutable_db_options.statistics.get();
+      Logger* logger = immutable_db_options.info_log.get();
+      SystemClock* clock = immutable_db_options.clock;
+      return MergeHelper::TimedFullMerge(merge_operator, key, value,
+                                         context.GetOperands(), result, logger,
+                                         statistics, clock);
+    } else if (db_options_ != nullptr) {
+      Statistics* statistics = db_options_->statistics.get();
+      Env* env = db_options_->env;
+      Logger* logger = db_options_->info_log.get();
+      SystemClock* clock = env->GetSystemClock().get();
+      return MergeHelper::TimedFullMerge(merge_operator, key, value,
+                                         context.GetOperands(), result, logger,
+                                         statistics, clock);
+    } else {
+      const auto cf_opts = cfh->cfd()->ioptions();
+      return MergeHelper::TimedFullMerge(
+          merge_operator, key, value, context.GetOperands(), result,
+          cf_opts->logger, cf_opts->stats, cf_opts->clock);
+    }
   } else {
-    iter->Prev();
+    return Status::InvalidArgument("Must provide a column_family");
   }
+}
 
-  Slice entry_value;
-  while (iter->Valid()) {
-    const WriteEntry entry = iter->Entry();
-    if (cmp->CompareKey(cf_id, entry.key, key) != 0) {
-      // Unexpected error or we've reached a different next key
-      break;
-    }
+WBWIIteratorImpl::Result WriteBatchWithIndexInternal::GetFromBatch(
+    WriteBatchWithIndex* batch, const Slice& key, MergeContext* context,
+    std::string* value, Status* s) {
+  *s = Status::OK();
 
-    switch (entry.type) {
-      case kPutRecord: {
-        result = WriteBatchWithIndexInternal::Result::kFound;
-        entry_value = entry.value;
-        break;
-      }
-      case kMergeRecord: {
-        result = WriteBatchWithIndexInternal::Result::kMergeInProgress;
-        merge_context->PushOperand(entry.value);
-        break;
-      }
-      case kDeleteRecord:
-      case kSingleDeleteRecord: {
-        result = WriteBatchWithIndexInternal::Result::kDeleted;
-        break;
-      }
-      case kLogDataRecord:
-      case kXIDRecord: {
-        // ignore
-        break;
-      }
-      default: {
-        result = WriteBatchWithIndexInternal::Result::kError;
-        (*s) = Status::Corruption("Unexpected entry in WriteBatchWithIndex:",
-                                  ToString(entry.type));
-        break;
+  std::unique_ptr<WBWIIteratorImpl> iter(
+      static_cast_with_check<WBWIIteratorImpl>(
+          batch->NewIterator(column_family_)));
+
+  // Search the iterator for this key, and updates/merges to it.
+  iter->Seek(key);
+  auto result = iter->FindLatestUpdate(key, context);
+  if (result == WBWIIteratorImpl::kError) {
+    (*s) = Status::Corruption("Unexpected entry in WriteBatchWithIndex:",
+                              ToString(iter->Entry().type));
+    return result;
+  } else if (result == WBWIIteratorImpl::kNotFound) {
+    return result;
+  } else if (result == WBWIIteratorImpl::Result::kFound) {  // PUT
+    Slice entry_value = iter->Entry().value;
+    if (context->GetNumOperands() > 0) {
+      *s = MergeKey(key, &entry_value, *context, value);
+      if (!s->ok()) {
+        result = WBWIIteratorImpl::Result::kError;
       }
+    } else {
+      value->assign(entry_value.data(), entry_value.size());
     }
-    if (result == WriteBatchWithIndexInternal::Result::kFound ||
-        result == WriteBatchWithIndexInternal::Result::kDeleted ||
-        result == WriteBatchWithIndexInternal::Result::kError) {
-      // We can stop iterating once we find a PUT or DELETE
-      break;
-    }
-    if (result == WriteBatchWithIndexInternal::Result::kMergeInProgress &&
-        overwrite_key == true) {
-      // Since we've overwritten keys, we do not know what other operations are
-      // in this batch for this key, so we cannot do a Merge to compute the
-      // result.  Instead, we will simply return MergeInProgress.
-      break;
-    }
-
-    iter->Prev();
-  }
-
-  if ((*s).ok()) {
-    if (result == WriteBatchWithIndexInternal::Result::kFound ||
-        result == WriteBatchWithIndexInternal::Result::kDeleted) {
-      // Found a Put or Delete.  Merge if necessary.
-      if (merge_context->GetNumOperands() > 0) {
-        const MergeOperator* merge_operator;
-
-        if (column_family != nullptr) {
-          auto cfh =
-              static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
-          merge_operator = cfh->cfd()->ioptions()->merge_operator;
-        } else {
-          *s = Status::InvalidArgument("Must provide a column_family");
-          result = WriteBatchWithIndexInternal::Result::kError;
-          return result;
-        }
-        Statistics* statistics = immuable_db_options.statistics.get();
-        Env* env = immuable_db_options.env;
-        Logger* logger = immuable_db_options.info_log.get();
-
-        if (merge_operator) {
-          *s = MergeHelper::TimedFullMerge(merge_operator, key, &entry_value,
-                                           merge_context->GetOperands(), value,
-                                           logger, statistics, env);
-        } else {
-          *s = Status::InvalidArgument("Options::merge_operator must be set");
-        }
-        if ((*s).ok()) {
-          result = WriteBatchWithIndexInternal::Result::kFound;
-        } else {
-          result = WriteBatchWithIndexInternal::Result::kError;
-        }
-      } else {  // nothing to merge
-        if (result == WriteBatchWithIndexInternal::Result::kFound) {  // PUT
-          value->assign(entry_value.data(), entry_value.size());
-        }
+  } else if (result == WBWIIteratorImpl::kDeleted) {
+    if (context->GetNumOperands() > 0) {
+      *s = MergeKey(key, nullptr, *context, value);
+      if (s->ok()) {
+        result = WBWIIteratorImpl::Result::kFound;
+      } else {
+        result = WBWIIteratorImpl::Result::kError;
       }
     }
   }
-
   return result;
 }
 
diff --git a/utilities/write_batch_with_index/write_batch_with_index_internal.h b/utilities/write_batch_with_index/write_batch_with_index_internal.h
index 6a859e072b9..b3360140bf2 100644
--- a/utilities/write_batch_with_index/write_batch_with_index_internal.h
+++ b/utilities/write_batch_with_index/write_batch_with_index_internal.h
@@ -10,6 +10,8 @@
 #include <string>
 #include <vector>
 
+#include "db/merge_context.h"
+#include "memtable/skiplist.h"
 #include "options/db_options.h"
 #include "port/port.h"
 #include "rocksdb/comparator.h"
@@ -21,8 +23,58 @@
 namespace ROCKSDB_NAMESPACE {
 
 class MergeContext;
+class WBWIIteratorImpl;
+class WriteBatchWithIndexInternal;
 struct Options;
 
+// when direction == forward
+// * current_at_base_ <=> base_iterator > delta_iterator
+// when direction == backwards
+// * current_at_base_ <=> base_iterator < delta_iterator
+// always:
+// * equal_keys_ <=> base_iterator == delta_iterator
+class BaseDeltaIterator : public Iterator {
+ public:
+  BaseDeltaIterator(ColumnFamilyHandle* column_family, Iterator* base_iterator,
+                    WBWIIteratorImpl* delta_iterator,
+                    const Comparator* comparator,
+                    const ReadOptions* read_options = nullptr);
+
+  ~BaseDeltaIterator() override {}
+
+  bool Valid() const override;
+  void SeekToFirst() override;
+  void SeekToLast() override;
+  void Seek(const Slice& k) override;
+  void SeekForPrev(const Slice& k) override;
+  void Next() override;
+  void Prev() override;
+  Slice key() const override;
+  Slice value() const override;
+  Status status() const override;
+  void Invalidate(Status s);
+
+ private:
+  void AssertInvariants();
+  void Advance();
+  void AdvanceDelta();
+  void AdvanceBase();
+  bool BaseValid() const;
+  bool DeltaValid() const;
+  void UpdateCurrent();
+
+  std::unique_ptr<WriteBatchWithIndexInternal> wbwii_;
+  bool forward_;
+  bool current_at_base_;
+  bool equal_keys_;
+  mutable Status status_;
+  std::unique_ptr<Iterator> base_iterator_;
+  std::unique_ptr<WBWIIteratorImpl> delta_iterator_;
+  const Comparator* comparator_;  // not owned
+  const Slice* iterate_upper_bound_;
+  mutable PinnableSlice merge_result_;
+};
+
 // Key used by skip list, as the binary searchable index of WriteBatchWithIndex.
 struct WriteBatchIndexEntry {
   WriteBatchIndexEntry(size_t o, uint32_t c, size_t ko, size_t ksz)
@@ -122,9 +174,120 @@ class WriteBatchEntryComparator {
   const ReadableWriteBatch* write_batch_;
 };
 
-class WriteBatchWithIndexInternal {
+typedef SkipList<WriteBatchIndexEntry*, const WriteBatchEntryComparator&>
+    WriteBatchEntrySkipList;
+
+class WBWIIteratorImpl : public WBWIIterator {
  public:
   enum Result { kFound, kDeleted, kNotFound, kMergeInProgress, kError };
+  WBWIIteratorImpl(uint32_t column_family_id,
+                   WriteBatchEntrySkipList* skip_list,
+                   const ReadableWriteBatch* write_batch,
+                   WriteBatchEntryComparator* comparator)
+      : column_family_id_(column_family_id),
+        skip_list_iter_(skip_list),
+        write_batch_(write_batch),
+        comparator_(comparator) {}
+
+  ~WBWIIteratorImpl() override {}
+
+  bool Valid() const override {
+    if (!skip_list_iter_.Valid()) {
+      return false;
+    }
+    const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key();
+    return (iter_entry != nullptr &&
+            iter_entry->column_family == column_family_id_);
+  }
+
+  void SeekToFirst() override {
+    WriteBatchIndexEntry search_entry(
+        nullptr /* search_key */, column_family_id_,
+        true /* is_forward_direction */, true /* is_seek_to_first */);
+    skip_list_iter_.Seek(&search_entry);
+  }
+
+  void SeekToLast() override {
+    WriteBatchIndexEntry search_entry(
+        nullptr /* search_key */, column_family_id_ + 1,
+        true /* is_forward_direction */, true /* is_seek_to_first */);
+    skip_list_iter_.Seek(&search_entry);
+    if (!skip_list_iter_.Valid()) {
+      skip_list_iter_.SeekToLast();
+    } else {
+      skip_list_iter_.Prev();
+    }
+  }
+
+  void Seek(const Slice& key) override {
+    WriteBatchIndexEntry search_entry(&key, column_family_id_,
+                                      true /* is_forward_direction */,
+                                      false /* is_seek_to_first */);
+    skip_list_iter_.Seek(&search_entry);
+  }
+
+  void SeekForPrev(const Slice& key) override {
+    WriteBatchIndexEntry search_entry(&key, column_family_id_,
+                                      false /* is_forward_direction */,
+                                      false /* is_seek_to_first */);
+    skip_list_iter_.SeekForPrev(&search_entry);
+  }
+
+  void Next() override { skip_list_iter_.Next(); }
+
+  void Prev() override { skip_list_iter_.Prev(); }
+
+  WriteEntry Entry() const override;
+
+  Status status() const override {
+    // this is in-memory data structure, so the only way status can be non-ok is
+    // through memory corruption
+    return Status::OK();
+  }
+
+  const WriteBatchIndexEntry* GetRawEntry() const {
+    return skip_list_iter_.key();
+  }
+
+  bool MatchesKey(uint32_t cf_id, const Slice& key);
+
+  // Moves the to first entry of the previous key.
+  void PrevKey();
+  // Moves the to first entry of the next key.
+  void NextKey();
+
+  // Moves the iterator to the Update (Put or Delete) for the current key
+  // If there are no Put/Delete, the Iterator will point to the first entry for
+  // this key
+  // @return kFound if a Put was found for the key
+  // @return kDeleted if a delete was found for the key
+  // @return kMergeInProgress if only merges were fouund for the key
+  // @return kError if an unsupported operation was found for the key
+  // @return kNotFound if no operations were found for this key
+  //
+  Result FindLatestUpdate(const Slice& key, MergeContext* merge_context);
+  Result FindLatestUpdate(MergeContext* merge_context);
+
+ protected:
+  void AdvanceKey(bool forward);
+
+ private:
+  uint32_t column_family_id_;
+  WriteBatchEntrySkipList::Iterator skip_list_iter_;
+  const ReadableWriteBatch* write_batch_;
+  WriteBatchEntryComparator* comparator_;
+};
+
+class WriteBatchWithIndexInternal {
+ public:
+  // For GetFromBatchAndDB or similar
+  explicit WriteBatchWithIndexInternal(DB* db,
+                                       ColumnFamilyHandle* column_family);
+  // For GetFromBatchAndDB or similar
+  explicit WriteBatchWithIndexInternal(ColumnFamilyHandle* column_family);
+  // For GetFromBatch or similar
+  explicit WriteBatchWithIndexInternal(const DBOptions* db_options,
+                                       ColumnFamilyHandle* column_family);
 
   // If batch contains a value for key, store it in *value and return kFound.
   // If batch contains a deletion for key, return Deleted.
@@ -134,11 +297,30 @@ class WriteBatchWithIndexInternal {
   //   and return kMergeInProgress
   // If batch does not contain this key, return kNotFound
   // Else, return kError on error with error Status stored in *s.
-  static WriteBatchWithIndexInternal::Result GetFromBatch(
-      const ImmutableDBOptions& ioptions, WriteBatchWithIndex* batch,
-      ColumnFamilyHandle* column_family, const Slice& key,
-      MergeContext* merge_context, WriteBatchEntryComparator* cmp,
-      std::string* value, bool overwrite_key, Status* s);
+  WBWIIteratorImpl::Result GetFromBatch(WriteBatchWithIndex* batch,
+                                        const Slice& key, std::string* value,
+                                        Status* s) {
+    return GetFromBatch(batch, key, &merge_context_, value, s);
+  }
+  WBWIIteratorImpl::Result GetFromBatch(WriteBatchWithIndex* batch,
+                                        const Slice& key,
+                                        MergeContext* merge_context,
+                                        std::string* value, Status* s);
+  Status MergeKey(const Slice& key, const Slice* value,
+                  std::string* result) const {
+    return MergeKey(key, value, merge_context_, result);
+  }
+  Status MergeKey(const Slice& key, const Slice* value,
+                  const MergeContext& context, std::string* result) const;
+  size_t GetNumOperands() const { return merge_context_.GetNumOperands(); }
+  MergeContext* GetMergeContext() { return &merge_context_; }
+  Slice GetOperand(int index) const { return merge_context_.GetOperand(index); }
+
+ private:
+  DB* db_;
+  const DBOptions* db_options_;
+  ColumnFamilyHandle* column_family_;
+  MergeContext merge_context_;
 };
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc
index ac4ab7af4a9..f39085efad5 100644
--- a/utilities/write_batch_with_index/write_batch_with_index_test.cc
+++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -10,8 +10,10 @@
 #ifndef ROCKSDB_LITE
 
 #include "rocksdb/utilities/write_batch_with_index.h"
+
 #include <map>
 #include <memory>
+
 #include "db/column_family.h"
 #include "port/stack_trace.h"
 #include "test_util/testharness.h"
@@ -19,6 +21,7 @@
 #include "util/string_util.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
+#include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
 
 namespace ROCKSDB_NAMESPACE {
 
@@ -73,9 +76,241 @@ struct TestHandler : public WriteBatch::Handler {
     return Status::OK();
   }
 };
+
+using KVMap = std::map<std::string, std::string>;
+
+class KVIter : public Iterator {
+ public:
+  explicit KVIter(const KVMap* map) : map_(map), iter_(map_->end()) {}
+  bool Valid() const override { return iter_ != map_->end(); }
+  void SeekToFirst() override { iter_ = map_->begin(); }
+  void SeekToLast() override {
+    if (map_->empty()) {
+      iter_ = map_->end();
+    } else {
+      iter_ = map_->find(map_->rbegin()->first);
+    }
+  }
+  void Seek(const Slice& k) override {
+    iter_ = map_->lower_bound(k.ToString());
+  }
+  void SeekForPrev(const Slice& k) override {
+    iter_ = map_->upper_bound(k.ToString());
+    Prev();
+  }
+  void Next() override { ++iter_; }
+  void Prev() override {
+    if (iter_ == map_->begin()) {
+      iter_ = map_->end();
+      return;
+    }
+    --iter_;
+  }
+  Slice key() const override { return iter_->first; }
+  Slice value() const override { return iter_->second; }
+  Status status() const override { return Status::OK(); }
+
+ private:
+  const KVMap* const map_;
+  KVMap::const_iterator iter_;
+};
+
+static std::string PrintContents(WriteBatchWithIndex* batch,
+                                 ColumnFamilyHandle* column_family) {
+  std::string result;
+
+  WBWIIterator* iter;
+  if (column_family == nullptr) {
+    iter = batch->NewIterator();
+  } else {
+    iter = batch->NewIterator(column_family);
+  }
+
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    WriteEntry e = iter->Entry();
+
+    if (e.type == kPutRecord) {
+      result.append("PUT(");
+      result.append(e.key.ToString());
+      result.append("):");
+      result.append(e.value.ToString());
+    } else if (e.type == kMergeRecord) {
+      result.append("MERGE(");
+      result.append(e.key.ToString());
+      result.append("):");
+      result.append(e.value.ToString());
+    } else if (e.type == kSingleDeleteRecord) {
+      result.append("SINGLE-DEL(");
+      result.append(e.key.ToString());
+      result.append(")");
+    } else {
+      assert(e.type == kDeleteRecord);
+      result.append("DEL(");
+      result.append(e.key.ToString());
+      result.append(")");
+    }
+
+    result.append(",");
+    iter->Next();
+  }
+
+  delete iter;
+  return result;
+}
+
+static std::string PrintContents(WriteBatchWithIndex* batch, KVMap* base_map,
+                                 ColumnFamilyHandle* column_family) {
+  std::string result;
+
+  Iterator* iter;
+  if (column_family == nullptr) {
+    iter = batch->NewIteratorWithBase(new KVIter(base_map));
+  } else {
+    iter = batch->NewIteratorWithBase(column_family, new KVIter(base_map));
+  }
+
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    assert(iter->status().ok());
+
+    Slice key = iter->key();
+    Slice value = iter->value();
+
+    result.append(key.ToString());
+    result.append(":");
+    result.append(value.ToString());
+    result.append(",");
+
+    iter->Next();
+  }
+
+  delete iter;
+  return result;
+}
+
+void AssertIter(Iterator* iter, const std::string& key,
+                const std::string& value) {
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(key, iter->key().ToString());
+  ASSERT_EQ(value, iter->value().ToString());
+}
+
+void AssertItersMatch(Iterator* iter1, Iterator* iter2) {
+  ASSERT_EQ(iter1->Valid(), iter2->Valid());
+  if (iter1->Valid()) {
+    ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString());
+    ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString());
+  }
+}
+
+void AssertItersEqual(Iterator* iter1, Iterator* iter2) {
+  iter1->SeekToFirst();
+  iter2->SeekToFirst();
+  while (iter1->Valid()) {
+    ASSERT_EQ(iter1->Valid(), iter2->Valid());
+    ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString());
+    ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString());
+    iter1->Next();
+    iter2->Next();
+  }
+  ASSERT_EQ(iter1->Valid(), iter2->Valid());
+}
+
+void AssertIterEqual(WBWIIteratorImpl* wbwii,
+                     const std::vector<std::string>& keys) {
+  wbwii->SeekToFirst();
+  for (auto k : keys) {
+    ASSERT_TRUE(wbwii->Valid());
+    ASSERT_EQ(wbwii->Entry().key, k);
+    wbwii->NextKey();
+  }
+  ASSERT_FALSE(wbwii->Valid());
+  wbwii->SeekToLast();
+  for (auto kit = keys.rbegin(); kit != keys.rend(); ++kit) {
+    ASSERT_TRUE(wbwii->Valid());
+    ASSERT_EQ(wbwii->Entry().key, *kit);
+    wbwii->PrevKey();
+  }
+  ASSERT_FALSE(wbwii->Valid());
+}
 }  // namespace anonymous
 
-class WriteBatchWithIndexTest : public testing::Test {};
+class WBWIBaseTest : public testing::Test {
+ public:
+  explicit WBWIBaseTest(bool overwrite) : db_(nullptr) {
+    options_.merge_operator =
+        MergeOperators::CreateFromStringId("stringappend");
+    options_.create_if_missing = true;
+    dbname_ = test::PerThreadDBPath("write_batch_with_index_test");
+    DestroyDB(dbname_, options_);
+    batch_.reset(new WriteBatchWithIndex(BytewiseComparator(), 20, overwrite));
+  }
+
+  virtual ~WBWIBaseTest() {
+    if (db_ != nullptr) {
+      ReleaseSnapshot();
+      delete db_;
+      DestroyDB(dbname_, options_);
+    }
+  }
+
+  std::string AddToBatch(ColumnFamilyHandle* cf, const std::string& key) {
+    std::string result;
+    for (size_t i = 0; i < key.size(); i++) {
+      if (key[i] == 'd') {
+        batch_->Delete(cf, key);
+        result = "";
+      } else if (key[i] == 'p') {
+        result = key + ToString(i);
+        batch_->Put(cf, key, result);
+      } else if (key[i] == 'm') {
+        std::string value = key + ToString(i);
+        batch_->Merge(cf, key, value);
+        if (result.empty()) {
+          result = value;
+        } else {
+          result = result + "," + value;
+        }
+      }
+    }
+    return result;
+  }
+
+  virtual Status OpenDB() { return DB::Open(options_, dbname_, &db_); }
+
+  void ReleaseSnapshot() {
+    if (read_opts_.snapshot != nullptr) {
+      EXPECT_NE(db_, nullptr);
+      db_->ReleaseSnapshot(read_opts_.snapshot);
+      read_opts_.snapshot = nullptr;
+    }
+  }
+
+ public:
+  DB* db_;
+  std::string dbname_;
+  Options options_;
+  WriteOptions write_opts_;
+  ReadOptions read_opts_;
+  std::unique_ptr<WriteBatchWithIndex> batch_;
+};
+
+class WBWIKeepTest : public WBWIBaseTest {
+ public:
+  WBWIKeepTest() : WBWIBaseTest(false) {}
+};
+
+class WBWIOverwriteTest : public WBWIBaseTest {
+ public:
+  WBWIOverwriteTest() : WBWIBaseTest(true) {}
+};
+class WriteBatchWithIndexTest : public WBWIBaseTest,
+                                public testing::WithParamInterface<bool> {
+ public:
+  WriteBatchWithIndexTest() : WBWIBaseTest(GetParam()) {}
+};
 
 void TestValueAsSecondaryIndexHelper(std::vector<Entry> entries,
                                      WriteBatchWithIndex* batch) {
@@ -96,11 +331,11 @@ void TestValueAsSecondaryIndexHelper(std::vector<Entry> entries,
   ColumnFamilyHandleImplDummy index(8, BytewiseComparator());
   for (auto& e : entries) {
     if (e.type == kPutRecord) {
-      batch->Put(&data, e.key, e.value);
-      batch->Put(&index, e.value, e.key);
+      ASSERT_OK(batch->Put(&data, e.key, e.value));
+      ASSERT_OK(batch->Put(&index, e.value, e.key));
     } else if (e.type == kMergeRecord) {
-      batch->Merge(&data, e.key, e.value);
-      batch->Put(&index, e.value, e.key);
+      ASSERT_OK(batch->Merge(&data, e.key, e.value));
+      ASSERT_OK(batch->Put(&index, e.value, e.key));
     } else {
       assert(e.type == kDeleteRecord);
       std::unique_ptr<WBWIIterator> iter(batch->NewIterator(&data));
@@ -109,8 +344,8 @@ void TestValueAsSecondaryIndexHelper(std::vector<Entry> entries,
       auto write_entry = iter->Entry();
       ASSERT_EQ(e.key, write_entry.key.ToString());
       ASSERT_EQ(e.value, write_entry.value.ToString());
-      batch->Delete(&data, e.key);
-      batch->Put(&index, e.value, "");
+      ASSERT_OK(batch->Delete(&data, e.key));
+      ASSERT_OK(batch->Put(&index, e.value, ""));
     }
   }
 
@@ -243,7 +478,7 @@ void TestValueAsSecondaryIndexHelper(std::vector<Entry> entries,
 
   // Verify WriteBatch can be iterated
   TestHandler handler;
-  batch->GetWriteBatch()->Iterate(&handler);
+  ASSERT_OK(batch->GetWriteBatch()->Iterate(&handler));
 
   // Verify data column family
   {
@@ -273,7 +508,7 @@ void TestValueAsSecondaryIndexHelper(std::vector<Entry> entries,
   }
 }
 
-TEST_F(WriteBatchWithIndexTest, TestValueAsSecondaryIndex) {
+TEST_F(WBWIKeepTest, TestValueAsSecondaryIndex) {
   Entry entries[] = {
       {"aaa", "0005", kPutRecord},
       {"b", "0002", kPutRecord},
@@ -286,12 +521,12 @@ TEST_F(WriteBatchWithIndexTest, TestValueAsSecondaryIndex) {
   };
   std::vector<Entry> entries_list(entries, entries + 8);
 
-  WriteBatchWithIndex batch(nullptr, 20);
+  batch_.reset(new WriteBatchWithIndex(nullptr, 20, false));
 
-  TestValueAsSecondaryIndexHelper(entries_list, &batch);
+  TestValueAsSecondaryIndexHelper(entries_list, batch_.get());
 
   // Clear batch and re-run test with new values
-  batch.Clear();
+  batch_->Clear();
 
   Entry new_entries[] = {
       {"aaa", "0005", kPutRecord},
@@ -306,30 +541,29 @@ TEST_F(WriteBatchWithIndexTest, TestValueAsSecondaryIndex) {
 
   entries_list = std::vector<Entry>(new_entries, new_entries + 8);
 
-  TestValueAsSecondaryIndexHelper(entries_list, &batch);
+  TestValueAsSecondaryIndexHelper(entries_list, batch_.get());
 }
 
-TEST_F(WriteBatchWithIndexTest, TestComparatorForCF) {
+TEST_P(WriteBatchWithIndexTest, TestComparatorForCF) {
   ColumnFamilyHandleImplDummy cf1(6, nullptr);
   ColumnFamilyHandleImplDummy reverse_cf(66, ReverseBytewiseComparator());
   ColumnFamilyHandleImplDummy cf2(88, BytewiseComparator());
-  WriteBatchWithIndex batch(BytewiseComparator(), 20);
 
-  batch.Put(&cf1, "ddd", "");
-  batch.Put(&cf2, "aaa", "");
-  batch.Put(&cf2, "eee", "");
-  batch.Put(&cf1, "ccc", "");
-  batch.Put(&reverse_cf, "a11", "");
-  batch.Put(&cf1, "bbb", "");
+  ASSERT_OK(batch_->Put(&cf1, "ddd", ""));
+  ASSERT_OK(batch_->Put(&cf2, "aaa", ""));
+  ASSERT_OK(batch_->Put(&cf2, "eee", ""));
+  ASSERT_OK(batch_->Put(&cf1, "ccc", ""));
+  ASSERT_OK(batch_->Put(&reverse_cf, "a11", ""));
+  ASSERT_OK(batch_->Put(&cf1, "bbb", ""));
 
   Slice key_slices[] = {"a", "3", "3"};
   Slice value_slice = "";
-  batch.Put(&reverse_cf, SliceParts(key_slices, 3),
-            SliceParts(&value_slice, 1));
-  batch.Put(&reverse_cf, "a22", "");
+  ASSERT_OK(batch_->Put(&reverse_cf, SliceParts(key_slices, 3),
+                        SliceParts(&value_slice, 1)));
+  ASSERT_OK(batch_->Put(&reverse_cf, "a22", ""));
 
   {
-    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&cf1));
+    std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&cf1));
     iter->Seek("");
     ASSERT_OK(iter->status());
     ASSERT_TRUE(iter->Valid());
@@ -348,7 +582,7 @@ TEST_F(WriteBatchWithIndexTest, TestComparatorForCF) {
   }
 
   {
-    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&cf2));
+    std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&cf2));
     iter->Seek("");
     ASSERT_OK(iter->status());
     ASSERT_TRUE(iter->Valid());
@@ -363,7 +597,7 @@ TEST_F(WriteBatchWithIndexTest, TestComparatorForCF) {
   }
 
   {
-    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&reverse_cf));
+    std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&reverse_cf));
     iter->Seek("");
     ASSERT_OK(iter->status());
     ASSERT_TRUE(!iter->Valid());
@@ -396,29 +630,28 @@ TEST_F(WriteBatchWithIndexTest, TestComparatorForCF) {
   }
 }
 
-TEST_F(WriteBatchWithIndexTest, TestOverwriteKey) {
+TEST_F(WBWIOverwriteTest, TestOverwriteKey) {
   ColumnFamilyHandleImplDummy cf1(6, nullptr);
   ColumnFamilyHandleImplDummy reverse_cf(66, ReverseBytewiseComparator());
   ColumnFamilyHandleImplDummy cf2(88, BytewiseComparator());
-  WriteBatchWithIndex batch(BytewiseComparator(), 20, true);
-
-  batch.Put(&cf1, "ddd", "");
-  batch.Merge(&cf1, "ddd", "");
-  batch.Delete(&cf1, "ddd");
-  batch.Put(&cf2, "aaa", "");
-  batch.Delete(&cf2, "aaa");
-  batch.Put(&cf2, "aaa", "aaa");
-  batch.Put(&cf2, "eee", "eee");
-  batch.Put(&cf1, "ccc", "");
-  batch.Put(&reverse_cf, "a11", "");
-  batch.Delete(&cf1, "ccc");
-  batch.Put(&reverse_cf, "a33", "a33");
-  batch.Put(&reverse_cf, "a11", "a11");
+
+  ASSERT_OK(batch_->Merge(&cf1, "ddd", ""));
+  ASSERT_OK(batch_->Put(&cf1, "ddd", ""));
+  ASSERT_OK(batch_->Delete(&cf1, "ddd"));
+  ASSERT_OK(batch_->Put(&cf2, "aaa", ""));
+  ASSERT_OK(batch_->Delete(&cf2, "aaa"));
+  ASSERT_OK(batch_->Put(&cf2, "aaa", "aaa"));
+  ASSERT_OK(batch_->Put(&cf2, "eee", "eee"));
+  ASSERT_OK(batch_->Put(&cf1, "ccc", ""));
+  ASSERT_OK(batch_->Put(&reverse_cf, "a11", ""));
+  ASSERT_OK(batch_->Delete(&cf1, "ccc"));
+  ASSERT_OK(batch_->Put(&reverse_cf, "a33", "a33"));
+  ASSERT_OK(batch_->Put(&reverse_cf, "a11", "a11"));
   Slice slices[] = {"a", "3", "3"};
-  batch.Delete(&reverse_cf, SliceParts(slices, 3));
+  ASSERT_OK(batch_->Delete(&reverse_cf, SliceParts(slices, 3)));
 
   {
-    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&cf1));
+    std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&cf1));
     iter->Seek("");
     ASSERT_OK(iter->status());
     ASSERT_TRUE(iter->Valid());
@@ -435,7 +668,7 @@ TEST_F(WriteBatchWithIndexTest, TestOverwriteKey) {
   }
 
   {
-    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&cf2));
+    std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&cf2));
     iter->SeekToLast();
     ASSERT_OK(iter->status());
     ASSERT_TRUE(iter->Valid());
@@ -466,7 +699,7 @@ TEST_F(WriteBatchWithIndexTest, TestOverwriteKey) {
   }
 
   {
-    std::unique_ptr<WBWIIterator> iter(batch.NewIterator(&reverse_cf));
+    std::unique_ptr<WBWIIterator> iter(batch_->NewIterator(&reverse_cf));
     iter->Seek("");
     ASSERT_OK(iter->status());
     ASSERT_TRUE(!iter->Valid());
@@ -500,64 +733,33 @@ TEST_F(WriteBatchWithIndexTest, TestOverwriteKey) {
   }
 }
 
-namespace {
-typedef std::map<std::string, std::string> KVMap;
-
-class KVIter : public Iterator {
- public:
-  explicit KVIter(const KVMap* map) : map_(map), iter_(map_->end()) {}
-  bool Valid() const override { return iter_ != map_->end(); }
-  void SeekToFirst() override { iter_ = map_->begin(); }
-  void SeekToLast() override {
-    if (map_->empty()) {
-      iter_ = map_->end();
-    } else {
-      iter_ = map_->find(map_->rbegin()->first);
-    }
-  }
-  void Seek(const Slice& k) override {
-    iter_ = map_->lower_bound(k.ToString());
-  }
-  void SeekForPrev(const Slice& k) override {
-    iter_ = map_->upper_bound(k.ToString());
-    Prev();
-  }
-  void Next() override { ++iter_; }
-  void Prev() override {
-    if (iter_ == map_->begin()) {
-      iter_ = map_->end();
-      return;
-    }
-    --iter_;
-  }
-
-  Slice key() const override { return iter_->first; }
-  Slice value() const override { return iter_->second; }
-  Status status() const override { return Status::OK(); }
-
- private:
-  const KVMap* const map_;
-  KVMap::const_iterator iter_;
-};
-
-void AssertIter(Iterator* iter, const std::string& key,
-                const std::string& value) {
-  ASSERT_OK(iter->status());
-  ASSERT_TRUE(iter->Valid());
-  ASSERT_EQ(key, iter->key().ToString());
-  ASSERT_EQ(value, iter->value().ToString());
-}
-
-void AssertItersEqual(Iterator* iter1, Iterator* iter2) {
-  ASSERT_EQ(iter1->Valid(), iter2->Valid());
-  if (iter1->Valid()) {
-    ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString());
-    ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString());
-  }
+TEST_P(WriteBatchWithIndexTest, TestWBWIIterator) {
+  ColumnFamilyHandleImplDummy cf1(1, BytewiseComparator());
+  ColumnFamilyHandleImplDummy cf2(2, BytewiseComparator());
+  ASSERT_OK(batch_->Put(&cf1, "a", "a1"));
+  ASSERT_OK(batch_->Put(&cf1, "c", "c1"));
+  ASSERT_OK(batch_->Put(&cf1, "c", "c2"));
+  ASSERT_OK(batch_->Put(&cf1, "e", "e1"));
+  ASSERT_OK(batch_->Put(&cf1, "e", "e2"));
+  ASSERT_OK(batch_->Put(&cf1, "e", "e3"));
+  std::unique_ptr<WBWIIteratorImpl> iter1(
+      static_cast<WBWIIteratorImpl*>(batch_->NewIterator(&cf1)));
+  std::unique_ptr<WBWIIteratorImpl> iter2(
+      static_cast<WBWIIteratorImpl*>(batch_->NewIterator(&cf2)));
+  AssertIterEqual(iter1.get(), {"a", "c", "e"});
+  AssertIterEqual(iter2.get(), {});
+  ASSERT_OK(batch_->Put(&cf2, "a", "a2"));
+  ASSERT_OK(batch_->Merge(&cf2, "b", "b1"));
+  ASSERT_OK(batch_->Merge(&cf2, "b", "b2"));
+  ASSERT_OK(batch_->Delete(&cf2, "d"));
+  ASSERT_OK(batch_->Merge(&cf2, "d", "d2"));
+  ASSERT_OK(batch_->Merge(&cf2, "d", "d3"));
+  ASSERT_OK(batch_->Delete(&cf2, "f"));
+  AssertIterEqual(iter1.get(), {"a", "c", "e"});
+  AssertIterEqual(iter2.get(), {"a", "b", "d", "f"});
 }
-}  // namespace
 
-TEST_F(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
+TEST_P(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
   std::vector<std::string> source_strings = {"a", "b", "c", "d", "e",
                                              "f", "g", "h", "i", "j"};
   for (int rand_seed = 301; rand_seed < 366; rand_seed++) {
@@ -566,14 +768,13 @@ TEST_F(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
     ColumnFamilyHandleImplDummy cf1(6, BytewiseComparator());
     ColumnFamilyHandleImplDummy cf2(2, BytewiseComparator());
     ColumnFamilyHandleImplDummy cf3(8, BytewiseComparator());
-
-    WriteBatchWithIndex batch(BytewiseComparator(), 20, true);
+    batch_->Clear();
 
     if (rand_seed % 2 == 0) {
-      batch.Put(&cf2, "zoo", "bar");
+      ASSERT_OK(batch_->Put(&cf2, "zoo", "bar"));
     }
     if (rand_seed % 4 == 1) {
-      batch.Put(&cf3, "zoo", "bar");
+      ASSERT_OK(batch_->Put(&cf3, "zoo", "bar"));
     }
 
     KVMap map;
@@ -589,24 +790,24 @@ TEST_F(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
           break;
         case 1:
           // only delta has it
-          batch.Put(&cf1, key, value);
+          ASSERT_OK(batch_->Put(&cf1, key, value));
           map[key] = value;
           merged_map[key] = value;
           break;
         case 2:
           // both has it. Delta should win
-          batch.Put(&cf1, key, value);
+          ASSERT_OK(batch_->Put(&cf1, key, value));
           map[key] = "wrong_value";
           merged_map[key] = value;
           break;
         case 3:
           // both has it. Delta is delete
-          batch.Delete(&cf1, key);
+          ASSERT_OK(batch_->Delete(&cf1, key));
           map[key] = "wrong_value";
           break;
         case 4:
           // only delta has it. Delta is delete
-          batch.Delete(&cf1, key);
+          ASSERT_OK(batch_->Delete(&cf1, key));
           map[key] = "wrong_value";
           break;
         default:
@@ -616,7 +817,7 @@ TEST_F(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
     }
 
     std::unique_ptr<Iterator> iter(
-        batch.NewIteratorWithBase(&cf1, new KVIter(&map)));
+        batch_->NewIteratorWithBase(&cf1, new KVIter(&map)));
     std::unique_ptr<Iterator> result_iter(new KVIter(&merged_map));
 
     bool is_valid = false;
@@ -672,24 +873,24 @@ TEST_F(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
           }
           break;
       }
-      AssertItersEqual(iter.get(), result_iter.get());
+      AssertItersMatch(iter.get(), result_iter.get());
       is_valid = iter->Valid();
     }
+
+    ASSERT_OK(iter->status());
   }
 }
 
-TEST_F(WriteBatchWithIndexTest, TestIteraratorWithBase) {
+TEST_P(WriteBatchWithIndexTest, TestIteraratorWithBase) {
   ColumnFamilyHandleImplDummy cf1(6, BytewiseComparator());
   ColumnFamilyHandleImplDummy cf2(2, BytewiseComparator());
-  WriteBatchWithIndex batch(BytewiseComparator(), 20, true);
-
   {
     KVMap map;
     map["a"] = "aa";
     map["c"] = "cc";
     map["e"] = "ee";
     std::unique_ptr<Iterator> iter(
-        batch.NewIteratorWithBase(&cf1, new KVIter(&map)));
+        batch_->NewIteratorWithBase(&cf1, new KVIter(&map)));
 
     iter->SeekToFirst();
     AssertIter(iter.get(), "a", "aa");
@@ -722,12 +923,12 @@ TEST_F(WriteBatchWithIndexTest, TestIteraratorWithBase) {
   }
 
   // Test the case that there is one element in the write batch
-  batch.Put(&cf2, "zoo", "bar");
-  batch.Put(&cf1, "a", "aa");
+  ASSERT_OK(batch_->Put(&cf2, "zoo", "bar"));
+  ASSERT_OK(batch_->Put(&cf1, "a", "aa"));
   {
     KVMap empty_map;
     std::unique_ptr<Iterator> iter(
-        batch.NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
+        batch_->NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
 
     iter->SeekToFirst();
     AssertIter(iter.get(), "a", "aa");
@@ -736,10 +937,10 @@ TEST_F(WriteBatchWithIndexTest, TestIteraratorWithBase) {
     ASSERT_TRUE(!iter->Valid());
   }
 
-  batch.Delete(&cf1, "b");
-  batch.Put(&cf1, "c", "cc");
-  batch.Put(&cf1, "d", "dd");
-  batch.Delete(&cf1, "e");
+  ASSERT_OK(batch_->Delete(&cf1, "b"));
+  ASSERT_OK(batch_->Put(&cf1, "c", "cc"));
+  ASSERT_OK(batch_->Put(&cf1, "d", "dd"));
+  ASSERT_OK(batch_->Delete(&cf1, "e"));
 
   {
     KVMap map;
@@ -747,7 +948,7 @@ TEST_F(WriteBatchWithIndexTest, TestIteraratorWithBase) {
     map["cc"] = "cccc";
     map["f"] = "ff";
     std::unique_ptr<Iterator> iter(
-        batch.NewIteratorWithBase(&cf1, new KVIter(&map)));
+        batch_->NewIteratorWithBase(&cf1, new KVIter(&map)));
 
     iter->SeekToFirst();
     AssertIter(iter.get(), "a", "aa");
@@ -805,7 +1006,7 @@ TEST_F(WriteBatchWithIndexTest, TestIteraratorWithBase) {
   {
     KVMap empty_map;
     std::unique_ptr<Iterator> iter(
-        batch.NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
+        batch_->NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
 
     iter->SeekToFirst();
     AssertIter(iter.get(), "a", "aa");
@@ -841,18 +1042,17 @@ TEST_F(WriteBatchWithIndexTest, TestIteraratorWithBase) {
   }
 }
 
-TEST_F(WriteBatchWithIndexTest, TestIteraratorWithBaseReverseCmp) {
+TEST_P(WriteBatchWithIndexTest, TestIteraratorWithBaseReverseCmp) {
   ColumnFamilyHandleImplDummy cf1(6, ReverseBytewiseComparator());
   ColumnFamilyHandleImplDummy cf2(2, ReverseBytewiseComparator());
-  WriteBatchWithIndex batch(BytewiseComparator(), 20, true);
 
   // Test the case that there is one element in the write batch
-  batch.Put(&cf2, "zoo", "bar");
-  batch.Put(&cf1, "a", "aa");
+  ASSERT_OK(batch_->Put(&cf2, "zoo", "bar"));
+  ASSERT_OK(batch_->Put(&cf1, "a", "aa"));
   {
     KVMap empty_map;
     std::unique_ptr<Iterator> iter(
-        batch.NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
+        batch_->NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
 
     iter->SeekToFirst();
     AssertIter(iter.get(), "a", "aa");
@@ -861,11 +1061,11 @@ TEST_F(WriteBatchWithIndexTest, TestIteraratorWithBaseReverseCmp) {
     ASSERT_TRUE(!iter->Valid());
   }
 
-  batch.Put(&cf1, "c", "cc");
+  ASSERT_OK(batch_->Put(&cf1, "c", "cc"));
   {
     KVMap map;
     std::unique_ptr<Iterator> iter(
-        batch.NewIteratorWithBase(&cf1, new KVIter(&map)));
+        batch_->NewIteratorWithBase(&cf1, new KVIter(&map)));
 
     iter->SeekToFirst();
     AssertIter(iter.get(), "c", "cc");
@@ -894,11 +1094,12 @@ TEST_F(WriteBatchWithIndexTest, TestIteraratorWithBaseReverseCmp) {
   }
 
   // default column family
-  batch.Put("a", "b");
+  ASSERT_OK(batch_->Put("a", "b"));
   {
     KVMap map;
     map["b"] = "";
-    std::unique_ptr<Iterator> iter(batch.NewIteratorWithBase(new KVIter(&map)));
+    std::unique_ptr<Iterator> iter(
+        batch_->NewIteratorWithBase(new KVIter(&map)));
 
     iter->SeekToFirst();
     AssertIter(iter.get(), "a", "b");
@@ -927,416 +1128,331 @@ TEST_F(WriteBatchWithIndexTest, TestIteraratorWithBaseReverseCmp) {
   }
 }
 
-TEST_F(WriteBatchWithIndexTest, TestGetFromBatch) {
+TEST_P(WriteBatchWithIndexTest, TestGetFromBatch) {
   Options options;
-  WriteBatchWithIndex batch;
   Status s;
   std::string value;
 
-  s = batch.GetFromBatch(options, "b", &value);
+  s = batch_->GetFromBatch(options_, "b", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  batch.Put("a", "a");
-  batch.Put("b", "b");
-  batch.Put("c", "c");
-  batch.Put("a", "z");
-  batch.Delete("c");
-  batch.Delete("d");
-  batch.Delete("e");
-  batch.Put("e", "e");
+  ASSERT_OK(batch_->Put("a", "a"));
+  ASSERT_OK(batch_->Put("b", "b"));
+  ASSERT_OK(batch_->Put("c", "c"));
+  ASSERT_OK(batch_->Put("a", "z"));
+  ASSERT_OK(batch_->Delete("c"));
+  ASSERT_OK(batch_->Delete("d"));
+  ASSERT_OK(batch_->Delete("e"));
+  ASSERT_OK(batch_->Put("e", "e"));
 
-  s = batch.GetFromBatch(options, "b", &value);
+  s = batch_->GetFromBatch(options_, "b", &value);
   ASSERT_OK(s);
   ASSERT_EQ("b", value);
 
-  s = batch.GetFromBatch(options, "a", &value);
+  s = batch_->GetFromBatch(options_, "a", &value);
   ASSERT_OK(s);
   ASSERT_EQ("z", value);
 
-  s = batch.GetFromBatch(options, "c", &value);
+  s = batch_->GetFromBatch(options_, "c", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = batch.GetFromBatch(options, "d", &value);
+  s = batch_->GetFromBatch(options_, "d", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = batch.GetFromBatch(options, "x", &value);
+  s = batch_->GetFromBatch(options_, "x", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = batch.GetFromBatch(options, "e", &value);
+  s = batch_->GetFromBatch(options_, "e", &value);
   ASSERT_OK(s);
   ASSERT_EQ("e", value);
 
-  batch.Merge("z", "z");
+  ASSERT_OK(batch_->Merge("z", "z"));
 
-  s = batch.GetFromBatch(options, "z", &value);
+  s = batch_->GetFromBatch(options_, "z", &value);
   ASSERT_NOK(s);  // No merge operator specified.
 
-  s = batch.GetFromBatch(options, "b", &value);
+  s = batch_->GetFromBatch(options_, "b", &value);
   ASSERT_OK(s);
   ASSERT_EQ("b", value);
 }
 
-TEST_F(WriteBatchWithIndexTest, TestGetFromBatchMerge) {
-  DB* db;
-  Options options;
-  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
-  options.create_if_missing = true;
-
-  std::string dbname = test::PerThreadDBPath("write_batch_with_index_test");
-
-  DestroyDB(dbname, options);
-  Status s = DB::Open(options, dbname, &db);
+TEST_P(WriteBatchWithIndexTest, TestGetFromBatchMerge) {
+  Status s = OpenDB();
   ASSERT_OK(s);
 
-  ColumnFamilyHandle* column_family = db->DefaultColumnFamily();
-  WriteBatchWithIndex batch;
+  ColumnFamilyHandle* column_family = db_->DefaultColumnFamily();
   std::string value;
 
-  s = batch.GetFromBatch(options, "x", &value);
+  s = batch_->GetFromBatch(options_, "x", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  batch.Put("x", "X");
+  ASSERT_OK(batch_->Put("x", "X"));
   std::string expected = "X";
 
   for (int i = 0; i < 5; i++) {
-    batch.Merge("x", ToString(i));
+    ASSERT_OK(batch_->Merge("x", ToString(i)));
     expected = expected + "," + ToString(i);
 
     if (i % 2 == 0) {
-      batch.Put("y", ToString(i / 2));
+      ASSERT_OK(batch_->Put("y", ToString(i / 2)));
     }
 
-    batch.Merge("z", "z");
+    ASSERT_OK(batch_->Merge("z", "z"));
 
-    s = batch.GetFromBatch(column_family, options, "x", &value);
+    s = batch_->GetFromBatch(column_family, options_, "x", &value);
     ASSERT_OK(s);
     ASSERT_EQ(expected, value);
 
-    s = batch.GetFromBatch(column_family, options, "y", &value);
+    s = batch_->GetFromBatch(column_family, options_, "y", &value);
     ASSERT_OK(s);
     ASSERT_EQ(ToString(i / 2), value);
 
-    s = batch.GetFromBatch(column_family, options, "z", &value);
+    s = batch_->GetFromBatch(column_family, options_, "z", &value);
     ASSERT_TRUE(s.IsMergeInProgress());
   }
-
-  delete db;
-  DestroyDB(dbname, options);
 }
 
-TEST_F(WriteBatchWithIndexTest, TestGetFromBatchMerge2) {
-  DB* db;
-  Options options;
-  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
-  options.create_if_missing = true;
-
-  std::string dbname = test::PerThreadDBPath("write_batch_with_index_test");
-
-  DestroyDB(dbname, options);
-  Status s = DB::Open(options, dbname, &db);
+TEST_F(WBWIOverwriteTest, TestGetFromBatchMerge2) {
+  Status s = OpenDB();
   ASSERT_OK(s);
 
-  ColumnFamilyHandle* column_family = db->DefaultColumnFamily();
-
-  // Test batch with overwrite_key=true
-  WriteBatchWithIndex batch(BytewiseComparator(), 0, true);
+  ColumnFamilyHandle* column_family = db_->DefaultColumnFamily();
   std::string value;
 
-  s = batch.GetFromBatch(column_family, options, "X", &value);
+  s = batch_->GetFromBatch(column_family, options_, "X", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  batch.Put(column_family, "X", "x");
-  s = batch.GetFromBatch(column_family, options, "X", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->Put(column_family, "X", "x"));
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
   ASSERT_EQ("x", value);
 
-  batch.Put(column_family, "X", "x2");
-  s = batch.GetFromBatch(column_family, options, "X", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->Put(column_family, "X", "x2"));
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
   ASSERT_EQ("x2", value);
 
-  batch.Merge(column_family, "X", "aaa");
-  s = batch.GetFromBatch(column_family, options, "X", &value);
-  ASSERT_TRUE(s.IsMergeInProgress());
+  ASSERT_OK(batch_->Merge(column_family, "X", "aaa"));
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
+  ASSERT_EQ("x2,aaa", value);
 
-  batch.Merge(column_family, "X", "bbb");
-  s = batch.GetFromBatch(column_family, options, "X", &value);
-  ASSERT_TRUE(s.IsMergeInProgress());
+  ASSERT_OK(batch_->Merge(column_family, "X", "bbb"));
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
+  ASSERT_EQ("x2,aaa,bbb", value);
 
-  batch.Put(column_family, "X", "x3");
-  s = batch.GetFromBatch(column_family, options, "X", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->Put(column_family, "X", "x3"));
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
   ASSERT_EQ("x3", value);
 
-  batch.Merge(column_family, "X", "ccc");
-  s = batch.GetFromBatch(column_family, options, "X", &value);
-  ASSERT_TRUE(s.IsMergeInProgress());
+  ASSERT_OK(batch_->Merge(column_family, "X", "ccc"));
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
+  ASSERT_EQ("x3,ccc", value);
 
-  batch.Delete(column_family, "X");
-  s = batch.GetFromBatch(column_family, options, "X", &value);
+  ASSERT_OK(batch_->Delete(column_family, "X"));
+  s = batch_->GetFromBatch(column_family, options_, "X", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  batch.Merge(column_family, "X", "ddd");
-  s = batch.GetFromBatch(column_family, options, "X", &value);
-  ASSERT_TRUE(s.IsMergeInProgress());
-
-  delete db;
-  DestroyDB(dbname, options);
+  batch_->Merge(column_family, "X", "ddd");
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value));
+  ASSERT_EQ("ddd", value);
 }
 
-TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDB) {
-  DB* db;
-  Options options;
-  options.create_if_missing = true;
-  std::string dbname = test::PerThreadDBPath("write_batch_with_index_test");
-
-  DestroyDB(dbname, options);
-  Status s = DB::Open(options, dbname, &db);
-  ASSERT_OK(s);
+TEST_P(WriteBatchWithIndexTest, TestGetFromBatchAndDB) {
+  ASSERT_OK(OpenDB());
 
-  WriteBatchWithIndex batch;
-  ReadOptions read_options;
-  WriteOptions write_options;
   std::string value;
 
-  s = db->Put(write_options, "a", "a");
-  ASSERT_OK(s);
+  ASSERT_OK(db_->Put(write_opts_, "a", "a"));
+  ASSERT_OK(db_->Put(write_opts_, "b", "b"));
+  ASSERT_OK(db_->Put(write_opts_, "c", "c"));
 
-  s = db->Put(write_options, "b", "b");
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->Put("a", "batch_->a"));
+  ASSERT_OK(batch_->Delete("b"));
 
-  s = db->Put(write_options, "c", "c");
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+  ASSERT_EQ("batch_->a", value);
 
-  batch.Put("a", "batch.a");
-  batch.Delete("b");
-
-  s = batch.GetFromBatchAndDB(db, read_options, "a", &value);
-  ASSERT_OK(s);
-  ASSERT_EQ("batch.a", value);
-
-  s = batch.GetFromBatchAndDB(db, read_options, "b", &value);
+  Status s = batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = batch.GetFromBatchAndDB(db, read_options, "c", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
   ASSERT_EQ("c", value);
 
-  s = batch.GetFromBatchAndDB(db, read_options, "x", &value);
+  s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  db->Delete(write_options, "x");
+  ASSERT_OK(db_->Delete(write_opts_, "x"));
 
-  s = batch.GetFromBatchAndDB(db, read_options, "x", &value);
+  s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
   ASSERT_TRUE(s.IsNotFound());
-
-  delete db;
-  DestroyDB(dbname, options);
 }
 
-TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge) {
-  DB* db;
-  Options options;
-
-  options.create_if_missing = true;
-  std::string dbname = test::PerThreadDBPath("write_batch_with_index_test");
-
-  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
-
-  DestroyDB(dbname, options);
-  Status s = DB::Open(options, dbname, &db);
-  assert(s.ok());
-
-  WriteBatchWithIndex batch;
-  ReadOptions read_options;
-  WriteOptions write_options;
-  std::string value;
-
-  s = db->Put(write_options, "a", "a0");
-  ASSERT_OK(s);
-
-  s = db->Put(write_options, "b", "b0");
-  ASSERT_OK(s);
-
-  s = db->Merge(write_options, "b", "b1");
+TEST_P(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge) {
+  Status s = OpenDB();
   ASSERT_OK(s);
 
-  s = db->Merge(write_options, "c", "c0");
-  ASSERT_OK(s);
+  std::string value;
 
-  s = db->Merge(write_options, "d", "d0");
-  ASSERT_OK(s);
+  ASSERT_OK(db_->Put(write_opts_, "a", "a0"));
+  ASSERT_OK(db_->Put(write_opts_, "b", "b0"));
+  ASSERT_OK(db_->Merge(write_opts_, "b", "b1"));
+  ASSERT_OK(db_->Merge(write_opts_, "c", "c0"));
+  ASSERT_OK(db_->Merge(write_opts_, "d", "d0"));
 
-  batch.Merge("a", "a1");
-  batch.Merge("a", "a2");
-  batch.Merge("b", "b2");
-  batch.Merge("d", "d1");
-  batch.Merge("e", "e0");
+  ASSERT_OK(batch_->Merge("a", "a1"));
+  ASSERT_OK(batch_->Merge("a", "a2"));
+  ASSERT_OK(batch_->Merge("b", "b2"));
+  ASSERT_OK(batch_->Merge("d", "d1"));
+  ASSERT_OK(batch_->Merge("e", "e0"));
 
-  s = batch.GetFromBatchAndDB(db, read_options, "a", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
   ASSERT_EQ("a0,a1,a2", value);
 
-  s = batch.GetFromBatchAndDB(db, read_options, "b", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value));
   ASSERT_EQ("b0,b1,b2", value);
 
-  s = batch.GetFromBatchAndDB(db, read_options, "c", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
   ASSERT_EQ("c0", value);
 
-  s = batch.GetFromBatchAndDB(db, read_options, "d", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "d", &value));
   ASSERT_EQ("d0,d1", value);
 
-  s = batch.GetFromBatchAndDB(db, read_options, "e", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
   ASSERT_EQ("e0", value);
 
-  s = db->Delete(write_options, "x");
-  ASSERT_OK(s);
+  ASSERT_OK(db_->Delete(write_opts_, "x"));
 
-  s = batch.GetFromBatchAndDB(db, read_options, "x", &value);
+  s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  const Snapshot* snapshot = db->GetSnapshot();
+  const Snapshot* snapshot = db_->GetSnapshot();
   ReadOptions snapshot_read_options;
   snapshot_read_options.snapshot = snapshot;
 
-  s = db->Delete(write_options, "a");
-  ASSERT_OK(s);
+  ASSERT_OK(db_->Delete(write_opts_, "a"));
 
-  s = batch.GetFromBatchAndDB(db, read_options, "a", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
   ASSERT_EQ("a1,a2", value);
 
-  s = batch.GetFromBatchAndDB(db, snapshot_read_options, "a", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(
+      s = batch_->GetFromBatchAndDB(db_, snapshot_read_options, "a", &value));
   ASSERT_EQ("a0,a1,a2", value);
 
-  batch.Delete("a");
+  ASSERT_OK(batch_->Delete("a"));
 
-  s = batch.GetFromBatchAndDB(db, read_options, "a", &value);
+  s = batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = batch.GetFromBatchAndDB(db, snapshot_read_options, "a", &value);
+  s = batch_->GetFromBatchAndDB(db_, snapshot_read_options, "a", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = db->Merge(write_options, "c", "c1");
-  ASSERT_OK(s);
+  ASSERT_OK(s = db_->Merge(write_opts_, "c", "c1"));
 
-  s = batch.GetFromBatchAndDB(db, read_options, "c", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(s = batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
   ASSERT_EQ("c0,c1", value);
 
-  s = batch.GetFromBatchAndDB(db, snapshot_read_options, "c", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(
+      s = batch_->GetFromBatchAndDB(db_, snapshot_read_options, "c", &value));
   ASSERT_EQ("c0", value);
 
-  s = db->Put(write_options, "e", "e1");
-  ASSERT_OK(s);
-
-  s = batch.GetFromBatchAndDB(db, read_options, "e", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(db_->Put(write_opts_, "e", "e1"));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
   ASSERT_EQ("e1,e0", value);
 
-  s = batch.GetFromBatchAndDB(db, snapshot_read_options, "e", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, snapshot_read_options, "e", &value));
   ASSERT_EQ("e0", value);
 
-  s = db->Delete(write_options, "e");
-  ASSERT_OK(s);
-
-  s = batch.GetFromBatchAndDB(db, read_options, "e", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(s = db_->Delete(write_opts_, "e"));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
   ASSERT_EQ("e0", value);
 
-  s = batch.GetFromBatchAndDB(db, snapshot_read_options, "e", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, snapshot_read_options, "e", &value));
   ASSERT_EQ("e0", value);
 
-  db->ReleaseSnapshot(snapshot);
-  delete db;
-  DestroyDB(dbname, options);
+  db_->ReleaseSnapshot(snapshot);
 }
 
-TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge2) {
-  DB* db;
-  Options options;
-
-  options.create_if_missing = true;
-  std::string dbname = test::PerThreadDBPath("write_batch_with_index_test");
-
-  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
-
-  DestroyDB(dbname, options);
-  Status s = DB::Open(options, dbname, &db);
-  assert(s.ok());
-
-  // Test batch with overwrite_key=true
-  WriteBatchWithIndex batch(BytewiseComparator(), 0, true);
+TEST_F(WBWIOverwriteTest, TestGetFromBatchAndDBMerge2) {
+  Status s = OpenDB();
+  ASSERT_OK(s);
 
-  ReadOptions read_options;
-  WriteOptions write_options;
   std::string value;
 
-  s = batch.GetFromBatchAndDB(db, read_options, "A", &value);
+  s = batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  batch.Merge("A", "xxx");
+  ASSERT_OK(batch_->Merge("A", "xxx"));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+  ASSERT_EQ(value, "xxx");
 
-  s = batch.GetFromBatchAndDB(db, read_options, "A", &value);
-  ASSERT_TRUE(s.IsMergeInProgress());
+  ASSERT_OK(batch_->Merge("A", "yyy"));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+  ASSERT_EQ(value, "xxx,yyy");
 
-  batch.Merge("A", "yyy");
+  ASSERT_OK(db_->Put(write_opts_, "A", "a0"));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+  ASSERT_EQ(value, "a0,xxx,yyy");
 
-  s = batch.GetFromBatchAndDB(db, read_options, "A", &value);
-  ASSERT_TRUE(s.IsMergeInProgress());
+  ASSERT_OK(batch_->Delete("A"));
 
-  s = db->Put(write_options, "A", "a0");
-  ASSERT_OK(s);
+  s = batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+}
 
-  s = batch.GetFromBatchAndDB(db, read_options, "A", &value);
-  ASSERT_TRUE(s.IsMergeInProgress());
+TEST_P(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge3) {
+  Status s = OpenDB();
+  ASSERT_OK(s);
 
-  batch.Delete("A");
+  FlushOptions flush_options;
+  std::string value;
 
-  s = batch.GetFromBatchAndDB(db, read_options, "A", &value);
-  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_OK(db_->Put(write_opts_, "A", "1"));
+  ASSERT_OK(db_->Flush(flush_options, db_->DefaultColumnFamily()));
+  ASSERT_OK(batch_->Merge("A", "2"));
 
-  delete db;
-  DestroyDB(dbname, options);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "A", &value));
+  ASSERT_EQ(value, "1,2");
 }
 
-TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge3) {
-  DB* db;
-  Options options;
-
-  options.create_if_missing = true;
-  std::string dbname = test::PerThreadDBPath("write_batch_with_index_test");
-
-  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+TEST_P(WriteBatchWithIndexTest, TestPinnedGetFromBatchAndDB) {
+  Status s = OpenDB();
+  ASSERT_OK(s);
 
-  DestroyDB(dbname, options);
-  Status s = DB::Open(options, dbname, &db);
-  assert(s.ok());
+  PinnableSlice value;
+
+  ASSERT_OK(db_->Put(write_opts_, "a", "a0"));
+  ASSERT_OK(db_->Put(write_opts_, "b", "b0"));
+  ASSERT_OK(db_->Merge(write_opts_, "b", "b1"));
+  ASSERT_OK(db_->Merge(write_opts_, "c", "c0"));
+  ASSERT_OK(db_->Merge(write_opts_, "d", "d0"));
+  ASSERT_OK(batch_->Merge("a", "a1"));
+  ASSERT_OK(batch_->Merge("a", "a2"));
+  ASSERT_OK(batch_->Merge("b", "b2"));
+  ASSERT_OK(batch_->Merge("d", "d1"));
+  ASSERT_OK(batch_->Merge("e", "e0"));
+
+  for (int i = 0; i < 2; i++) {
+    if (i == 1) {
+      // Do it again with a flushed DB...
+      ASSERT_OK(db_->Flush(FlushOptions(), db_->DefaultColumnFamily()));
+    }
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+    ASSERT_EQ("a0,a1,a2", value.ToString());
 
-  ReadOptions read_options;
-  WriteOptions write_options;
-  FlushOptions flush_options;
-  std::string value;
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value));
+    ASSERT_EQ("b0,b1,b2", value.ToString());
 
-  WriteBatchWithIndex batch;
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "c", &value));
+    ASSERT_EQ("c0", value.ToString());
 
-  ASSERT_OK(db->Put(write_options, "A", "1"));
-  ASSERT_OK(db->Flush(flush_options, db->DefaultColumnFamily()));
-  ASSERT_OK(batch.Merge("A", "2"));
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "d", &value));
+    ASSERT_EQ("d0,d1", value.ToString());
 
-  ASSERT_OK(batch.GetFromBatchAndDB(db, read_options, "A", &value));
-  ASSERT_EQ(value, "1,2");
+    ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "e", &value));
+    ASSERT_EQ("e0", value.ToString());
+    ASSERT_OK(db_->Delete(write_opts_, "x"));
 
-  delete db;
-  DestroyDB(dbname, options);
+    s = batch_->GetFromBatchAndDB(db_, read_opts_, "x", &value);
+    ASSERT_TRUE(s.IsNotFound());
+  }
 }
 
 void AssertKey(std::string key, WBWIIterator* iter) {
@@ -1351,25 +1467,24 @@ void AssertValue(std::string value, WBWIIterator* iter) {
 
 // Tests that we can write to the WBWI while we iterate (from a single thread).
 // iteration should see the newest writes
-TEST_F(WriteBatchWithIndexTest, MutateWhileIteratingCorrectnessTest) {
-  WriteBatchWithIndex batch(BytewiseComparator(), 0, true);
+TEST_F(WBWIOverwriteTest, MutateWhileIteratingCorrectnessTest) {
   for (char c = 'a'; c <= 'z'; ++c) {
-    batch.Put(std::string(1, c), std::string(1, c));
+    ASSERT_OK(batch_->Put(std::string(1, c), std::string(1, c)));
   }
 
-  std::unique_ptr<WBWIIterator> iter(batch.NewIterator());
+  std::unique_ptr<WBWIIterator> iter(batch_->NewIterator());
   iter->Seek("k");
   AssertKey("k", iter.get());
   iter->Next();
   AssertKey("l", iter.get());
-  batch.Put("ab", "cc");
+  ASSERT_OK(batch_->Put("ab", "cc"));
   iter->Next();
   AssertKey("m", iter.get());
-  batch.Put("mm", "kk");
+  ASSERT_OK(batch_->Put("mm", "kk"));
   iter->Next();
   AssertKey("mm", iter.get());
   AssertValue("kk", iter.get());
-  batch.Delete("mm");
+  ASSERT_OK(batch_->Delete("mm"));
 
   iter->Next();
   AssertKey("n", iter.get());
@@ -1379,7 +1494,7 @@ TEST_F(WriteBatchWithIndexTest, MutateWhileIteratingCorrectnessTest) {
 
   iter->Seek("ab");
   AssertKey("ab", iter.get());
-  batch.Delete("x");
+  ASSERT_OK(batch_->Delete("x"));
   iter->Seek("x");
   AssertKey("x", iter.get());
   ASSERT_EQ(kDeleteRecord, iter->Entry().type);
@@ -1398,10 +1513,10 @@ void AssertIterValue(std::string value, Iterator* iter) {
 }
 
 // same thing as above, but testing IteratorWithBase
-TEST_F(WriteBatchWithIndexTest, MutateWhileIteratingBaseCorrectnessTest) {
+TEST_F(WBWIOverwriteTest, MutateWhileIteratingBaseCorrectnessTest) {
   WriteBatchWithIndex batch(BytewiseComparator(), 0, true);
   for (char c = 'a'; c <= 'z'; ++c) {
-    batch.Put(std::string(1, c), std::string(1, c));
+    ASSERT_OK(batch_->Put(std::string(1, c), std::string(1, c)));
   }
 
   KVMap map;
@@ -1410,20 +1525,19 @@ TEST_F(WriteBatchWithIndexTest, MutateWhileIteratingBaseCorrectnessTest) {
   map["ee"] = "ee";
   map["em"] = "me";
 
-  std::unique_ptr<Iterator> iter(
-      batch.NewIteratorWithBase(new KVIter(&map)));
+  std::unique_ptr<Iterator> iter(batch_->NewIteratorWithBase(new KVIter(&map)));
   iter->Seek("k");
   AssertIterKey("k", iter.get());
   iter->Next();
   AssertIterKey("l", iter.get());
-  batch.Put("ab", "cc");
+  ASSERT_OK(batch_->Put("ab", "cc"));
   iter->Next();
   AssertIterKey("m", iter.get());
-  batch.Put("mm", "kk");
+  ASSERT_OK(batch_->Put("mm", "kk"));
   iter->Next();
   AssertIterKey("mm", iter.get());
   AssertIterValue("kk", iter.get());
-  batch.Delete("mm");
+  ASSERT_OK(batch_->Delete("mm"));
   iter->Next();
   AssertIterKey("n", iter.get());
   iter->Prev();
@@ -1436,13 +1550,13 @@ TEST_F(WriteBatchWithIndexTest, MutateWhileIteratingBaseCorrectnessTest) {
   AssertIterKey("aa", iter.get());
   iter->Prev();
   AssertIterKey("a", iter.get());
-  batch.Delete("aa");
+  ASSERT_OK(batch_->Delete("aa"));
   iter->Next();
   AssertIterKey("ab", iter.get());
   iter->Prev();
   AssertIterKey("a", iter.get());
 
-  batch.Delete("x");
+  ASSERT_OK(batch_->Delete("x"));
   iter->Seek("x");
   AssertIterKey("y", iter.get());
   iter->Next();
@@ -1451,11 +1565,11 @@ TEST_F(WriteBatchWithIndexTest, MutateWhileIteratingBaseCorrectnessTest) {
   iter->Prev();
   AssertIterKey("w", iter.get());
 
-  batch.Delete("e");
+  ASSERT_OK(batch_->Delete("e"));
   iter->Seek("e");
   AssertIterKey("ee", iter.get());
   AssertIterValue("ee", iter.get());
-  batch.Put("ee", "xx");
+  ASSERT_OK(batch_->Put("ee", "xx"));
   // still the same value
   AssertIterValue("ee", iter.get());
   iter->Next();
@@ -1463,13 +1577,14 @@ TEST_F(WriteBatchWithIndexTest, MutateWhileIteratingBaseCorrectnessTest) {
   iter->Prev();
   // new value
   AssertIterValue("xx", iter.get());
+
+  ASSERT_OK(iter->status());
 }
 
 // stress testing mutations with IteratorWithBase
-TEST_F(WriteBatchWithIndexTest, MutateWhileIteratingBaseStressTest) {
-  WriteBatchWithIndex batch(BytewiseComparator(), 0, true);
+TEST_F(WBWIOverwriteTest, MutateWhileIteratingBaseStressTest) {
   for (char c = 'a'; c <= 'z'; ++c) {
-    batch.Put(std::string(1, c), std::string(1, c));
+    ASSERT_OK(batch_->Put(std::string(1, c), std::string(1, c)));
   }
 
   KVMap map;
@@ -1477,8 +1592,7 @@ TEST_F(WriteBatchWithIndexTest, MutateWhileIteratingBaseStressTest) {
     map[std::string(2, c)] = std::string(2, c);
   }
 
-  std::unique_ptr<Iterator> iter(
-      batch.NewIteratorWithBase(new KVIter(&map)));
+  std::unique_ptr<Iterator> iter(batch_->NewIteratorWithBase(new KVIter(&map)));
 
   Random rnd(301);
   for (int i = 0; i < 1000000; ++i) {
@@ -1486,16 +1600,16 @@ TEST_F(WriteBatchWithIndexTest, MutateWhileIteratingBaseStressTest) {
     char c = static_cast<char>(rnd.Uniform(26) + 'a');
     switch (random) {
       case 0:
-        batch.Put(std::string(1, c), "xxx");
+        ASSERT_OK(batch_->Put(std::string(1, c), "xxx"));
         break;
       case 1:
-        batch.Put(std::string(2, c), "xxx");
+        ASSERT_OK(batch_->Put(std::string(2, c), "xxx"));
         break;
       case 2:
-        batch.Delete(std::string(1, c));
+        ASSERT_OK(batch_->Delete(std::string(1, c)));
         break;
       case 3:
-        batch.Delete(std::string(2, c));
+        ASSERT_OK(batch_->Delete(std::string(2, c)));
         break;
       case 4:
         iter->Seek(std::string(1, c));
@@ -1517,316 +1631,616 @@ TEST_F(WriteBatchWithIndexTest, MutateWhileIteratingBaseStressTest) {
         assert(false);
     }
   }
+  ASSERT_OK(iter->status());
 }
 
-static std::string PrintContents(WriteBatchWithIndex* batch,
-                                 ColumnFamilyHandle* column_family) {
-  std::string result;
-
-  WBWIIterator* iter;
-  if (column_family == nullptr) {
-    iter = batch->NewIterator();
-  } else {
-    iter = batch->NewIterator(column_family);
-  }
-
-  iter->SeekToFirst();
-  while (iter->Valid()) {
-    WriteEntry e = iter->Entry();
-
-    if (e.type == kPutRecord) {
-      result.append("PUT(");
-      result.append(e.key.ToString());
-      result.append("):");
-      result.append(e.value.ToString());
-    } else if (e.type == kMergeRecord) {
-      result.append("MERGE(");
-      result.append(e.key.ToString());
-      result.append("):");
-      result.append(e.value.ToString());
-    } else if (e.type == kSingleDeleteRecord) {
-      result.append("SINGLE-DEL(");
-      result.append(e.key.ToString());
-      result.append(")");
-    } else {
-      assert(e.type == kDeleteRecord);
-      result.append("DEL(");
-      result.append(e.key.ToString());
-      result.append(")");
-    }
-
-    result.append(",");
-    iter->Next();
-  }
-
-  delete iter;
-  return result;
-}
-
-static std::string PrintContents(WriteBatchWithIndex* batch, KVMap* base_map,
-                                 ColumnFamilyHandle* column_family) {
-  std::string result;
-
-  Iterator* iter;
-  if (column_family == nullptr) {
-    iter = batch->NewIteratorWithBase(new KVIter(base_map));
-  } else {
-    iter = batch->NewIteratorWithBase(column_family, new KVIter(base_map));
-  }
-
-  iter->SeekToFirst();
-  while (iter->Valid()) {
-    assert(iter->status().ok());
-
-    Slice key = iter->key();
-    Slice value = iter->value();
-
-    result.append(key.ToString());
-    result.append(":");
-    result.append(value.ToString());
-    result.append(",");
-
-    iter->Next();
-  }
-
-  delete iter;
-  return result;
-}
-
-TEST_F(WriteBatchWithIndexTest, SavePointTest) {
-  WriteBatchWithIndex batch;
+TEST_P(WriteBatchWithIndexTest, SavePointTest) {
   ColumnFamilyHandleImplDummy cf1(1, BytewiseComparator());
+  KVMap empty_map;
+  std::unique_ptr<Iterator> cf0_iter(
+      batch_->NewIteratorWithBase(new KVIter(&empty_map)));
+  std::unique_ptr<Iterator> cf1_iter(
+      batch_->NewIteratorWithBase(&cf1, new KVIter(&empty_map)));
   Status s;
-
-  batch.Put("A", "a");
-  batch.Put("B", "b");
-  batch.Put("A", "aa");
-  batch.Put(&cf1, "A", "a1");
-  batch.Delete(&cf1, "B");
-  batch.Put(&cf1, "C", "c1");
-  batch.Put(&cf1, "E", "e1");
-
-  batch.SetSavePoint();  // 1
-
-  batch.Put("C", "cc");
-  batch.Put("B", "bb");
-  batch.Delete("A");
-  batch.Put(&cf1, "B", "b1");
-  batch.Delete(&cf1, "A");
-  batch.SingleDelete(&cf1, "E");
-  batch.SetSavePoint();  // 2
-
-  batch.Put("A", "aaa");
-  batch.Put("A", "xxx");
-  batch.Delete("B");
-  batch.Put(&cf1, "B", "b2");
-  batch.Delete(&cf1, "C");
-  batch.SetSavePoint();  // 3
-  batch.SetSavePoint();  // 4
-  batch.SingleDelete("D");
-  batch.Delete(&cf1, "D");
-  batch.Delete(&cf1, "E");
-
-  ASSERT_EQ(
-      "PUT(A):a,PUT(A):aa,DEL(A),PUT(A):aaa,PUT(A):xxx,PUT(B):b,PUT(B):bb,DEL("
-      "B)"
-      ",PUT(C):cc,SINGLE-DEL(D),",
-      PrintContents(&batch, nullptr));
-
-  ASSERT_EQ(
-      "PUT(A):a1,DEL(A),DEL(B),PUT(B):b1,PUT(B):b2,PUT(C):c1,DEL(C),"
-      "DEL(D),PUT(E):e1,SINGLE-DEL(E),DEL(E),",
-      PrintContents(&batch, &cf1));
-
-  ASSERT_OK(batch.RollbackToSavePoint());  // rollback to 4
-  ASSERT_EQ(
-      "PUT(A):a,PUT(A):aa,DEL(A),PUT(A):aaa,PUT(A):xxx,PUT(B):b,PUT(B):bb,DEL("
-      "B)"
-      ",PUT(C):cc,",
-      PrintContents(&batch, nullptr));
-
-  ASSERT_EQ(
-      "PUT(A):a1,DEL(A),DEL(B),PUT(B):b1,PUT(B):b2,PUT(C):c1,DEL(C),"
-      "PUT(E):e1,SINGLE-DEL(E),",
-      PrintContents(&batch, &cf1));
-
-  ASSERT_OK(batch.RollbackToSavePoint());  // rollback to 3
-  ASSERT_EQ(
-      "PUT(A):a,PUT(A):aa,DEL(A),PUT(A):aaa,PUT(A):xxx,PUT(B):b,PUT(B):bb,DEL("
-      "B)"
-      ",PUT(C):cc,",
-      PrintContents(&batch, nullptr));
-
-  ASSERT_EQ(
-      "PUT(A):a1,DEL(A),DEL(B),PUT(B):b1,PUT(B):b2,PUT(C):c1,DEL(C),"
-      "PUT(E):e1,SINGLE-DEL(E),",
-      PrintContents(&batch, &cf1));
-
-  ASSERT_OK(batch.RollbackToSavePoint());  // rollback to 2
-  ASSERT_EQ("PUT(A):a,PUT(A):aa,DEL(A),PUT(B):b,PUT(B):bb,PUT(C):cc,",
-            PrintContents(&batch, nullptr));
-
-  ASSERT_EQ(
-      "PUT(A):a1,DEL(A),DEL(B),PUT(B):b1,PUT(C):c1,"
-      "PUT(E):e1,SINGLE-DEL(E),",
-      PrintContents(&batch, &cf1));
-
-  batch.SetSavePoint();  // 5
-  batch.Put("X", "x");
-
-  ASSERT_EQ("PUT(A):a,PUT(A):aa,DEL(A),PUT(B):b,PUT(B):bb,PUT(C):cc,PUT(X):x,",
-            PrintContents(&batch, nullptr));
-
-  ASSERT_OK(batch.RollbackToSavePoint());  // rollback to 5
-  ASSERT_EQ("PUT(A):a,PUT(A):aa,DEL(A),PUT(B):b,PUT(B):bb,PUT(C):cc,",
-            PrintContents(&batch, nullptr));
-
-  ASSERT_EQ(
-      "PUT(A):a1,DEL(A),DEL(B),PUT(B):b1,PUT(C):c1,"
-      "PUT(E):e1,SINGLE-DEL(E),",
-      PrintContents(&batch, &cf1));
-
-  ASSERT_OK(batch.RollbackToSavePoint());  // rollback to 1
-  ASSERT_EQ("PUT(A):a,PUT(A):aa,PUT(B):b,", PrintContents(&batch, nullptr));
-
-  ASSERT_EQ("PUT(A):a1,DEL(B),PUT(C):c1,PUT(E):e1,",
-            PrintContents(&batch, &cf1));
-
-  s = batch.RollbackToSavePoint();  // no savepoint found
+  KVMap kvm_cf0_0 = {{"A", "aa"}, {"B", "b"}};
+  KVMap kvm_cf1_0 = {{"A", "a1"}, {"C", "c1"}, {"E", "e1"}};
+  KVIter kvi_cf0_0(&kvm_cf0_0);
+  KVIter kvi_cf1_0(&kvm_cf1_0);
+
+  ASSERT_OK(batch_->Put("A", "a"));
+  ASSERT_OK(batch_->Put("B", "b"));
+  ASSERT_OK(batch_->Put("A", "aa"));
+  ASSERT_OK(batch_->Put(&cf1, "A", "a1"));
+  ASSERT_OK(batch_->Delete(&cf1, "B"));
+  ASSERT_OK(batch_->Put(&cf1, "C", "c1"));
+  ASSERT_OK(batch_->Put(&cf1, "E", "e1"));
+
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_0);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_0);
+  batch_->SetSavePoint();  // 1
+
+  KVMap kvm_cf0_1 = {{"B", "bb"}, {"C", "cc"}};
+  KVMap kvm_cf1_1 = {{"B", "b1"}, {"C", "c1"}};
+  KVIter kvi_cf0_1(&kvm_cf0_1);
+  KVIter kvi_cf1_1(&kvm_cf1_1);
+
+  ASSERT_OK(batch_->Put("C", "cc"));
+  ASSERT_OK(batch_->Put("B", "bb"));
+  ASSERT_OK(batch_->Delete("A"));
+  ASSERT_OK(batch_->Put(&cf1, "B", "b1"));
+  ASSERT_OK(batch_->Delete(&cf1, "A"));
+  ASSERT_OK(batch_->SingleDelete(&cf1, "E"));
+  batch_->SetSavePoint();  // 2
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_1);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_1);
+
+  KVMap kvm_cf0_2 = {{"A", "xxx"}, {"C", "cc"}};
+  KVMap kvm_cf1_2 = {{"B", "b2"}};
+  KVIter kvi_cf0_2(&kvm_cf0_2);
+  KVIter kvi_cf1_2(&kvm_cf1_2);
+
+  ASSERT_OK(batch_->Put("A", "aaa"));
+  ASSERT_OK(batch_->Put("A", "xxx"));
+  ASSERT_OK(batch_->Delete("B"));
+  ASSERT_OK(batch_->Put(&cf1, "B", "b2"));
+  ASSERT_OK(batch_->Delete(&cf1, "C"));
+  batch_->SetSavePoint();  // 3
+  batch_->SetSavePoint();  // 4
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_2);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_2);
+
+  KVMap kvm_cf0_4 = {{"A", "xxx"}, {"C", "cc"}};
+  KVMap kvm_cf1_4 = {{"B", "b2"}};
+  KVIter kvi_cf0_4(&kvm_cf0_4);
+  KVIter kvi_cf1_4(&kvm_cf1_4);
+  ASSERT_OK(batch_->SingleDelete("D"));
+  ASSERT_OK(batch_->Delete(&cf1, "D"));
+  ASSERT_OK(batch_->Delete(&cf1, "E"));
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_4);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_4);
+
+  ASSERT_OK(batch_->RollbackToSavePoint());  // rollback to 4
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_2);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_2);
+
+  ASSERT_OK(batch_->RollbackToSavePoint());  // rollback to 3
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_2);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_2);
+
+  ASSERT_OK(batch_->RollbackToSavePoint());  // rollback to 2
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_1);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_1);
+
+  batch_->SetSavePoint();  // 5
+  ASSERT_OK(batch_->Put("X", "x"));
+
+  KVMap kvm_cf0_5 = {{"B", "bb"}, {"C", "cc"}, {"X", "x"}};
+  KVIter kvi_cf0_5(&kvm_cf0_5);
+  KVIter kvi_cf1_5(&kvm_cf1_1);
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_5);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_5);
+
+  ASSERT_OK(batch_->RollbackToSavePoint());  // rollback to 5
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_1);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_1);
+
+  ASSERT_OK(batch_->RollbackToSavePoint());  // rollback to 1
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_0);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_0);
+
+  s = batch_->RollbackToSavePoint();  // no savepoint found
   ASSERT_TRUE(s.IsNotFound());
-  ASSERT_EQ("PUT(A):a,PUT(A):aa,PUT(B):b,", PrintContents(&batch, nullptr));
-
-  ASSERT_EQ("PUT(A):a1,DEL(B),PUT(C):c1,PUT(E):e1,",
-            PrintContents(&batch, &cf1));
+  AssertItersEqual(cf0_iter.get(), &kvi_cf0_0);
+  AssertItersEqual(cf1_iter.get(), &kvi_cf1_0);
 
-  batch.SetSavePoint();  // 6
+  batch_->SetSavePoint();  // 6
 
-  batch.Clear();
-  ASSERT_EQ("", PrintContents(&batch, nullptr));
-  ASSERT_EQ("", PrintContents(&batch, &cf1));
+  batch_->Clear();
+  ASSERT_EQ("", PrintContents(batch_.get(), nullptr));
+  ASSERT_EQ("", PrintContents(batch_.get(), &cf1));
 
-  s = batch.RollbackToSavePoint();  // rollback to 6
+  s = batch_->RollbackToSavePoint();  // rollback to 6
   ASSERT_TRUE(s.IsNotFound());
 }
 
-TEST_F(WriteBatchWithIndexTest, SingleDeleteTest) {
-  WriteBatchWithIndex batch;
+TEST_P(WriteBatchWithIndexTest, SingleDeleteTest) {
   Status s;
   std::string value;
-  DBOptions db_options;
 
-  batch.SingleDelete("A");
+  ASSERT_OK(batch_->SingleDelete("A"));
 
-  s = batch.GetFromBatch(db_options, "A", &value);
+  s = batch_->GetFromBatch(options_, "A", &value);
   ASSERT_TRUE(s.IsNotFound());
-  s = batch.GetFromBatch(db_options, "B", &value);
+  s = batch_->GetFromBatch(options_, "B", &value);
   ASSERT_TRUE(s.IsNotFound());
-  value = PrintContents(&batch, nullptr);
-  ASSERT_EQ("SINGLE-DEL(A),", value);
 
-  batch.Clear();
-  batch.Put("A", "a");
-  batch.Put("A", "a2");
-  batch.Put("B", "b");
-  batch.SingleDelete("A");
+  batch_->Clear();
+  ASSERT_OK(batch_->Put("A", "a"));
+  ASSERT_OK(batch_->Put("A", "a2"));
+  ASSERT_OK(batch_->Put("B", "b"));
+  ASSERT_OK(batch_->SingleDelete("A"));
 
-  s = batch.GetFromBatch(db_options, "A", &value);
+  s = batch_->GetFromBatch(options_, "A", &value);
   ASSERT_TRUE(s.IsNotFound());
-  s = batch.GetFromBatch(db_options, "B", &value);
+  s = batch_->GetFromBatch(options_, "B", &value);
   ASSERT_OK(s);
   ASSERT_EQ("b", value);
 
-  value = PrintContents(&batch, nullptr);
-  ASSERT_EQ("PUT(A):a,PUT(A):a2,SINGLE-DEL(A),PUT(B):b,", value);
+  ASSERT_OK(batch_->Put("C", "c"));
+  ASSERT_OK(batch_->Put("A", "a3"));
+  ASSERT_OK(batch_->Delete("B"));
+  ASSERT_OK(batch_->SingleDelete("B"));
+  ASSERT_OK(batch_->SingleDelete("C"));
 
-  batch.Put("C", "c");
-  batch.Put("A", "a3");
-  batch.Delete("B");
-  batch.SingleDelete("B");
-  batch.SingleDelete("C");
-
-  s = batch.GetFromBatch(db_options, "A", &value);
+  s = batch_->GetFromBatch(options_, "A", &value);
   ASSERT_OK(s);
   ASSERT_EQ("a3", value);
-  s = batch.GetFromBatch(db_options, "B", &value);
+  s = batch_->GetFromBatch(options_, "B", &value);
   ASSERT_TRUE(s.IsNotFound());
-  s = batch.GetFromBatch(db_options, "C", &value);
+  s = batch_->GetFromBatch(options_, "C", &value);
   ASSERT_TRUE(s.IsNotFound());
-  s = batch.GetFromBatch(db_options, "D", &value);
+  s = batch_->GetFromBatch(options_, "D", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  value = PrintContents(&batch, nullptr);
-  ASSERT_EQ(
-      "PUT(A):a,PUT(A):a2,SINGLE-DEL(A),PUT(A):a3,PUT(B):b,DEL(B),SINGLE-DEL(B)"
-      ",PUT(C):c,SINGLE-DEL(C),",
-      value);
-
-  batch.Put("B", "b4");
-  batch.Put("C", "c4");
-  batch.Put("D", "d4");
-  batch.SingleDelete("D");
-  batch.SingleDelete("D");
-  batch.Delete("A");
+  ASSERT_OK(batch_->Put("B", "b4"));
+  ASSERT_OK(batch_->Put("C", "c4"));
+  ASSERT_OK(batch_->Put("D", "d4"));
+  ASSERT_OK(batch_->SingleDelete("D"));
+  ASSERT_OK(batch_->SingleDelete("D"));
+  ASSERT_OK(batch_->Delete("A"));
 
-  s = batch.GetFromBatch(db_options, "A", &value);
+  s = batch_->GetFromBatch(options_, "A", &value);
   ASSERT_TRUE(s.IsNotFound());
-  s = batch.GetFromBatch(db_options, "B", &value);
+  s = batch_->GetFromBatch(options_, "B", &value);
   ASSERT_OK(s);
   ASSERT_EQ("b4", value);
-  s = batch.GetFromBatch(db_options, "C", &value);
+  s = batch_->GetFromBatch(options_, "C", &value);
   ASSERT_OK(s);
   ASSERT_EQ("c4", value);
-  s = batch.GetFromBatch(db_options, "D", &value);
+  s = batch_->GetFromBatch(options_, "D", &value);
   ASSERT_TRUE(s.IsNotFound());
-
-  value = PrintContents(&batch, nullptr);
-  ASSERT_EQ(
-      "PUT(A):a,PUT(A):a2,SINGLE-DEL(A),PUT(A):a3,DEL(A),PUT(B):b,DEL(B),"
-      "SINGLE-DEL(B),PUT(B):b4,PUT(C):c,SINGLE-DEL(C),PUT(C):c4,PUT(D):d4,"
-      "SINGLE-DEL(D),SINGLE-DEL(D),",
-      value);
 }
 
-TEST_F(WriteBatchWithIndexTest, SingleDeleteDeltaIterTest) {
-  Status s;
+TEST_P(WriteBatchWithIndexTest, SingleDeleteDeltaIterTest) {
   std::string value;
-  DBOptions db_options;
-  WriteBatchWithIndex batch(BytewiseComparator(), 20, true /* overwrite_key */);
-  batch.Put("A", "a");
-  batch.Put("A", "a2");
-  batch.Put("B", "b");
-  batch.SingleDelete("A");
-  batch.Delete("B");
+  ASSERT_OK(batch_->Put("A", "a"));
+  ASSERT_OK(batch_->Put("A", "a2"));
+  ASSERT_OK(batch_->Put("B", "b"));
+  ASSERT_OK(batch_->SingleDelete("A"));
+  ASSERT_OK(batch_->Delete("B"));
 
   KVMap map;
-  value = PrintContents(&batch, &map, nullptr);
+  value = PrintContents(batch_.get(), &map, nullptr);
   ASSERT_EQ("", value);
 
   map["A"] = "aa";
   map["C"] = "cc";
   map["D"] = "dd";
 
-  batch.SingleDelete("B");
-  batch.SingleDelete("C");
-  batch.SingleDelete("Z");
+  ASSERT_OK(batch_->SingleDelete("B"));
+  ASSERT_OK(batch_->SingleDelete("C"));
+  ASSERT_OK(batch_->SingleDelete("Z"));
 
-  value = PrintContents(&batch, &map, nullptr);
+  value = PrintContents(batch_.get(), &map, nullptr);
   ASSERT_EQ("D:dd,", value);
 
-  batch.Put("A", "a3");
-  batch.Put("B", "b3");
-  batch.SingleDelete("A");
-  batch.SingleDelete("A");
-  batch.SingleDelete("D");
-  batch.SingleDelete("D");
-  batch.Delete("D");
+  ASSERT_OK(batch_->Put("A", "a3"));
+  ASSERT_OK(batch_->Put("B", "b3"));
+  ASSERT_OK(batch_->SingleDelete("A"));
+  ASSERT_OK(batch_->SingleDelete("A"));
+  ASSERT_OK(batch_->SingleDelete("D"));
+  ASSERT_OK(batch_->SingleDelete("D"));
+  ASSERT_OK(batch_->Delete("D"));
 
   map["E"] = "ee";
 
-  value = PrintContents(&batch, &map, nullptr);
+  value = PrintContents(batch_.get(), &map, nullptr);
   ASSERT_EQ("B:b3,E:ee,", value);
 }
 
+TEST_P(WriteBatchWithIndexTest, MultiGetTest) {
+  // MultiGet a lot of keys in order to force std::vector reallocations
+  std::vector<std::string> keys;
+  for (int i = 0; i < 100; ++i) {
+    keys.emplace_back(std::to_string(i));
+  }
+
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+  // Write some data to the db for the even numbered keys
+  {
+    WriteBatch wb;
+    for (size_t i = 0; i < keys.size(); i += 2) {
+      std::string val = "val" + std::to_string(i);
+      ASSERT_OK(wb.Put(cf0, keys[i], val));
+    }
+    ASSERT_OK(db_->Write(write_opts_, &wb));
+    for (size_t i = 0; i < keys.size(); i += 2) {
+      std::string value;
+      ASSERT_OK(db_->Get(read_opts_, cf0, keys[i], &value));
+    }
+  }
+
+  // Write some data to the batch
+  for (size_t i = 0; i < keys.size(); ++i) {
+    if ((i % 5) == 0) {
+      ASSERT_OK(batch_->Delete(cf0, keys[i]));
+    } else if ((i % 7) == 0) {
+      std::string val = "new" + std::to_string(i);
+      ASSERT_OK(batch_->Put(cf0, keys[i], val));
+    }
+    if (i > 0 && (i % 3) == 0) {
+      ASSERT_OK(batch_->Merge(cf0, keys[i], "merge"));
+    }
+  }
+
+  std::vector<Slice> key_slices;
+  for (size_t i = 0; i < keys.size(); ++i) {
+    key_slices.emplace_back(keys[i]);
+  }
+  std::vector<PinnableSlice> values(keys.size());
+  std::vector<Status> statuses(keys.size());
+
+  batch_->MultiGetFromBatchAndDB(db_, read_opts_, cf0, key_slices.size(),
+                                 key_slices.data(), values.data(),
+                                 statuses.data(), false);
+  for (size_t i = 0; i < keys.size(); ++i) {
+    if (i == 0) {
+      ASSERT_TRUE(statuses[i].IsNotFound());
+    } else if ((i % 3) == 0) {
+      ASSERT_OK(statuses[i]);
+      if ((i % 5) == 0) {  // Merge after Delete
+        ASSERT_EQ(values[i], "merge");
+      } else if ((i % 7) == 0) {  // Merge after Put
+        std::string val = "new" + std::to_string(i);
+        ASSERT_EQ(values[i], val + ",merge");
+      } else if ((i % 2) == 0) {
+        std::string val = "val" + std::to_string(i);
+        ASSERT_EQ(values[i], val + ",merge");
+      } else {
+        ASSERT_EQ(values[i], "merge");
+      }
+    } else if ((i % 5) == 0) {
+      ASSERT_TRUE(statuses[i].IsNotFound());
+    } else if ((i % 7) == 0) {
+      ASSERT_OK(statuses[i]);
+      ASSERT_EQ(values[i], "new" + std::to_string(i));
+    } else if ((i % 2) == 0) {
+      ASSERT_OK(statuses[i]);
+      ASSERT_EQ(values[i], "val" + std::to_string(i));
+    } else {
+      ASSERT_TRUE(statuses[i].IsNotFound());
+    }
+  }
+}
+TEST_P(WriteBatchWithIndexTest, MultiGetTest2) {
+  // MultiGet a lot of keys in order to force std::vector reallocations
+  const int num_keys = 700;
+  const int keys_per_pass = 100;
+  std::vector<std::string> keys;
+  for (size_t i = 0; i < num_keys; ++i) {
+    keys.emplace_back(std::to_string(i));
+  }
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+  // Keys   0- 99 have a PUT in the batch but not DB
+  // Keys 100-199 have a PUT in the DB
+  // Keys 200-299 Have a PUT/DELETE
+  // Keys 300-399 Have a PUT/DELETE/MERGE
+  // Keys 400-499 have a PUT/MERGE
+  // Keys 500-599 have a MERGE only
+  // Keys 600-699 were never written
+  {
+    WriteBatch wb;
+    for (size_t i = 100; i < 500; i++) {
+      std::string val = std::to_string(i);
+      ASSERT_OK(wb.Put(cf0, keys[i], val));
+    }
+    ASSERT_OK(db_->Write(write_opts_, &wb));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions(), cf0));
+  for (size_t i = 0; i < 100; i++) {
+    ASSERT_OK(batch_->Put(cf0, keys[i], keys[i]));
+  }
+  for (size_t i = 200; i < 400; i++) {
+    ASSERT_OK(batch_->Delete(cf0, keys[i]));
+  }
+  for (size_t i = 300; i < 600; i++) {
+    std::string val = std::to_string(i) + "m";
+    ASSERT_OK(batch_->Merge(cf0, keys[i], val));
+  }
+
+  Random rnd(301);
+  std::vector<PinnableSlice> values(keys_per_pass);
+  std::vector<Status> statuses(keys_per_pass);
+  for (int pass = 0; pass < 40; pass++) {
+    std::vector<Slice> key_slices;
+    for (size_t i = 0; i < keys_per_pass; i++) {
+      int random = rnd.Uniform(num_keys);
+      key_slices.emplace_back(keys[random]);
+    }
+    batch_->MultiGetFromBatchAndDB(db_, read_opts_, cf0, keys_per_pass,
+                                   key_slices.data(), values.data(),
+                                   statuses.data(), false);
+    for (size_t i = 0; i < keys_per_pass; i++) {
+      int key = ParseInt(key_slices[i].ToString());
+      switch (key / 100) {
+        case 0:  // 0-99 PUT only
+          ASSERT_OK(statuses[i]);
+          ASSERT_EQ(values[i], key_slices[i].ToString());
+          break;
+        case 1:  // 100-199 PUT only
+          ASSERT_OK(statuses[i]);
+          ASSERT_EQ(values[i], key_slices[i].ToString());
+          break;
+        case 2:  // 200-299 Deleted
+          ASSERT_TRUE(statuses[i].IsNotFound());
+          break;
+        case 3:  // 300-399 Delete+Merge
+          ASSERT_OK(statuses[i]);
+          ASSERT_EQ(values[i], key_slices[i].ToString() + "m");
+          break;
+        case 4:  // 400-400 Put+ Merge
+          ASSERT_OK(statuses[i]);
+          ASSERT_EQ(values[i], key_slices[i].ToString() + "," +
+                                   key_slices[i].ToString() + "m");
+          break;
+        case 5:  // Merge only
+          ASSERT_OK(statuses[i]);
+          ASSERT_EQ(values[i], key_slices[i].ToString() + "m");
+          break;
+        case 6:  // Never written
+          ASSERT_TRUE(statuses[i].IsNotFound());
+          break;
+        default:
+          assert(false);
+      }  // end switch
+    }    // End for each key
+  }      // end for passes
+}
+
+// This test has merges, but the merge does not play into the final result
+TEST_P(WriteBatchWithIndexTest, FakeMergeWithIteratorTest) {
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+  // The map we are starting with
+  KVMap input = {
+      {"odm", "odm0"},
+      {"omd", "omd0"},
+      {"omp", "omp0"},
+  };
+  KVMap result = {
+      {"odm", "odm2"},  // Orig, Delete, Merge
+      {"mp", "mp1"},    // Merge, Put
+      {"omp", "omp2"},  // Origi, Merge, Put
+      {"mmp", "mmp2"}   // Merge, Merge, Put
+  };
+
+  for (auto& iter : result) {
+    EXPECT_EQ(AddToBatch(cf0, iter.first), iter.second);
+  }
+  AddToBatch(cf0, "md");   // Merge, Delete
+  AddToBatch(cf0, "mmd");  // Merge, Merge, Delete
+  AddToBatch(cf0, "omd");  // Orig, Merge, Delete
+
+  KVIter kvi(&result);
+  // First try just the batch
+  std::unique_ptr<Iterator> iter(
+      batch_->NewIteratorWithBase(cf0, new KVIter(&input)));
+  AssertItersEqual(iter.get(), &kvi);
+}
+
+TEST_P(WriteBatchWithIndexTest, IteratorMergeTest) {
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+  KVMap result = {
+      {"m", "m0"},                // Merge
+      {"mm", "mm0,mm1"},          // Merge, Merge
+      {"dm", "dm1"},              // Delete, Merge
+      {"dmm", "dmm1,dmm2"},       // Delete, Merge, Merge
+      {"mdm", "mdm2"},            // Merge, Delete, Merge
+      {"mpm", "mpm1,mpm2"},       // Merge, Put, Merge
+      {"pm", "pm0,pm1"},          // Put, Merge
+      {"pmm", "pmm0,pmm1,pmm2"},  // Put, Merge, Merge
+  };
+
+  for (auto& iter : result) {
+    EXPECT_EQ(AddToBatch(cf0, iter.first), iter.second);
+  }
+
+  KVIter kvi(&result);
+  // First try just the batch
+  KVMap empty_map;
+  std::unique_ptr<Iterator> iter(
+      batch_->NewIteratorWithBase(cf0, new KVIter(&empty_map)));
+  AssertItersEqual(iter.get(), &kvi);
+}
+
+TEST_P(WriteBatchWithIndexTest, IteratorMergeTestWithOrig) {
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+  KVMap original;
+  KVMap results = {
+      {"m", "om,m0"},             // Merge
+      {"mm", "omm,mm0,mm1"},      // Merge, Merge
+      {"dm", "dm1"},              // Delete, Merge
+      {"dmm", "dmm1,dmm2"},       // Delete, Merge, Merge
+      {"mdm", "mdm2"},            // Merge, Delete, Merge
+      {"mpm", "mpm1,mpm2"},       // Merge, Put, Merge
+      {"pm", "pm0,pm1"},          // Put, Merge
+      {"pmm", "pmm0,pmm1,pmm2"},  // Put, Merge, Merge
+  };
+
+  for (auto& iter : results) {
+    AddToBatch(cf0, iter.first);
+    original[iter.first] = "o" + iter.first;
+  }
+
+  KVIter kvi(&results);
+  // First try just the batch
+  std::unique_ptr<Iterator> iter(
+      batch_->NewIteratorWithBase(cf0, new KVIter(&original)));
+  AssertItersEqual(iter.get(), &kvi);
+}
+
+TEST_P(WriteBatchWithIndexTest, GetFromBatchAfterMerge) {
+  std::string value;
+  Status s;
+
+  ASSERT_OK(OpenDB());
+  ASSERT_OK(db_->Put(write_opts_, "o", "aa"));
+  batch_->Merge("o", "bb");  // Merging bb under key "o"
+  batch_->Merge("m", "cc");  // Merging bc under key "m"
+  s = batch_->GetFromBatch(options_, "m", &value);
+  ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+  s = batch_->GetFromBatch(options_, "o", &value);
+  ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+
+  ASSERT_OK(db_->Write(write_opts_, batch_->GetWriteBatch()));
+  ASSERT_OK(db_->Get(read_opts_, "o", &value));
+  ASSERT_EQ(value, "aa,bb");
+  ASSERT_OK(db_->Get(read_opts_, "m", &value));
+  ASSERT_EQ(value, "cc");
+}
+
+TEST_P(WriteBatchWithIndexTest, GetFromBatchAndDBAfterMerge) {
+  std::string value;
+
+  ASSERT_OK(OpenDB());
+  ASSERT_OK(db_->Put(write_opts_, "o", "aa"));
+  ASSERT_OK(batch_->Merge("o", "bb"));  // Merging bb under key "o"
+  ASSERT_OK(batch_->Merge("m", "cc"));  // Merging bc under key "m"
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "o", &value));
+  ASSERT_EQ(value, "aa,bb");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "m", &value));
+  ASSERT_EQ(value, "cc");
+}
+
+TEST_F(WBWIKeepTest, GetAfterPut) {
+  std::string value;
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+  ASSERT_OK(db_->Put(write_opts_, "key", "orig"));
+
+  ASSERT_OK(batch_->Put("key", "aa"));  // Writing aa under key
+  ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+  ASSERT_EQ(value, "aa");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "aa");
+
+  ASSERT_OK(batch_->Merge("key", "bb"));  // Merging bb under key
+  ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+  ASSERT_EQ(value, "aa,bb");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "aa,bb");
+
+  ASSERT_OK(batch_->Merge("key", "cc"));  // Merging cc under key
+  ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+  ASSERT_EQ(value, "aa,bb,cc");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "aa,bb,cc");
+}
+
+TEST_P(WriteBatchWithIndexTest, GetAfterMergePut) {
+  std::string value;
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+  ASSERT_OK(db_->Put(write_opts_, "key", "orig"));
+
+  ASSERT_OK(batch_->Merge("key", "aa"));  // Merging aa under key
+  Status s = batch_->GetFromBatch(cf0, options_, "key", &value);
+  ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "orig,aa");
+
+  ASSERT_OK(batch_->Merge("key", "bb"));  // Merging bb under key
+  s = batch_->GetFromBatch(cf0, options_, "key", &value);
+  ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "orig,aa,bb");
+
+  ASSERT_OK(batch_->Put("key", "cc"));  // Writing cc under key
+  ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+  ASSERT_EQ(value, "cc");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "cc");
+
+  ASSERT_OK(batch_->Merge("key", "dd"));  // Merging dd under key
+  ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+  ASSERT_EQ(value, "cc,dd");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "cc,dd");
+}
+
+TEST_P(WriteBatchWithIndexTest, GetAfterMergeDelete) {
+  std::string value;
+  ASSERT_OK(OpenDB());
+  ColumnFamilyHandle* cf0 = db_->DefaultColumnFamily();
+
+  ASSERT_OK(batch_->Merge("key", "aa"));  // Merging aa under key
+  Status s = batch_->GetFromBatch(cf0, options_, "key", &value);
+  ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "aa");
+
+  ASSERT_OK(batch_->Merge("key", "bb"));  // Merging bb under key
+  s = batch_->GetFromBatch(cf0, options_, "key", &value);
+  ASSERT_EQ(s.code(), Status::Code::kMergeInProgress);
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "aa,bb");
+
+  ASSERT_OK(batch_->Delete("key"));  // Delete key from batch
+  s = batch_->GetFromBatch(cf0, options_, "key", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  ASSERT_OK(batch_->Merge("key", "cc"));  // Merging cc under key
+  ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+  ASSERT_EQ(value, "cc");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "cc");
+  ASSERT_OK(batch_->Merge("key", "dd"));  // Merging dd under key
+  ASSERT_OK(batch_->GetFromBatch(cf0, options_, "key", &value));
+  ASSERT_EQ(value, "cc,dd");
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "key", &value));
+  ASSERT_EQ(value, "cc,dd");
+}
+
+TEST_F(WBWIOverwriteTest, TestBadMergeOperator) {
+  class FailingMergeOperator : public MergeOperator {
+   public:
+    FailingMergeOperator() {}
+
+    bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+                     MergeOperationOutput* /*merge_out*/) const override {
+      return false;
+    }
+
+    const char* Name() const override { return "Failing"; }
+  };
+  options_.merge_operator.reset(new FailingMergeOperator());
+  ASSERT_OK(OpenDB());
+
+  ColumnFamilyHandle* column_family = db_->DefaultColumnFamily();
+  std::string value;
+
+  ASSERT_OK(db_->Put(write_opts_, "a", "a0"));
+  ASSERT_OK(batch_->Put("b", "b0"));
+
+  ASSERT_OK(batch_->Merge("a", "a1"));
+  ASSERT_NOK(batch_->GetFromBatchAndDB(db_, read_opts_, "a", &value));
+  ASSERT_NOK(batch_->GetFromBatch(column_family, options_, "a", &value));
+  ASSERT_OK(batch_->GetFromBatchAndDB(db_, read_opts_, "b", &value));
+  ASSERT_OK(batch_->GetFromBatch(column_family, options_, "b", &value));
+}
+
+INSTANTIATE_TEST_CASE_P(WBWI, WriteBatchWithIndexTest, testing::Bool());
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {